MobileInternetTechnolody/Chiara/AnitomySharp/Parser.cs

431 lines
16 KiB
C#
Raw Normal View History

2024-07-04 15:27:19 +08:00
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
namespace AnitomySharp;
/// <summary>
/// Class to classify <see cref="Token"/>s
/// </summary>
public class Parser
{
public bool IsEpisodeKeywordsFound { get; private set; }
public ParserHelper ParseHelper { get; }
public ParserNumber ParseNumber { get; }
public List<Element> Elements { get; }
public List<Token> Tokens { get; }
private Options Options { get; }
/// <summary>
/// Constructs a new token parser
/// </summary>
/// <param name="elements">the list where parsed elements will be added</param>
/// <param name="options">the parser options</param>
/// <param name="tokens">the list of tokens</param>
public Parser(List<Element> elements, Options options, List<Token> tokens)
{
Elements = elements;
Options = options;
Tokens = tokens;
ParseHelper = new ParserHelper(this);
ParseNumber = new ParserNumber(this);
}
/** Begins the parsing process */
public bool Parse()
{
SearchForKeywords();
SearchForIsolatedNumbers();
if (Options.ParseEpisodeNumber)
{
SearchForEpisodeNumber();
}
SearchForAnimeTitle();
if (Options.ParseReleaseGroup && Empty(ElementCategory.ElementReleaseGroup))
{
SearchForReleaseGroup();
}
if (Options.ParseEpisodeTitle && !Empty(ElementCategory.ElementEpisodeNumber))
{
SearchForEpisodeTitle();
}
ValidateElements();
return Empty(ElementCategory.ElementAnimeTitle);
}
/** Search for anime keywords. */
private void SearchForKeywords()
{
for (int i = 0; i < Tokens.Count; i++)
{
var token = Tokens[i];
if (token.Category != Token.TokenCategory.Unknown) continue;
string word = token.Content;
word = word.Trim(" -".ToCharArray());
if (string.IsNullOrEmpty(word)) continue;
// Don't bother if the word is a number that cannot be CRC
if (word.Length != 8 && StringHelper.IsNumericString(word)) continue;
string keyword = KeywordManager.Normalize(word);
var category = ElementCategory.ElementUnknown;
var options = new KeywordOptions();
if (KeywordManager.FindAndSet(keyword, ref category, ref options))
{
if (!Options.ParseReleaseGroup && category == ElementCategory.ElementReleaseGroup) continue;
if (!ParserHelper.IsElementCategorySearchable(category) || !options.Searchable) continue;
if (ParserHelper.IsElementCategorySingular(category) && !Empty(category)) continue;
switch (category)
{
case ElementCategory.ElementAnimeSeasonPrefix:
ParseHelper.CheckAndSetAnimeSeasonKeyword(token, i);
continue;
case ElementCategory.ElementEpisodePrefix when options.Valid:
ParseHelper.CheckExtentKeyword(ElementCategory.ElementEpisodeNumber, i, token);
continue;
case ElementCategory.ElementReleaseVersion:
word = word[1..];
break;
case ElementCategory.ElementVolumePrefix:
ParseHelper.CheckExtentKeyword(ElementCategory.ElementVolumeNumber, i, token);
continue;
}
}
else
{
if (Empty(ElementCategory.ElementFileChecksum) && ParserHelper.IsCrc32(word))
{
category = ElementCategory.ElementFileChecksum;
}
else if (Empty(ElementCategory.ElementVideoResolution) && ParserHelper.IsResolution(word))
{
category = ElementCategory.ElementVideoResolution;
}
}
if (category == ElementCategory.ElementUnknown) continue;
Elements.Add(new Element(category, word));
if (options.Identifiable)
{
token.Category = Token.TokenCategory.Identifier;
}
}
}
/** Search for episode number. */
private void SearchForEpisodeNumber()
{
// List all unknown tokens that contain a number
var tokens = new List<int>();
for (int i = 0; i < Tokens.Count; i++)
{
var token = Tokens[i];
if (token.Category == Token.TokenCategory.Unknown &&
ParserHelper.IndexOfFirstDigit(token.Content) != -1)
{
tokens.Add(i);
}
}
if (tokens.Count == 0) return;
IsEpisodeKeywordsFound = !Empty(ElementCategory.ElementEpisodeNumber);
// If a token matches a known episode pattern, it has to be the episode number
if (ParseNumber.SearchForEpisodePatterns(tokens)) return;
// We have previously found an episode number via keywords
if (!Empty(ElementCategory.ElementEpisodeNumber)) return;
// From now on, we're only interested in numeric tokens
tokens.RemoveAll(r => !StringHelper.IsNumericString(Tokens[r].Content));
// e.g. "01 (176)", "29 (04)"
if (ParseNumber.SearchForEquivalentNumbers(tokens)) return;
// e.g. " - 08"
if (ParseNumber.SearchForSeparatedNumbers(tokens)) return;
// "e.g. "[12]", "(2006)"
if (ParseNumber.SearchForIsolatedNumbers(tokens)) return;
// Consider using the last number as a last resort
ParseNumber.SearchForLastNumber(tokens);
}
/// <summary>
/// Search for anime title
/// </summary>
private void SearchForAnimeTitle()
{
bool enclosedTitle = false;
int tokenBegin = Token.FindToken(Tokens, 0, Tokens.Count, Token.TokenFlag.FlagNotEnclosed,
Token.TokenFlag.FlagUnknown);
// If that doesn't work, find the first unknown token in the second enclosed
// group, assuming that the first one is the release group
if (!Token.InListRange(tokenBegin, Tokens))
{
tokenBegin = 0;
enclosedTitle = true;
bool skippedPreviousGroup = false;
do
{
tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown);
if (!Token.InListRange(tokenBegin, Tokens)) break;
// Ignore groups that are composed of non-Latin characters
if (StringHelper.IsMostlyLatinString(Tokens[tokenBegin].Content) && skippedPreviousGroup)
{
break;
}
// Get the first unknown token of the next group
tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket);
tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown);
skippedPreviousGroup = true;
} while (Token.InListRange(tokenBegin, Tokens));
}
if (!Token.InListRange(tokenBegin, Tokens)) return;
// Continue until an identifier (or a bracket, if the title is enclosed) is found
int tokenEnd = Token.FindToken(
Tokens,
tokenBegin,
Tokens.Count,
Token.TokenFlag.FlagIdentifier,
enclosedTitle ? Token.TokenFlag.FlagBracket : Token.TokenFlag.FlagNone);
// If within the interval there's an open bracket without its matching pair,
// move the upper endpoint back to the bracket
if (!enclosedTitle)
{
int lastBracket = tokenEnd;
bool bracketOpen = false;
for (int i = tokenBegin; i < tokenEnd; i++)
{
if (Tokens[i].Category != Token.TokenCategory.Bracket) continue;
lastBracket = i;
bracketOpen = !bracketOpen;
}
if (bracketOpen) tokenEnd = lastBracket;
}
// If the interval ends with an enclosed group (e.g. "Anime Title [Fansub]"),
// move the upper endpoint back to the beginning of the group. We ignore
// parentheses in order to keep certain groups (e.g. "(TV)") intact.
if (!enclosedTitle)
{
int token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter);
while (ParseHelper.IsTokenCategory(token, Token.TokenCategory.Bracket) && Tokens[token].Content[0] != ')')
{
token = Token.FindPrevToken(Tokens, token, Token.TokenFlag.FlagBracket);
if (!Token.InListRange(token, Tokens)) continue;
tokenEnd = token;
token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter);
}
}
ParseHelper.BuildElement(ElementCategory.ElementAnimeTitle, false,
Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin));
}
/// <summary>
/// Search for release group
/// </summary>
private void SearchForReleaseGroup()
{
for (int tokenBegin = 0, tokenEnd = tokenBegin; tokenBegin < Tokens.Count;)
{
// Find the first enclosed unknown token
tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagEnclosed,
Token.TokenFlag.FlagUnknown);
if (!Token.InListRange(tokenBegin, Tokens)) return;
// Continue until a bracket or identifier is found
tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket,
Token.TokenFlag.FlagIdentifier);
if (!Token.InListRange(tokenEnd, Tokens) ||
Tokens[tokenEnd].Category != Token.TokenCategory.Bracket) continue;
// Ignore if it's not the first non-delimiter token in group
int prevToken = Token.FindPrevToken(Tokens, tokenBegin, Token.TokenFlag.FlagNotDelimiter);
if (Token.InListRange(prevToken, Tokens) &&
Tokens[prevToken].Category != Token.TokenCategory.Bracket) continue;
ParseHelper.BuildElement(ElementCategory.ElementReleaseGroup, true,
Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin));
return;
}
}
/// <summary>
/// Search for episode title
/// </summary>
private void SearchForEpisodeTitle()
{
int tokenBegin;
int tokenEnd = 0;
do
{
// Find the first non-enclosed unknown token
tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagNotEnclosed,
Token.TokenFlag.FlagUnknown);
if (!Token.InListRange(tokenBegin, Tokens)) return;
// Continue until a bracket or identifier is found
tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket,
Token.TokenFlag.FlagIdentifier);
// Ignore if it's only a dash
if (tokenEnd - tokenBegin <= 2 && ParserHelper.IsDashCharacter(Tokens[tokenBegin].Content[0])) continue;
//if (tokenBegin.Pos == null || tokenEnd.Pos == null) continue;
ParseHelper.BuildElement(ElementCategory.ElementEpisodeTitle, false,
Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin));
return;
} while (Token.InListRange(tokenBegin, Tokens));
}
/// <summary>
/// Search for isolated numbers
/// </summary>
private void SearchForIsolatedNumbers()
{
for (int i = 0; i < Tokens.Count; i++)
{
var token = Tokens[i];
if (token.Category != Token.TokenCategory.Unknown || !StringHelper.IsNumericString(token.Content) ||
!ParseHelper.IsTokenIsolated(i))
{
continue;
}
int number = StringHelper.StringToInt(token.Content);
// Anime year
if (number >= ParserNumber.AnimeYearMin && number <= ParserNumber.AnimeYearMax)
{
if (Empty(ElementCategory.ElementAnimeYear))
{
Elements.Add(new Element(ElementCategory.ElementAnimeYear, token.Content));
token.Category = Token.TokenCategory.Identifier;
continue;
}
}
// Video resolution
if (number != 480 && number != 720 && number != 1080) continue;
// If these numbers are isolated, it's more likely for them to be the
// video resolution rather than the episode number. Some fansub groups use these without the "p" suffix.
if (!Empty(ElementCategory.ElementVideoResolution)) continue;
Elements.Add(new Element(ElementCategory.ElementVideoResolution, token.Content));
token.Category = Token.TokenCategory.Identifier;
}
}
/// <summary>
/// Validate Elements
/// </summary>
private void ValidateElements()
{
if (Empty(ElementCategory.ElementAnimeType) || Empty(ElementCategory.ElementEpisodeTitle))
{
return;
}
string episodeTitle = Get(ElementCategory.ElementEpisodeTitle);
for (int i = 0; i < Elements.Count;)
{
var el = Elements[i];
if (el.Category == ElementCategory.ElementAnimeType)
{
if (episodeTitle.Contains(el.Value))
{
if (episodeTitle.Length == el.Value.Length)
{
Elements.RemoveAll(element =>
element.Category == ElementCategory.ElementEpisodeTitle); // invalid episode title
}
else
{
string keyword = KeywordManager.Normalize(el.Value);
if (KeywordManager.Contains(ElementCategory.ElementAnimeType, keyword))
{
i = Erase(el); // invalid anime type
continue;
}
}
}
}
++i;
}
}
/// <summary>
/// Returns whether or not the parser contains this category
/// </summary>
/// <param name="category"></param>
/// <returns></returns>
private bool Empty(ElementCategory category)
{
return Elements.All(element => element.Category != category);
}
/// <summary>
/// Returns the value of a particular category
/// </summary>
/// <param name="category"></param>
/// <returns></returns>
private string Get(ElementCategory category)
{
var foundElement = Elements.Find(element => element.Category == category);
if (foundElement != null) return foundElement.Value;
Element e = new Element(category, "");
Elements.Add(e);
foundElement = e;
return foundElement.Value;
}
/// <summary>
/// Deletes the first element with the same <code>element.Category</code> and returns the deleted element's position.
/// </summary>
private int Erase(Element element)
{
int removedIdx = -1;
for (int i = 0; i < Elements.Count; i++)
{
var currentElement = Elements[i];
if (element.Category != currentElement.Category) continue;
removedIdx = i;
Elements.RemoveAt(i);
break;
}
return removedIdx;
}
}