/* * Copyright (c) 2014-2017, Eren Okka * Copyright (c) 2016-2017, Paul Miller * Copyright (c) 2017-2018, Tyler Bratton * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ namespace AnitomySharp; /// /// Class to classify s /// public class Parser { public bool IsEpisodeKeywordsFound { get; private set; } public ParserHelper ParseHelper { get; } public ParserNumber ParseNumber { get; } public List Elements { get; } public List Tokens { get; } private Options Options { get; } /// /// Constructs a new token parser /// /// the list where parsed elements will be added /// the parser options /// the list of tokens public Parser(List elements, Options options, List tokens) { Elements = elements; Options = options; Tokens = tokens; ParseHelper = new ParserHelper(this); ParseNumber = new ParserNumber(this); } /** Begins the parsing process */ public bool Parse() { SearchForKeywords(); SearchForIsolatedNumbers(); if (Options.ParseEpisodeNumber) { SearchForEpisodeNumber(); } SearchForAnimeTitle(); if (Options.ParseReleaseGroup && Empty(ElementCategory.ElementReleaseGroup)) { SearchForReleaseGroup(); } if (Options.ParseEpisodeTitle && !Empty(ElementCategory.ElementEpisodeNumber)) { SearchForEpisodeTitle(); } ValidateElements(); return Empty(ElementCategory.ElementAnimeTitle); } /** Search for anime keywords. */ private void SearchForKeywords() { for (int i = 0; i < Tokens.Count; i++) { var token = Tokens[i]; if (token.Category != Token.TokenCategory.Unknown) continue; string word = token.Content; word = word.Trim(" -".ToCharArray()); if (string.IsNullOrEmpty(word)) continue; // Don't bother if the word is a number that cannot be CRC if (word.Length != 8 && StringHelper.IsNumericString(word)) continue; string keyword = KeywordManager.Normalize(word); var category = ElementCategory.ElementUnknown; var options = new KeywordOptions(); if (KeywordManager.FindAndSet(keyword, ref category, ref options)) { if (!Options.ParseReleaseGroup && category == ElementCategory.ElementReleaseGroup) continue; if (!ParserHelper.IsElementCategorySearchable(category) || !options.Searchable) continue; if (ParserHelper.IsElementCategorySingular(category) && !Empty(category)) continue; switch (category) { case ElementCategory.ElementAnimeSeasonPrefix: ParseHelper.CheckAndSetAnimeSeasonKeyword(token, i); continue; case ElementCategory.ElementEpisodePrefix when options.Valid: ParseHelper.CheckExtentKeyword(ElementCategory.ElementEpisodeNumber, i, token); continue; case ElementCategory.ElementReleaseVersion: word = word[1..]; break; case ElementCategory.ElementVolumePrefix: ParseHelper.CheckExtentKeyword(ElementCategory.ElementVolumeNumber, i, token); continue; } } else { if (Empty(ElementCategory.ElementFileChecksum) && ParserHelper.IsCrc32(word)) { category = ElementCategory.ElementFileChecksum; } else if (Empty(ElementCategory.ElementVideoResolution) && ParserHelper.IsResolution(word)) { category = ElementCategory.ElementVideoResolution; } } if (category == ElementCategory.ElementUnknown) continue; Elements.Add(new Element(category, word)); if (options.Identifiable) { token.Category = Token.TokenCategory.Identifier; } } } /** Search for episode number. */ private void SearchForEpisodeNumber() { // List all unknown tokens that contain a number var tokens = new List(); for (int i = 0; i < Tokens.Count; i++) { var token = Tokens[i]; if (token.Category == Token.TokenCategory.Unknown && ParserHelper.IndexOfFirstDigit(token.Content) != -1) { tokens.Add(i); } } if (tokens.Count == 0) return; IsEpisodeKeywordsFound = !Empty(ElementCategory.ElementEpisodeNumber); // If a token matches a known episode pattern, it has to be the episode number if (ParseNumber.SearchForEpisodePatterns(tokens)) return; // We have previously found an episode number via keywords if (!Empty(ElementCategory.ElementEpisodeNumber)) return; // From now on, we're only interested in numeric tokens tokens.RemoveAll(r => !StringHelper.IsNumericString(Tokens[r].Content)); // e.g. "01 (176)", "29 (04)" if (ParseNumber.SearchForEquivalentNumbers(tokens)) return; // e.g. " - 08" if (ParseNumber.SearchForSeparatedNumbers(tokens)) return; // "e.g. "[12]", "(2006)" if (ParseNumber.SearchForIsolatedNumbers(tokens)) return; // Consider using the last number as a last resort ParseNumber.SearchForLastNumber(tokens); } /// /// Search for anime title /// private void SearchForAnimeTitle() { bool enclosedTitle = false; int tokenBegin = Token.FindToken(Tokens, 0, Tokens.Count, Token.TokenFlag.FlagNotEnclosed, Token.TokenFlag.FlagUnknown); // If that doesn't work, find the first unknown token in the second enclosed // group, assuming that the first one is the release group if (!Token.InListRange(tokenBegin, Tokens)) { tokenBegin = 0; enclosedTitle = true; bool skippedPreviousGroup = false; do { tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown); if (!Token.InListRange(tokenBegin, Tokens)) break; // Ignore groups that are composed of non-Latin characters if (StringHelper.IsMostlyLatinString(Tokens[tokenBegin].Content) && skippedPreviousGroup) { break; } // Get the first unknown token of the next group tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket); tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown); skippedPreviousGroup = true; } while (Token.InListRange(tokenBegin, Tokens)); } if (!Token.InListRange(tokenBegin, Tokens)) return; // Continue until an identifier (or a bracket, if the title is enclosed) is found int tokenEnd = Token.FindToken( Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagIdentifier, enclosedTitle ? Token.TokenFlag.FlagBracket : Token.TokenFlag.FlagNone); // If within the interval there's an open bracket without its matching pair, // move the upper endpoint back to the bracket if (!enclosedTitle) { int lastBracket = tokenEnd; bool bracketOpen = false; for (int i = tokenBegin; i < tokenEnd; i++) { if (Tokens[i].Category != Token.TokenCategory.Bracket) continue; lastBracket = i; bracketOpen = !bracketOpen; } if (bracketOpen) tokenEnd = lastBracket; } // If the interval ends with an enclosed group (e.g. "Anime Title [Fansub]"), // move the upper endpoint back to the beginning of the group. We ignore // parentheses in order to keep certain groups (e.g. "(TV)") intact. if (!enclosedTitle) { int token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter); while (ParseHelper.IsTokenCategory(token, Token.TokenCategory.Bracket) && Tokens[token].Content[0] != ')') { token = Token.FindPrevToken(Tokens, token, Token.TokenFlag.FlagBracket); if (!Token.InListRange(token, Tokens)) continue; tokenEnd = token; token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter); } } ParseHelper.BuildElement(ElementCategory.ElementAnimeTitle, false, Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin)); } /// /// Search for release group /// private void SearchForReleaseGroup() { for (int tokenBegin = 0, tokenEnd = tokenBegin; tokenBegin < Tokens.Count;) { // Find the first enclosed unknown token tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagEnclosed, Token.TokenFlag.FlagUnknown); if (!Token.InListRange(tokenBegin, Tokens)) return; // Continue until a bracket or identifier is found tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket, Token.TokenFlag.FlagIdentifier); if (!Token.InListRange(tokenEnd, Tokens) || Tokens[tokenEnd].Category != Token.TokenCategory.Bracket) continue; // Ignore if it's not the first non-delimiter token in group int prevToken = Token.FindPrevToken(Tokens, tokenBegin, Token.TokenFlag.FlagNotDelimiter); if (Token.InListRange(prevToken, Tokens) && Tokens[prevToken].Category != Token.TokenCategory.Bracket) continue; ParseHelper.BuildElement(ElementCategory.ElementReleaseGroup, true, Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin)); return; } } /// /// Search for episode title /// private void SearchForEpisodeTitle() { int tokenBegin; int tokenEnd = 0; do { // Find the first non-enclosed unknown token tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagNotEnclosed, Token.TokenFlag.FlagUnknown); if (!Token.InListRange(tokenBegin, Tokens)) return; // Continue until a bracket or identifier is found tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket, Token.TokenFlag.FlagIdentifier); // Ignore if it's only a dash if (tokenEnd - tokenBegin <= 2 && ParserHelper.IsDashCharacter(Tokens[tokenBegin].Content[0])) continue; //if (tokenBegin.Pos == null || tokenEnd.Pos == null) continue; ParseHelper.BuildElement(ElementCategory.ElementEpisodeTitle, false, Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin)); return; } while (Token.InListRange(tokenBegin, Tokens)); } /// /// Search for isolated numbers /// private void SearchForIsolatedNumbers() { for (int i = 0; i < Tokens.Count; i++) { var token = Tokens[i]; if (token.Category != Token.TokenCategory.Unknown || !StringHelper.IsNumericString(token.Content) || !ParseHelper.IsTokenIsolated(i)) { continue; } int number = StringHelper.StringToInt(token.Content); // Anime year if (number >= ParserNumber.AnimeYearMin && number <= ParserNumber.AnimeYearMax) { if (Empty(ElementCategory.ElementAnimeYear)) { Elements.Add(new Element(ElementCategory.ElementAnimeYear, token.Content)); token.Category = Token.TokenCategory.Identifier; continue; } } // Video resolution if (number != 480 && number != 720 && number != 1080) continue; // If these numbers are isolated, it's more likely for them to be the // video resolution rather than the episode number. Some fansub groups use these without the "p" suffix. if (!Empty(ElementCategory.ElementVideoResolution)) continue; Elements.Add(new Element(ElementCategory.ElementVideoResolution, token.Content)); token.Category = Token.TokenCategory.Identifier; } } /// /// Validate Elements /// private void ValidateElements() { if (Empty(ElementCategory.ElementAnimeType) || Empty(ElementCategory.ElementEpisodeTitle)) { return; } string episodeTitle = Get(ElementCategory.ElementEpisodeTitle); for (int i = 0; i < Elements.Count;) { var el = Elements[i]; if (el.Category == ElementCategory.ElementAnimeType) { if (episodeTitle.Contains(el.Value)) { if (episodeTitle.Length == el.Value.Length) { Elements.RemoveAll(element => element.Category == ElementCategory.ElementEpisodeTitle); // invalid episode title } else { string keyword = KeywordManager.Normalize(el.Value); if (KeywordManager.Contains(ElementCategory.ElementAnimeType, keyword)) { i = Erase(el); // invalid anime type continue; } } } } ++i; } } /// /// Returns whether or not the parser contains this category /// /// /// private bool Empty(ElementCategory category) { return Elements.All(element => element.Category != category); } /// /// Returns the value of a particular category /// /// /// private string Get(ElementCategory category) { var foundElement = Elements.Find(element => element.Category == category); if (foundElement != null) return foundElement.Value; Element e = new Element(category, ""); Elements.Add(e); foundElement = e; return foundElement.Value; } /// /// Deletes the first element with the same element.Category and returns the deleted element's position. /// private int Erase(Element element) { int removedIdx = -1; for (int i = 0; i < Elements.Count; i++) { var currentElement = Elements[i]; if (element.Category != currentElement.Category) continue; removedIdx = i; Elements.RemoveAt(i); break; } return removedIdx; } }