/* * Copyright (c) 2014-2017, Eren Okka * Copyright (c) 2016-2017, Paul Miller * Copyright (c) 2017-2018, Tyler Bratton * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ using System.Text.RegularExpressions; namespace AnitomySharp; /// /// A utility class to assist in number parsing. /// public partial class ParserNumber(Parser parser) { public const int AnimeYearMin = 1900; public const int AnimeYearMax = 2100; private const int EpisodeNumberMax = AnimeYearMax - 1; private const int VolumeNumberMax = 50; private const string RegexMatchOnlyStart = @"\A(?:"; private const string RegexMatchOnlyEnd = @")\z"; /// /// Returns whether or not the number is a volume number /// private static bool IsValidVolumeNumber(string number) { return StringHelper.StringToInt(number) <= VolumeNumberMax; } /// /// Returns whether or not the number is a valid episode number. /// private static bool IsValidEpisodeNumber(string number) { // Eliminate non numeric portion of number, then parse as double. string temp = ""; for (int i = 0; i < number.Length && char.IsDigit(number[i]); i++) { temp += number[i]; } return !string.IsNullOrEmpty(temp) && double.Parse(temp) <= EpisodeNumberMax; } /// /// Sets the alternative episode number. /// private void SetAlternativeEpisodeNumber(string number, Token token) { parser.Elements.Add(new Element(ElementCategory.ElementEpisodeNumberAlt, number)); token.Category = Token.TokenCategory.Identifier; } /// /// Sets the volume number. /// /// the number /// the token which contains the volume number /// true if we should check if it's a valid number, false to disable verification /// true if the volume number was set public bool SetVolumeNumber(string number, Token token, bool validate) { if (validate && !IsValidVolumeNumber(number)) return false; parser.Elements.Add(new Element(ElementCategory.ElementVolumeNumber, number)); token.Category = Token.TokenCategory.Identifier; return true; } /// /// Sets the anime episode number. /// /// the episode number /// the token which contains the volume number /// true if we should check if it's a valid episode number; false to disable validation /// true if the episode number was set public bool SetEpisodeNumber(string number, Token token, bool validate) { if (validate && !IsValidEpisodeNumber(number)) return false; token.Category = Token.TokenCategory.Identifier; var category = ElementCategory.ElementEpisodeNumber; if (parser.IsEpisodeKeywordsFound) { foreach (var element in parser.Elements) { if (element.Category != ElementCategory.ElementEpisodeNumber) continue; int comparison = StringHelper.StringToInt(number) - StringHelper.StringToInt(element.Value); switch (comparison) { case > 0: category = ElementCategory.ElementEpisodeNumberAlt; break; case < 0: element.Category = ElementCategory.ElementEpisodeNumberAlt; break; default: return false; } break; } } parser.Elements.Add(new Element(category, number)); return true; } /// /// Checks if a number follows the specified token /// /// the category to set if a number follows the token /// the token /// true if a number follows the token; false otherwise private bool NumberComesAfterPrefix(ElementCategory category, Token token) { int numberBegin = ParserHelper.IndexOfFirstDigit(token.Content); string prefix = StringHelper.SubstringWithCheck(token.Content, 0, numberBegin).ToUpperInvariant(); if (!KeywordManager.Contains(category, prefix)) return false; string number = StringHelper.SubstringWithCheck(token.Content, numberBegin, token.Content.Length - numberBegin); switch (category) { case ElementCategory.ElementEpisodePrefix: if (!MatchEpisodePatterns(number, token)) { SetEpisodeNumber(number, token, false); } return true; case ElementCategory.ElementVolumePrefix: if (!MatchVolumePatterns(number, token)) { SetVolumeNumber(number, token, false); } return true; default: return false; } } /// /// Checks whether the number precedes the word "of" /// /// the token /// the index of the token /// true if the token precedes the word "of" private bool NumberComesBeforeAnotherNumber(Token token, int currentTokenIdx) { int separatorToken = Token.FindNextToken(parser.Tokens, currentTokenIdx, Token.TokenFlag.FlagNotDelimiter); if (!Token.InListRange(separatorToken, parser.Tokens)) return false; var separators = new List> { Tuple.Create("&", true), Tuple.Create("of", false) }; foreach (var separator in separators) { if (parser.Tokens[separatorToken].Content != separator.Item1) continue; int otherToken = Token.FindNextToken(parser.Tokens, separatorToken, Token.TokenFlag.FlagNotDelimiter); if (!Token.InListRange(otherToken, parser.Tokens) || !StringHelper.IsNumericString(parser.Tokens[otherToken].Content)) continue; SetEpisodeNumber(token.Content, token, false); if (separator.Item2) { SetEpisodeNumber(parser.Tokens[otherToken].Content, parser.Tokens[otherToken], false); } parser.Tokens[separatorToken].Category = Token.TokenCategory.Identifier; parser.Tokens[otherToken].Category = Token.TokenCategory.Identifier; return true; } return false; } // EPISODE MATCHERS /// /// Attempts to find an episode/season inside a word /// /// the word /// the token /// true if the word was matched to an episode/season number public bool MatchEpisodePatterns(string word, Token token) { if (StringHelper.IsNumericString(word)) return false; word = word.Trim(" -".ToCharArray()); bool numericFront = char.IsDigit(word[0]); bool numericBack = char.IsDigit(word[^1]); if (numericFront && numericBack) { // e.g. "01v2" if (MatchSingleEpisodePattern(word, token)) { return true; } // e.g. "01-02", "03-05v2" if (MatchMultiEpisodePattern(word, token)) { return true; } // e.g. "07.5" if (MatchFractionalEpisodePattern(word, token)) { return true; } } if (numericBack) { // e.g. "2x01", "S01E03", "S01-02xE001-150" if (MatchSeasonAndEpisodePattern(word, token)) { return true; } // e.g. "#01", "#02-03v2" if (MatchNumberSignPattern(word, token)) { return true; } } switch (numericFront) { // e.g. "ED1", "OP4a", "OVA2" case false when MatchTypeAndEpisodePattern(word, token): // e.g. "4a", "111C" case true when !numericBack && MatchPartialEpisodePattern(word, token): return true; default: // U+8A71 is used as counter for stories, episodes of TV series, etc. return numericFront && MatchJapaneseCounterPattern(word, token); } } /// /// Match a single episode pattern. e.g. "01v2". /// /// the word /// the token /// true if the token matched private bool MatchSingleEpisodePattern(string word, Token token) { var match = SingleEpisodeRegex().Match(word); if (!match.Success) return false; SetEpisodeNumber(match.Groups[1].Value, token, false); parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[2].Value)); return true; } /// /// Match a multi episode pattern. e.g. "01-02", "03-05v2". /// /// the word /// the token /// true if the token matched private bool MatchMultiEpisodePattern(string word, Token token) { var match = MultiEpisodeRegex().Match(word); if (!match.Success) return false; string lowerBound = match.Groups[1].Value; string upperBound = match.Groups[3].Value; if (StringHelper.StringToInt(lowerBound) >= StringHelper.StringToInt(upperBound)) return false; if (!SetEpisodeNumber(lowerBound, token, true)) return false; SetEpisodeNumber(upperBound, token, true); if (!string.IsNullOrEmpty(match.Groups[2].Value)) { parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[2].Value)); } if (!string.IsNullOrEmpty(match.Groups[4].Value)) { parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[4].Value)); } return true; } /// /// Match season and episode patterns. e.g. "2x01", "S01E03", "S01-02xE001-150". /// /// the word /// the token /// true if the token matched private bool MatchSeasonAndEpisodePattern(string word, Token token) { var match = SeasonEpisodeRegex().Match(word); if (!match.Success) return false; parser.Elements.Add(new Element(ElementCategory.ElementAnimeSeason, match.Groups[1].Value)); if (!string.IsNullOrEmpty(match.Groups[2].Value)) { parser.Elements.Add(new Element(ElementCategory.ElementAnimeSeason, match.Groups[2].Value)); } SetEpisodeNumber(match.Groups[3].Value, token, false); if (!string.IsNullOrEmpty(match.Groups[4].Value)) { SetEpisodeNumber(match.Groups[4].Value, token, false); } return true; } /// /// Match type and episode. e.g. "ED1", "OP4a", "OVA2". /// /// the word /// the token /// true if the token matched private bool MatchTypeAndEpisodePattern(string word, Token token) { int numberBegin = ParserHelper.IndexOfFirstDigit(word); string prefix = StringHelper.SubstringWithCheck(word, 0, numberBegin); var category = ElementCategory.ElementAnimeType; var options = new KeywordOptions(); if (!KeywordManager.FindAndSet(KeywordManager.Normalize(prefix), ref category, ref options)) return false; parser.Elements.Add(new Element(ElementCategory.ElementAnimeType, prefix)); string number = word.Substring(numberBegin); if (!MatchEpisodePatterns(number, token) && !SetEpisodeNumber(number, token, true)) return false; int foundIdx = parser.Tokens.IndexOf(token); if (foundIdx == -1) return true; token.Content = number; parser.Tokens.Insert(foundIdx, new Token(options.Identifiable ? Token.TokenCategory.Identifier : Token.TokenCategory.Unknown, prefix, token.Enclosed)); return true; } /// /// Match fractional episodes. e.g. "07.5" /// /// the word /// the token /// true if the token matched private bool MatchFractionalEpisodePattern(string word, Token token) { if (string.IsNullOrEmpty(word)) { word = ""; } const string regexPattern = RegexMatchOnlyStart + @"\d+\.5" + RegexMatchOnlyEnd; var match = Regex.Match(word, regexPattern); return match.Success && SetEpisodeNumber(word, token, true); } /// /// Match partial episodes. e.g. "4a", "111C". /// /// the word /// the token /// true if the token matched private bool MatchPartialEpisodePattern(string word, Token token) { if (string.IsNullOrEmpty(word)) return false; int foundIdx = Enumerable.Range(0, word.Length) .DefaultIfEmpty(word.Length) .FirstOrDefault(value => !char.IsDigit(word[value])); int suffixLength = word.Length - foundIdx; return suffixLength == 1 && IsValidSuffix(word[foundIdx]) && SetEpisodeNumber(word, token, true); bool IsValidSuffix(int c) => c is >= 'A' and <= 'C' or >= 'a' and <= 'c'; } /// /// Match episodes with number signs. e.g. "#01", "#02-03v2" /// /// the word /// the token /// true if the token matched private bool MatchNumberSignPattern(string word, Token token) { if (string.IsNullOrEmpty(word) || word[0] != '#') word = ""; var match = NumberEpisodeRegex().Match(word); if (!match.Success) return false; if (!SetEpisodeNumber(match.Groups[1].Value, token, true)) return false; if (!string.IsNullOrEmpty(match.Groups[2].Value)) { SetEpisodeNumber(match.Groups[2].Value, token, false); } if (!string.IsNullOrEmpty(match.Groups[3].Value)) { parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[3].Value)); } return true; } /// /// Match Japanese patterns. e.g. U+8A71 is used as counter for stories, episodes of TV series, etc. /// /// the word /// the token /// true if the token matched private bool MatchJapaneseCounterPattern(string word, Token token) { if (string.IsNullOrEmpty(word) || word[^1] != '\u8A71') return false; var match = JapaneseEpisodeRegex().Match(word); if (!match.Success) return false; SetEpisodeNumber(match.Groups[1].Value, token, false); return true; } // VOLUME MATCHES /// /// Attempts to find an episode/season inside a word /// /// the word /// the token /// true if the word was matched to an episode/season number public bool MatchVolumePatterns(string word, Token token) { // All patterns contain at least one non-numeric character if (StringHelper.IsNumericString(word)) return false; word = word.Trim(" -".ToCharArray()); bool numericFront = char.IsDigit(word[0]); bool numericBack = char.IsDigit(word[^1]); if (numericFront && numericBack) { // e.g. "01v2" e.g. "01-02", "03-05v2" return MatchSingleVolumePattern(word, token) || MatchMultiVolumePattern(word, token); } return false; } /// /// Match single volume. e.g. "01v2" /// /// the word /// the token /// true if the token matched private bool MatchSingleVolumePattern(string word, Token token) { if (string.IsNullOrEmpty(word)) word = ""; var match = SingleVolumeRegex().Match(word); if (!match.Success) return false; SetVolumeNumber(match.Groups[1].Value, token, false); parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[2].Value)); return true; } /// /// Match multi-volume. e.g. "01-02", "03-05v2". /// /// the word /// the token /// true if the token matched private bool MatchMultiVolumePattern(string word, Token token) { if (string.IsNullOrEmpty(word)) word = ""; var match = MultiVolumeRegex().Match(word); if (!match.Success) return false; string lowerBound = match.Groups[1].Value; string upperBound = match.Groups[2].Value; if (StringHelper.StringToInt(lowerBound) >= StringHelper.StringToInt(upperBound)) return false; if (!SetVolumeNumber(lowerBound, token, true)) return false; SetVolumeNumber(upperBound, token, false); if (string.IsNullOrEmpty(match.Groups[3].Value)) { parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[3].Value)); } return true; } // SEARCH /// /// Searches for isolated numbers in a list of tokens. /// /// the list of tokens /// true if an isolated number was found public bool SearchForIsolatedNumbers(IEnumerable tokens) { return tokens .Where(it => parser.Tokens[it].Enclosed && parser.ParseHelper.IsTokenIsolated(it)) .Any(it => SetEpisodeNumber(parser.Tokens[it].Content, parser.Tokens[it], true)); } /// /// Searches for separated numbers in a list of tokens. /// /// the list of tokens /// true fi a separated number was found public bool SearchForSeparatedNumbers(List tokens) { foreach (int it in tokens) { int previousToken = Token.FindPrevToken(parser.Tokens, it, Token.TokenFlag.FlagNotDelimiter); // See if the number has a preceding "-" separator if (!parser.ParseHelper.IsTokenCategory(previousToken, Token.TokenCategory.Unknown) || !ParserHelper.IsDashCharacter(parser.Tokens[previousToken].Content[0])) continue; if (!SetEpisodeNumber(parser.Tokens[it].Content, parser.Tokens[it], true)) continue; parser.Tokens[previousToken].Category = Token.TokenCategory.Identifier; return true; } return false; } /// /// Searches for episode patterns in a list of tokens. /// /// the list of tokens /// true if an episode number was found public bool SearchForEpisodePatterns(List tokens) { foreach (int it in tokens) { bool numericFront = parser.Tokens[it].Content.Length > 0 && char.IsDigit(parser.Tokens[it].Content[0]); if (!numericFront) { // e.g. "EP.1", "Vol.1" if (NumberComesAfterPrefix(ElementCategory.ElementEpisodePrefix, parser.Tokens[it])) { return true; } if (NumberComesAfterPrefix(ElementCategory.ElementVolumePrefix, parser.Tokens[it])) { continue; } } else { // e.g. "8 & 10", "01 of 24" if (NumberComesBeforeAnotherNumber(parser.Tokens[it], it)) { return true; } } // Look for other patterns if (MatchEpisodePatterns(parser.Tokens[it].Content, parser.Tokens[it])) { return true; } } return false; } /// /// Searches for equivalent number in a list of tokens. e.g. 08(114) /// /// the list of tokens /// true if an equivalent number was found public bool SearchForEquivalentNumbers(List tokens) { foreach (int it in tokens) { // Find number must be isolated. if (parser.ParseHelper.IsTokenIsolated(it) || !IsValidEpisodeNumber(parser.Tokens[it].Content)) { continue; } // Find the first enclosed, non-delimiter token int nextToken = Token.FindNextToken(parser.Tokens, it, Token.TokenFlag.FlagNotDelimiter); if (!parser.ParseHelper.IsTokenCategory(nextToken, Token.TokenCategory.Bracket)) continue; nextToken = Token.FindNextToken(parser.Tokens, nextToken, Token.TokenFlag.FlagEnclosed, Token.TokenFlag.FlagNotDelimiter); if (!parser.ParseHelper.IsTokenCategory(nextToken, Token.TokenCategory.Unknown)) continue; // Check if it's an isolated number if (!parser.ParseHelper.IsTokenIsolated(nextToken) || !StringHelper.IsNumericString(parser.Tokens[nextToken].Content) || !IsValidEpisodeNumber(parser.Tokens[nextToken].Content)) { continue; } var list = new List { parser.Tokens[it], parser.Tokens[nextToken] }; list.Sort((o1, o2) => StringHelper.StringToInt(o1.Content) - StringHelper.StringToInt(o2.Content)); SetEpisodeNumber(list[0].Content, list[0], false); SetAlternativeEpisodeNumber(list[1].Content, list[1]); return true; } return false; } /// /// Searches for the last number token in a list of tokens /// /// the list of tokens /// true if the last number token was found public bool SearchForLastNumber(List tokens) { for (int i = tokens.Count - 1; i >= 0; i--) { int it = tokens[i]; // Assuming that episode number always comes after the title, // the first token cannot be what we're looking for if (it == 0) continue; if (parser.Tokens[it].Enclosed) continue; // Ignore if it's the first non-enclosed, non-delimiter token if (parser.Tokens.GetRange(0, it) .All(r => r.Enclosed || r.Category == Token.TokenCategory.Delimiter)) { continue; } int previousToken = Token.FindPrevToken(parser.Tokens, it, Token.TokenFlag.FlagNotDelimiter); if (parser.ParseHelper.IsTokenCategory(previousToken, Token.TokenCategory.Unknown)) { if (parser.Tokens[previousToken].Content.Equals("Movie", StringComparison.InvariantCultureIgnoreCase) || parser.Tokens[previousToken].Content.Equals("Part", StringComparison.InvariantCultureIgnoreCase)) { continue; } } // We'll use this number after all if (SetEpisodeNumber(parser.Tokens[it].Content, parser.Tokens[it], true)) { return true; } } return false; } [GeneratedRegex(@"\A(?:(\d{1,3})[vV](\d))\z")] private static partial Regex SingleEpisodeRegex(); [GeneratedRegex(@"\A(?:(\d{1,3})(?:[vV](\d))?[-~&+](\d{1,3})(?:[vV](\d))?)\z")] private static partial Regex MultiEpisodeRegex(); [GeneratedRegex(@"\A(?:S?(\d{1,2})(?:-S?(\d{1,2}))?(?:x|[ ._-x]?E)(\d{1,3})(?:-E?(\d{1,3}))?)\z")] private static partial Regex SeasonEpisodeRegex(); [GeneratedRegex(@"\A(?:#(\d{1,3})(?:[-~&+](\d{1,3}))?(?:[vV](\d))?)\z")] private static partial Regex NumberEpisodeRegex(); [GeneratedRegex(@"\A(?:(\d{1,3})話)\z")] private static partial Regex JapaneseEpisodeRegex(); [GeneratedRegex(@"\A(?:(\d{1,2})[vV](\d))\z")] private static partial Regex SingleVolumeRegex(); [GeneratedRegex(@"\A(?:(\d{1,2})[-~&+](\d{1,2})(?:[vV](\d))?)\z")] private static partial Regex MultiVolumeRegex(); }