init: repo

This commit is contained in:
2024-07-04 15:27:19 +08:00
commit a90302f7ba
293 changed files with 18994 additions and 0 deletions

View File

@@ -0,0 +1,96 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
namespace AnitomySharp;
/// <summary>
/// A library capable of parsing Anime filenames
/// This code is a C++ to C# port of <see href="https://github.com/erengy/anitomy">Anitomy</see>>,
/// using the already existing Java port <see href="https://github.com/Vorror/anitomyJ">AnitomyJ</see> as a reference.
/// </summary>
public class Anitomy
{
private Anitomy() {}
/// <summary>
/// Parses an anime <see cref="filename"/> into its constituent elements.
/// </summary>
/// <param name="filename">the anime file name</param>
/// <param name="options">the options to parse with, use Parse(filename) to use default options</param>
/// <returns>the list of parsed elements</returns>
public static IEnumerable<Element> Parse(string filename, Options options)
{
List<Element> elements = [];
List<Token> tokens = [];
if (options.ParseFileExtension)
{
string extension = "";
if (RemoveExtensionFromFilename(ref filename, ref extension))
{
elements.Add(new Element(ElementCategory.ElementFileExtension, extension));
}
}
if (string.IsNullOrEmpty(filename))
{
return elements;
}
elements.Add(new Element(ElementCategory.ElementFileName, filename));
bool isTokenized = new Tokenizer(filename, elements, options, tokens).Tokenize();
if (!isTokenized)
{
return elements;
}
new Parser(elements, options, tokens).Parse();
return elements;
}
/// <summary>
/// Parses an anime <see cref="filename"/> into its consituent elements.
/// </summary>
/// <param name="filename">the anime file name</param>
/// <returns>the list of parsed elements</returns>
public static IEnumerable<Element> Parse(string filename)
{
return Parse(filename, new Options());
}
/// <summary>
/// Removes the extension from the <see cref="filename"/>
/// </summary>
/// <param name="filename">the ref that will be updated with the new filename</param>
/// <param name="extension">the ref that will be updated with the file extension</param>
/// <returns>if the extension was successfully separated from the filename</returns>
private static bool RemoveExtensionFromFilename(ref string filename, ref string extension)
{
int position;
if (string.IsNullOrEmpty(filename) || (position = filename.LastIndexOf('.')) == -1)
{
return false;
}
extension = filename[(position + 1)..];
if (extension.Length > 4 || !extension.All(char.IsLetterOrDigit))
{
return false;
}
string keyword = KeywordManager.Normalize(extension);
if (!KeywordManager.Contains(ElementCategory.ElementFileExtension, keyword))
{
return false;
}
filename = filename[..position];
return true;
}
}

View File

@@ -0,0 +1,9 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
</Project>

View File

@@ -0,0 +1,90 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
namespace AnitomySharp;
/// <summary>
/// An <see cref="Element"/> represents an identified Anime <see cref="Token"/>.
/// A single filename may contain multiple of the same
/// token(e.g <see cref="ElementCategory.ElementEpisodeNumber"/>).
/// </summary>
public class Element
{
public ElementCategory Category { get; set; }
public string Value { get; }
/// <summary>
/// Constructs a new Element
/// </summary>
/// <param name="category">the category of the element</param>
/// <param name="value">the element's value</param>
public Element(ElementCategory category, string value)
{
Category = category;
Value = value;
}
public override int GetHashCode()
{
return -1926371015 + Value.GetHashCode();
}
public override bool Equals(object? obj)
{
if (this == obj)
{
return true;
}
if (obj == null || GetType() != obj.GetType())
{
return false;
}
var other = (Element) obj;
return Category.Equals(other.Category);
}
public override string ToString()
{
return $"Element{{category={Category}, value='{Value}'}}";
}
}
/** Element Categories */
public enum ElementCategory
{
ElementAnimeSeason,
ElementAnimeSeasonPrefix,
ElementAnimeTitle,
ElementAnimeType,
ElementAnimeYear,
ElementAudioTerm,
ElementDeviceCompatibility,
ElementEpisodeNumber,
ElementEpisodeNumberAlt,
ElementEpisodePrefix,
ElementEpisodeTitle,
ElementFileChecksum,
ElementFileExtension,
ElementFileName,
ElementLanguage,
ElementOther,
ElementReleaseGroup,
ElementReleaseInformation,
ElementReleaseVersion,
ElementSource,
ElementSubtitles,
ElementVideoResolution,
ElementVideoTerm,
ElementVolumeNumber,
ElementVolumePrefix,
ElementUnknown
}

View File

@@ -0,0 +1,279 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
namespace AnitomySharp;
/// <summary>
/// A class to manager the list of known anime keywords. This class is analogous to <code>keyword.cpp</code> of Anitomy, and <code>KeywordManager.java</code> of AnitomyJ
/// </summary>
public static class KeywordManager
{
private static readonly Dictionary<string, Keyword> s_keys = new();
private static readonly Dictionary<string, Keyword> s_extensions = new();
private static readonly List<Tuple<ElementCategory, List<string>>> s_peekEntries;
static KeywordManager()
{
var optionsDefault = new KeywordOptions();
var optionsInvalid = new KeywordOptions(true, true, false);
var optionsUnidentifiable = new KeywordOptions(false, true, true);
var optionsUnidentifiableInvalid = new KeywordOptions(false, true, false);
var optionsUnidentifiableUnsearchable = new KeywordOptions(false, false, true);
Add(ElementCategory.ElementAnimeSeasonPrefix,
optionsUnidentifiable,
new List<string> {"SAISON", "SEASON"});
Add(ElementCategory.ElementAnimeType,
optionsUnidentifiable,
new List<string> {"GEKIJOUBAN", "MOVIE", "OAD", "OAV", "ONA", "OVA", "SPECIAL", "SPECIALS", "TV"});
Add(ElementCategory.ElementAnimeType,
optionsUnidentifiableUnsearchable,
new List<string> {"SP"}); // e.g. "Yumeiro Patissiere SP Professional"
Add(ElementCategory.ElementAnimeType,
optionsUnidentifiableInvalid,
new List<string> {"ED", "ENDING", "NCED", "NCOP", "OP", "OPENING", "PREVIEW", "PV"});
Add(ElementCategory.ElementAudioTerm,
optionsDefault,
new List<string> {
// Audio channels
"2.0CH", "2CH", "5.1", "5.1CH", "DTS", "DTS-ES", "DTS5.1",
"TRUEHD5.1",
// Audio codec
"AAC", "AACX2", "AACX3", "AACX4", "AC3", "EAC3", "E-AC-3",
"FLAC", "FLACX2", "FLACX3", "FLACX4", "LOSSLESS", "MP3", "OGG", "VORBIS",
// Audio language
"DUALAUDIO", "DUAL AUDIO"
});
Add(ElementCategory.ElementDeviceCompatibility,
optionsDefault,
new List<string> {"IPAD3", "IPHONE5", "IPOD", "PS3", "XBOX", "XBOX360"});
Add(ElementCategory.ElementDeviceCompatibility,
optionsUnidentifiable,
new List<string> {"ANDROID"});
Add(ElementCategory.ElementEpisodePrefix,
optionsDefault,
new List<string> {"EP", "EP.", "EPS", "EPS.", "EPISODE", "EPISODE.", "EPISODES", "CAPITULO", "EPISODIO", "FOLGE"});
Add(ElementCategory.ElementEpisodePrefix,
optionsInvalid,
new List<string> {"E", "\\x7B2C"}); // single-letter episode keywords are not valid tokens
Add(ElementCategory.ElementFileExtension,
optionsDefault,
new List<string> {"3GP", "AVI", "DIVX", "FLV", "M2TS", "MKV", "MOV", "MP4", "MPG", "OGM", "RM", "RMVB", "TS", "WEBM", "WMV"});
Add(ElementCategory.ElementFileExtension,
optionsInvalid,
new List<string> {"AAC", "AIFF", "FLAC", "M4A", "MP3", "MKA", "OGG", "WAV", "WMA", "7Z", "RAR", "ZIP", "ASS", "SRT"});
Add(ElementCategory.ElementLanguage,
optionsDefault,
new List<string> {"ENG", "ENGLISH", "ESPANO", "JAP", "PT-BR", "SPANISH", "VOSTFR"});
Add(ElementCategory.ElementLanguage,
optionsUnidentifiable,
new List<string> {"ESP", "ITA"}); // e.g. "Tokyo ESP:, "Bokura ga Ita"
Add(ElementCategory.ElementOther,
optionsDefault,
new List<string> {"REMASTER", "REMASTERED", "UNCENSORED", "UNCUT", "TS", "VFR", "WIDESCREEN", "WS"});
Add(ElementCategory.ElementReleaseGroup,
optionsDefault,
new List<string> {"THORA"});
Add(ElementCategory.ElementReleaseInformation,
optionsDefault,
new List<string> {"BATCH", "COMPLETE", "PATCH", "REMUX"});
Add(ElementCategory.ElementReleaseInformation,
optionsUnidentifiable,
new List<string> {"END", "FINAL"}); // e.g. "The End of Evangelion", 'Final Approach"
Add(ElementCategory.ElementReleaseVersion,
optionsDefault,
new List<string> {"V0", "V1", "V2", "V3", "V4"});
Add(ElementCategory.ElementSource,
optionsDefault,
new List<string> {"BD", "BDRIP", "BLURAY", "BLU-RAY", "DVD", "DVD5", "DVD9", "DVD-R2J", "DVDRIP", "DVD-RIP", "R2DVD", "R2J", "R2JDVD", "R2JDVDRIP", "HDTV", "HDTVRIP", "TVRIP", "TV-RIP", "WEBCAST", "WEBRIP"});
Add(ElementCategory.ElementSubtitles,
optionsDefault,
new List<string> {"ASS", "BIG5", "DUB", "DUBBED", "HARDSUB", "HARDSUBS", "RAW", "SOFTSUB", "SOFTSUBS", "SUB", "SUBBED", "SUBTITLED"});
Add(ElementCategory.ElementVideoTerm,
optionsDefault,
new List<string> {
// Frame rate
"23.976FPS", "24FPS", "29.97FPS", "30FPS", "60FPS", "120FPS",
// Video codec
"8BIT", "8-BIT", "10BIT", "10BITS", "10-BIT", "10-BITS",
"HI10", "HI10P", "HI444", "HI444P", "HI444PP",
"H264", "H265", "H.264", "H.265", "X264", "X265", "X.264",
"AVC", "HEVC", "HEVC2", "DIVX", "DIVX5", "DIVX6", "XVID",
// Video format
"AVI", "RMVB", "WMV", "WMV3", "WMV9",
// Video quality
"HQ", "LQ",
// Video resolution
"HD", "SD"});
Add(ElementCategory.ElementVolumePrefix,
optionsDefault,
new List<string> {"VOL", "VOL.", "VOLUME"});
s_peekEntries = new List<Tuple<ElementCategory, List<string>>>
{
Tuple.Create(ElementCategory.ElementAudioTerm, new List<string> { "Dual Audio" }),
Tuple.Create(ElementCategory.ElementVideoTerm, new List<string> { "H264", "H.264", "h264", "h.264" }),
Tuple.Create(ElementCategory.ElementVideoResolution, new List<string> { "480p", "720p", "1080p" }),
Tuple.Create(ElementCategory.ElementSource, new List<string> { "Blu-Ray" })
};
}
public static string Normalize(string word)
{
return string.IsNullOrEmpty(word) ? word : word.ToUpperInvariant();
}
public static bool Contains(ElementCategory category, string keyword)
{
var keys = GetKeywordContainer(category);
if (keys.TryGetValue(keyword, out var foundEntry))
{
return foundEntry.Category == category;
}
return false;
}
/// <summary>
/// Finds a particular <code>keyword</code>. If found sets <code>category</code> and <code>options</code> to the found search result.
/// </summary>
/// <param name="keyword">the keyword to search for</param>
/// <param name="category">the reference that will be set/changed to the found keyword category</param>
/// <param name="options">the reference that will be set/changed to the found keyword options</param>
/// <returns>if the keyword was found</returns>
public static bool FindAndSet(string keyword, ref ElementCategory category, ref KeywordOptions options)
{
Dictionary<string, Keyword> keys = GetKeywordContainer(category);
if (!keys.TryGetValue(keyword, out var foundEntry))
{
return false;
}
if (category == ElementCategory.ElementUnknown)
{
category = foundEntry.Category;
}
else if (foundEntry.Category != category)
{
return false;
}
options = foundEntry.Options;
return true;
}
/// <summary>
/// Given a particular <code>filename</code> and <code>range</code> attempt to preidentify the token before we attempt the main parsing logic
/// </summary>
/// <param name="filename">the filename</param>
/// <param name="range">the search range</param>
/// <param name="elements">elements array that any pre-identified elements will be added to</param>
/// <param name="preIdentifiedTokens">elements array that any pre-identified token ranges will be added to</param>
public static void PeekAndAdd(string filename, TokenRange range, List<Element> elements, List<TokenRange> preIdentifiedTokens)
{
int endR = range.Offset + range.Size;
string search = filename.Substring(range.Offset, endR > filename.Length ? filename.Length - range.Offset : endR - range.Offset);
foreach (var entry in s_peekEntries)
{
foreach (string keyword in entry.Item2)
{
int foundIdx = search.IndexOf(keyword, StringComparison.CurrentCulture);
if (foundIdx == -1) continue;
foundIdx += range.Offset;
elements.Add(new Element(entry.Item1, keyword));
preIdentifiedTokens.Add(new TokenRange(foundIdx, keyword.Length));
}
}
}
// Private API
/** Returns the appropriate keyword container. */
private static Dictionary<string, Keyword> GetKeywordContainer(ElementCategory category)
{
return category == ElementCategory.ElementFileExtension ? s_extensions : s_keys;
}
/// Adds a <code>category</code>, <code>options</code>, and <code>keywords</code> to the internal keywords list.
private static void Add(ElementCategory category, KeywordOptions options, IEnumerable<string> keywords)
{
var keys = GetKeywordContainer(category);
foreach (string key in keywords.Where(k => !string.IsNullOrEmpty(k) && !keys.ContainsKey(k)))
{
keys[key] = new Keyword(category, options);
}
}
}
/// <summary>
/// Keyword options for a particular keyword.
/// </summary>
public class KeywordOptions
{
public bool Identifiable { get; }
public bool Searchable { get; }
public bool Valid { get; }
public KeywordOptions() : this(true, true, true) {}
/// <summary>
/// Constructs a new keyword options
/// </summary>
/// <param name="identifiable">if the token is identifiable</param>
/// <param name="searchable">if the token is searchable</param>
/// <param name="valid">if the token is valid</param>
public KeywordOptions(bool identifiable, bool searchable, bool valid)
{
Identifiable = identifiable;
Searchable = searchable;
Valid = valid;
}
}
/// <summary>
/// A Keyword
/// </summary>
public struct Keyword
{
public readonly ElementCategory Category;
public readonly KeywordOptions Options;
/// <summary>
/// Constructs a new Keyword
/// </summary>
/// <param name="category">the category of the keyword</param>
/// <param name="options">the keyword's options</param>
public Keyword(ElementCategory category, KeywordOptions options)
{
Category = category;
Options = options;
}
}

View File

@@ -0,0 +1,28 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
namespace AnitomySharp;
/// <summary>
/// AnitomySharp search configuration options
/// </summary>
public class Options(
string delimiters = " _.&+,|",
bool episode = true,
bool title = true,
bool extension = true,
bool group = true)
{
public string AllowedDelimiters { get; } = delimiters;
public bool ParseEpisodeNumber { get; } = episode;
public bool ParseEpisodeTitle { get; } = title;
public bool ParseFileExtension { get; } = extension;
public bool ParseReleaseGroup { get; } = group;
}

View File

@@ -0,0 +1,430 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
namespace AnitomySharp;
/// <summary>
/// Class to classify <see cref="Token"/>s
/// </summary>
public class Parser
{
public bool IsEpisodeKeywordsFound { get; private set; }
public ParserHelper ParseHelper { get; }
public ParserNumber ParseNumber { get; }
public List<Element> Elements { get; }
public List<Token> Tokens { get; }
private Options Options { get; }
/// <summary>
/// Constructs a new token parser
/// </summary>
/// <param name="elements">the list where parsed elements will be added</param>
/// <param name="options">the parser options</param>
/// <param name="tokens">the list of tokens</param>
public Parser(List<Element> elements, Options options, List<Token> tokens)
{
Elements = elements;
Options = options;
Tokens = tokens;
ParseHelper = new ParserHelper(this);
ParseNumber = new ParserNumber(this);
}
/** Begins the parsing process */
public bool Parse()
{
SearchForKeywords();
SearchForIsolatedNumbers();
if (Options.ParseEpisodeNumber)
{
SearchForEpisodeNumber();
}
SearchForAnimeTitle();
if (Options.ParseReleaseGroup && Empty(ElementCategory.ElementReleaseGroup))
{
SearchForReleaseGroup();
}
if (Options.ParseEpisodeTitle && !Empty(ElementCategory.ElementEpisodeNumber))
{
SearchForEpisodeTitle();
}
ValidateElements();
return Empty(ElementCategory.ElementAnimeTitle);
}
/** Search for anime keywords. */
private void SearchForKeywords()
{
for (int i = 0; i < Tokens.Count; i++)
{
var token = Tokens[i];
if (token.Category != Token.TokenCategory.Unknown) continue;
string word = token.Content;
word = word.Trim(" -".ToCharArray());
if (string.IsNullOrEmpty(word)) continue;
// Don't bother if the word is a number that cannot be CRC
if (word.Length != 8 && StringHelper.IsNumericString(word)) continue;
string keyword = KeywordManager.Normalize(word);
var category = ElementCategory.ElementUnknown;
var options = new KeywordOptions();
if (KeywordManager.FindAndSet(keyword, ref category, ref options))
{
if (!Options.ParseReleaseGroup && category == ElementCategory.ElementReleaseGroup) continue;
if (!ParserHelper.IsElementCategorySearchable(category) || !options.Searchable) continue;
if (ParserHelper.IsElementCategorySingular(category) && !Empty(category)) continue;
switch (category)
{
case ElementCategory.ElementAnimeSeasonPrefix:
ParseHelper.CheckAndSetAnimeSeasonKeyword(token, i);
continue;
case ElementCategory.ElementEpisodePrefix when options.Valid:
ParseHelper.CheckExtentKeyword(ElementCategory.ElementEpisodeNumber, i, token);
continue;
case ElementCategory.ElementReleaseVersion:
word = word[1..];
break;
case ElementCategory.ElementVolumePrefix:
ParseHelper.CheckExtentKeyword(ElementCategory.ElementVolumeNumber, i, token);
continue;
}
}
else
{
if (Empty(ElementCategory.ElementFileChecksum) && ParserHelper.IsCrc32(word))
{
category = ElementCategory.ElementFileChecksum;
}
else if (Empty(ElementCategory.ElementVideoResolution) && ParserHelper.IsResolution(word))
{
category = ElementCategory.ElementVideoResolution;
}
}
if (category == ElementCategory.ElementUnknown) continue;
Elements.Add(new Element(category, word));
if (options.Identifiable)
{
token.Category = Token.TokenCategory.Identifier;
}
}
}
/** Search for episode number. */
private void SearchForEpisodeNumber()
{
// List all unknown tokens that contain a number
var tokens = new List<int>();
for (int i = 0; i < Tokens.Count; i++)
{
var token = Tokens[i];
if (token.Category == Token.TokenCategory.Unknown &&
ParserHelper.IndexOfFirstDigit(token.Content) != -1)
{
tokens.Add(i);
}
}
if (tokens.Count == 0) return;
IsEpisodeKeywordsFound = !Empty(ElementCategory.ElementEpisodeNumber);
// If a token matches a known episode pattern, it has to be the episode number
if (ParseNumber.SearchForEpisodePatterns(tokens)) return;
// We have previously found an episode number via keywords
if (!Empty(ElementCategory.ElementEpisodeNumber)) return;
// From now on, we're only interested in numeric tokens
tokens.RemoveAll(r => !StringHelper.IsNumericString(Tokens[r].Content));
// e.g. "01 (176)", "29 (04)"
if (ParseNumber.SearchForEquivalentNumbers(tokens)) return;
// e.g. " - 08"
if (ParseNumber.SearchForSeparatedNumbers(tokens)) return;
// "e.g. "[12]", "(2006)"
if (ParseNumber.SearchForIsolatedNumbers(tokens)) return;
// Consider using the last number as a last resort
ParseNumber.SearchForLastNumber(tokens);
}
/// <summary>
/// Search for anime title
/// </summary>
private void SearchForAnimeTitle()
{
bool enclosedTitle = false;
int tokenBegin = Token.FindToken(Tokens, 0, Tokens.Count, Token.TokenFlag.FlagNotEnclosed,
Token.TokenFlag.FlagUnknown);
// If that doesn't work, find the first unknown token in the second enclosed
// group, assuming that the first one is the release group
if (!Token.InListRange(tokenBegin, Tokens))
{
tokenBegin = 0;
enclosedTitle = true;
bool skippedPreviousGroup = false;
do
{
tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown);
if (!Token.InListRange(tokenBegin, Tokens)) break;
// Ignore groups that are composed of non-Latin characters
if (StringHelper.IsMostlyLatinString(Tokens[tokenBegin].Content) && skippedPreviousGroup)
{
break;
}
// Get the first unknown token of the next group
tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket);
tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown);
skippedPreviousGroup = true;
} while (Token.InListRange(tokenBegin, Tokens));
}
if (!Token.InListRange(tokenBegin, Tokens)) return;
// Continue until an identifier (or a bracket, if the title is enclosed) is found
int tokenEnd = Token.FindToken(
Tokens,
tokenBegin,
Tokens.Count,
Token.TokenFlag.FlagIdentifier,
enclosedTitle ? Token.TokenFlag.FlagBracket : Token.TokenFlag.FlagNone);
// If within the interval there's an open bracket without its matching pair,
// move the upper endpoint back to the bracket
if (!enclosedTitle)
{
int lastBracket = tokenEnd;
bool bracketOpen = false;
for (int i = tokenBegin; i < tokenEnd; i++)
{
if (Tokens[i].Category != Token.TokenCategory.Bracket) continue;
lastBracket = i;
bracketOpen = !bracketOpen;
}
if (bracketOpen) tokenEnd = lastBracket;
}
// If the interval ends with an enclosed group (e.g. "Anime Title [Fansub]"),
// move the upper endpoint back to the beginning of the group. We ignore
// parentheses in order to keep certain groups (e.g. "(TV)") intact.
if (!enclosedTitle)
{
int token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter);
while (ParseHelper.IsTokenCategory(token, Token.TokenCategory.Bracket) && Tokens[token].Content[0] != ')')
{
token = Token.FindPrevToken(Tokens, token, Token.TokenFlag.FlagBracket);
if (!Token.InListRange(token, Tokens)) continue;
tokenEnd = token;
token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter);
}
}
ParseHelper.BuildElement(ElementCategory.ElementAnimeTitle, false,
Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin));
}
/// <summary>
/// Search for release group
/// </summary>
private void SearchForReleaseGroup()
{
for (int tokenBegin = 0, tokenEnd = tokenBegin; tokenBegin < Tokens.Count;)
{
// Find the first enclosed unknown token
tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagEnclosed,
Token.TokenFlag.FlagUnknown);
if (!Token.InListRange(tokenBegin, Tokens)) return;
// Continue until a bracket or identifier is found
tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket,
Token.TokenFlag.FlagIdentifier);
if (!Token.InListRange(tokenEnd, Tokens) ||
Tokens[tokenEnd].Category != Token.TokenCategory.Bracket) continue;
// Ignore if it's not the first non-delimiter token in group
int prevToken = Token.FindPrevToken(Tokens, tokenBegin, Token.TokenFlag.FlagNotDelimiter);
if (Token.InListRange(prevToken, Tokens) &&
Tokens[prevToken].Category != Token.TokenCategory.Bracket) continue;
ParseHelper.BuildElement(ElementCategory.ElementReleaseGroup, true,
Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin));
return;
}
}
/// <summary>
/// Search for episode title
/// </summary>
private void SearchForEpisodeTitle()
{
int tokenBegin;
int tokenEnd = 0;
do
{
// Find the first non-enclosed unknown token
tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagNotEnclosed,
Token.TokenFlag.FlagUnknown);
if (!Token.InListRange(tokenBegin, Tokens)) return;
// Continue until a bracket or identifier is found
tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket,
Token.TokenFlag.FlagIdentifier);
// Ignore if it's only a dash
if (tokenEnd - tokenBegin <= 2 && ParserHelper.IsDashCharacter(Tokens[tokenBegin].Content[0])) continue;
//if (tokenBegin.Pos == null || tokenEnd.Pos == null) continue;
ParseHelper.BuildElement(ElementCategory.ElementEpisodeTitle, false,
Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin));
return;
} while (Token.InListRange(tokenBegin, Tokens));
}
/// <summary>
/// Search for isolated numbers
/// </summary>
private void SearchForIsolatedNumbers()
{
for (int i = 0; i < Tokens.Count; i++)
{
var token = Tokens[i];
if (token.Category != Token.TokenCategory.Unknown || !StringHelper.IsNumericString(token.Content) ||
!ParseHelper.IsTokenIsolated(i))
{
continue;
}
int number = StringHelper.StringToInt(token.Content);
// Anime year
if (number >= ParserNumber.AnimeYearMin && number <= ParserNumber.AnimeYearMax)
{
if (Empty(ElementCategory.ElementAnimeYear))
{
Elements.Add(new Element(ElementCategory.ElementAnimeYear, token.Content));
token.Category = Token.TokenCategory.Identifier;
continue;
}
}
// Video resolution
if (number != 480 && number != 720 && number != 1080) continue;
// If these numbers are isolated, it's more likely for them to be the
// video resolution rather than the episode number. Some fansub groups use these without the "p" suffix.
if (!Empty(ElementCategory.ElementVideoResolution)) continue;
Elements.Add(new Element(ElementCategory.ElementVideoResolution, token.Content));
token.Category = Token.TokenCategory.Identifier;
}
}
/// <summary>
/// Validate Elements
/// </summary>
private void ValidateElements()
{
if (Empty(ElementCategory.ElementAnimeType) || Empty(ElementCategory.ElementEpisodeTitle))
{
return;
}
string episodeTitle = Get(ElementCategory.ElementEpisodeTitle);
for (int i = 0; i < Elements.Count;)
{
var el = Elements[i];
if (el.Category == ElementCategory.ElementAnimeType)
{
if (episodeTitle.Contains(el.Value))
{
if (episodeTitle.Length == el.Value.Length)
{
Elements.RemoveAll(element =>
element.Category == ElementCategory.ElementEpisodeTitle); // invalid episode title
}
else
{
string keyword = KeywordManager.Normalize(el.Value);
if (KeywordManager.Contains(ElementCategory.ElementAnimeType, keyword))
{
i = Erase(el); // invalid anime type
continue;
}
}
}
}
++i;
}
}
/// <summary>
/// Returns whether or not the parser contains this category
/// </summary>
/// <param name="category"></param>
/// <returns></returns>
private bool Empty(ElementCategory category)
{
return Elements.All(element => element.Category != category);
}
/// <summary>
/// Returns the value of a particular category
/// </summary>
/// <param name="category"></param>
/// <returns></returns>
private string Get(ElementCategory category)
{
var foundElement = Elements.Find(element => element.Category == category);
if (foundElement != null) return foundElement.Value;
Element e = new Element(category, "");
Elements.Add(e);
foundElement = e;
return foundElement.Value;
}
/// <summary>
/// Deletes the first element with the same <code>element.Category</code> and returns the deleted element's position.
/// </summary>
private int Erase(Element element)
{
int removedIdx = -1;
for (int i = 0; i < Elements.Count; i++)
{
var currentElement = Elements[i];
if (element.Category != currentElement.Category) continue;
removedIdx = i;
Elements.RemoveAt(i);
break;
}
return removedIdx;
}
}

View File

@@ -0,0 +1,281 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
using System.Text;
namespace AnitomySharp;
/// <summary>
/// Utility class to assist in the parsing.
/// </summary>
public class ParserHelper(Parser parser)
{
private static readonly System.Buffers.SearchValues<char> s_myChars = System.Buffers.SearchValues.Create("xX\u00D7");
private const string Dashes = "-\u2010\u2011\u2012\u2013\u2014\u2015";
private const string DashesWithSpace = " -\u2010\u2011\u2012\u2013\u2014\u2015";
private static readonly Dictionary<string, string> s_ordinals = new()
{
{"1st", "1"}, {"First", "1"},
{"2nd", "2"}, {"Second", "2"},
{"3rd", "3"}, {"Third", "3"},
{"4th", "4"}, {"Fourth", "4"},
{"5th", "5"}, {"Fifth", "5"},
{"6th", "6"}, {"Sixth", "6"},
{"7th", "7"}, {"Seventh", "7"},
{"8th", "8"}, {"Eighth", "8"},
{"9th", "9"}, {"Ninth", "9"}
};
/// <summary>
/// Returns whether or not the <code>result</code> matches the <code>category</code>.
/// </summary>
public bool IsTokenCategory(int result, Token.TokenCategory category)
{
return Token.InListRange(result, parser.Tokens) && parser.Tokens[result].Category == category;
}
/// <summary>
/// Returns whether or not the <code>str</code> is a CRC string.
/// </summary>
public static bool IsCrc32(string str)
{
return str.Length == 8 && StringHelper.IsHexadecimalString(str);
}
/// <summary>
/// Returns whether or not the <code>character</code> is a dash character
/// </summary>
public static bool IsDashCharacter(char c)
{
return Dashes.Contains(c.ToString());
}
/// <summary>
/// Returns a number from an original (e.g. 2nd)
/// </summary>
private static string GetNumberFromOrdinal(string str)
{
return string.IsNullOrEmpty(str) ? string.Empty : s_ordinals.GetValueOrDefault(str, "");
}
/// <summary>
/// Returns the index of the first digit in the <code>str</code>; -1 otherwise.
/// </summary>
public static int IndexOfFirstDigit(string str)
{
if (string.IsNullOrEmpty(str)) return -1;
for (int i = 0; i < str.Length; i++)
{
if (char.IsDigit(str, i))
{
return i;
}
}
return -1;
}
/// <summary>
/// Returns whether or not the <code>str</code> is a resolution.
/// </summary>
public static bool IsResolution(string str)
{
if (string.IsNullOrEmpty(str)) return false;
const int minWidthSize = 3;
const int minHeightSize = 3;
switch (str.Length)
{
case >= minWidthSize + 1 + minHeightSize:
{
int pos = str.AsSpan().IndexOfAny(s_myChars);
if (pos == -1 || pos < minWidthSize || pos > str.Length - (minHeightSize + 1)) return false;
return !str.Where((t, i) => i != pos && !char.IsDigit(t)).Any();
}
case < minHeightSize + 1:
return false;
default:
{
if (char.ToLower(str[^1]) != 'p') return false;
for (int i = 0; i < str.Length - 1; i++)
{
if (!char.IsDigit(str[i])) return false;
}
return true;
}
}
}
/// <summary>
/// Returns whether or not the <code>category</code> is searchable.
/// </summary>
public static bool IsElementCategorySearchable(ElementCategory category)
{
return category switch
{
ElementCategory.ElementAnimeSeasonPrefix or ElementCategory.ElementAnimeType
or ElementCategory.ElementAudioTerm or ElementCategory.ElementDeviceCompatibility
or ElementCategory.ElementEpisodePrefix or ElementCategory.ElementFileChecksum
or ElementCategory.ElementLanguage or ElementCategory.ElementOther
or ElementCategory.ElementReleaseGroup or ElementCategory.ElementReleaseInformation
or ElementCategory.ElementReleaseVersion or ElementCategory.ElementSource
or ElementCategory.ElementSubtitles or ElementCategory.ElementVideoResolution
or ElementCategory.ElementVideoTerm or ElementCategory.ElementVolumePrefix => true,
_ => false
};
}
/// <summary>
/// Returns whether the <code>category</code> is singular.
/// </summary>
public static bool IsElementCategorySingular(ElementCategory category)
{
return category switch
{
ElementCategory.ElementAnimeSeason or ElementCategory.ElementAnimeType or ElementCategory.ElementAudioTerm
or ElementCategory.ElementDeviceCompatibility or ElementCategory.ElementEpisodeNumber
or ElementCategory.ElementLanguage or ElementCategory.ElementOther
or ElementCategory.ElementReleaseInformation or ElementCategory.ElementSource
or ElementCategory.ElementVideoTerm => false,
_ => false
};
}
/// <summary>
/// Returns whether or not a token at the current <code>pos</code> is isolated(surrounded by braces).
/// </summary>
public bool IsTokenIsolated(int pos)
{
int prevToken = Token.FindPrevToken(parser.Tokens, pos, Token.TokenFlag.FlagNotDelimiter);
if (!IsTokenCategory(prevToken, Token.TokenCategory.Bracket)) return false;
int nextToken = Token.FindNextToken(parser.Tokens, pos, Token.TokenFlag.FlagNotDelimiter);
return IsTokenCategory(nextToken, Token.TokenCategory.Bracket);
}
/// <summary>
/// Finds and sets the anime season keyword.
/// </summary>
public bool CheckAndSetAnimeSeasonKeyword(Token token, int currentTokenPos)
{
int previousToken = Token.FindPrevToken(parser.Tokens, currentTokenPos, Token.TokenFlag.FlagNotDelimiter);
if (Token.InListRange(previousToken, parser.Tokens))
{
string number = GetNumberFromOrdinal(parser.Tokens[previousToken].Content);
if (!string.IsNullOrEmpty(number))
{
SetAnimeSeason(parser.Tokens[previousToken], token, number);
return true;
}
}
int nextToken = Token.FindNextToken(parser.Tokens, currentTokenPos, Token.TokenFlag.FlagNotDelimiter);
if (!Token.InListRange(nextToken, parser.Tokens) ||
!StringHelper.IsNumericString(parser.Tokens[nextToken].Content)) return false;
SetAnimeSeason(token, parser.Tokens[nextToken], parser.Tokens[nextToken].Content);
return true;
void SetAnimeSeason(Token first, Token second, string content)
{
parser.Elements.Add(new Element(ElementCategory.ElementAnimeSeason, content));
first.Category = Token.TokenCategory.Identifier;
second.Category = Token.TokenCategory.Identifier;
}
}
/// <summary>
/// A Method to find the correct volume/episode number when prefixed (i.e. Vol.4).
/// </summary>
/// <param name="category">the category we're searching for</param>
/// <param name="currentTokenPos">the current token position</param>
/// <param name="token">the token</param>
/// <returns>true if we found the volume/episode number</returns>
public bool CheckExtentKeyword(ElementCategory category, int currentTokenPos, Token token)
{
int nToken = Token.FindNextToken(parser.Tokens, currentTokenPos, Token.TokenFlag.FlagNotDelimiter);
if (!IsTokenCategory(nToken, Token.TokenCategory.Unknown)) return false;
if (IndexOfFirstDigit(parser.Tokens[nToken].Content) != 0) return false;
switch (category)
{
case ElementCategory.ElementEpisodeNumber:
if (!parser.ParseNumber.MatchEpisodePatterns(parser.Tokens[nToken].Content, parser.Tokens[nToken]))
{
parser.ParseNumber.SetEpisodeNumber(parser.Tokens[nToken].Content, parser.Tokens[nToken], false);
}
break;
case ElementCategory.ElementVolumeNumber:
if (!parser.ParseNumber.MatchVolumePatterns(parser.Tokens[nToken].Content, parser.Tokens[nToken]))
{
parser.ParseNumber.SetVolumeNumber(parser.Tokens[nToken].Content, parser.Tokens[nToken], false);
}
break;
}
token.Category = Token.TokenCategory.Identifier;
return true;
}
public void BuildElement(ElementCategory category, bool keepDelimiters, List<Token> tokens)
{
var element = new StringBuilder();
for (int i = 0; i < tokens.Count; i++)
{
var token = tokens[i];
switch (token.Category)
{
case Token.TokenCategory.Unknown:
element.Append(token.Content);
token.Category = Token.TokenCategory.Identifier;
break;
case Token.TokenCategory.Bracket:
element.Append(token.Content);
break;
case Token.TokenCategory.Delimiter:
string delimiter = "";
if (!string.IsNullOrEmpty(token.Content))
{
delimiter = token.Content[0].ToString();
}
if (keepDelimiters)
{
element.Append(delimiter);
}
else if (Token.InListRange(i, tokens))
{
switch (delimiter)
{
case ",":
case "&":
element.Append(delimiter);
break;
default:
element.Append(' ');
break;
}
}
break;
}
}
if (!keepDelimiters)
{
element = new StringBuilder(element.ToString().Trim(DashesWithSpace.ToCharArray()));
}
if (!string.IsNullOrEmpty(element.ToString()))
{
parser.Elements.Add(new Element(category, element.ToString()));
}
}
}

View File

@@ -0,0 +1,681 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
using System.Text.RegularExpressions;
namespace AnitomySharp;
/// <summary>
/// A utility class to assist in number parsing.
/// </summary>
public partial class ParserNumber(Parser parser)
{
public const int AnimeYearMin = 1900;
public const int AnimeYearMax = 2100;
private const int EpisodeNumberMax = AnimeYearMax - 1;
private const int VolumeNumberMax = 50;
private const string RegexMatchOnlyStart = @"\A(?:";
private const string RegexMatchOnlyEnd = @")\z";
/// <summary>
/// Returns whether or not the <code>number</code> is a volume number
/// </summary>
private static bool IsValidVolumeNumber(string number)
{
return StringHelper.StringToInt(number) <= VolumeNumberMax;
}
/// <summary>
/// Returns whether or not the <code>number</code> is a valid episode number.
/// </summary>
private static bool IsValidEpisodeNumber(string number)
{
// Eliminate non numeric portion of number, then parse as double.
string temp = "";
for (int i = 0; i < number.Length && char.IsDigit(number[i]); i++)
{
temp += number[i];
}
return !string.IsNullOrEmpty(temp) && double.Parse(temp) <= EpisodeNumberMax;
}
/// <summary>
/// Sets the alternative episode number.
/// </summary>
private void SetAlternativeEpisodeNumber(string number, Token token)
{
parser.Elements.Add(new Element(ElementCategory.ElementEpisodeNumberAlt, number));
token.Category = Token.TokenCategory.Identifier;
}
/// <summary>
/// Sets the volume number.
/// </summary>
/// <param name="number">the number</param>
/// <param name="token">the token which contains the volume number</param>
/// <param name="validate">true if we should check if it's a valid number, false to disable verification</param>
/// <returns>true if the volume number was set</returns>
public bool SetVolumeNumber(string number, Token token, bool validate)
{
if (validate && !IsValidVolumeNumber(number)) return false;
parser.Elements.Add(new Element(ElementCategory.ElementVolumeNumber, number));
token.Category = Token.TokenCategory.Identifier;
return true;
}
/// <summary>
/// Sets the anime episode number.
/// </summary>
/// <param name="number">the episode number</param>
/// <param name="token">the token which contains the volume number</param>
/// <param name="validate">true if we should check if it's a valid episode number; false to disable validation</param>
/// <returns>true if the episode number was set</returns>
public bool SetEpisodeNumber(string number, Token token, bool validate)
{
if (validate && !IsValidEpisodeNumber(number)) return false;
token.Category = Token.TokenCategory.Identifier;
var category = ElementCategory.ElementEpisodeNumber;
if (parser.IsEpisodeKeywordsFound)
{
foreach (var element in parser.Elements)
{
if (element.Category != ElementCategory.ElementEpisodeNumber) continue;
int comparison = StringHelper.StringToInt(number) - StringHelper.StringToInt(element.Value);
switch (comparison)
{
case > 0:
category = ElementCategory.ElementEpisodeNumberAlt;
break;
case < 0:
element.Category = ElementCategory.ElementEpisodeNumberAlt;
break;
default:
return false;
}
break;
}
}
parser.Elements.Add(new Element(category, number));
return true;
}
/// <summary>
/// Checks if a number follows the specified <code>token</code>
/// </summary>
/// <param name="category">the category to set if a number follows the <code>token</code></param>
/// <param name="token">the token</param>
/// <returns>true if a number follows the token; false otherwise</returns>
private bool NumberComesAfterPrefix(ElementCategory category, Token token)
{
int numberBegin = ParserHelper.IndexOfFirstDigit(token.Content);
string prefix = StringHelper.SubstringWithCheck(token.Content, 0, numberBegin).ToUpperInvariant();
if (!KeywordManager.Contains(category, prefix)) return false;
string number = StringHelper.SubstringWithCheck(token.Content, numberBegin, token.Content.Length - numberBegin);
switch (category)
{
case ElementCategory.ElementEpisodePrefix:
if (!MatchEpisodePatterns(number, token))
{
SetEpisodeNumber(number, token, false);
}
return true;
case ElementCategory.ElementVolumePrefix:
if (!MatchVolumePatterns(number, token))
{
SetVolumeNumber(number, token, false);
}
return true;
default:
return false;
}
}
/// <summary>
/// Checks whether the number precedes the word "of"
/// </summary>
/// <param name="token">the token</param>
/// <param name="currentTokenIdx">the index of the token</param>
/// <returns>true if the token precedes the word "of"</returns>
private bool NumberComesBeforeAnotherNumber(Token token, int currentTokenIdx)
{
int separatorToken = Token.FindNextToken(parser.Tokens, currentTokenIdx, Token.TokenFlag.FlagNotDelimiter);
if (!Token.InListRange(separatorToken, parser.Tokens)) return false;
var separators = new List<Tuple<string, bool>>
{
Tuple.Create("&", true),
Tuple.Create("of", false)
};
foreach (var separator in separators)
{
if (parser.Tokens[separatorToken].Content != separator.Item1) continue;
int otherToken = Token.FindNextToken(parser.Tokens, separatorToken, Token.TokenFlag.FlagNotDelimiter);
if (!Token.InListRange(otherToken, parser.Tokens)
|| !StringHelper.IsNumericString(parser.Tokens[otherToken].Content)) continue;
SetEpisodeNumber(token.Content, token, false);
if (separator.Item2)
{
SetEpisodeNumber(parser.Tokens[otherToken].Content, parser.Tokens[otherToken], false);
}
parser.Tokens[separatorToken].Category = Token.TokenCategory.Identifier;
parser.Tokens[otherToken].Category = Token.TokenCategory.Identifier;
return true;
}
return false;
}
// EPISODE MATCHERS
/// <summary>
/// Attempts to find an episode/season inside a <code>word</code>
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the word was matched to an episode/season number</returns>
public bool MatchEpisodePatterns(string word, Token token)
{
if (StringHelper.IsNumericString(word)) return false;
word = word.Trim(" -".ToCharArray());
bool numericFront = char.IsDigit(word[0]);
bool numericBack = char.IsDigit(word[^1]);
if (numericFront && numericBack)
{
// e.g. "01v2"
if (MatchSingleEpisodePattern(word, token))
{
return true;
}
// e.g. "01-02", "03-05v2"
if (MatchMultiEpisodePattern(word, token))
{
return true;
}
// e.g. "07.5"
if (MatchFractionalEpisodePattern(word, token))
{
return true;
}
}
if (numericBack)
{
// e.g. "2x01", "S01E03", "S01-02xE001-150"
if (MatchSeasonAndEpisodePattern(word, token))
{
return true;
}
// e.g. "#01", "#02-03v2"
if (MatchNumberSignPattern(word, token))
{
return true;
}
}
switch (numericFront)
{
// e.g. "ED1", "OP4a", "OVA2"
case false when MatchTypeAndEpisodePattern(word, token):
// e.g. "4a", "111C"
case true when !numericBack && MatchPartialEpisodePattern(word, token):
return true;
default:
// U+8A71 is used as counter for stories, episodes of TV series, etc.
return numericFront && MatchJapaneseCounterPattern(word, token);
}
}
/// <summary>
/// Match a single episode pattern. e.g. "01v2".
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchSingleEpisodePattern(string word, Token token)
{
var match = SingleEpisodeRegex().Match(word);
if (!match.Success) return false;
SetEpisodeNumber(match.Groups[1].Value, token, false);
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[2].Value));
return true;
}
/// <summary>
/// Match a multi episode pattern. e.g. "01-02", "03-05v2".
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchMultiEpisodePattern(string word, Token token)
{
var match = MultiEpisodeRegex().Match(word);
if (!match.Success) return false;
string lowerBound = match.Groups[1].Value;
string upperBound = match.Groups[3].Value;
if (StringHelper.StringToInt(lowerBound) >= StringHelper.StringToInt(upperBound)) return false;
if (!SetEpisodeNumber(lowerBound, token, true)) return false;
SetEpisodeNumber(upperBound, token, true);
if (!string.IsNullOrEmpty(match.Groups[2].Value))
{
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[2].Value));
}
if (!string.IsNullOrEmpty(match.Groups[4].Value))
{
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[4].Value));
}
return true;
}
/// <summary>
/// Match season and episode patterns. e.g. "2x01", "S01E03", "S01-02xE001-150".
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchSeasonAndEpisodePattern(string word, Token token)
{
var match = SeasonEpisodeRegex().Match(word);
if (!match.Success) return false;
parser.Elements.Add(new Element(ElementCategory.ElementAnimeSeason, match.Groups[1].Value));
if (!string.IsNullOrEmpty(match.Groups[2].Value))
{
parser.Elements.Add(new Element(ElementCategory.ElementAnimeSeason, match.Groups[2].Value));
}
SetEpisodeNumber(match.Groups[3].Value, token, false);
if (!string.IsNullOrEmpty(match.Groups[4].Value))
{
SetEpisodeNumber(match.Groups[4].Value, token, false);
}
return true;
}
/// <summary>
/// Match type and episode. e.g. "ED1", "OP4a", "OVA2".
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchTypeAndEpisodePattern(string word, Token token)
{
int numberBegin = ParserHelper.IndexOfFirstDigit(word);
string prefix = StringHelper.SubstringWithCheck(word, 0, numberBegin);
var category = ElementCategory.ElementAnimeType;
var options = new KeywordOptions();
if (!KeywordManager.FindAndSet(KeywordManager.Normalize(prefix), ref category, ref options)) return false;
parser.Elements.Add(new Element(ElementCategory.ElementAnimeType, prefix));
string number = word.Substring(numberBegin);
if (!MatchEpisodePatterns(number, token) && !SetEpisodeNumber(number, token, true)) return false;
int foundIdx = parser.Tokens.IndexOf(token);
if (foundIdx == -1) return true;
token.Content = number;
parser.Tokens.Insert(foundIdx,
new Token(options.Identifiable ? Token.TokenCategory.Identifier : Token.TokenCategory.Unknown, prefix, token.Enclosed));
return true;
}
/// <summary>
/// Match fractional episodes. e.g. "07.5"
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchFractionalEpisodePattern(string word, Token token)
{
if (string.IsNullOrEmpty(word))
{
word = "";
}
const string regexPattern = RegexMatchOnlyStart + @"\d+\.5" + RegexMatchOnlyEnd;
var match = Regex.Match(word, regexPattern);
return match.Success && SetEpisodeNumber(word, token, true);
}
/// <summary>
/// Match partial episodes. e.g. "4a", "111C".
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchPartialEpisodePattern(string word, Token token)
{
if (string.IsNullOrEmpty(word)) return false;
int foundIdx = Enumerable.Range(0, word.Length)
.DefaultIfEmpty(word.Length)
.FirstOrDefault(value => !char.IsDigit(word[value]));
int suffixLength = word.Length - foundIdx;
return suffixLength == 1 && IsValidSuffix(word[foundIdx]) && SetEpisodeNumber(word, token, true);
bool IsValidSuffix(int c) => c is >= 'A' and <= 'C' or >= 'a' and <= 'c';
}
/// <summary>
/// Match episodes with number signs. e.g. "#01", "#02-03v2"
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchNumberSignPattern(string word, Token token)
{
if (string.IsNullOrEmpty(word) || word[0] != '#') word = "";
var match = NumberEpisodeRegex().Match(word);
if (!match.Success) return false;
if (!SetEpisodeNumber(match.Groups[1].Value, token, true)) return false;
if (!string.IsNullOrEmpty(match.Groups[2].Value))
{
SetEpisodeNumber(match.Groups[2].Value, token, false);
}
if (!string.IsNullOrEmpty(match.Groups[3].Value))
{
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[3].Value));
}
return true;
}
/// <summary>
/// Match Japanese patterns. e.g. U+8A71 is used as counter for stories, episodes of TV series, etc.
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchJapaneseCounterPattern(string word, Token token)
{
if (string.IsNullOrEmpty(word) || word[^1] != '\u8A71') return false;
var match = JapaneseEpisodeRegex().Match(word);
if (!match.Success) return false;
SetEpisodeNumber(match.Groups[1].Value, token, false);
return true;
}
// VOLUME MATCHES
/// <summary>
/// Attempts to find an episode/season inside a <code>word</code>
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the word was matched to an episode/season number</returns>
public bool MatchVolumePatterns(string word, Token token)
{
// All patterns contain at least one non-numeric character
if (StringHelper.IsNumericString(word)) return false;
word = word.Trim(" -".ToCharArray());
bool numericFront = char.IsDigit(word[0]);
bool numericBack = char.IsDigit(word[^1]);
if (numericFront && numericBack)
{
// e.g. "01v2" e.g. "01-02", "03-05v2"
return MatchSingleVolumePattern(word, token) || MatchMultiVolumePattern(word, token);
}
return false;
}
/// <summary>
/// Match single volume. e.g. "01v2"
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchSingleVolumePattern(string word, Token token)
{
if (string.IsNullOrEmpty(word)) word = "";
var match = SingleVolumeRegex().Match(word);
if (!match.Success) return false;
SetVolumeNumber(match.Groups[1].Value, token, false);
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[2].Value));
return true;
}
/// <summary>
/// Match multi-volume. e.g. "01-02", "03-05v2".
/// </summary>
/// <param name="word">the word</param>
/// <param name="token">the token</param>
/// <returns>true if the token matched</returns>
private bool MatchMultiVolumePattern(string word, Token token)
{
if (string.IsNullOrEmpty(word)) word = "";
var match = MultiVolumeRegex().Match(word);
if (!match.Success) return false;
string lowerBound = match.Groups[1].Value;
string upperBound = match.Groups[2].Value;
if (StringHelper.StringToInt(lowerBound) >= StringHelper.StringToInt(upperBound)) return false;
if (!SetVolumeNumber(lowerBound, token, true)) return false;
SetVolumeNumber(upperBound, token, false);
if (string.IsNullOrEmpty(match.Groups[3].Value))
{
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[3].Value));
}
return true;
}
// SEARCH
/// <summary>
/// Searches for isolated numbers in a list of <code>tokens</code>.
/// </summary>
/// <param name="tokens">the list of tokens</param>
/// <returns>true if an isolated number was found</returns>
public bool SearchForIsolatedNumbers(IEnumerable<int> tokens)
{
return tokens
.Where(it => parser.Tokens[it].Enclosed && parser.ParseHelper.IsTokenIsolated(it))
.Any(it => SetEpisodeNumber(parser.Tokens[it].Content, parser.Tokens[it], true));
}
/// <summary>
/// Searches for separated numbers in a list of <code>tokens</code>.
/// </summary>
/// <param name="tokens">the list of tokens</param>
/// <returns>true fi a separated number was found</returns>
public bool SearchForSeparatedNumbers(List<int> tokens)
{
foreach (int it in tokens)
{
int previousToken = Token.FindPrevToken(parser.Tokens, it, Token.TokenFlag.FlagNotDelimiter);
// See if the number has a preceding "-" separator
if (!parser.ParseHelper.IsTokenCategory(previousToken, Token.TokenCategory.Unknown)
|| !ParserHelper.IsDashCharacter(parser.Tokens[previousToken].Content[0])) continue;
if (!SetEpisodeNumber(parser.Tokens[it].Content, parser.Tokens[it], true)) continue;
parser.Tokens[previousToken].Category = Token.TokenCategory.Identifier;
return true;
}
return false;
}
/// <summary>
/// Searches for episode patterns in a list of <code>tokens</code>.
/// </summary>
/// <param name="tokens">the list of tokens</param>
/// <returns>true if an episode number was found</returns>
public bool SearchForEpisodePatterns(List<int> tokens)
{
foreach (int it in tokens)
{
bool numericFront = parser.Tokens[it].Content.Length > 0 && char.IsDigit(parser.Tokens[it].Content[0]);
if (!numericFront)
{
// e.g. "EP.1", "Vol.1"
if (NumberComesAfterPrefix(ElementCategory.ElementEpisodePrefix, parser.Tokens[it]))
{
return true;
}
if (NumberComesAfterPrefix(ElementCategory.ElementVolumePrefix, parser.Tokens[it]))
{
continue;
}
}
else
{
// e.g. "8 & 10", "01 of 24"
if (NumberComesBeforeAnotherNumber(parser.Tokens[it], it))
{
return true;
}
}
// Look for other patterns
if (MatchEpisodePatterns(parser.Tokens[it].Content, parser.Tokens[it]))
{
return true;
}
}
return false;
}
/// <summary>
/// Searches for equivalent number in a list of <code>tokens</code>. e.g. 08(114)
/// </summary>
/// <param name="tokens">the list of tokens</param>
/// <returns>true if an equivalent number was found</returns>
public bool SearchForEquivalentNumbers(List<int> tokens)
{
foreach (int it in tokens)
{
// Find number must be isolated.
if (parser.ParseHelper.IsTokenIsolated(it) || !IsValidEpisodeNumber(parser.Tokens[it].Content))
{
continue;
}
// Find the first enclosed, non-delimiter token
int nextToken = Token.FindNextToken(parser.Tokens, it, Token.TokenFlag.FlagNotDelimiter);
if (!parser.ParseHelper.IsTokenCategory(nextToken, Token.TokenCategory.Bracket)) continue;
nextToken = Token.FindNextToken(parser.Tokens, nextToken, Token.TokenFlag.FlagEnclosed,
Token.TokenFlag.FlagNotDelimiter);
if (!parser.ParseHelper.IsTokenCategory(nextToken, Token.TokenCategory.Unknown)) continue;
// Check if it's an isolated number
if (!parser.ParseHelper.IsTokenIsolated(nextToken)
|| !StringHelper.IsNumericString(parser.Tokens[nextToken].Content)
|| !IsValidEpisodeNumber(parser.Tokens[nextToken].Content))
{
continue;
}
var list = new List<Token>
{
parser.Tokens[it], parser.Tokens[nextToken]
};
list.Sort((o1, o2) => StringHelper.StringToInt(o1.Content) - StringHelper.StringToInt(o2.Content));
SetEpisodeNumber(list[0].Content, list[0], false);
SetAlternativeEpisodeNumber(list[1].Content, list[1]);
return true;
}
return false;
}
/// <summary>
/// Searches for the last number token in a list of <code>tokens</code>
/// </summary>
/// <param name="tokens">the list of tokens</param>
/// <returns>true if the last number token was found</returns>
public bool SearchForLastNumber(List<int> tokens)
{
for (int i = tokens.Count - 1; i >= 0; i--)
{
int it = tokens[i];
// Assuming that episode number always comes after the title,
// the first token cannot be what we're looking for
if (it == 0) continue;
if (parser.Tokens[it].Enclosed) continue;
// Ignore if it's the first non-enclosed, non-delimiter token
if (parser.Tokens.GetRange(0, it)
.All(r => r.Enclosed || r.Category == Token.TokenCategory.Delimiter))
{
continue;
}
int previousToken = Token.FindPrevToken(parser.Tokens, it, Token.TokenFlag.FlagNotDelimiter);
if (parser.ParseHelper.IsTokenCategory(previousToken, Token.TokenCategory.Unknown))
{
if (parser.Tokens[previousToken].Content.Equals("Movie", StringComparison.InvariantCultureIgnoreCase)
|| parser.Tokens[previousToken].Content.Equals("Part", StringComparison.InvariantCultureIgnoreCase))
{
continue;
}
}
// We'll use this number after all
if (SetEpisodeNumber(parser.Tokens[it].Content, parser.Tokens[it], true))
{
return true;
}
}
return false;
}
[GeneratedRegex(@"\A(?:(\d{1,3})[vV](\d))\z")]
private static partial Regex SingleEpisodeRegex();
[GeneratedRegex(@"\A(?:(\d{1,3})(?:[vV](\d))?[-~&+](\d{1,3})(?:[vV](\d))?)\z")]
private static partial Regex MultiEpisodeRegex();
[GeneratedRegex(@"\A(?:S?(\d{1,2})(?:-S?(\d{1,2}))?(?:x|[ ._-x]?E)(\d{1,3})(?:-E?(\d{1,3}))?)\z")]
private static partial Regex SeasonEpisodeRegex();
[GeneratedRegex(@"\A(?:#(\d{1,3})(?:[-~&+](\d{1,3}))?(?:[vV](\d))?)\z")]
private static partial Regex NumberEpisodeRegex();
[GeneratedRegex(@"\A(?:(\d{1,3})話)\z")]
private static partial Regex JapaneseEpisodeRegex();
[GeneratedRegex(@"\A(?:(\d{1,2})[vV](\d))\z")]
private static partial Regex SingleVolumeRegex();
[GeneratedRegex(@"\A(?:(\d{1,2})[-~&+](\d{1,2})(?:[vV](\d))?)\z")]
private static partial Regex MultiVolumeRegex();
}

View File

@@ -0,0 +1,91 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
namespace AnitomySharp;
/// <summary>
/// A string helper class that is analogous to <code>string.cpp</code> of the original Anitomy, and <code>StringHelper.java</code> of AnitomyJ.
/// </summary>
public static class StringHelper
{
/// <summary>
/// Returns whether or not the character is alphanumeric
/// </summary>
public static bool IsAlphanumericChar(char c)
{
return c is >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z';
}
/// <summary>
/// Returns whether or not the character is a hex character.
/// </summary>
private static bool IsHexadecimalChar(char c)
{
return c is >= '0' and <= '9' or >= 'A' and <= 'F' or >= 'a' and <= 'f';
}
/// <summary>
/// Returns whether or not the character is a latin character
/// </summary>
private static bool IsLatinChar(char c)
{
// We're just checking until the end of the Latin Extended-B block,
// rather than all the blocks that belong to the Latin script.
return c <= '\u024F';
}
/// <summary>
/// Returns whether or not the <code>str</code> is a hex string.
/// </summary>
public static bool IsHexadecimalString(string str)
{
return !string.IsNullOrEmpty(str) && str.All(IsHexadecimalChar);
}
/// <summary>
/// Returns whether or not the <code>str</code> is mostly a latin string.
/// </summary>
public static bool IsMostlyLatinString(string str)
{
double length = !string.IsNullOrEmpty(str) ? 1.0 : str.Length;
return str.Where(IsLatinChar).Count() / length >= 0.5;
}
/// <summary>
/// Returns whether or not the <code>str</code> is a numeric string.
/// </summary>
public static bool IsNumericString(string str)
{
return str.All(char.IsDigit);
}
/// <summary>
/// Returns the int value of the <code>str</code>; 0 otherwise.
/// </summary>
public static int StringToInt(string str)
{
try
{
return int.Parse(str);
}
catch (Exception ex)
{
Console.WriteLine(ex);
return 0;
}
}
public static string SubstringWithCheck(string str, int start, int count)
{
if (start + count > str.Length) count = str.Length - start;
return str.Substring(start, count);
}
}

View File

@@ -0,0 +1,226 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
namespace AnitomySharp;
/// <summary>
/// An anime filename is tokenized into individual <see cref="Token"/>s. This class represents an individual token.
/// </summary>
public class Token
{
/// <summary>
/// The category of the token.
/// </summary>
public enum TokenCategory
{
Unknown,
Bracket,
Delimiter,
Identifier,
Invalid
}
/// <summary>
/// TokenFlag, used for searching specific token categories. This allows granular searching of TokenCategories.
/// </summary>
public enum TokenFlag
{
// None
FlagNone,
// Categories
FlagBracket,
FlagNotBracket,
FlagDelimiter,
FlagNotDelimiter,
FlagIdentifier,
FlagNotIdentifier,
FlagUnknown,
FlagNotUnknown,
FlagValid,
FlagNotValid,
// Enclosed (Meaning that it is enclosed in some bracket (e.g. [ ] ))
FlagEnclosed,
FlagNotEnclosed
}
/// <summary>
/// Set of token category flags
/// </summary>
private static readonly List<TokenFlag> s_flagMaskCategories =
[
TokenFlag.FlagBracket, TokenFlag.FlagNotBracket,
TokenFlag.FlagDelimiter, TokenFlag.FlagNotDelimiter,
TokenFlag.FlagIdentifier, TokenFlag.FlagNotIdentifier,
TokenFlag.FlagUnknown, TokenFlag.FlagNotUnknown,
TokenFlag.FlagValid, TokenFlag.FlagNotValid
];
/// <summary>
/// Set of token enclosed flags
/// </summary>
private static readonly List<TokenFlag> s_flagMaskEnclosed = [TokenFlag.FlagEnclosed, TokenFlag.FlagNotEnclosed];
public TokenCategory Category { get; set; }
public string Content { get; set; }
public bool Enclosed { get; }
/// <summary>
/// Constructs a new token
/// </summary>
/// <param name="category">the token category</param>
/// <param name="content">the token content</param>
/// <param name="enclosed">whether or not the token is enclosed in braces</param>
public Token(TokenCategory category, string content, bool enclosed)
{
Category = category;
Content = content;
Enclosed = enclosed;
}
/// <summary>
/// Validates a token against the <code>flags</code>. The <code>flags</code> is used as a search parameter.
/// </summary>
/// <param name="token">the token</param>
/// <param name="flags">the flags the token must conform against</param>
/// <returns>true if the token conforms to the set of <code>flags</code>; false otherwise</returns>
private static bool CheckTokenFlags(Token token, ICollection<TokenFlag> flags)
{
// Make sure token is the correct closure
if (flags.Any(f => s_flagMaskEnclosed.Contains(f)))
{
bool success = CheckFlag(TokenFlag.FlagEnclosed) == token.Enclosed;
if (!success) return false; // Not enclosed correctly (e.g. enclosed when we're looking for non-enclosed).
}
// Make sure token is the correct category
if (!flags.Any(f => s_flagMaskCategories.Contains(f))) return true;
bool secondarySuccess = false;
CheckCategory(TokenFlag.FlagBracket, TokenFlag.FlagNotBracket, TokenCategory.Bracket);
CheckCategory(TokenFlag.FlagDelimiter, TokenFlag.FlagNotDelimiter, TokenCategory.Delimiter);
CheckCategory(TokenFlag.FlagIdentifier, TokenFlag.FlagNotIdentifier, TokenCategory.Identifier);
CheckCategory(TokenFlag.FlagUnknown, TokenFlag.FlagNotUnknown, TokenCategory.Unknown);
CheckCategory(TokenFlag.FlagNotValid, TokenFlag.FlagValid, TokenCategory.Invalid);
return secondarySuccess;
void CheckCategory(TokenFlag fe, TokenFlag fn, TokenCategory c)
{
if (secondarySuccess) return;
bool result = CheckFlag(fe) ? token.Category == c : CheckFlag(fn) && token.Category != c;
secondarySuccess = result;
}
// Simple alias to check if flag is a part of the set
bool CheckFlag(TokenFlag flag)
{
return flags.Contains(flag);
}
}
/// <summary>
/// Given a list of <code>tokens</code>, searches for any token token that matches the list of <code>flags</code>.
/// </summary>
/// <param name="tokens">the list of tokens</param>
/// <param name="begin">the search starting position.</param>
/// <param name="end">the search ending position.</param>
/// <param name="flags">the search flags</param>
/// <returns>the search result</returns>
public static int FindToken(List<Token> tokens, int begin, int end, params TokenFlag[] flags)
{
return FindTokenBase(tokens, begin, end, i => i < tokens.Count, i => i + 1, flags);
}
/// <summary>
/// Given a list of <code>tokens</code>, searches for the next token in <code>tokens</code> that matches the list of <code>flags</code>.
/// </summary>
/// <param name="tokens">the list of tokens</param>
/// <param name="first">the search starting position.</param>
/// <param name="flags">the search flags</param>
/// <returns>the search result</returns>
public static int FindNextToken(List<Token> tokens, int first, params TokenFlag[] flags)
{
return FindTokenBase(tokens, first + 1, tokens.Count, i => i < tokens.Count, i => i + 1, flags);
}
/// <summary>
/// Given a list of <code>tokens</code>, searches for the previous token in <code>tokens</code> that matches the list of <code>flags</code>.
/// </summary>
/// <param name="tokens">the list of tokens</param>
/// <param name="begin">the search starting position. Exclusive of position.Pos</param>
/// <param name="flags">the search flags</param>
/// <returns>the search result</returns>
public static int FindPrevToken(List<Token> tokens, int begin, params TokenFlag[] flags)
{
return FindTokenBase(tokens, begin - 1, -1, i => i >= 0, i => i - 1, flags);
}
/// <summary>
/// Given a list of tokens finds the first token that passes <see cref="CheckTokenFlags"/>.
/// </summary>
/// <param name="tokens">the list of the tokens to search</param>
/// <param name="begin">the start index of the search.</param>
/// <param name="end">the end index of the search.</param>
/// <param name="shouldContinue">a function that returns whether or not we should continue searching</param>
/// <param name="next">a function that returns the next search index</param>
/// <param name="flags">the flags that each token should be validated against</param>
/// <returns>the found token</returns>
private static int FindTokenBase(
List<Token> tokens,
int begin,
int end,
Func<int, bool> shouldContinue,
Func<int, int> next,
params TokenFlag[] flags)
{
var find = new List<TokenFlag>();
find.AddRange(flags);
for (int i = begin; shouldContinue(i); i = next(i))
{
var token = tokens[i];
if (CheckTokenFlags(token, find))
{
return i;
}
}
return end;
}
public static bool InListRange(int pos, List<Token> list)
{
return -1 < pos && pos < list.Count;
}
public override bool Equals(object? o)
{
if (this == o)
return true;
if (o is not Token token)
return false;
return Enclosed == token.Enclosed && Category == token.Category && Equals(Content, token.Content);
}
public override int GetHashCode()
{
int hashCode = -1776802967;
hashCode = hashCode * -1521134295 + Category.GetHashCode();
hashCode = hashCode * -1521134295 + EqualityComparer<string>.Default.GetHashCode(Content);
hashCode = hashCode * -1521134295 + Enclosed.GetHashCode();
return hashCode;
}
public override string ToString()
{
return $"Token{{category={Category}, content='{Content}', enclosed={Enclosed}}}";
}
}

View File

@@ -0,0 +1,17 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
namespace AnitomySharp;
public struct TokenRange(int offset, int size)
{
public int Offset = offset;
public int Size = size;
}

View File

@@ -0,0 +1,322 @@
/*
* Copyright (c) 2014-2017, Eren Okka
* Copyright (c) 2016-2017, Paul Miller
* Copyright (c) 2017-2018, Tyler Bratton
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
using System.Text;
namespace AnitomySharp;
/// <summary>
/// A class that will tokenize an anime filename.
/// </summary>
public class Tokenizer
{
private readonly string _filename;
private readonly List<Element> _elements;
private readonly Options _options;
private readonly List<Token> _tokens;
private static readonly List<Tuple<string, string>> s_brackets =
[
new Tuple<string, string>("(", ")"), // U+0028-U+0029
new Tuple<string, string>("[", "]"), // U+005B-U+005D Square bracket
new Tuple<string, string>("{", "}"), // U+007B-U+007D Curly bracket
new Tuple<string, string>("\u300C", "\u300D"), // Corner bracket
new Tuple<string, string>("\u300E", "\u300E"), // White corner bracket
new Tuple<string, string>("\u3010", "\u3011"), // Black lenticular bracket
new Tuple<string, string>("\uFF08", "\uFF09")
];
/// <summary>
/// Tokenize a filename into <see cref="Element"/>s
/// </summary>
/// <param name="filename">the filename</param>
/// <param name="elements">the list of elements where pre-identified tokens will be added</param>
/// <param name="options">the parser options</param>
/// <param name="tokens">the list of tokens where tokens will be added</param>
public Tokenizer(string filename, List<Element> elements, Options options, List<Token> tokens)
{
_filename = filename;
_elements = elements;
_options = options;
_tokens = tokens;
}
/// <summary>
/// Returns true if tokenization was successful; false otherwise.
/// </summary>
/// <returns></returns>
public bool Tokenize()
{
TokenizeByBrackets();
return _tokens.Count > 0;
}
/// <summary>
/// Adds a token to the internal list of tokens
/// </summary>
/// <param name="category">the token category</param>
/// <param name="enclosed">whether or not the token is enclosed in braces</param>
/// <param name="range">the token range</param>
private void AddToken(Token.TokenCategory category, bool enclosed, TokenRange range)
{
_tokens.Add(new Token(category, StringHelper.SubstringWithCheck(_filename, range.Offset, range.Size), enclosed));
}
private string GetDelimiters(TokenRange range)
{
var delimiters = new StringBuilder();
foreach (var i in Enumerable.Range(range.Offset, Math.Min(_filename.Length, range.Offset + range.Size) - range.Offset)
.Where(value => IsDelimiter(_filename[value])))
{
delimiters.Append(_filename[i]);
}
return delimiters.ToString();
bool IsDelimiter(char c)
{
if (StringHelper.IsAlphanumericChar(c)) return false;
return _options.AllowedDelimiters.Contains(c.ToString()) && !delimiters.ToString().Contains(c.ToString());
}
}
/// <summary>
/// Tokenize by bracket.
/// </summary>
private void TokenizeByBrackets()
{
string matchingBracket = string.Empty;
bool isBracketOpen = false;
for (int i = 0; i < _filename.Length; )
{
int foundIdx = !isBracketOpen ? FindFirstBracket(i, _filename.Length) : _filename.IndexOf(matchingBracket, i, StringComparison.Ordinal);
var range = new TokenRange(i, foundIdx == -1 ? _filename.Length : foundIdx - i);
if (range.Size > 0)
{
// Check if our range contains any known anime identifiers
TokenizeByPreIdentified(isBracketOpen, range);
}
if (foundIdx != -1)
{
// mark as bracket
AddToken(Token.TokenCategory.Bracket, true, new TokenRange(range.Offset + range.Size, 1));
isBracketOpen = !isBracketOpen;
i = foundIdx + 1;
}
else
{
break;
}
}
return;
int FindFirstBracket(int start, int end)
{
for (int i = start; i < end; i++)
{
foreach (var bracket in s_brackets)
{
if (!_filename[i].Equals(char.Parse(bracket.Item1))) continue;
matchingBracket = bracket.Item2;
return i;
}
}
return -1;
}
}
/// <summary>
/// Tokenize by looking for known anime identifiers
/// </summary>
/// <param name="enclosed">whether or not the current <code>range</code> is enclosed in braces</param>
/// <param name="range">the token range</param>
private void TokenizeByPreIdentified(bool enclosed, TokenRange range)
{
var preIdentifiedTokens = new List<TokenRange>();
// Find known anime identifiers
KeywordManager.PeekAndAdd(_filename, range, _elements, preIdentifiedTokens);
int offset = range.Offset;
var subRange = new TokenRange(range.Offset, 0);
while (offset < range.Offset + range.Size)
{
foreach (var preIdentifiedToken in preIdentifiedTokens)
{
if (offset != preIdentifiedToken.Offset) continue;
if (subRange.Size > 0)
{
TokenizeByDelimiters(enclosed, subRange);
}
AddToken(Token.TokenCategory.Identifier, enclosed, preIdentifiedToken);
subRange.Offset = preIdentifiedToken.Offset + preIdentifiedToken.Size;
offset = subRange.Offset - 1; // It's going to be incremented below
}
subRange.Size = ++offset - subRange.Offset;
}
// Either there was no preidentified token range, or we're now about to process the tail of our current range
if (subRange.Size > 0)
{
TokenizeByDelimiters(enclosed, subRange);
}
}
/// <summary>
/// Tokenize by delimiters allowed in <see cref="Options"/>.AllowedDelimiters.
/// </summary>
/// <param name="enclosed">whether or not the current <code>range</code> is enclosed in braces</param>
/// <param name="range">the token range</param>
private void TokenizeByDelimiters(bool enclosed, TokenRange range)
{
string delimiters = GetDelimiters(range);
if (string.IsNullOrEmpty(delimiters))
{
AddToken(Token.TokenCategory.Unknown, enclosed, range);
return;
}
for (int i = range.Offset, end = range.Offset + range.Size; i < end;)
{
int found = Enumerable.Range(i, Math.Min(end, _filename.Length) - i)
.Where(c => delimiters.Contains(_filename[c].ToString()))
.DefaultIfEmpty(end)
.FirstOrDefault();
var subRange = new TokenRange(i, found - i);
if (subRange.Size > 0)
{
AddToken(Token.TokenCategory.Unknown, enclosed, subRange);
}
if (found != end)
{
AddToken(Token.TokenCategory.Delimiter, enclosed, new TokenRange(subRange.Offset + subRange.Size, 1));
i = found + 1;
}
else
{
break;
}
}
ValidateDelimiterTokens();
}
/// <summary>
/// Validates tokens (make sure certain words delimited by certain tokens aren't split)
/// </summary>
private void ValidateDelimiterTokens()
{
for (int i = 0; i < _tokens.Count; i++)
{
var token = _tokens[i];
if (token.Category != Token.TokenCategory.Delimiter) continue;
char delimiter = token.Content[0];
int prevToken = Token.FindPrevToken(_tokens, i, Token.TokenFlag.FlagValid);
int nextToken = Token.FindNextToken(_tokens, i, Token.TokenFlag.FlagValid);
// Check for single-character tokens to prevent splitting group names,
// keywords, episode numbers, etc.
if (delimiter != ' ' && delimiter != '_')
{
// Single character token
if (IsSingleCharacterToken(prevToken))
{
AppendTokenTo(token, _tokens[prevToken]);
while (IsUnknownToken(nextToken))
{
AppendTokenTo(_tokens[nextToken], _tokens[prevToken]);
nextToken = Token.FindNextToken(_tokens, i, Token.TokenFlag.FlagValid);
if (!IsDelimiterToken(nextToken) || _tokens[nextToken].Content[0] != delimiter) continue;
AppendTokenTo(_tokens[nextToken], _tokens[prevToken]);
nextToken = Token.FindNextToken(_tokens, nextToken, Token.TokenFlag.FlagValid);
}
continue;
}
if (IsSingleCharacterToken(nextToken))
{
AppendTokenTo(token, _tokens[prevToken]);
AppendTokenTo(_tokens[nextToken], _tokens[prevToken]);
continue;
}
}
// Check for adjacent delimiters
if (IsUnknownToken(prevToken) && IsDelimiterToken(nextToken))
{
char nextDelimiter = _tokens[nextToken].Content[0];
if (delimiter != nextDelimiter && delimiter != ',')
{
if (nextDelimiter is ' ' or '_')
{
AppendTokenTo(token, _tokens[prevToken]);
}
}
}
else if (IsDelimiterToken(prevToken) && IsDelimiterToken(nextToken))
{
char prevDelimiter = _tokens[prevToken].Content[0];
char nextDelimiter = _tokens[nextToken].Content[0];
if (prevDelimiter == nextDelimiter && prevDelimiter != delimiter)
{
token.Category = Token.TokenCategory.Unknown; // e.g. "& in "_&_"
}
}
// Check for other special cases
if (delimiter != '&' && delimiter != '+') continue;
if (!IsUnknownToken(prevToken) || !IsUnknownToken(nextToken)) continue;
if (!StringHelper.IsNumericString(_tokens[prevToken].Content)
|| !StringHelper.IsNumericString(_tokens[nextToken].Content)) continue;
AppendTokenTo(token, _tokens[prevToken]);
AppendTokenTo(_tokens[nextToken], _tokens[prevToken]); // e.g. 01+02
}
// Remove invalid tokens
_tokens.RemoveAll(token => token.Category == Token.TokenCategory.Invalid);
return;
void AppendTokenTo(Token src, Token dest)
{
dest.Content += src.Content;
src.Category = Token.TokenCategory.Invalid;
}
bool IsUnknownToken(int it)
{
return Token.InListRange(it, _tokens) && _tokens[it].Category == Token.TokenCategory.Unknown;
}
bool IsSingleCharacterToken(int it)
{
return IsUnknownToken(it) && _tokens[it].Content.Length == 1 && _tokens[it].Content[0] != '-';
}
bool IsDelimiterToken(int it)
{
return Token.InListRange(it, _tokens) && _tokens[it].Category == Token.TokenCategory.Delimiter;
}
}
}