init: repo
This commit is contained in:
96
Chiara/AnitomySharp/Anitomy.cs
Normal file
96
Chiara/AnitomySharp/Anitomy.cs
Normal file
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// A library capable of parsing Anime filenames
|
||||
/// This code is a C++ to C# port of <see href="https://github.com/erengy/anitomy">Anitomy</see>>,
|
||||
/// using the already existing Java port <see href="https://github.com/Vorror/anitomyJ">AnitomyJ</see> as a reference.
|
||||
/// </summary>
|
||||
public class Anitomy
|
||||
{
|
||||
private Anitomy() {}
|
||||
|
||||
/// <summary>
|
||||
/// Parses an anime <see cref="filename"/> into its constituent elements.
|
||||
/// </summary>
|
||||
/// <param name="filename">the anime file name</param>
|
||||
/// <param name="options">the options to parse with, use Parse(filename) to use default options</param>
|
||||
/// <returns>the list of parsed elements</returns>
|
||||
public static IEnumerable<Element> Parse(string filename, Options options)
|
||||
{
|
||||
List<Element> elements = [];
|
||||
List<Token> tokens = [];
|
||||
|
||||
if (options.ParseFileExtension)
|
||||
{
|
||||
string extension = "";
|
||||
if (RemoveExtensionFromFilename(ref filename, ref extension))
|
||||
{
|
||||
elements.Add(new Element(ElementCategory.ElementFileExtension, extension));
|
||||
}
|
||||
}
|
||||
|
||||
if (string.IsNullOrEmpty(filename))
|
||||
{
|
||||
return elements;
|
||||
}
|
||||
elements.Add(new Element(ElementCategory.ElementFileName, filename));
|
||||
|
||||
bool isTokenized = new Tokenizer(filename, elements, options, tokens).Tokenize();
|
||||
if (!isTokenized)
|
||||
{
|
||||
return elements;
|
||||
}
|
||||
new Parser(elements, options, tokens).Parse();
|
||||
return elements;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses an anime <see cref="filename"/> into its consituent elements.
|
||||
/// </summary>
|
||||
/// <param name="filename">the anime file name</param>
|
||||
/// <returns>the list of parsed elements</returns>
|
||||
public static IEnumerable<Element> Parse(string filename)
|
||||
{
|
||||
return Parse(filename, new Options());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes the extension from the <see cref="filename"/>
|
||||
/// </summary>
|
||||
/// <param name="filename">the ref that will be updated with the new filename</param>
|
||||
/// <param name="extension">the ref that will be updated with the file extension</param>
|
||||
/// <returns>if the extension was successfully separated from the filename</returns>
|
||||
private static bool RemoveExtensionFromFilename(ref string filename, ref string extension)
|
||||
{
|
||||
int position;
|
||||
if (string.IsNullOrEmpty(filename) || (position = filename.LastIndexOf('.')) == -1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
extension = filename[(position + 1)..];
|
||||
if (extension.Length > 4 || !extension.All(char.IsLetterOrDigit))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
string keyword = KeywordManager.Normalize(extension);
|
||||
if (!KeywordManager.Contains(ElementCategory.ElementFileExtension, keyword))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
filename = filename[..position];
|
||||
return true;
|
||||
}
|
||||
}
|
9
Chiara/AnitomySharp/AnitomySharp.csproj
Normal file
9
Chiara/AnitomySharp/AnitomySharp.csproj
Normal file
@@ -0,0 +1,9 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
</Project>
|
90
Chiara/AnitomySharp/Element.cs
Normal file
90
Chiara/AnitomySharp/Element.cs
Normal file
@@ -0,0 +1,90 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// An <see cref="Element"/> represents an identified Anime <see cref="Token"/>.
|
||||
/// A single filename may contain multiple of the same
|
||||
/// token(e.g <see cref="ElementCategory.ElementEpisodeNumber"/>).
|
||||
/// </summary>
|
||||
public class Element
|
||||
{
|
||||
public ElementCategory Category { get; set; }
|
||||
public string Value { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a new Element
|
||||
/// </summary>
|
||||
/// <param name="category">the category of the element</param>
|
||||
/// <param name="value">the element's value</param>
|
||||
public Element(ElementCategory category, string value)
|
||||
{
|
||||
Category = category;
|
||||
Value = value;
|
||||
}
|
||||
|
||||
public override int GetHashCode()
|
||||
{
|
||||
return -1926371015 + Value.GetHashCode();
|
||||
}
|
||||
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
if (this == obj)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (obj == null || GetType() != obj.GetType())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var other = (Element) obj;
|
||||
return Category.Equals(other.Category);
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"Element{{category={Category}, value='{Value}'}}";
|
||||
}
|
||||
}
|
||||
|
||||
/** Element Categories */
|
||||
public enum ElementCategory
|
||||
{
|
||||
ElementAnimeSeason,
|
||||
ElementAnimeSeasonPrefix,
|
||||
ElementAnimeTitle,
|
||||
ElementAnimeType,
|
||||
ElementAnimeYear,
|
||||
ElementAudioTerm,
|
||||
ElementDeviceCompatibility,
|
||||
ElementEpisodeNumber,
|
||||
ElementEpisodeNumberAlt,
|
||||
ElementEpisodePrefix,
|
||||
ElementEpisodeTitle,
|
||||
ElementFileChecksum,
|
||||
ElementFileExtension,
|
||||
ElementFileName,
|
||||
ElementLanguage,
|
||||
ElementOther,
|
||||
ElementReleaseGroup,
|
||||
ElementReleaseInformation,
|
||||
ElementReleaseVersion,
|
||||
ElementSource,
|
||||
ElementSubtitles,
|
||||
ElementVideoResolution,
|
||||
ElementVideoTerm,
|
||||
ElementVolumeNumber,
|
||||
ElementVolumePrefix,
|
||||
ElementUnknown
|
||||
}
|
279
Chiara/AnitomySharp/Keyword.cs
Normal file
279
Chiara/AnitomySharp/Keyword.cs
Normal file
@@ -0,0 +1,279 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// A class to manager the list of known anime keywords. This class is analogous to <code>keyword.cpp</code> of Anitomy, and <code>KeywordManager.java</code> of AnitomyJ
|
||||
/// </summary>
|
||||
public static class KeywordManager
|
||||
{
|
||||
private static readonly Dictionary<string, Keyword> s_keys = new();
|
||||
private static readonly Dictionary<string, Keyword> s_extensions = new();
|
||||
private static readonly List<Tuple<ElementCategory, List<string>>> s_peekEntries;
|
||||
|
||||
static KeywordManager()
|
||||
{
|
||||
var optionsDefault = new KeywordOptions();
|
||||
var optionsInvalid = new KeywordOptions(true, true, false);
|
||||
var optionsUnidentifiable = new KeywordOptions(false, true, true);
|
||||
var optionsUnidentifiableInvalid = new KeywordOptions(false, true, false);
|
||||
var optionsUnidentifiableUnsearchable = new KeywordOptions(false, false, true);
|
||||
|
||||
Add(ElementCategory.ElementAnimeSeasonPrefix,
|
||||
optionsUnidentifiable,
|
||||
new List<string> {"SAISON", "SEASON"});
|
||||
|
||||
Add(ElementCategory.ElementAnimeType,
|
||||
optionsUnidentifiable,
|
||||
new List<string> {"GEKIJOUBAN", "MOVIE", "OAD", "OAV", "ONA", "OVA", "SPECIAL", "SPECIALS", "TV"});
|
||||
|
||||
Add(ElementCategory.ElementAnimeType,
|
||||
optionsUnidentifiableUnsearchable,
|
||||
new List<string> {"SP"}); // e.g. "Yumeiro Patissiere SP Professional"
|
||||
|
||||
Add(ElementCategory.ElementAnimeType,
|
||||
optionsUnidentifiableInvalid,
|
||||
new List<string> {"ED", "ENDING", "NCED", "NCOP", "OP", "OPENING", "PREVIEW", "PV"});
|
||||
|
||||
Add(ElementCategory.ElementAudioTerm,
|
||||
optionsDefault,
|
||||
new List<string> {
|
||||
// Audio channels
|
||||
"2.0CH", "2CH", "5.1", "5.1CH", "DTS", "DTS-ES", "DTS5.1",
|
||||
"TRUEHD5.1",
|
||||
// Audio codec
|
||||
"AAC", "AACX2", "AACX3", "AACX4", "AC3", "EAC3", "E-AC-3",
|
||||
"FLAC", "FLACX2", "FLACX3", "FLACX4", "LOSSLESS", "MP3", "OGG", "VORBIS",
|
||||
// Audio language
|
||||
"DUALAUDIO", "DUAL AUDIO"
|
||||
});
|
||||
|
||||
Add(ElementCategory.ElementDeviceCompatibility,
|
||||
optionsDefault,
|
||||
new List<string> {"IPAD3", "IPHONE5", "IPOD", "PS3", "XBOX", "XBOX360"});
|
||||
|
||||
Add(ElementCategory.ElementDeviceCompatibility,
|
||||
optionsUnidentifiable,
|
||||
new List<string> {"ANDROID"});
|
||||
|
||||
Add(ElementCategory.ElementEpisodePrefix,
|
||||
optionsDefault,
|
||||
new List<string> {"EP", "EP.", "EPS", "EPS.", "EPISODE", "EPISODE.", "EPISODES", "CAPITULO", "EPISODIO", "FOLGE"});
|
||||
|
||||
Add(ElementCategory.ElementEpisodePrefix,
|
||||
optionsInvalid,
|
||||
new List<string> {"E", "\\x7B2C"}); // single-letter episode keywords are not valid tokens
|
||||
|
||||
Add(ElementCategory.ElementFileExtension,
|
||||
optionsDefault,
|
||||
new List<string> {"3GP", "AVI", "DIVX", "FLV", "M2TS", "MKV", "MOV", "MP4", "MPG", "OGM", "RM", "RMVB", "TS", "WEBM", "WMV"});
|
||||
|
||||
Add(ElementCategory.ElementFileExtension,
|
||||
optionsInvalid,
|
||||
new List<string> {"AAC", "AIFF", "FLAC", "M4A", "MP3", "MKA", "OGG", "WAV", "WMA", "7Z", "RAR", "ZIP", "ASS", "SRT"});
|
||||
|
||||
Add(ElementCategory.ElementLanguage,
|
||||
optionsDefault,
|
||||
new List<string> {"ENG", "ENGLISH", "ESPANO", "JAP", "PT-BR", "SPANISH", "VOSTFR"});
|
||||
|
||||
Add(ElementCategory.ElementLanguage,
|
||||
optionsUnidentifiable,
|
||||
new List<string> {"ESP", "ITA"}); // e.g. "Tokyo ESP:, "Bokura ga Ita"
|
||||
|
||||
Add(ElementCategory.ElementOther,
|
||||
optionsDefault,
|
||||
new List<string> {"REMASTER", "REMASTERED", "UNCENSORED", "UNCUT", "TS", "VFR", "WIDESCREEN", "WS"});
|
||||
|
||||
Add(ElementCategory.ElementReleaseGroup,
|
||||
optionsDefault,
|
||||
new List<string> {"THORA"});
|
||||
|
||||
Add(ElementCategory.ElementReleaseInformation,
|
||||
optionsDefault,
|
||||
new List<string> {"BATCH", "COMPLETE", "PATCH", "REMUX"});
|
||||
|
||||
Add(ElementCategory.ElementReleaseInformation,
|
||||
optionsUnidentifiable,
|
||||
new List<string> {"END", "FINAL"}); // e.g. "The End of Evangelion", 'Final Approach"
|
||||
|
||||
Add(ElementCategory.ElementReleaseVersion,
|
||||
optionsDefault,
|
||||
new List<string> {"V0", "V1", "V2", "V3", "V4"});
|
||||
|
||||
Add(ElementCategory.ElementSource,
|
||||
optionsDefault,
|
||||
new List<string> {"BD", "BDRIP", "BLURAY", "BLU-RAY", "DVD", "DVD5", "DVD9", "DVD-R2J", "DVDRIP", "DVD-RIP", "R2DVD", "R2J", "R2JDVD", "R2JDVDRIP", "HDTV", "HDTVRIP", "TVRIP", "TV-RIP", "WEBCAST", "WEBRIP"});
|
||||
|
||||
Add(ElementCategory.ElementSubtitles,
|
||||
optionsDefault,
|
||||
new List<string> {"ASS", "BIG5", "DUB", "DUBBED", "HARDSUB", "HARDSUBS", "RAW", "SOFTSUB", "SOFTSUBS", "SUB", "SUBBED", "SUBTITLED"});
|
||||
|
||||
Add(ElementCategory.ElementVideoTerm,
|
||||
optionsDefault,
|
||||
new List<string> {
|
||||
// Frame rate
|
||||
"23.976FPS", "24FPS", "29.97FPS", "30FPS", "60FPS", "120FPS",
|
||||
// Video codec
|
||||
"8BIT", "8-BIT", "10BIT", "10BITS", "10-BIT", "10-BITS",
|
||||
"HI10", "HI10P", "HI444", "HI444P", "HI444PP",
|
||||
"H264", "H265", "H.264", "H.265", "X264", "X265", "X.264",
|
||||
"AVC", "HEVC", "HEVC2", "DIVX", "DIVX5", "DIVX6", "XVID",
|
||||
// Video format
|
||||
"AVI", "RMVB", "WMV", "WMV3", "WMV9",
|
||||
// Video quality
|
||||
"HQ", "LQ",
|
||||
// Video resolution
|
||||
"HD", "SD"});
|
||||
|
||||
Add(ElementCategory.ElementVolumePrefix,
|
||||
optionsDefault,
|
||||
new List<string> {"VOL", "VOL.", "VOLUME"});
|
||||
|
||||
s_peekEntries = new List<Tuple<ElementCategory, List<string>>>
|
||||
{
|
||||
Tuple.Create(ElementCategory.ElementAudioTerm, new List<string> { "Dual Audio" }),
|
||||
Tuple.Create(ElementCategory.ElementVideoTerm, new List<string> { "H264", "H.264", "h264", "h.264" }),
|
||||
Tuple.Create(ElementCategory.ElementVideoResolution, new List<string> { "480p", "720p", "1080p" }),
|
||||
Tuple.Create(ElementCategory.ElementSource, new List<string> { "Blu-Ray" })
|
||||
};
|
||||
}
|
||||
|
||||
public static string Normalize(string word)
|
||||
{
|
||||
return string.IsNullOrEmpty(word) ? word : word.ToUpperInvariant();
|
||||
}
|
||||
|
||||
public static bool Contains(ElementCategory category, string keyword)
|
||||
{
|
||||
var keys = GetKeywordContainer(category);
|
||||
if (keys.TryGetValue(keyword, out var foundEntry))
|
||||
{
|
||||
return foundEntry.Category == category;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Finds a particular <code>keyword</code>. If found sets <code>category</code> and <code>options</code> to the found search result.
|
||||
/// </summary>
|
||||
/// <param name="keyword">the keyword to search for</param>
|
||||
/// <param name="category">the reference that will be set/changed to the found keyword category</param>
|
||||
/// <param name="options">the reference that will be set/changed to the found keyword options</param>
|
||||
/// <returns>if the keyword was found</returns>
|
||||
public static bool FindAndSet(string keyword, ref ElementCategory category, ref KeywordOptions options)
|
||||
{
|
||||
Dictionary<string, Keyword> keys = GetKeywordContainer(category);
|
||||
if (!keys.TryGetValue(keyword, out var foundEntry))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (category == ElementCategory.ElementUnknown)
|
||||
{
|
||||
category = foundEntry.Category;
|
||||
}
|
||||
else if (foundEntry.Category != category)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
options = foundEntry.Options;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given a particular <code>filename</code> and <code>range</code> attempt to preidentify the token before we attempt the main parsing logic
|
||||
/// </summary>
|
||||
/// <param name="filename">the filename</param>
|
||||
/// <param name="range">the search range</param>
|
||||
/// <param name="elements">elements array that any pre-identified elements will be added to</param>
|
||||
/// <param name="preIdentifiedTokens">elements array that any pre-identified token ranges will be added to</param>
|
||||
public static void PeekAndAdd(string filename, TokenRange range, List<Element> elements, List<TokenRange> preIdentifiedTokens)
|
||||
{
|
||||
int endR = range.Offset + range.Size;
|
||||
string search = filename.Substring(range.Offset, endR > filename.Length ? filename.Length - range.Offset : endR - range.Offset);
|
||||
foreach (var entry in s_peekEntries)
|
||||
{
|
||||
foreach (string keyword in entry.Item2)
|
||||
{
|
||||
int foundIdx = search.IndexOf(keyword, StringComparison.CurrentCulture);
|
||||
if (foundIdx == -1) continue;
|
||||
foundIdx += range.Offset;
|
||||
elements.Add(new Element(entry.Item1, keyword));
|
||||
preIdentifiedTokens.Add(new TokenRange(foundIdx, keyword.Length));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Private API
|
||||
|
||||
/** Returns the appropriate keyword container. */
|
||||
private static Dictionary<string, Keyword> GetKeywordContainer(ElementCategory category)
|
||||
{
|
||||
return category == ElementCategory.ElementFileExtension ? s_extensions : s_keys;
|
||||
}
|
||||
|
||||
/// Adds a <code>category</code>, <code>options</code>, and <code>keywords</code> to the internal keywords list.
|
||||
private static void Add(ElementCategory category, KeywordOptions options, IEnumerable<string> keywords)
|
||||
{
|
||||
var keys = GetKeywordContainer(category);
|
||||
foreach (string key in keywords.Where(k => !string.IsNullOrEmpty(k) && !keys.ContainsKey(k)))
|
||||
{
|
||||
keys[key] = new Keyword(category, options);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Keyword options for a particular keyword.
|
||||
/// </summary>
|
||||
public class KeywordOptions
|
||||
{
|
||||
public bool Identifiable { get; }
|
||||
public bool Searchable { get; }
|
||||
public bool Valid { get; }
|
||||
|
||||
public KeywordOptions() : this(true, true, true) {}
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a new keyword options
|
||||
/// </summary>
|
||||
/// <param name="identifiable">if the token is identifiable</param>
|
||||
/// <param name="searchable">if the token is searchable</param>
|
||||
/// <param name="valid">if the token is valid</param>
|
||||
public KeywordOptions(bool identifiable, bool searchable, bool valid)
|
||||
{
|
||||
Identifiable = identifiable;
|
||||
Searchable = searchable;
|
||||
Valid = valid;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A Keyword
|
||||
/// </summary>
|
||||
public struct Keyword
|
||||
{
|
||||
public readonly ElementCategory Category;
|
||||
public readonly KeywordOptions Options;
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a new Keyword
|
||||
/// </summary>
|
||||
/// <param name="category">the category of the keyword</param>
|
||||
/// <param name="options">the keyword's options</param>
|
||||
public Keyword(ElementCategory category, KeywordOptions options)
|
||||
{
|
||||
Category = category;
|
||||
Options = options;
|
||||
}
|
||||
}
|
28
Chiara/AnitomySharp/Options.cs
Normal file
28
Chiara/AnitomySharp/Options.cs
Normal file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// AnitomySharp search configuration options
|
||||
/// </summary>
|
||||
public class Options(
|
||||
string delimiters = " _.&+,|",
|
||||
bool episode = true,
|
||||
bool title = true,
|
||||
bool extension = true,
|
||||
bool group = true)
|
||||
{
|
||||
public string AllowedDelimiters { get; } = delimiters;
|
||||
public bool ParseEpisodeNumber { get; } = episode;
|
||||
public bool ParseEpisodeTitle { get; } = title;
|
||||
public bool ParseFileExtension { get; } = extension;
|
||||
public bool ParseReleaseGroup { get; } = group;
|
||||
}
|
430
Chiara/AnitomySharp/Parser.cs
Normal file
430
Chiara/AnitomySharp/Parser.cs
Normal file
@@ -0,0 +1,430 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// Class to classify <see cref="Token"/>s
|
||||
/// </summary>
|
||||
public class Parser
|
||||
{
|
||||
public bool IsEpisodeKeywordsFound { get; private set; }
|
||||
public ParserHelper ParseHelper { get; }
|
||||
public ParserNumber ParseNumber { get; }
|
||||
public List<Element> Elements { get; }
|
||||
public List<Token> Tokens { get; }
|
||||
private Options Options { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a new token parser
|
||||
/// </summary>
|
||||
/// <param name="elements">the list where parsed elements will be added</param>
|
||||
/// <param name="options">the parser options</param>
|
||||
/// <param name="tokens">the list of tokens</param>
|
||||
public Parser(List<Element> elements, Options options, List<Token> tokens)
|
||||
{
|
||||
Elements = elements;
|
||||
Options = options;
|
||||
Tokens = tokens;
|
||||
ParseHelper = new ParserHelper(this);
|
||||
ParseNumber = new ParserNumber(this);
|
||||
}
|
||||
|
||||
/** Begins the parsing process */
|
||||
public bool Parse()
|
||||
{
|
||||
SearchForKeywords();
|
||||
SearchForIsolatedNumbers();
|
||||
|
||||
if (Options.ParseEpisodeNumber)
|
||||
{
|
||||
SearchForEpisodeNumber();
|
||||
}
|
||||
|
||||
SearchForAnimeTitle();
|
||||
|
||||
if (Options.ParseReleaseGroup && Empty(ElementCategory.ElementReleaseGroup))
|
||||
{
|
||||
SearchForReleaseGroup();
|
||||
}
|
||||
|
||||
if (Options.ParseEpisodeTitle && !Empty(ElementCategory.ElementEpisodeNumber))
|
||||
{
|
||||
SearchForEpisodeTitle();
|
||||
}
|
||||
|
||||
ValidateElements();
|
||||
return Empty(ElementCategory.ElementAnimeTitle);
|
||||
}
|
||||
|
||||
/** Search for anime keywords. */
|
||||
private void SearchForKeywords()
|
||||
{
|
||||
for (int i = 0; i < Tokens.Count; i++)
|
||||
{
|
||||
var token = Tokens[i];
|
||||
if (token.Category != Token.TokenCategory.Unknown) continue;
|
||||
|
||||
string word = token.Content;
|
||||
word = word.Trim(" -".ToCharArray());
|
||||
if (string.IsNullOrEmpty(word)) continue;
|
||||
|
||||
// Don't bother if the word is a number that cannot be CRC
|
||||
if (word.Length != 8 && StringHelper.IsNumericString(word)) continue;
|
||||
|
||||
string keyword = KeywordManager.Normalize(word);
|
||||
var category = ElementCategory.ElementUnknown;
|
||||
var options = new KeywordOptions();
|
||||
|
||||
if (KeywordManager.FindAndSet(keyword, ref category, ref options))
|
||||
{
|
||||
if (!Options.ParseReleaseGroup && category == ElementCategory.ElementReleaseGroup) continue;
|
||||
if (!ParserHelper.IsElementCategorySearchable(category) || !options.Searchable) continue;
|
||||
if (ParserHelper.IsElementCategorySingular(category) && !Empty(category)) continue;
|
||||
switch (category)
|
||||
{
|
||||
case ElementCategory.ElementAnimeSeasonPrefix:
|
||||
ParseHelper.CheckAndSetAnimeSeasonKeyword(token, i);
|
||||
continue;
|
||||
case ElementCategory.ElementEpisodePrefix when options.Valid:
|
||||
ParseHelper.CheckExtentKeyword(ElementCategory.ElementEpisodeNumber, i, token);
|
||||
continue;
|
||||
case ElementCategory.ElementReleaseVersion:
|
||||
word = word[1..];
|
||||
break;
|
||||
case ElementCategory.ElementVolumePrefix:
|
||||
ParseHelper.CheckExtentKeyword(ElementCategory.ElementVolumeNumber, i, token);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (Empty(ElementCategory.ElementFileChecksum) && ParserHelper.IsCrc32(word))
|
||||
{
|
||||
category = ElementCategory.ElementFileChecksum;
|
||||
}
|
||||
else if (Empty(ElementCategory.ElementVideoResolution) && ParserHelper.IsResolution(word))
|
||||
{
|
||||
category = ElementCategory.ElementVideoResolution;
|
||||
}
|
||||
}
|
||||
|
||||
if (category == ElementCategory.ElementUnknown) continue;
|
||||
Elements.Add(new Element(category, word));
|
||||
if (options.Identifiable)
|
||||
{
|
||||
token.Category = Token.TokenCategory.Identifier;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Search for episode number. */
|
||||
private void SearchForEpisodeNumber()
|
||||
{
|
||||
// List all unknown tokens that contain a number
|
||||
var tokens = new List<int>();
|
||||
for (int i = 0; i < Tokens.Count; i++)
|
||||
{
|
||||
var token = Tokens[i];
|
||||
if (token.Category == Token.TokenCategory.Unknown &&
|
||||
ParserHelper.IndexOfFirstDigit(token.Content) != -1)
|
||||
{
|
||||
tokens.Add(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (tokens.Count == 0) return;
|
||||
|
||||
IsEpisodeKeywordsFound = !Empty(ElementCategory.ElementEpisodeNumber);
|
||||
|
||||
// If a token matches a known episode pattern, it has to be the episode number
|
||||
if (ParseNumber.SearchForEpisodePatterns(tokens)) return;
|
||||
|
||||
// We have previously found an episode number via keywords
|
||||
if (!Empty(ElementCategory.ElementEpisodeNumber)) return;
|
||||
|
||||
// From now on, we're only interested in numeric tokens
|
||||
tokens.RemoveAll(r => !StringHelper.IsNumericString(Tokens[r].Content));
|
||||
|
||||
// e.g. "01 (176)", "29 (04)"
|
||||
if (ParseNumber.SearchForEquivalentNumbers(tokens)) return;
|
||||
|
||||
// e.g. " - 08"
|
||||
if (ParseNumber.SearchForSeparatedNumbers(tokens)) return;
|
||||
|
||||
// "e.g. "[12]", "(2006)"
|
||||
if (ParseNumber.SearchForIsolatedNumbers(tokens)) return;
|
||||
|
||||
// Consider using the last number as a last resort
|
||||
ParseNumber.SearchForLastNumber(tokens);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Search for anime title
|
||||
/// </summary>
|
||||
private void SearchForAnimeTitle()
|
||||
{
|
||||
bool enclosedTitle = false;
|
||||
|
||||
int tokenBegin = Token.FindToken(Tokens, 0, Tokens.Count, Token.TokenFlag.FlagNotEnclosed,
|
||||
Token.TokenFlag.FlagUnknown);
|
||||
|
||||
// If that doesn't work, find the first unknown token in the second enclosed
|
||||
// group, assuming that the first one is the release group
|
||||
if (!Token.InListRange(tokenBegin, Tokens))
|
||||
{
|
||||
tokenBegin = 0;
|
||||
enclosedTitle = true;
|
||||
bool skippedPreviousGroup = false;
|
||||
|
||||
do
|
||||
{
|
||||
tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown);
|
||||
if (!Token.InListRange(tokenBegin, Tokens)) break;
|
||||
|
||||
// Ignore groups that are composed of non-Latin characters
|
||||
if (StringHelper.IsMostlyLatinString(Tokens[tokenBegin].Content) && skippedPreviousGroup)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// Get the first unknown token of the next group
|
||||
tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket);
|
||||
tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown);
|
||||
skippedPreviousGroup = true;
|
||||
} while (Token.InListRange(tokenBegin, Tokens));
|
||||
}
|
||||
|
||||
if (!Token.InListRange(tokenBegin, Tokens)) return;
|
||||
|
||||
// Continue until an identifier (or a bracket, if the title is enclosed) is found
|
||||
int tokenEnd = Token.FindToken(
|
||||
Tokens,
|
||||
tokenBegin,
|
||||
Tokens.Count,
|
||||
Token.TokenFlag.FlagIdentifier,
|
||||
enclosedTitle ? Token.TokenFlag.FlagBracket : Token.TokenFlag.FlagNone);
|
||||
|
||||
// If within the interval there's an open bracket without its matching pair,
|
||||
// move the upper endpoint back to the bracket
|
||||
if (!enclosedTitle)
|
||||
{
|
||||
int lastBracket = tokenEnd;
|
||||
bool bracketOpen = false;
|
||||
for (int i = tokenBegin; i < tokenEnd; i++)
|
||||
{
|
||||
if (Tokens[i].Category != Token.TokenCategory.Bracket) continue;
|
||||
lastBracket = i;
|
||||
bracketOpen = !bracketOpen;
|
||||
}
|
||||
|
||||
if (bracketOpen) tokenEnd = lastBracket;
|
||||
}
|
||||
|
||||
// If the interval ends with an enclosed group (e.g. "Anime Title [Fansub]"),
|
||||
// move the upper endpoint back to the beginning of the group. We ignore
|
||||
// parentheses in order to keep certain groups (e.g. "(TV)") intact.
|
||||
if (!enclosedTitle)
|
||||
{
|
||||
int token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter);
|
||||
|
||||
while (ParseHelper.IsTokenCategory(token, Token.TokenCategory.Bracket) && Tokens[token].Content[0] != ')')
|
||||
{
|
||||
token = Token.FindPrevToken(Tokens, token, Token.TokenFlag.FlagBracket);
|
||||
if (!Token.InListRange(token, Tokens)) continue;
|
||||
tokenEnd = token;
|
||||
token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter);
|
||||
}
|
||||
}
|
||||
|
||||
ParseHelper.BuildElement(ElementCategory.ElementAnimeTitle, false,
|
||||
Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Search for release group
|
||||
/// </summary>
|
||||
private void SearchForReleaseGroup()
|
||||
{
|
||||
for (int tokenBegin = 0, tokenEnd = tokenBegin; tokenBegin < Tokens.Count;)
|
||||
{
|
||||
// Find the first enclosed unknown token
|
||||
tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagEnclosed,
|
||||
Token.TokenFlag.FlagUnknown);
|
||||
if (!Token.InListRange(tokenBegin, Tokens)) return;
|
||||
|
||||
// Continue until a bracket or identifier is found
|
||||
tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket,
|
||||
Token.TokenFlag.FlagIdentifier);
|
||||
if (!Token.InListRange(tokenEnd, Tokens) ||
|
||||
Tokens[tokenEnd].Category != Token.TokenCategory.Bracket) continue;
|
||||
|
||||
// Ignore if it's not the first non-delimiter token in group
|
||||
int prevToken = Token.FindPrevToken(Tokens, tokenBegin, Token.TokenFlag.FlagNotDelimiter);
|
||||
if (Token.InListRange(prevToken, Tokens) &&
|
||||
Tokens[prevToken].Category != Token.TokenCategory.Bracket) continue;
|
||||
|
||||
ParseHelper.BuildElement(ElementCategory.ElementReleaseGroup, true,
|
||||
Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Search for episode title
|
||||
/// </summary>
|
||||
private void SearchForEpisodeTitle()
|
||||
{
|
||||
int tokenBegin;
|
||||
int tokenEnd = 0;
|
||||
|
||||
do
|
||||
{
|
||||
// Find the first non-enclosed unknown token
|
||||
tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagNotEnclosed,
|
||||
Token.TokenFlag.FlagUnknown);
|
||||
if (!Token.InListRange(tokenBegin, Tokens)) return;
|
||||
|
||||
// Continue until a bracket or identifier is found
|
||||
tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket,
|
||||
Token.TokenFlag.FlagIdentifier);
|
||||
|
||||
// Ignore if it's only a dash
|
||||
if (tokenEnd - tokenBegin <= 2 && ParserHelper.IsDashCharacter(Tokens[tokenBegin].Content[0])) continue;
|
||||
//if (tokenBegin.Pos == null || tokenEnd.Pos == null) continue;
|
||||
ParseHelper.BuildElement(ElementCategory.ElementEpisodeTitle, false,
|
||||
Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin));
|
||||
return;
|
||||
} while (Token.InListRange(tokenBegin, Tokens));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Search for isolated numbers
|
||||
/// </summary>
|
||||
private void SearchForIsolatedNumbers()
|
||||
{
|
||||
for (int i = 0; i < Tokens.Count; i++)
|
||||
{
|
||||
var token = Tokens[i];
|
||||
if (token.Category != Token.TokenCategory.Unknown || !StringHelper.IsNumericString(token.Content) ||
|
||||
!ParseHelper.IsTokenIsolated(i))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
int number = StringHelper.StringToInt(token.Content);
|
||||
|
||||
// Anime year
|
||||
if (number >= ParserNumber.AnimeYearMin && number <= ParserNumber.AnimeYearMax)
|
||||
{
|
||||
if (Empty(ElementCategory.ElementAnimeYear))
|
||||
{
|
||||
Elements.Add(new Element(ElementCategory.ElementAnimeYear, token.Content));
|
||||
token.Category = Token.TokenCategory.Identifier;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Video resolution
|
||||
if (number != 480 && number != 720 && number != 1080) continue;
|
||||
// If these numbers are isolated, it's more likely for them to be the
|
||||
// video resolution rather than the episode number. Some fansub groups use these without the "p" suffix.
|
||||
if (!Empty(ElementCategory.ElementVideoResolution)) continue;
|
||||
Elements.Add(new Element(ElementCategory.ElementVideoResolution, token.Content));
|
||||
token.Category = Token.TokenCategory.Identifier;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validate Elements
|
||||
/// </summary>
|
||||
private void ValidateElements()
|
||||
{
|
||||
if (Empty(ElementCategory.ElementAnimeType) || Empty(ElementCategory.ElementEpisodeTitle))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
string episodeTitle = Get(ElementCategory.ElementEpisodeTitle);
|
||||
|
||||
for (int i = 0; i < Elements.Count;)
|
||||
{
|
||||
var el = Elements[i];
|
||||
|
||||
if (el.Category == ElementCategory.ElementAnimeType)
|
||||
{
|
||||
if (episodeTitle.Contains(el.Value))
|
||||
{
|
||||
if (episodeTitle.Length == el.Value.Length)
|
||||
{
|
||||
Elements.RemoveAll(element =>
|
||||
element.Category == ElementCategory.ElementEpisodeTitle); // invalid episode title
|
||||
}
|
||||
else
|
||||
{
|
||||
string keyword = KeywordManager.Normalize(el.Value);
|
||||
if (KeywordManager.Contains(ElementCategory.ElementAnimeType, keyword))
|
||||
{
|
||||
i = Erase(el); // invalid anime type
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the parser contains this category
|
||||
/// </summary>
|
||||
/// <param name="category"></param>
|
||||
/// <returns></returns>
|
||||
private bool Empty(ElementCategory category)
|
||||
{
|
||||
return Elements.All(element => element.Category != category);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the value of a particular category
|
||||
/// </summary>
|
||||
/// <param name="category"></param>
|
||||
/// <returns></returns>
|
||||
private string Get(ElementCategory category)
|
||||
{
|
||||
var foundElement = Elements.Find(element => element.Category == category);
|
||||
|
||||
if (foundElement != null) return foundElement.Value;
|
||||
Element e = new Element(category, "");
|
||||
Elements.Add(e);
|
||||
foundElement = e;
|
||||
|
||||
return foundElement.Value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deletes the first element with the same <code>element.Category</code> and returns the deleted element's position.
|
||||
/// </summary>
|
||||
private int Erase(Element element)
|
||||
{
|
||||
int removedIdx = -1;
|
||||
for (int i = 0; i < Elements.Count; i++)
|
||||
{
|
||||
var currentElement = Elements[i];
|
||||
if (element.Category != currentElement.Category) continue;
|
||||
removedIdx = i;
|
||||
Elements.RemoveAt(i);
|
||||
break;
|
||||
}
|
||||
|
||||
return removedIdx;
|
||||
}
|
||||
}
|
281
Chiara/AnitomySharp/ParserHelper.cs
Normal file
281
Chiara/AnitomySharp/ParserHelper.cs
Normal file
@@ -0,0 +1,281 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
using System.Text;
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// Utility class to assist in the parsing.
|
||||
/// </summary>
|
||||
public class ParserHelper(Parser parser)
|
||||
{
|
||||
private static readonly System.Buffers.SearchValues<char> s_myChars = System.Buffers.SearchValues.Create("xX\u00D7");
|
||||
private const string Dashes = "-\u2010\u2011\u2012\u2013\u2014\u2015";
|
||||
private const string DashesWithSpace = " -\u2010\u2011\u2012\u2013\u2014\u2015";
|
||||
|
||||
private static readonly Dictionary<string, string> s_ordinals = new()
|
||||
{
|
||||
{"1st", "1"}, {"First", "1"},
|
||||
{"2nd", "2"}, {"Second", "2"},
|
||||
{"3rd", "3"}, {"Third", "3"},
|
||||
{"4th", "4"}, {"Fourth", "4"},
|
||||
{"5th", "5"}, {"Fifth", "5"},
|
||||
{"6th", "6"}, {"Sixth", "6"},
|
||||
{"7th", "7"}, {"Seventh", "7"},
|
||||
{"8th", "8"}, {"Eighth", "8"},
|
||||
{"9th", "9"}, {"Ninth", "9"}
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>result</code> matches the <code>category</code>.
|
||||
/// </summary>
|
||||
public bool IsTokenCategory(int result, Token.TokenCategory category)
|
||||
{
|
||||
return Token.InListRange(result, parser.Tokens) && parser.Tokens[result].Category == category;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>str</code> is a CRC string.
|
||||
/// </summary>
|
||||
public static bool IsCrc32(string str)
|
||||
{
|
||||
return str.Length == 8 && StringHelper.IsHexadecimalString(str);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>character</code> is a dash character
|
||||
/// </summary>
|
||||
public static bool IsDashCharacter(char c)
|
||||
{
|
||||
return Dashes.Contains(c.ToString());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a number from an original (e.g. 2nd)
|
||||
/// </summary>
|
||||
private static string GetNumberFromOrdinal(string str)
|
||||
{
|
||||
return string.IsNullOrEmpty(str) ? string.Empty : s_ordinals.GetValueOrDefault(str, "");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the index of the first digit in the <code>str</code>; -1 otherwise.
|
||||
/// </summary>
|
||||
public static int IndexOfFirstDigit(string str)
|
||||
{
|
||||
if (string.IsNullOrEmpty(str)) return -1;
|
||||
for (int i = 0; i < str.Length; i++)
|
||||
{
|
||||
if (char.IsDigit(str, i))
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>str</code> is a resolution.
|
||||
/// </summary>
|
||||
public static bool IsResolution(string str)
|
||||
{
|
||||
if (string.IsNullOrEmpty(str)) return false;
|
||||
const int minWidthSize = 3;
|
||||
const int minHeightSize = 3;
|
||||
|
||||
switch (str.Length)
|
||||
{
|
||||
case >= minWidthSize + 1 + minHeightSize:
|
||||
{
|
||||
int pos = str.AsSpan().IndexOfAny(s_myChars);
|
||||
if (pos == -1 || pos < minWidthSize || pos > str.Length - (minHeightSize + 1)) return false;
|
||||
return !str.Where((t, i) => i != pos && !char.IsDigit(t)).Any();
|
||||
}
|
||||
case < minHeightSize + 1:
|
||||
return false;
|
||||
default:
|
||||
{
|
||||
if (char.ToLower(str[^1]) != 'p') return false;
|
||||
for (int i = 0; i < str.Length - 1; i++)
|
||||
{
|
||||
if (!char.IsDigit(str[i])) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>category</code> is searchable.
|
||||
/// </summary>
|
||||
public static bool IsElementCategorySearchable(ElementCategory category)
|
||||
{
|
||||
return category switch
|
||||
{
|
||||
ElementCategory.ElementAnimeSeasonPrefix or ElementCategory.ElementAnimeType
|
||||
or ElementCategory.ElementAudioTerm or ElementCategory.ElementDeviceCompatibility
|
||||
or ElementCategory.ElementEpisodePrefix or ElementCategory.ElementFileChecksum
|
||||
or ElementCategory.ElementLanguage or ElementCategory.ElementOther
|
||||
or ElementCategory.ElementReleaseGroup or ElementCategory.ElementReleaseInformation
|
||||
or ElementCategory.ElementReleaseVersion or ElementCategory.ElementSource
|
||||
or ElementCategory.ElementSubtitles or ElementCategory.ElementVideoResolution
|
||||
or ElementCategory.ElementVideoTerm or ElementCategory.ElementVolumePrefix => true,
|
||||
_ => false
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether the <code>category</code> is singular.
|
||||
/// </summary>
|
||||
public static bool IsElementCategorySingular(ElementCategory category)
|
||||
{
|
||||
return category switch
|
||||
{
|
||||
ElementCategory.ElementAnimeSeason or ElementCategory.ElementAnimeType or ElementCategory.ElementAudioTerm
|
||||
or ElementCategory.ElementDeviceCompatibility or ElementCategory.ElementEpisodeNumber
|
||||
or ElementCategory.ElementLanguage or ElementCategory.ElementOther
|
||||
or ElementCategory.ElementReleaseInformation or ElementCategory.ElementSource
|
||||
or ElementCategory.ElementVideoTerm => false,
|
||||
_ => false
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not a token at the current <code>pos</code> is isolated(surrounded by braces).
|
||||
/// </summary>
|
||||
public bool IsTokenIsolated(int pos)
|
||||
{
|
||||
int prevToken = Token.FindPrevToken(parser.Tokens, pos, Token.TokenFlag.FlagNotDelimiter);
|
||||
if (!IsTokenCategory(prevToken, Token.TokenCategory.Bracket)) return false;
|
||||
int nextToken = Token.FindNextToken(parser.Tokens, pos, Token.TokenFlag.FlagNotDelimiter);
|
||||
return IsTokenCategory(nextToken, Token.TokenCategory.Bracket);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Finds and sets the anime season keyword.
|
||||
/// </summary>
|
||||
public bool CheckAndSetAnimeSeasonKeyword(Token token, int currentTokenPos)
|
||||
{
|
||||
int previousToken = Token.FindPrevToken(parser.Tokens, currentTokenPos, Token.TokenFlag.FlagNotDelimiter);
|
||||
if (Token.InListRange(previousToken, parser.Tokens))
|
||||
{
|
||||
string number = GetNumberFromOrdinal(parser.Tokens[previousToken].Content);
|
||||
if (!string.IsNullOrEmpty(number))
|
||||
{
|
||||
SetAnimeSeason(parser.Tokens[previousToken], token, number);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
int nextToken = Token.FindNextToken(parser.Tokens, currentTokenPos, Token.TokenFlag.FlagNotDelimiter);
|
||||
if (!Token.InListRange(nextToken, parser.Tokens) ||
|
||||
!StringHelper.IsNumericString(parser.Tokens[nextToken].Content)) return false;
|
||||
SetAnimeSeason(token, parser.Tokens[nextToken], parser.Tokens[nextToken].Content);
|
||||
return true;
|
||||
|
||||
void SetAnimeSeason(Token first, Token second, string content)
|
||||
{
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementAnimeSeason, content));
|
||||
first.Category = Token.TokenCategory.Identifier;
|
||||
second.Category = Token.TokenCategory.Identifier;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A Method to find the correct volume/episode number when prefixed (i.e. Vol.4).
|
||||
/// </summary>
|
||||
/// <param name="category">the category we're searching for</param>
|
||||
/// <param name="currentTokenPos">the current token position</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if we found the volume/episode number</returns>
|
||||
public bool CheckExtentKeyword(ElementCategory category, int currentTokenPos, Token token)
|
||||
{
|
||||
int nToken = Token.FindNextToken(parser.Tokens, currentTokenPos, Token.TokenFlag.FlagNotDelimiter);
|
||||
if (!IsTokenCategory(nToken, Token.TokenCategory.Unknown)) return false;
|
||||
if (IndexOfFirstDigit(parser.Tokens[nToken].Content) != 0) return false;
|
||||
switch (category)
|
||||
{
|
||||
case ElementCategory.ElementEpisodeNumber:
|
||||
if (!parser.ParseNumber.MatchEpisodePatterns(parser.Tokens[nToken].Content, parser.Tokens[nToken]))
|
||||
{
|
||||
parser.ParseNumber.SetEpisodeNumber(parser.Tokens[nToken].Content, parser.Tokens[nToken], false);
|
||||
}
|
||||
break;
|
||||
case ElementCategory.ElementVolumeNumber:
|
||||
if (!parser.ParseNumber.MatchVolumePatterns(parser.Tokens[nToken].Content, parser.Tokens[nToken]))
|
||||
{
|
||||
parser.ParseNumber.SetVolumeNumber(parser.Tokens[nToken].Content, parser.Tokens[nToken], false);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
token.Category = Token.TokenCategory.Identifier;
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void BuildElement(ElementCategory category, bool keepDelimiters, List<Token> tokens)
|
||||
{
|
||||
var element = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < tokens.Count; i++)
|
||||
{
|
||||
var token = tokens[i];
|
||||
switch (token.Category)
|
||||
{
|
||||
case Token.TokenCategory.Unknown:
|
||||
element.Append(token.Content);
|
||||
token.Category = Token.TokenCategory.Identifier;
|
||||
break;
|
||||
case Token.TokenCategory.Bracket:
|
||||
element.Append(token.Content);
|
||||
break;
|
||||
case Token.TokenCategory.Delimiter:
|
||||
string delimiter = "";
|
||||
if (!string.IsNullOrEmpty(token.Content))
|
||||
{
|
||||
delimiter = token.Content[0].ToString();
|
||||
}
|
||||
|
||||
if (keepDelimiters)
|
||||
{
|
||||
element.Append(delimiter);
|
||||
}
|
||||
else if (Token.InListRange(i, tokens))
|
||||
{
|
||||
switch (delimiter)
|
||||
{
|
||||
case ",":
|
||||
case "&":
|
||||
element.Append(delimiter);
|
||||
break;
|
||||
default:
|
||||
element.Append(' ');
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!keepDelimiters)
|
||||
{
|
||||
element = new StringBuilder(element.ToString().Trim(DashesWithSpace.ToCharArray()));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(element.ToString()))
|
||||
{
|
||||
parser.Elements.Add(new Element(category, element.ToString()));
|
||||
}
|
||||
}
|
||||
}
|
681
Chiara/AnitomySharp/ParserNumber.cs
Normal file
681
Chiara/AnitomySharp/ParserNumber.cs
Normal file
@@ -0,0 +1,681 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// A utility class to assist in number parsing.
|
||||
/// </summary>
|
||||
public partial class ParserNumber(Parser parser)
|
||||
{
|
||||
public const int AnimeYearMin = 1900;
|
||||
public const int AnimeYearMax = 2100;
|
||||
private const int EpisodeNumberMax = AnimeYearMax - 1;
|
||||
private const int VolumeNumberMax = 50;
|
||||
private const string RegexMatchOnlyStart = @"\A(?:";
|
||||
private const string RegexMatchOnlyEnd = @")\z";
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>number</code> is a volume number
|
||||
/// </summary>
|
||||
private static bool IsValidVolumeNumber(string number)
|
||||
{
|
||||
return StringHelper.StringToInt(number) <= VolumeNumberMax;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>number</code> is a valid episode number.
|
||||
/// </summary>
|
||||
private static bool IsValidEpisodeNumber(string number)
|
||||
{
|
||||
// Eliminate non numeric portion of number, then parse as double.
|
||||
string temp = "";
|
||||
for (int i = 0; i < number.Length && char.IsDigit(number[i]); i++)
|
||||
{
|
||||
temp += number[i];
|
||||
}
|
||||
|
||||
return !string.IsNullOrEmpty(temp) && double.Parse(temp) <= EpisodeNumberMax;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets the alternative episode number.
|
||||
/// </summary>
|
||||
private void SetAlternativeEpisodeNumber(string number, Token token)
|
||||
{
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementEpisodeNumberAlt, number));
|
||||
token.Category = Token.TokenCategory.Identifier;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets the volume number.
|
||||
/// </summary>
|
||||
/// <param name="number">the number</param>
|
||||
/// <param name="token">the token which contains the volume number</param>
|
||||
/// <param name="validate">true if we should check if it's a valid number, false to disable verification</param>
|
||||
/// <returns>true if the volume number was set</returns>
|
||||
public bool SetVolumeNumber(string number, Token token, bool validate)
|
||||
{
|
||||
if (validate && !IsValidVolumeNumber(number)) return false;
|
||||
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementVolumeNumber, number));
|
||||
token.Category = Token.TokenCategory.Identifier;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets the anime episode number.
|
||||
/// </summary>
|
||||
/// <param name="number">the episode number</param>
|
||||
/// <param name="token">the token which contains the volume number</param>
|
||||
/// <param name="validate">true if we should check if it's a valid episode number; false to disable validation</param>
|
||||
/// <returns>true if the episode number was set</returns>
|
||||
public bool SetEpisodeNumber(string number, Token token, bool validate)
|
||||
{
|
||||
if (validate && !IsValidEpisodeNumber(number)) return false;
|
||||
token.Category = Token.TokenCategory.Identifier;
|
||||
var category = ElementCategory.ElementEpisodeNumber;
|
||||
|
||||
if (parser.IsEpisodeKeywordsFound)
|
||||
{
|
||||
foreach (var element in parser.Elements)
|
||||
{
|
||||
if (element.Category != ElementCategory.ElementEpisodeNumber) continue;
|
||||
|
||||
int comparison = StringHelper.StringToInt(number) - StringHelper.StringToInt(element.Value);
|
||||
switch (comparison)
|
||||
{
|
||||
case > 0:
|
||||
category = ElementCategory.ElementEpisodeNumberAlt;
|
||||
break;
|
||||
case < 0:
|
||||
element.Category = ElementCategory.ElementEpisodeNumberAlt;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
parser.Elements.Add(new Element(category, number));
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a number follows the specified <code>token</code>
|
||||
/// </summary>
|
||||
/// <param name="category">the category to set if a number follows the <code>token</code></param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if a number follows the token; false otherwise</returns>
|
||||
private bool NumberComesAfterPrefix(ElementCategory category, Token token)
|
||||
{
|
||||
int numberBegin = ParserHelper.IndexOfFirstDigit(token.Content);
|
||||
string prefix = StringHelper.SubstringWithCheck(token.Content, 0, numberBegin).ToUpperInvariant();
|
||||
if (!KeywordManager.Contains(category, prefix)) return false;
|
||||
string number = StringHelper.SubstringWithCheck(token.Content, numberBegin, token.Content.Length - numberBegin);
|
||||
|
||||
switch (category)
|
||||
{
|
||||
case ElementCategory.ElementEpisodePrefix:
|
||||
if (!MatchEpisodePatterns(number, token))
|
||||
{
|
||||
SetEpisodeNumber(number, token, false);
|
||||
}
|
||||
return true;
|
||||
case ElementCategory.ElementVolumePrefix:
|
||||
if (!MatchVolumePatterns(number, token))
|
||||
{
|
||||
SetVolumeNumber(number, token, false);
|
||||
}
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks whether the number precedes the word "of"
|
||||
/// </summary>
|
||||
/// <param name="token">the token</param>
|
||||
/// <param name="currentTokenIdx">the index of the token</param>
|
||||
/// <returns>true if the token precedes the word "of"</returns>
|
||||
private bool NumberComesBeforeAnotherNumber(Token token, int currentTokenIdx)
|
||||
{
|
||||
int separatorToken = Token.FindNextToken(parser.Tokens, currentTokenIdx, Token.TokenFlag.FlagNotDelimiter);
|
||||
|
||||
if (!Token.InListRange(separatorToken, parser.Tokens)) return false;
|
||||
var separators = new List<Tuple<string, bool>>
|
||||
{
|
||||
Tuple.Create("&", true),
|
||||
Tuple.Create("of", false)
|
||||
};
|
||||
|
||||
foreach (var separator in separators)
|
||||
{
|
||||
if (parser.Tokens[separatorToken].Content != separator.Item1) continue;
|
||||
int otherToken = Token.FindNextToken(parser.Tokens, separatorToken, Token.TokenFlag.FlagNotDelimiter);
|
||||
if (!Token.InListRange(otherToken, parser.Tokens)
|
||||
|| !StringHelper.IsNumericString(parser.Tokens[otherToken].Content)) continue;
|
||||
SetEpisodeNumber(token.Content, token, false);
|
||||
if (separator.Item2)
|
||||
{
|
||||
SetEpisodeNumber(parser.Tokens[otherToken].Content, parser.Tokens[otherToken], false);
|
||||
}
|
||||
|
||||
parser.Tokens[separatorToken].Category = Token.TokenCategory.Identifier;
|
||||
parser.Tokens[otherToken].Category = Token.TokenCategory.Identifier;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// EPISODE MATCHERS
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to find an episode/season inside a <code>word</code>
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the word was matched to an episode/season number</returns>
|
||||
public bool MatchEpisodePatterns(string word, Token token)
|
||||
{
|
||||
if (StringHelper.IsNumericString(word)) return false;
|
||||
|
||||
word = word.Trim(" -".ToCharArray());
|
||||
|
||||
bool numericFront = char.IsDigit(word[0]);
|
||||
bool numericBack = char.IsDigit(word[^1]);
|
||||
|
||||
if (numericFront && numericBack)
|
||||
{
|
||||
// e.g. "01v2"
|
||||
if (MatchSingleEpisodePattern(word, token))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// e.g. "01-02", "03-05v2"
|
||||
|
||||
if (MatchMultiEpisodePattern(word, token))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// e.g. "07.5"
|
||||
|
||||
if (MatchFractionalEpisodePattern(word, token))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (numericBack)
|
||||
{
|
||||
// e.g. "2x01", "S01E03", "S01-02xE001-150"
|
||||
if (MatchSeasonAndEpisodePattern(word, token))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// e.g. "#01", "#02-03v2"
|
||||
|
||||
if (MatchNumberSignPattern(word, token))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
switch (numericFront)
|
||||
{
|
||||
// e.g. "ED1", "OP4a", "OVA2"
|
||||
case false when MatchTypeAndEpisodePattern(word, token):
|
||||
// e.g. "4a", "111C"
|
||||
case true when !numericBack && MatchPartialEpisodePattern(word, token):
|
||||
return true;
|
||||
default:
|
||||
// U+8A71 is used as counter for stories, episodes of TV series, etc.
|
||||
return numericFront && MatchJapaneseCounterPattern(word, token);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match a single episode pattern. e.g. "01v2".
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchSingleEpisodePattern(string word, Token token)
|
||||
{
|
||||
var match = SingleEpisodeRegex().Match(word);
|
||||
|
||||
if (!match.Success) return false;
|
||||
|
||||
SetEpisodeNumber(match.Groups[1].Value, token, false);
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[2].Value));
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match a multi episode pattern. e.g. "01-02", "03-05v2".
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchMultiEpisodePattern(string word, Token token)
|
||||
{
|
||||
var match = MultiEpisodeRegex().Match(word);
|
||||
if (!match.Success) return false;
|
||||
|
||||
string lowerBound = match.Groups[1].Value;
|
||||
string upperBound = match.Groups[3].Value;
|
||||
|
||||
if (StringHelper.StringToInt(lowerBound) >= StringHelper.StringToInt(upperBound)) return false;
|
||||
if (!SetEpisodeNumber(lowerBound, token, true)) return false;
|
||||
SetEpisodeNumber(upperBound, token, true);
|
||||
if (!string.IsNullOrEmpty(match.Groups[2].Value))
|
||||
{
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[2].Value));
|
||||
}
|
||||
if (!string.IsNullOrEmpty(match.Groups[4].Value))
|
||||
{
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[4].Value));
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match season and episode patterns. e.g. "2x01", "S01E03", "S01-02xE001-150".
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchSeasonAndEpisodePattern(string word, Token token)
|
||||
{
|
||||
var match = SeasonEpisodeRegex().Match(word);
|
||||
if (!match.Success) return false;
|
||||
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementAnimeSeason, match.Groups[1].Value));
|
||||
if (!string.IsNullOrEmpty(match.Groups[2].Value))
|
||||
{
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementAnimeSeason, match.Groups[2].Value));
|
||||
}
|
||||
SetEpisodeNumber(match.Groups[3].Value, token, false);
|
||||
if (!string.IsNullOrEmpty(match.Groups[4].Value))
|
||||
{
|
||||
SetEpisodeNumber(match.Groups[4].Value, token, false);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match type and episode. e.g. "ED1", "OP4a", "OVA2".
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchTypeAndEpisodePattern(string word, Token token)
|
||||
{
|
||||
int numberBegin = ParserHelper.IndexOfFirstDigit(word);
|
||||
string prefix = StringHelper.SubstringWithCheck(word, 0, numberBegin);
|
||||
|
||||
var category = ElementCategory.ElementAnimeType;
|
||||
var options = new KeywordOptions();
|
||||
|
||||
if (!KeywordManager.FindAndSet(KeywordManager.Normalize(prefix), ref category, ref options)) return false;
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementAnimeType, prefix));
|
||||
string number = word.Substring(numberBegin);
|
||||
if (!MatchEpisodePatterns(number, token) && !SetEpisodeNumber(number, token, true)) return false;
|
||||
int foundIdx = parser.Tokens.IndexOf(token);
|
||||
if (foundIdx == -1) return true;
|
||||
token.Content = number;
|
||||
parser.Tokens.Insert(foundIdx,
|
||||
new Token(options.Identifiable ? Token.TokenCategory.Identifier : Token.TokenCategory.Unknown, prefix, token.Enclosed));
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match fractional episodes. e.g. "07.5"
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchFractionalEpisodePattern(string word, Token token)
|
||||
{
|
||||
if (string.IsNullOrEmpty(word))
|
||||
{
|
||||
word = "";
|
||||
}
|
||||
|
||||
const string regexPattern = RegexMatchOnlyStart + @"\d+\.5" + RegexMatchOnlyEnd;
|
||||
var match = Regex.Match(word, regexPattern);
|
||||
return match.Success && SetEpisodeNumber(word, token, true);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match partial episodes. e.g. "4a", "111C".
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchPartialEpisodePattern(string word, Token token)
|
||||
{
|
||||
if (string.IsNullOrEmpty(word)) return false;
|
||||
int foundIdx = Enumerable.Range(0, word.Length)
|
||||
.DefaultIfEmpty(word.Length)
|
||||
.FirstOrDefault(value => !char.IsDigit(word[value]));
|
||||
int suffixLength = word.Length - foundIdx;
|
||||
|
||||
return suffixLength == 1 && IsValidSuffix(word[foundIdx]) && SetEpisodeNumber(word, token, true);
|
||||
|
||||
bool IsValidSuffix(int c) => c is >= 'A' and <= 'C' or >= 'a' and <= 'c';
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match episodes with number signs. e.g. "#01", "#02-03v2"
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchNumberSignPattern(string word, Token token)
|
||||
{
|
||||
if (string.IsNullOrEmpty(word) || word[0] != '#') word = "";
|
||||
var match = NumberEpisodeRegex().Match(word);
|
||||
if (!match.Success) return false;
|
||||
|
||||
if (!SetEpisodeNumber(match.Groups[1].Value, token, true)) return false;
|
||||
if (!string.IsNullOrEmpty(match.Groups[2].Value))
|
||||
{
|
||||
SetEpisodeNumber(match.Groups[2].Value, token, false);
|
||||
}
|
||||
if (!string.IsNullOrEmpty(match.Groups[3].Value))
|
||||
{
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[3].Value));
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match Japanese patterns. e.g. U+8A71 is used as counter for stories, episodes of TV series, etc.
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchJapaneseCounterPattern(string word, Token token)
|
||||
{
|
||||
if (string.IsNullOrEmpty(word) || word[^1] != '\u8A71') return false;
|
||||
var match = JapaneseEpisodeRegex().Match(word);
|
||||
if (!match.Success) return false;
|
||||
SetEpisodeNumber(match.Groups[1].Value, token, false);
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
// VOLUME MATCHES
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to find an episode/season inside a <code>word</code>
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the word was matched to an episode/season number</returns>
|
||||
public bool MatchVolumePatterns(string word, Token token)
|
||||
{
|
||||
// All patterns contain at least one non-numeric character
|
||||
if (StringHelper.IsNumericString(word)) return false;
|
||||
|
||||
word = word.Trim(" -".ToCharArray());
|
||||
|
||||
bool numericFront = char.IsDigit(word[0]);
|
||||
bool numericBack = char.IsDigit(word[^1]);
|
||||
|
||||
if (numericFront && numericBack)
|
||||
{
|
||||
// e.g. "01v2" e.g. "01-02", "03-05v2"
|
||||
return MatchSingleVolumePattern(word, token) || MatchMultiVolumePattern(word, token);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match single volume. e.g. "01v2"
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchSingleVolumePattern(string word, Token token)
|
||||
{
|
||||
if (string.IsNullOrEmpty(word)) word = "";
|
||||
var match = SingleVolumeRegex().Match(word);
|
||||
if (!match.Success) return false;
|
||||
|
||||
SetVolumeNumber(match.Groups[1].Value, token, false);
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[2].Value));
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Match multi-volume. e.g. "01-02", "03-05v2".
|
||||
/// </summary>
|
||||
/// <param name="word">the word</param>
|
||||
/// <param name="token">the token</param>
|
||||
/// <returns>true if the token matched</returns>
|
||||
private bool MatchMultiVolumePattern(string word, Token token)
|
||||
{
|
||||
if (string.IsNullOrEmpty(word)) word = "";
|
||||
var match = MultiVolumeRegex().Match(word);
|
||||
if (!match.Success) return false;
|
||||
|
||||
string lowerBound = match.Groups[1].Value;
|
||||
string upperBound = match.Groups[2].Value;
|
||||
if (StringHelper.StringToInt(lowerBound) >= StringHelper.StringToInt(upperBound)) return false;
|
||||
if (!SetVolumeNumber(lowerBound, token, true)) return false;
|
||||
SetVolumeNumber(upperBound, token, false);
|
||||
if (string.IsNullOrEmpty(match.Groups[3].Value))
|
||||
{
|
||||
parser.Elements.Add(new Element(ElementCategory.ElementReleaseVersion, match.Groups[3].Value));
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
// SEARCH
|
||||
|
||||
/// <summary>
|
||||
/// Searches for isolated numbers in a list of <code>tokens</code>.
|
||||
/// </summary>
|
||||
/// <param name="tokens">the list of tokens</param>
|
||||
/// <returns>true if an isolated number was found</returns>
|
||||
public bool SearchForIsolatedNumbers(IEnumerable<int> tokens)
|
||||
{
|
||||
return tokens
|
||||
.Where(it => parser.Tokens[it].Enclosed && parser.ParseHelper.IsTokenIsolated(it))
|
||||
.Any(it => SetEpisodeNumber(parser.Tokens[it].Content, parser.Tokens[it], true));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Searches for separated numbers in a list of <code>tokens</code>.
|
||||
/// </summary>
|
||||
/// <param name="tokens">the list of tokens</param>
|
||||
/// <returns>true fi a separated number was found</returns>
|
||||
public bool SearchForSeparatedNumbers(List<int> tokens)
|
||||
{
|
||||
foreach (int it in tokens)
|
||||
{
|
||||
int previousToken = Token.FindPrevToken(parser.Tokens, it, Token.TokenFlag.FlagNotDelimiter);
|
||||
|
||||
// See if the number has a preceding "-" separator
|
||||
if (!parser.ParseHelper.IsTokenCategory(previousToken, Token.TokenCategory.Unknown)
|
||||
|| !ParserHelper.IsDashCharacter(parser.Tokens[previousToken].Content[0])) continue;
|
||||
if (!SetEpisodeNumber(parser.Tokens[it].Content, parser.Tokens[it], true)) continue;
|
||||
parser.Tokens[previousToken].Category = Token.TokenCategory.Identifier;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Searches for episode patterns in a list of <code>tokens</code>.
|
||||
/// </summary>
|
||||
/// <param name="tokens">the list of tokens</param>
|
||||
/// <returns>true if an episode number was found</returns>
|
||||
public bool SearchForEpisodePatterns(List<int> tokens)
|
||||
{
|
||||
foreach (int it in tokens)
|
||||
{
|
||||
bool numericFront = parser.Tokens[it].Content.Length > 0 && char.IsDigit(parser.Tokens[it].Content[0]);
|
||||
|
||||
if (!numericFront)
|
||||
{
|
||||
// e.g. "EP.1", "Vol.1"
|
||||
if (NumberComesAfterPrefix(ElementCategory.ElementEpisodePrefix, parser.Tokens[it]))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if (NumberComesAfterPrefix(ElementCategory.ElementVolumePrefix, parser.Tokens[it]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// e.g. "8 & 10", "01 of 24"
|
||||
if (NumberComesBeforeAnotherNumber(parser.Tokens[it], it))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Look for other patterns
|
||||
if (MatchEpisodePatterns(parser.Tokens[it].Content, parser.Tokens[it]))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Searches for equivalent number in a list of <code>tokens</code>. e.g. 08(114)
|
||||
/// </summary>
|
||||
/// <param name="tokens">the list of tokens</param>
|
||||
/// <returns>true if an equivalent number was found</returns>
|
||||
public bool SearchForEquivalentNumbers(List<int> tokens)
|
||||
{
|
||||
foreach (int it in tokens)
|
||||
{
|
||||
// Find number must be isolated.
|
||||
if (parser.ParseHelper.IsTokenIsolated(it) || !IsValidEpisodeNumber(parser.Tokens[it].Content))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the first enclosed, non-delimiter token
|
||||
int nextToken = Token.FindNextToken(parser.Tokens, it, Token.TokenFlag.FlagNotDelimiter);
|
||||
if (!parser.ParseHelper.IsTokenCategory(nextToken, Token.TokenCategory.Bracket)) continue;
|
||||
nextToken = Token.FindNextToken(parser.Tokens, nextToken, Token.TokenFlag.FlagEnclosed,
|
||||
Token.TokenFlag.FlagNotDelimiter);
|
||||
if (!parser.ParseHelper.IsTokenCategory(nextToken, Token.TokenCategory.Unknown)) continue;
|
||||
|
||||
// Check if it's an isolated number
|
||||
if (!parser.ParseHelper.IsTokenIsolated(nextToken)
|
||||
|| !StringHelper.IsNumericString(parser.Tokens[nextToken].Content)
|
||||
|| !IsValidEpisodeNumber(parser.Tokens[nextToken].Content))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var list = new List<Token>
|
||||
{
|
||||
parser.Tokens[it], parser.Tokens[nextToken]
|
||||
};
|
||||
|
||||
list.Sort((o1, o2) => StringHelper.StringToInt(o1.Content) - StringHelper.StringToInt(o2.Content));
|
||||
SetEpisodeNumber(list[0].Content, list[0], false);
|
||||
SetAlternativeEpisodeNumber(list[1].Content, list[1]);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Searches for the last number token in a list of <code>tokens</code>
|
||||
/// </summary>
|
||||
/// <param name="tokens">the list of tokens</param>
|
||||
/// <returns>true if the last number token was found</returns>
|
||||
public bool SearchForLastNumber(List<int> tokens)
|
||||
{
|
||||
for (int i = tokens.Count - 1; i >= 0; i--)
|
||||
{
|
||||
int it = tokens[i];
|
||||
|
||||
// Assuming that episode number always comes after the title,
|
||||
// the first token cannot be what we're looking for
|
||||
if (it == 0) continue;
|
||||
if (parser.Tokens[it].Enclosed) continue;
|
||||
|
||||
// Ignore if it's the first non-enclosed, non-delimiter token
|
||||
if (parser.Tokens.GetRange(0, it)
|
||||
.All(r => r.Enclosed || r.Category == Token.TokenCategory.Delimiter))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
int previousToken = Token.FindPrevToken(parser.Tokens, it, Token.TokenFlag.FlagNotDelimiter);
|
||||
if (parser.ParseHelper.IsTokenCategory(previousToken, Token.TokenCategory.Unknown))
|
||||
{
|
||||
if (parser.Tokens[previousToken].Content.Equals("Movie", StringComparison.InvariantCultureIgnoreCase)
|
||||
|| parser.Tokens[previousToken].Content.Equals("Part", StringComparison.InvariantCultureIgnoreCase))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// We'll use this number after all
|
||||
if (SetEpisodeNumber(parser.Tokens[it].Content, parser.Tokens[it], true))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"\A(?:(\d{1,3})[vV](\d))\z")]
|
||||
private static partial Regex SingleEpisodeRegex();
|
||||
|
||||
[GeneratedRegex(@"\A(?:(\d{1,3})(?:[vV](\d))?[-~&+](\d{1,3})(?:[vV](\d))?)\z")]
|
||||
private static partial Regex MultiEpisodeRegex();
|
||||
|
||||
[GeneratedRegex(@"\A(?:S?(\d{1,2})(?:-S?(\d{1,2}))?(?:x|[ ._-x]?E)(\d{1,3})(?:-E?(\d{1,3}))?)\z")]
|
||||
private static partial Regex SeasonEpisodeRegex();
|
||||
|
||||
[GeneratedRegex(@"\A(?:#(\d{1,3})(?:[-~&+](\d{1,3}))?(?:[vV](\d))?)\z")]
|
||||
private static partial Regex NumberEpisodeRegex();
|
||||
|
||||
[GeneratedRegex(@"\A(?:(\d{1,3})話)\z")]
|
||||
private static partial Regex JapaneseEpisodeRegex();
|
||||
|
||||
[GeneratedRegex(@"\A(?:(\d{1,2})[vV](\d))\z")]
|
||||
private static partial Regex SingleVolumeRegex();
|
||||
|
||||
[GeneratedRegex(@"\A(?:(\d{1,2})[-~&+](\d{1,2})(?:[vV](\d))?)\z")]
|
||||
private static partial Regex MultiVolumeRegex();
|
||||
}
|
91
Chiara/AnitomySharp/StringHelper.cs
Normal file
91
Chiara/AnitomySharp/StringHelper.cs
Normal file
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// A string helper class that is analogous to <code>string.cpp</code> of the original Anitomy, and <code>StringHelper.java</code> of AnitomyJ.
|
||||
/// </summary>
|
||||
public static class StringHelper
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the character is alphanumeric
|
||||
/// </summary>
|
||||
public static bool IsAlphanumericChar(char c)
|
||||
{
|
||||
return c is >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z';
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the character is a hex character.
|
||||
/// </summary>
|
||||
private static bool IsHexadecimalChar(char c)
|
||||
{
|
||||
return c is >= '0' and <= '9' or >= 'A' and <= 'F' or >= 'a' and <= 'f';
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the character is a latin character
|
||||
/// </summary>
|
||||
private static bool IsLatinChar(char c)
|
||||
{
|
||||
// We're just checking until the end of the Latin Extended-B block,
|
||||
// rather than all the blocks that belong to the Latin script.
|
||||
return c <= '\u024F';
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>str</code> is a hex string.
|
||||
/// </summary>
|
||||
public static bool IsHexadecimalString(string str)
|
||||
{
|
||||
return !string.IsNullOrEmpty(str) && str.All(IsHexadecimalChar);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>str</code> is mostly a latin string.
|
||||
/// </summary>
|
||||
public static bool IsMostlyLatinString(string str)
|
||||
{
|
||||
double length = !string.IsNullOrEmpty(str) ? 1.0 : str.Length;
|
||||
return str.Where(IsLatinChar).Count() / length >= 0.5;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether or not the <code>str</code> is a numeric string.
|
||||
/// </summary>
|
||||
public static bool IsNumericString(string str)
|
||||
{
|
||||
return str.All(char.IsDigit);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the int value of the <code>str</code>; 0 otherwise.
|
||||
/// </summary>
|
||||
public static int StringToInt(string str)
|
||||
{
|
||||
try
|
||||
{
|
||||
return int.Parse(str);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine(ex);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public static string SubstringWithCheck(string str, int start, int count)
|
||||
{
|
||||
if (start + count > str.Length) count = str.Length - start;
|
||||
return str.Substring(start, count);
|
||||
}
|
||||
}
|
226
Chiara/AnitomySharp/Token.cs
Normal file
226
Chiara/AnitomySharp/Token.cs
Normal file
@@ -0,0 +1,226 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// An anime filename is tokenized into individual <see cref="Token"/>s. This class represents an individual token.
|
||||
/// </summary>
|
||||
public class Token
|
||||
{
|
||||
/// <summary>
|
||||
/// The category of the token.
|
||||
/// </summary>
|
||||
public enum TokenCategory
|
||||
{
|
||||
Unknown,
|
||||
Bracket,
|
||||
Delimiter,
|
||||
Identifier,
|
||||
Invalid
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// TokenFlag, used for searching specific token categories. This allows granular searching of TokenCategories.
|
||||
/// </summary>
|
||||
public enum TokenFlag
|
||||
{
|
||||
// None
|
||||
FlagNone,
|
||||
|
||||
// Categories
|
||||
FlagBracket,
|
||||
FlagNotBracket,
|
||||
FlagDelimiter,
|
||||
FlagNotDelimiter,
|
||||
FlagIdentifier,
|
||||
FlagNotIdentifier,
|
||||
FlagUnknown,
|
||||
FlagNotUnknown,
|
||||
FlagValid,
|
||||
FlagNotValid,
|
||||
|
||||
// Enclosed (Meaning that it is enclosed in some bracket (e.g. [ ] ))
|
||||
FlagEnclosed,
|
||||
FlagNotEnclosed
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Set of token category flags
|
||||
/// </summary>
|
||||
private static readonly List<TokenFlag> s_flagMaskCategories =
|
||||
[
|
||||
TokenFlag.FlagBracket, TokenFlag.FlagNotBracket,
|
||||
TokenFlag.FlagDelimiter, TokenFlag.FlagNotDelimiter,
|
||||
TokenFlag.FlagIdentifier, TokenFlag.FlagNotIdentifier,
|
||||
TokenFlag.FlagUnknown, TokenFlag.FlagNotUnknown,
|
||||
TokenFlag.FlagValid, TokenFlag.FlagNotValid
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Set of token enclosed flags
|
||||
/// </summary>
|
||||
private static readonly List<TokenFlag> s_flagMaskEnclosed = [TokenFlag.FlagEnclosed, TokenFlag.FlagNotEnclosed];
|
||||
|
||||
public TokenCategory Category { get; set; }
|
||||
public string Content { get; set; }
|
||||
public bool Enclosed { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a new token
|
||||
/// </summary>
|
||||
/// <param name="category">the token category</param>
|
||||
/// <param name="content">the token content</param>
|
||||
/// <param name="enclosed">whether or not the token is enclosed in braces</param>
|
||||
public Token(TokenCategory category, string content, bool enclosed)
|
||||
{
|
||||
Category = category;
|
||||
Content = content;
|
||||
Enclosed = enclosed;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a token against the <code>flags</code>. The <code>flags</code> is used as a search parameter.
|
||||
/// </summary>
|
||||
/// <param name="token">the token</param>
|
||||
/// <param name="flags">the flags the token must conform against</param>
|
||||
/// <returns>true if the token conforms to the set of <code>flags</code>; false otherwise</returns>
|
||||
private static bool CheckTokenFlags(Token token, ICollection<TokenFlag> flags)
|
||||
{
|
||||
// Make sure token is the correct closure
|
||||
if (flags.Any(f => s_flagMaskEnclosed.Contains(f)))
|
||||
{
|
||||
bool success = CheckFlag(TokenFlag.FlagEnclosed) == token.Enclosed;
|
||||
if (!success) return false; // Not enclosed correctly (e.g. enclosed when we're looking for non-enclosed).
|
||||
}
|
||||
|
||||
// Make sure token is the correct category
|
||||
if (!flags.Any(f => s_flagMaskCategories.Contains(f))) return true;
|
||||
bool secondarySuccess = false;
|
||||
|
||||
CheckCategory(TokenFlag.FlagBracket, TokenFlag.FlagNotBracket, TokenCategory.Bracket);
|
||||
CheckCategory(TokenFlag.FlagDelimiter, TokenFlag.FlagNotDelimiter, TokenCategory.Delimiter);
|
||||
CheckCategory(TokenFlag.FlagIdentifier, TokenFlag.FlagNotIdentifier, TokenCategory.Identifier);
|
||||
CheckCategory(TokenFlag.FlagUnknown, TokenFlag.FlagNotUnknown, TokenCategory.Unknown);
|
||||
CheckCategory(TokenFlag.FlagNotValid, TokenFlag.FlagValid, TokenCategory.Invalid);
|
||||
return secondarySuccess;
|
||||
|
||||
void CheckCategory(TokenFlag fe, TokenFlag fn, TokenCategory c)
|
||||
{
|
||||
if (secondarySuccess) return;
|
||||
bool result = CheckFlag(fe) ? token.Category == c : CheckFlag(fn) && token.Category != c;
|
||||
secondarySuccess = result;
|
||||
}
|
||||
|
||||
// Simple alias to check if flag is a part of the set
|
||||
bool CheckFlag(TokenFlag flag)
|
||||
{
|
||||
return flags.Contains(flag);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given a list of <code>tokens</code>, searches for any token token that matches the list of <code>flags</code>.
|
||||
/// </summary>
|
||||
/// <param name="tokens">the list of tokens</param>
|
||||
/// <param name="begin">the search starting position.</param>
|
||||
/// <param name="end">the search ending position.</param>
|
||||
/// <param name="flags">the search flags</param>
|
||||
/// <returns>the search result</returns>
|
||||
public static int FindToken(List<Token> tokens, int begin, int end, params TokenFlag[] flags)
|
||||
{
|
||||
return FindTokenBase(tokens, begin, end, i => i < tokens.Count, i => i + 1, flags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given a list of <code>tokens</code>, searches for the next token in <code>tokens</code> that matches the list of <code>flags</code>.
|
||||
/// </summary>
|
||||
/// <param name="tokens">the list of tokens</param>
|
||||
/// <param name="first">the search starting position.</param>
|
||||
/// <param name="flags">the search flags</param>
|
||||
/// <returns>the search result</returns>
|
||||
public static int FindNextToken(List<Token> tokens, int first, params TokenFlag[] flags)
|
||||
{
|
||||
return FindTokenBase(tokens, first + 1, tokens.Count, i => i < tokens.Count, i => i + 1, flags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given a list of <code>tokens</code>, searches for the previous token in <code>tokens</code> that matches the list of <code>flags</code>.
|
||||
/// </summary>
|
||||
/// <param name="tokens">the list of tokens</param>
|
||||
/// <param name="begin">the search starting position. Exclusive of position.Pos</param>
|
||||
/// <param name="flags">the search flags</param>
|
||||
/// <returns>the search result</returns>
|
||||
public static int FindPrevToken(List<Token> tokens, int begin, params TokenFlag[] flags)
|
||||
{
|
||||
return FindTokenBase(tokens, begin - 1, -1, i => i >= 0, i => i - 1, flags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Given a list of tokens finds the first token that passes <see cref="CheckTokenFlags"/>.
|
||||
/// </summary>
|
||||
/// <param name="tokens">the list of the tokens to search</param>
|
||||
/// <param name="begin">the start index of the search.</param>
|
||||
/// <param name="end">the end index of the search.</param>
|
||||
/// <param name="shouldContinue">a function that returns whether or not we should continue searching</param>
|
||||
/// <param name="next">a function that returns the next search index</param>
|
||||
/// <param name="flags">the flags that each token should be validated against</param>
|
||||
/// <returns>the found token</returns>
|
||||
private static int FindTokenBase(
|
||||
List<Token> tokens,
|
||||
int begin,
|
||||
int end,
|
||||
Func<int, bool> shouldContinue,
|
||||
Func<int, int> next,
|
||||
params TokenFlag[] flags)
|
||||
{
|
||||
var find = new List<TokenFlag>();
|
||||
find.AddRange(flags);
|
||||
|
||||
for (int i = begin; shouldContinue(i); i = next(i))
|
||||
{
|
||||
var token = tokens[i];
|
||||
if (CheckTokenFlags(token, find))
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
public static bool InListRange(int pos, List<Token> list)
|
||||
{
|
||||
return -1 < pos && pos < list.Count;
|
||||
}
|
||||
|
||||
public override bool Equals(object? o)
|
||||
{
|
||||
if (this == o)
|
||||
return true;
|
||||
if (o is not Token token)
|
||||
return false;
|
||||
return Enclosed == token.Enclosed && Category == token.Category && Equals(Content, token.Content);
|
||||
}
|
||||
|
||||
public override int GetHashCode()
|
||||
{
|
||||
int hashCode = -1776802967;
|
||||
hashCode = hashCode * -1521134295 + Category.GetHashCode();
|
||||
hashCode = hashCode * -1521134295 + EqualityComparer<string>.Default.GetHashCode(Content);
|
||||
hashCode = hashCode * -1521134295 + Enclosed.GetHashCode();
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"Token{{category={Category}, content='{Content}', enclosed={Enclosed}}}";
|
||||
}
|
||||
}
|
17
Chiara/AnitomySharp/TokenRange.cs
Normal file
17
Chiara/AnitomySharp/TokenRange.cs
Normal file
@@ -0,0 +1,17 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
public struct TokenRange(int offset, int size)
|
||||
{
|
||||
public int Offset = offset;
|
||||
public int Size = size;
|
||||
}
|
322
Chiara/AnitomySharp/Tokenizer.cs
Normal file
322
Chiara/AnitomySharp/Tokenizer.cs
Normal file
@@ -0,0 +1,322 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2017, Eren Okka
|
||||
* Copyright (c) 2016-2017, Paul Miller
|
||||
* Copyright (c) 2017-2018, Tyler Bratton
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
|
||||
using System.Text;
|
||||
|
||||
namespace AnitomySharp;
|
||||
|
||||
/// <summary>
|
||||
/// A class that will tokenize an anime filename.
|
||||
/// </summary>
|
||||
public class Tokenizer
|
||||
{
|
||||
private readonly string _filename;
|
||||
private readonly List<Element> _elements;
|
||||
private readonly Options _options;
|
||||
private readonly List<Token> _tokens;
|
||||
private static readonly List<Tuple<string, string>> s_brackets =
|
||||
[
|
||||
new Tuple<string, string>("(", ")"), // U+0028-U+0029
|
||||
new Tuple<string, string>("[", "]"), // U+005B-U+005D Square bracket
|
||||
new Tuple<string, string>("{", "}"), // U+007B-U+007D Curly bracket
|
||||
new Tuple<string, string>("\u300C", "\u300D"), // Corner bracket
|
||||
new Tuple<string, string>("\u300E", "\u300E"), // White corner bracket
|
||||
new Tuple<string, string>("\u3010", "\u3011"), // Black lenticular bracket
|
||||
new Tuple<string, string>("\uFF08", "\uFF09")
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Tokenize a filename into <see cref="Element"/>s
|
||||
/// </summary>
|
||||
/// <param name="filename">the filename</param>
|
||||
/// <param name="elements">the list of elements where pre-identified tokens will be added</param>
|
||||
/// <param name="options">the parser options</param>
|
||||
/// <param name="tokens">the list of tokens where tokens will be added</param>
|
||||
public Tokenizer(string filename, List<Element> elements, Options options, List<Token> tokens)
|
||||
{
|
||||
_filename = filename;
|
||||
_elements = elements;
|
||||
_options = options;
|
||||
_tokens = tokens;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns true if tokenization was successful; false otherwise.
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public bool Tokenize()
|
||||
{
|
||||
TokenizeByBrackets();
|
||||
return _tokens.Count > 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds a token to the internal list of tokens
|
||||
/// </summary>
|
||||
/// <param name="category">the token category</param>
|
||||
/// <param name="enclosed">whether or not the token is enclosed in braces</param>
|
||||
/// <param name="range">the token range</param>
|
||||
private void AddToken(Token.TokenCategory category, bool enclosed, TokenRange range)
|
||||
{
|
||||
_tokens.Add(new Token(category, StringHelper.SubstringWithCheck(_filename, range.Offset, range.Size), enclosed));
|
||||
}
|
||||
|
||||
private string GetDelimiters(TokenRange range)
|
||||
{
|
||||
var delimiters = new StringBuilder();
|
||||
|
||||
foreach (var i in Enumerable.Range(range.Offset, Math.Min(_filename.Length, range.Offset + range.Size) - range.Offset)
|
||||
.Where(value => IsDelimiter(_filename[value])))
|
||||
{
|
||||
delimiters.Append(_filename[i]);
|
||||
}
|
||||
|
||||
return delimiters.ToString();
|
||||
|
||||
bool IsDelimiter(char c)
|
||||
{
|
||||
if (StringHelper.IsAlphanumericChar(c)) return false;
|
||||
return _options.AllowedDelimiters.Contains(c.ToString()) && !delimiters.ToString().Contains(c.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tokenize by bracket.
|
||||
/// </summary>
|
||||
private void TokenizeByBrackets()
|
||||
{
|
||||
string matchingBracket = string.Empty;
|
||||
|
||||
bool isBracketOpen = false;
|
||||
for (int i = 0; i < _filename.Length; )
|
||||
{
|
||||
int foundIdx = !isBracketOpen ? FindFirstBracket(i, _filename.Length) : _filename.IndexOf(matchingBracket, i, StringComparison.Ordinal);
|
||||
|
||||
var range = new TokenRange(i, foundIdx == -1 ? _filename.Length : foundIdx - i);
|
||||
if (range.Size > 0)
|
||||
{
|
||||
// Check if our range contains any known anime identifiers
|
||||
TokenizeByPreIdentified(isBracketOpen, range);
|
||||
}
|
||||
|
||||
if (foundIdx != -1)
|
||||
{
|
||||
// mark as bracket
|
||||
AddToken(Token.TokenCategory.Bracket, true, new TokenRange(range.Offset + range.Size, 1));
|
||||
isBracketOpen = !isBracketOpen;
|
||||
i = foundIdx + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
int FindFirstBracket(int start, int end)
|
||||
{
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
foreach (var bracket in s_brackets)
|
||||
{
|
||||
if (!_filename[i].Equals(char.Parse(bracket.Item1))) continue;
|
||||
matchingBracket = bracket.Item2;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tokenize by looking for known anime identifiers
|
||||
/// </summary>
|
||||
/// <param name="enclosed">whether or not the current <code>range</code> is enclosed in braces</param>
|
||||
/// <param name="range">the token range</param>
|
||||
private void TokenizeByPreIdentified(bool enclosed, TokenRange range)
|
||||
{
|
||||
var preIdentifiedTokens = new List<TokenRange>();
|
||||
|
||||
// Find known anime identifiers
|
||||
KeywordManager.PeekAndAdd(_filename, range, _elements, preIdentifiedTokens);
|
||||
|
||||
int offset = range.Offset;
|
||||
var subRange = new TokenRange(range.Offset, 0);
|
||||
while (offset < range.Offset + range.Size)
|
||||
{
|
||||
foreach (var preIdentifiedToken in preIdentifiedTokens)
|
||||
{
|
||||
if (offset != preIdentifiedToken.Offset) continue;
|
||||
if (subRange.Size > 0)
|
||||
{
|
||||
TokenizeByDelimiters(enclosed, subRange);
|
||||
}
|
||||
|
||||
AddToken(Token.TokenCategory.Identifier, enclosed, preIdentifiedToken);
|
||||
subRange.Offset = preIdentifiedToken.Offset + preIdentifiedToken.Size;
|
||||
offset = subRange.Offset - 1; // It's going to be incremented below
|
||||
}
|
||||
|
||||
subRange.Size = ++offset - subRange.Offset;
|
||||
}
|
||||
|
||||
// Either there was no preidentified token range, or we're now about to process the tail of our current range
|
||||
if (subRange.Size > 0)
|
||||
{
|
||||
TokenizeByDelimiters(enclosed, subRange);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tokenize by delimiters allowed in <see cref="Options"/>.AllowedDelimiters.
|
||||
/// </summary>
|
||||
/// <param name="enclosed">whether or not the current <code>range</code> is enclosed in braces</param>
|
||||
/// <param name="range">the token range</param>
|
||||
private void TokenizeByDelimiters(bool enclosed, TokenRange range)
|
||||
{
|
||||
string delimiters = GetDelimiters(range);
|
||||
|
||||
if (string.IsNullOrEmpty(delimiters))
|
||||
{
|
||||
AddToken(Token.TokenCategory.Unknown, enclosed, range);
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = range.Offset, end = range.Offset + range.Size; i < end;)
|
||||
{
|
||||
int found = Enumerable.Range(i, Math.Min(end, _filename.Length) - i)
|
||||
.Where(c => delimiters.Contains(_filename[c].ToString()))
|
||||
.DefaultIfEmpty(end)
|
||||
.FirstOrDefault();
|
||||
|
||||
var subRange = new TokenRange(i, found - i);
|
||||
if (subRange.Size > 0)
|
||||
{
|
||||
AddToken(Token.TokenCategory.Unknown, enclosed, subRange);
|
||||
}
|
||||
|
||||
if (found != end)
|
||||
{
|
||||
AddToken(Token.TokenCategory.Delimiter, enclosed, new TokenRange(subRange.Offset + subRange.Size, 1));
|
||||
i = found + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateDelimiterTokens();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates tokens (make sure certain words delimited by certain tokens aren't split)
|
||||
/// </summary>
|
||||
private void ValidateDelimiterTokens()
|
||||
{
|
||||
for (int i = 0; i < _tokens.Count; i++)
|
||||
{
|
||||
var token = _tokens[i];
|
||||
if (token.Category != Token.TokenCategory.Delimiter) continue;
|
||||
char delimiter = token.Content[0];
|
||||
|
||||
int prevToken = Token.FindPrevToken(_tokens, i, Token.TokenFlag.FlagValid);
|
||||
int nextToken = Token.FindNextToken(_tokens, i, Token.TokenFlag.FlagValid);
|
||||
|
||||
// Check for single-character tokens to prevent splitting group names,
|
||||
// keywords, episode numbers, etc.
|
||||
if (delimiter != ' ' && delimiter != '_')
|
||||
{
|
||||
|
||||
// Single character token
|
||||
if (IsSingleCharacterToken(prevToken))
|
||||
{
|
||||
AppendTokenTo(token, _tokens[prevToken]);
|
||||
|
||||
while (IsUnknownToken(nextToken))
|
||||
{
|
||||
AppendTokenTo(_tokens[nextToken], _tokens[prevToken]);
|
||||
|
||||
nextToken = Token.FindNextToken(_tokens, i, Token.TokenFlag.FlagValid);
|
||||
if (!IsDelimiterToken(nextToken) || _tokens[nextToken].Content[0] != delimiter) continue;
|
||||
AppendTokenTo(_tokens[nextToken], _tokens[prevToken]);
|
||||
nextToken = Token.FindNextToken(_tokens, nextToken, Token.TokenFlag.FlagValid);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (IsSingleCharacterToken(nextToken))
|
||||
{
|
||||
AppendTokenTo(token, _tokens[prevToken]);
|
||||
AppendTokenTo(_tokens[nextToken], _tokens[prevToken]);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for adjacent delimiters
|
||||
if (IsUnknownToken(prevToken) && IsDelimiterToken(nextToken))
|
||||
{
|
||||
char nextDelimiter = _tokens[nextToken].Content[0];
|
||||
if (delimiter != nextDelimiter && delimiter != ',')
|
||||
{
|
||||
if (nextDelimiter is ' ' or '_')
|
||||
{
|
||||
AppendTokenTo(token, _tokens[prevToken]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (IsDelimiterToken(prevToken) && IsDelimiterToken(nextToken))
|
||||
{
|
||||
char prevDelimiter = _tokens[prevToken].Content[0];
|
||||
char nextDelimiter = _tokens[nextToken].Content[0];
|
||||
if (prevDelimiter == nextDelimiter && prevDelimiter != delimiter)
|
||||
{
|
||||
token.Category = Token.TokenCategory.Unknown; // e.g. "& in "_&_"
|
||||
}
|
||||
}
|
||||
|
||||
// Check for other special cases
|
||||
if (delimiter != '&' && delimiter != '+') continue;
|
||||
if (!IsUnknownToken(prevToken) || !IsUnknownToken(nextToken)) continue;
|
||||
if (!StringHelper.IsNumericString(_tokens[prevToken].Content)
|
||||
|| !StringHelper.IsNumericString(_tokens[nextToken].Content)) continue;
|
||||
AppendTokenTo(token, _tokens[prevToken]);
|
||||
AppendTokenTo(_tokens[nextToken], _tokens[prevToken]); // e.g. 01+02
|
||||
}
|
||||
|
||||
// Remove invalid tokens
|
||||
_tokens.RemoveAll(token => token.Category == Token.TokenCategory.Invalid);
|
||||
return;
|
||||
|
||||
void AppendTokenTo(Token src, Token dest)
|
||||
{
|
||||
dest.Content += src.Content;
|
||||
src.Category = Token.TokenCategory.Invalid;
|
||||
}
|
||||
|
||||
bool IsUnknownToken(int it)
|
||||
{
|
||||
return Token.InListRange(it, _tokens) && _tokens[it].Category == Token.TokenCategory.Unknown;
|
||||
}
|
||||
|
||||
bool IsSingleCharacterToken(int it)
|
||||
{
|
||||
return IsUnknownToken(it) && _tokens[it].Content.Length == 1 && _tokens[it].Content[0] != '-';
|
||||
}
|
||||
|
||||
bool IsDelimiterToken(int it)
|
||||
{
|
||||
return Token.InListRange(it, _tokens) && _tokens[it].Category == Token.TokenCategory.Delimiter;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user