Some checks failed
Docker. / Ubuntu (push) Has been cancelled
User-agent updater. / User-agent (push) Failing after 15s
Lock Threads / lock (push) Failing after 10s
Waiting for answer. / waiting-for-answer (push) Failing after 22s
Close stale issues and PRs / stale (push) Successful in 13s
Needs user action. / needs-user-action (push) Failing after 8s
Can't reproduce. / cant-reproduce (push) Failing after 8s
322 lines
13 KiB
C++
322 lines
13 KiB
C++
// This file is part of Desktop App Toolkit,
|
|
// a set of libraries for developing nice desktop applications.
|
|
//
|
|
// For license and copyright information please follow this link:
|
|
// https://github.com/desktop-app/legal/blob/master/LEGAL
|
|
//
|
|
#include "spellcheck/spellcheck_utils.h"
|
|
#include "spellcheck/platform/platform_spellcheck.h"
|
|
|
|
#include <QtCore/QStringList>
|
|
#include <QTextBoundaryFinder>
|
|
|
|
namespace Spellchecker {
|
|
namespace {
|
|
|
|
struct SubtagScript {
|
|
const char *subtag;
|
|
QChar::Script script;
|
|
};
|
|
|
|
// https://chromium.googlesource.com/chromium/src/+/refs/heads/master/third_party/blink/renderer/platform/text/locale_to_script_mapping.cc
|
|
|
|
std::vector<QChar::Script> SupportedScripts;
|
|
rpl::event_stream<> SupportedScriptsEventStream;
|
|
|
|
constexpr auto kFactor = 1000;
|
|
|
|
constexpr auto kAcuteAccentChars = {
|
|
QChar(769), QChar(833), // QChar(180),
|
|
QChar(714), QChar(779), QChar(733),
|
|
QChar(758), QChar(791), QChar(719),
|
|
};
|
|
|
|
constexpr auto kUnspellcheckableScripts = {
|
|
QChar::Script_Katakana,
|
|
QChar::Script_Han,
|
|
};
|
|
|
|
constexpr SubtagScript kLocaleScriptList[] = {
|
|
{"aa", QChar::Script_Latin}, {"ab", QChar::Script_Cyrillic},
|
|
{"ady", QChar::Script_Cyrillic}, {"aeb", QChar::Script_Arabic},
|
|
{"af", QChar::Script_Latin}, {"ak", QChar::Script_Latin},
|
|
{"am", QChar::Script_Ethiopic}, {"ar", QChar::Script_Arabic},
|
|
{"arq", QChar::Script_Arabic}, {"ary", QChar::Script_Arabic},
|
|
{"arz", QChar::Script_Arabic}, {"as", QChar::Script_Bengali},
|
|
{"ast", QChar::Script_Latin}, {"av", QChar::Script_Cyrillic},
|
|
{"ay", QChar::Script_Latin}, {"az", QChar::Script_Latin},
|
|
{"azb", QChar::Script_Arabic}, {"ba", QChar::Script_Cyrillic},
|
|
{"bal", QChar::Script_Arabic}, {"be", QChar::Script_Cyrillic},
|
|
{"bej", QChar::Script_Arabic}, {"bg", QChar::Script_Cyrillic},
|
|
{"bi", QChar::Script_Latin}, {"bn", QChar::Script_Bengali},
|
|
{"bo", QChar::Script_Tibetan}, {"bqi", QChar::Script_Arabic},
|
|
{"brh", QChar::Script_Arabic}, {"bs", QChar::Script_Latin},
|
|
{"ca", QChar::Script_Latin}, {"ce", QChar::Script_Cyrillic},
|
|
{"ceb", QChar::Script_Latin}, {"ch", QChar::Script_Latin},
|
|
{"chk", QChar::Script_Latin}, {"cja", QChar::Script_Arabic},
|
|
{"cjm", QChar::Script_Arabic}, {"ckb", QChar::Script_Arabic},
|
|
{"cs", QChar::Script_Latin}, {"cy", QChar::Script_Latin},
|
|
{"da", QChar::Script_Latin}, {"dcc", QChar::Script_Arabic},
|
|
{"de", QChar::Script_Latin}, {"doi", QChar::Script_Arabic},
|
|
{"dv", QChar::Script_Thaana}, {"dyo", QChar::Script_Arabic},
|
|
{"dz", QChar::Script_Tibetan}, {"ee", QChar::Script_Latin},
|
|
{"efi", QChar::Script_Latin}, {"el", QChar::Script_Greek},
|
|
{"en", QChar::Script_Latin}, {"es", QChar::Script_Latin},
|
|
{"et", QChar::Script_Latin}, {"eu", QChar::Script_Latin},
|
|
{"fa", QChar::Script_Arabic}, {"fi", QChar::Script_Latin},
|
|
{"fil", QChar::Script_Latin}, {"fj", QChar::Script_Latin},
|
|
{"fo", QChar::Script_Latin}, {"fr", QChar::Script_Latin},
|
|
{"fur", QChar::Script_Latin}, {"fy", QChar::Script_Latin},
|
|
{"ga", QChar::Script_Latin}, {"gaa", QChar::Script_Latin},
|
|
{"gba", QChar::Script_Arabic}, {"gbz", QChar::Script_Arabic},
|
|
{"gd", QChar::Script_Latin}, {"gil", QChar::Script_Latin},
|
|
{"gl", QChar::Script_Latin}, {"gjk", QChar::Script_Arabic},
|
|
{"gju", QChar::Script_Arabic}, {"glk", QChar::Script_Arabic},
|
|
{"gn", QChar::Script_Latin}, {"gsw", QChar::Script_Latin},
|
|
{"gu", QChar::Script_Gujarati}, {"ha", QChar::Script_Latin},
|
|
{"haw", QChar::Script_Latin}, {"haz", QChar::Script_Arabic},
|
|
{"he", QChar::Script_Hebrew}, {"hi", QChar::Script_Devanagari},
|
|
{"hil", QChar::Script_Latin}, {"hnd", QChar::Script_Arabic},
|
|
{"hno", QChar::Script_Arabic}, {"ho", QChar::Script_Latin},
|
|
{"hr", QChar::Script_Latin}, {"ht", QChar::Script_Latin},
|
|
{"hu", QChar::Script_Latin}, {"hy", QChar::Script_Armenian},
|
|
{"id", QChar::Script_Latin}, {"ig", QChar::Script_Latin},
|
|
{"ii", QChar::Script_Yi}, {"ilo", QChar::Script_Latin},
|
|
{"inh", QChar::Script_Cyrillic}, {"is", QChar::Script_Latin},
|
|
{"it", QChar::Script_Latin}, {"iu", QChar::Script_CanadianAboriginal},
|
|
{"ja", QChar::Script_Katakana}, // or Script_Hiragana.
|
|
{"jv", QChar::Script_Latin}, {"ka", QChar::Script_Georgian},
|
|
{"kaj", QChar::Script_Latin}, {"kam", QChar::Script_Latin},
|
|
{"kbd", QChar::Script_Cyrillic}, {"kha", QChar::Script_Latin},
|
|
{"khw", QChar::Script_Arabic}, {"kk", QChar::Script_Cyrillic},
|
|
{"kl", QChar::Script_Latin}, {"km", QChar::Script_Khmer},
|
|
{"kn", QChar::Script_Kannada}, {"ko", QChar::Script_Hangul},
|
|
{"kok", QChar::Script_Devanagari}, {"kos", QChar::Script_Latin},
|
|
{"kpe", QChar::Script_Latin}, {"krc", QChar::Script_Cyrillic},
|
|
{"ks", QChar::Script_Arabic}, {"ku", QChar::Script_Arabic},
|
|
{"kum", QChar::Script_Cyrillic}, {"kvx", QChar::Script_Arabic},
|
|
{"kxp", QChar::Script_Arabic}, {"ky", QChar::Script_Cyrillic},
|
|
{"la", QChar::Script_Latin}, {"lah", QChar::Script_Arabic},
|
|
{"lb", QChar::Script_Latin}, {"lez", QChar::Script_Cyrillic},
|
|
{"lki", QChar::Script_Arabic}, {"ln", QChar::Script_Latin},
|
|
{"lo", QChar::Script_Lao}, {"lrc", QChar::Script_Arabic},
|
|
{"lt", QChar::Script_Latin}, {"luz", QChar::Script_Arabic},
|
|
{"lv", QChar::Script_Latin}, {"mai", QChar::Script_Devanagari},
|
|
{"mdf", QChar::Script_Cyrillic}, {"mfa", QChar::Script_Arabic},
|
|
{"mg", QChar::Script_Latin}, {"mh", QChar::Script_Latin},
|
|
{"mi", QChar::Script_Latin}, {"mk", QChar::Script_Cyrillic},
|
|
{"ml", QChar::Script_Malayalam}, {"mn", QChar::Script_Cyrillic},
|
|
{"mr", QChar::Script_Devanagari},{"ms", QChar::Script_Latin},
|
|
{"mt", QChar::Script_Latin}, {"mvy", QChar::Script_Arabic},
|
|
{"my", QChar::Script_Myanmar}, {"myv", QChar::Script_Cyrillic},
|
|
{"mzn", QChar::Script_Arabic}, {"na", QChar::Script_Latin},
|
|
{"nb", QChar::Script_Latin}, {"ne", QChar::Script_Devanagari},
|
|
{"niu", QChar::Script_Latin}, {"nl", QChar::Script_Latin},
|
|
{"nn", QChar::Script_Latin}, {"nr", QChar::Script_Latin},
|
|
{"nso", QChar::Script_Latin}, {"ny", QChar::Script_Latin},
|
|
{"oc", QChar::Script_Latin}, {"om", QChar::Script_Latin},
|
|
{"or", QChar::Script_Oriya}, {"os", QChar::Script_Cyrillic},
|
|
{"pa", QChar::Script_Gurmukhi}, {"pag", QChar::Script_Latin},
|
|
{"pap", QChar::Script_Latin}, {"pau", QChar::Script_Latin},
|
|
{"pl", QChar::Script_Latin}, {"pon", QChar::Script_Latin},
|
|
{"prd", QChar::Script_Arabic}, {"prs", QChar::Script_Arabic},
|
|
{"ps", QChar::Script_Arabic}, {"pt", QChar::Script_Latin},
|
|
{"qu", QChar::Script_Latin}, {"rm", QChar::Script_Latin},
|
|
{"rmt", QChar::Script_Arabic}, {"rn", QChar::Script_Latin},
|
|
{"ro", QChar::Script_Latin}, {"ru", QChar::Script_Cyrillic},
|
|
{"rw", QChar::Script_Latin}, {"sa", QChar::Script_Devanagari},
|
|
{"sah", QChar::Script_Cyrillic}, {"sat", QChar::Script_Latin},
|
|
{"sd", QChar::Script_Arabic}, {"sdh", QChar::Script_Arabic},
|
|
{"se", QChar::Script_Latin}, {"sg", QChar::Script_Latin},
|
|
{"shi", QChar::Script_Arabic}, {"si", QChar::Script_Sinhala},
|
|
{"sid", QChar::Script_Latin}, {"sk", QChar::Script_Latin},
|
|
{"skr", QChar::Script_Arabic}, {"sl", QChar::Script_Latin},
|
|
{"sm", QChar::Script_Latin}, {"so", QChar::Script_Latin},
|
|
{"sq", QChar::Script_Latin}, {"sr", QChar::Script_Cyrillic},
|
|
{"ss", QChar::Script_Latin}, {"st", QChar::Script_Latin},
|
|
{"su", QChar::Script_Latin}, {"sus", QChar::Script_Arabic},
|
|
{"sv", QChar::Script_Latin}, {"sw", QChar::Script_Latin},
|
|
{"swb", QChar::Script_Arabic}, {"syr", QChar::Script_Arabic},
|
|
{"ta", QChar::Script_Tamil}, {"te", QChar::Script_Telugu},
|
|
{"tet", QChar::Script_Latin}, {"tg", QChar::Script_Cyrillic},
|
|
{"th", QChar::Script_Thai}, {"ti", QChar::Script_Ethiopic},
|
|
{"tig", QChar::Script_Ethiopic}, {"tk", QChar::Script_Latin},
|
|
{"tkl", QChar::Script_Latin}, {"tl", QChar::Script_Latin},
|
|
{"tn", QChar::Script_Latin}, {"to", QChar::Script_Latin},
|
|
{"tpi", QChar::Script_Latin}, {"tr", QChar::Script_Latin},
|
|
{"trv", QChar::Script_Latin}, {"ts", QChar::Script_Latin},
|
|
{"tt", QChar::Script_Cyrillic}, {"ttt", QChar::Script_Arabic},
|
|
{"tvl", QChar::Script_Latin}, {"tw", QChar::Script_Latin},
|
|
{"ty", QChar::Script_Latin}, {"tyv", QChar::Script_Cyrillic},
|
|
{"udm", QChar::Script_Cyrillic}, {"ug", QChar::Script_Arabic},
|
|
{"uk", QChar::Script_Cyrillic}, {"und", QChar::Script_Latin},
|
|
{"ur", QChar::Script_Arabic}, {"uz", QChar::Script_Cyrillic},
|
|
{"ve", QChar::Script_Latin}, {"vi", QChar::Script_Latin},
|
|
{"wal", QChar::Script_Ethiopic}, {"war", QChar::Script_Latin},
|
|
{"wo", QChar::Script_Latin}, {"xh", QChar::Script_Latin},
|
|
{"yap", QChar::Script_Latin}, {"yo", QChar::Script_Latin},
|
|
{"za", QChar::Script_Latin}, {"zdj", QChar::Script_Arabic},
|
|
{"zh", QChar::Script_Han}, {"zu", QChar::Script_Latin},
|
|
// Encompassed languages within the Chinese macrolanguage.
|
|
// http://www-01.sil.org/iso639-3/documentation.asp?id=zho
|
|
// http://lists.w3.org/Archives/Public/public-i18n-cjk/2016JulSep/0022.html
|
|
// {"cdo", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"cjy", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"cmn", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"cpx", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"czh", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"czo", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"gan", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"hsn", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"mnp", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"wuu", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"hak", USCRIPT_TRADITIONAL_HAN},
|
|
// {"lzh", USCRIPT_TRADITIONAL_HAN},
|
|
// {"nan", USCRIPT_TRADITIONAL_HAN},
|
|
// {"yue", USCRIPT_TRADITIONAL_HAN},
|
|
// {"zh-cdo", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-cjy", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-cmn", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-cpx", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-czh", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-czo", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-gan", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-hsn", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-mnp", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-wuu", USCRIPT_SIMPLIFIED_HAN},
|
|
// {"zh-hak", USCRIPT_TRADITIONAL_HAN},
|
|
// {"zh-lzh", USCRIPT_TRADITIONAL_HAN},
|
|
// {"zh-nan", USCRIPT_TRADITIONAL_HAN},
|
|
// {"zh-yue", USCRIPT_TRADITIONAL_HAN},
|
|
// // Chinese with regions. Logically, regions should be handled
|
|
// // separately, but this works for the current purposes.
|
|
// {"zh-hk", USCRIPT_TRADITIONAL_HAN},
|
|
// {"zh-mo", USCRIPT_TRADITIONAL_HAN},
|
|
// {"zh-tw", USCRIPT_TRADITIONAL_HAN},
|
|
};
|
|
|
|
inline auto IsAcuteAccentChar(const QChar &c) {
|
|
return ranges::contains(kAcuteAccentChars, c);
|
|
}
|
|
|
|
inline auto IsSpellcheckableScripts(const QChar::Script &s) {
|
|
return !ranges::contains(kUnspellcheckableScripts, s);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
QChar::Script LocaleToScriptCode(const QString &locale) {
|
|
const auto subtag = locale.left(
|
|
std::max(locale.indexOf('_'), locale.indexOf('-')));
|
|
for (const auto &kv : kLocaleScriptList) {
|
|
if (subtag == kv.subtag) {
|
|
return kv.script;
|
|
}
|
|
}
|
|
return QChar::Script_Common;
|
|
}
|
|
|
|
QChar::Script WordScript(QStringView word) {
|
|
// Find the first letter.
|
|
const auto firstLetter = ranges::find_if(word, [](QChar c) {
|
|
return c.isLetter();
|
|
});
|
|
return firstLetter == word.end()
|
|
? QChar::Script_Common
|
|
: firstLetter->script();
|
|
}
|
|
|
|
bool IsWordSkippable(QStringView word, bool checkSupportedScripts) {
|
|
if (word.size() > kMaxWordSize) {
|
|
return true;
|
|
}
|
|
const auto wordScript = WordScript(word);
|
|
if (checkSupportedScripts
|
|
&& !ranges::contains(SupportedScripts, wordScript)) {
|
|
return true;
|
|
}
|
|
return ranges::any_of(word, [&](QChar c) {
|
|
return (c.script() != wordScript)
|
|
&& !IsAcuteAccentChar(c)
|
|
&& (c.unicode() != '\'') // Patched Qt to make it a non-separator.
|
|
&& (c.unicode() != '_'); // This is not a word separator.
|
|
});
|
|
}
|
|
|
|
void UpdateSupportedScripts(std::vector<QString> languages) {
|
|
// It should be called at least once from Platform::Spellchecker::Init().
|
|
SupportedScripts = ranges::views::all(
|
|
languages
|
|
) | ranges::views::transform(
|
|
LocaleToScriptCode
|
|
) | ranges::views::unique | ranges::views::filter(
|
|
IsSpellcheckableScripts
|
|
) | ranges::to_vector;
|
|
SupportedScriptsEventStream.fire({});
|
|
}
|
|
|
|
rpl::producer<> SupportedScriptsChanged() {
|
|
return SupportedScriptsEventStream.events();
|
|
}
|
|
|
|
MisspelledWords RangesFromText(
|
|
const QString &text,
|
|
Fn<bool(const QString &word)> filterCallback) {
|
|
MisspelledWords ranges;
|
|
|
|
if (text.isEmpty()) {
|
|
return ranges;
|
|
}
|
|
|
|
auto finder = QTextBoundaryFinder(QTextBoundaryFinder::Word, text);
|
|
|
|
const auto isEnd = [&] {
|
|
return (finder.toNextBoundary() == -1);
|
|
};
|
|
|
|
while (finder.position() < text.length()) {
|
|
if (!finder.boundaryReasons().testFlag(
|
|
QTextBoundaryFinder::StartOfItem)) {
|
|
if (isEnd()) {
|
|
break;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
const auto start = finder.position();
|
|
const auto end = finder.toNextBoundary();
|
|
if (end == -1) {
|
|
break;
|
|
}
|
|
const auto length = end - start;
|
|
if (length < 1) {
|
|
continue;
|
|
}
|
|
if (!filterCallback(text.mid(start, length))) {
|
|
ranges.push_back(std::make_pair(start, length));
|
|
}
|
|
|
|
if (isEnd()) {
|
|
break;
|
|
}
|
|
}
|
|
return ranges;
|
|
}
|
|
|
|
bool CheckSkipAndSpell(const QString &word) {
|
|
return !IsWordSkippable(word)
|
|
&& Platform::Spellchecker::CheckSpelling(word);
|
|
}
|
|
|
|
QLocale LocaleFromLangId(int langId) {
|
|
if (langId < kFactor) {
|
|
return QLocale(static_cast<QLocale::Language>(langId));
|
|
}
|
|
const auto l = langId / kFactor;
|
|
const auto lang = static_cast<QLocale::Language>(l);
|
|
const auto country = static_cast<QLocale::Country>(langId - l * kFactor);
|
|
return QLocale(lang, country);
|
|
}
|
|
|
|
} // namespace Spellchecker
|