init
Some checks failed
Docker. / Ubuntu (push) Has been cancelled
User-agent updater. / User-agent (push) Failing after 15s
Lock Threads / lock (push) Failing after 10s
Waiting for answer. / waiting-for-answer (push) Failing after 22s
Close stale issues and PRs / stale (push) Successful in 13s
Needs user action. / needs-user-action (push) Failing after 8s
Can't reproduce. / cant-reproduce (push) Failing after 8s

This commit is contained in:
allhaileris
2026-02-16 15:50:16 +03:00
commit afb81b8278
13816 changed files with 3689732 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
result.aspell
result.hunspell
List_of_common_misspellings.txt.*
x.*

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,6 @@
EXTRA_DIST= \
List_of_common_misspellings.txt \
Makefile.orig \
prepare \
README \
test

View File

@@ -0,0 +1,11 @@
all:
./prepare
./test
single:
./prepare2
./test
clean:
rm *.[1-5] result.*

View File

@@ -0,0 +1,16 @@
source of text data: Wikipedia
http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines
For testing Hunspell you need the extended en_US dictionary with phonetic table:
http://hunspell.sourceforge.net/en_US.zip
test:
make -f Makefile.orig
test only with Hunspell:
make -f Makefile.orig single
test with different input file and dictionaries:
INPUT=dutchlist.txt HUNSPELL=nl_NL ASPELL=nl make -f Makefile.orig

View File

@@ -0,0 +1,40 @@
#!/bin/bash
# Check common misspellings
# input file format:
# word->word1, ...
# Source: http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines
hunspell=../../src/tools/hunspell
hlang=${HUNSPELL:-en_US}
alang=${ASPELL:-en_US}
input=${INPUT:-List_of_common_misspellings.txt}
# remove bad words recognised by Hunspell as good
cat $input | sed 's/[-]>/ /' | $hunspell -d $hlang -1 -L |
# remove items with dash for Aspell
grep '^[^-]* ' |
# remove spaces from end of lines
sed 's/ *$//' >$input.1
# remove bad words recognised by Aspell as good
cut -f 1 -d ' ' $input.1 | aspell -l $alang --list |
awk 'FILENAME=="-"{a[$1]=1;next}a[$1]{print$0}' - $input.1 |
# change commas with tabs
sed 's/, */ /g' >$input.2
# remove lines with unrecognised suggestions (except suggestion with spaces)
cut -d ' ' -f 2- $input.2 | tr "\t" "\n" | grep -v ' ' >x.1
cat x.1 | $hunspell -l -d $hlang >x.2
cat x.1 | aspell -l $alang --list >>x.2
cat x.2 | awk 'BEGIN{FS="\t"}
FILENAME=="-"{a[$1]=1;next}a[$2]!=1 && a[$3]!=1{print $0}' - $input.2 >$input.3
cut -f 1 -d ' ' $input.3 | aspell -l $alang -a | grep -v ^$ | sed -n '2,$p' |
sed 's/^.*: //;s/, / /g' >$input.4
cat $input.3 | $hunspell -d $hlang -a -1 | grep -v ^$ | sed -n '2,$p' |
sed 's/^.*: //;s/, / /g' >$input.5

View File

@@ -0,0 +1,30 @@
#!/bin/bash
# Check common misspellings
# input file format:
# word->word1, ...
# Source: http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines
hunspell=../../src/tools/hunspell
hlang=${HUNSPELL:-en_US}
input=${INPUT:-List_of_common_misspellings.txt}
# remove bad words recognised by Hunspell as good
cat $input | sed 's/[-]>/ /' | $hunspell -d $hlang -1 -L |
# remove spaces from end of lines
sed 's/ *$//' >$input.1
# change commas with tabs
cat $input.1 | sed 's/, */ /g' >$input.2
# remove lines with unrecognised suggestions (except suggestion with spaces)
cut -d ' ' -f 2- $input.2 | tr "\t" "\n" | grep -v ' ' >x.1
cat x.1 | $hunspell -l -d $hlang >x.2
cat x.2 | awk 'BEGIN{FS="\t"}
FILENAME=="-"{a[$1]=1;next}a[$2]!=1 && a[$3]!=1{print $0}' - $input.2 >$input.3
test -f $input.4 && rm $input.4
cat $input.3 | $hunspell -d $hlang -a -1 | grep -v ^$ | sed -n '2,$p' |
sed 's/^.*: //;s/, / /g' >$input.5

View File

@@ -0,0 +1,25 @@
#!/bin/bash
# Check common misspellings
# input file format:
# word->word1, ...
# Source: http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines
input=${INPUT:-List_of_common_misspellings.txt}
function check() {
cat $1 | awk 'BEGIN{maxord=0;FS="\t"}FILENAME=="-"{for (i=1; i<=NF; i++){a[NR,$(i)]=i};max=NR;next}{x1=a[NR-max,$2];x2=a[NR-max,$3];sug++;if($3)sug++;if (!x1&&!x2){mis2++;misrow=misrow"\n"$0};if(!x1||($3 && !x2))mis++;ord+=x1+x2;}END{
print "Missed rows", misrow;
print "======================================="
print maxord, "max. suggestion for a word";
print max, "input rows";
print mis2, "missing rows";
print sug, "expected suggestions";
print mis, "missing suggestions";
print ord/(sug-mis), "average ranking";
}' - $2
}
test -f $input.4 && check $input.4 $input.3 >result.aspell
check $input.5 $input.3 >result.hunspell
test -f result.aspell && tail -6 result.aspell
tail -6 result.hunspell