init
Some checks failed
Docker. / Ubuntu (push) Has been cancelled
User-agent updater. / User-agent (push) Failing after 15s
Lock Threads / lock (push) Failing after 10s
Waiting for answer. / waiting-for-answer (push) Failing after 22s
Needs user action. / needs-user-action (push) Failing after 8s
Can't reproduce. / cant-reproduce (push) Failing after 8s
Close stale issues and PRs / stale (push) Has been cancelled

This commit is contained in:
allhaileris
2026-02-16 15:50:16 +03:00
commit afb81b8278
13816 changed files with 3689732 additions and 0 deletions

View File

@@ -0,0 +1,149 @@
name: gcld3
on: [push, pull_request]
jobs:
test:
name: ${{ matrix.os }}-${{matrix.python-version}}-test
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
python-version: [3.6, 3.7, 3.8, pypy3]
steps:
- uses: actions/checkout@v2
- name: Linux Dependencies
if: runner.os == 'Linux'
run: sudo apt-get install libprotobuf-dev protobuf-compiler python3-dev
- name: MacOS Dependencies
if: runner.os == 'macOS'
run: brew install protobuf
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Build package
run: |
pip install setuptools
python setup.py install
- name: Test with pytest
run: |
pip install pytest pytest-cov
pytest gcld3/tests/gcld3_test.py
sdist:
name: Build source distribution
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
name: Install Python
with:
python-version: "3.8"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install setuptools wheel
- name: Build sdist
run: python setup.py sdist
- uses: actions/upload-artifact@v2
with:
path: dist/*.tar.gz
wheel:
name: ${{ matrix.os }},${{ matrix.arch }}-wheel
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
arch: [auto]
include:
- os: ubuntu-latest
arch: aarch64
steps:
- uses: actions/checkout@v2
- name: Set up QEMU
if: ${{ matrix.arch == 'aarch64' }}
uses: docker/setup-qemu-action@v1
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Install cibuildwheel
run: |
python -m pip install cibuildwheel>=1.5.5 auditwheel delocate
- name: Build
env:
CIBW_BUILD: "cp36-* cp38-* pp36-*"
CIBW_SKIP: "*-win32 *-manylinux_i686 pp27-* cp27-* cp35-* *-musllinux_aarch64"
CIBW_ARCHS: ${{matrix.arch}}
CIBW_BEFORE_BUILD_LINUX: yum -y install protobuf-devel protobuf-compiler python3-devel
CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair --lib-sdir . -w {dest_dir} {wheel}"
CIBW_BEFORE_BUILD_MACOS: brew install protobuf
CIBW_REPAIR_WHEEL_COMMAND_MACOS: "delocate-listdeps {wheel} && delocate-wheel -w {dest_dir} -v {wheel}"
run: |
python -m cibuildwheel --output-dir wheelhouse
- uses: actions/upload-artifact@v2
with:
path: ./wheelhouse/*.whl
pypi:
needs: [wheel, sdist]
runs-on: ubuntu-latest
# upload to PyPI on every tag starting with 'v'
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
# alternatively, to publish when a GitHub Release is created, use the following rule:
# if: github.event_name == 'release' && github.event.action == 'published'
steps:
- uses: actions/download-artifact@v2
with:
name: artifact
path: dist
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install twine
- name: Upload to test pypi
env:
TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
run: |
twine upload --repository-url https://test.pypi.org/legacy/ dist/*
- name: Upload to pypi
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
twine upload dist/*

1
Telegram/ThirdParty/cld3/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
build

69
Telegram/ThirdParty/cld3/CMakeLists.txt vendored Normal file
View File

@@ -0,0 +1,69 @@
# This cmake scripts only builds a static cld3 lib and the unittests.
project(cld3)
# Old versions of cmake dont search/find protobuf lite
cmake_minimum_required(VERSION 3.9)
find_package(Protobuf REQUIRED)
message(STATUS "Protobuf_FOUND= ${Protobuf_FOUND}")
message(STATUS "Protobuf_VERSION= ${Protobuf_VERSION}")
message(WARNING "Protobuf 2.5 and CLD3 seems happy together. This script does NOT check if your verison of protobuf is compatible.")
message(STATUS "Protobuf_LIBRARIES= ${Protobuf_LIBRARIES}")
message(STATUS "Protobuf_LITE_LIBRARIES= ${Protobuf_LITE_LIBRARIES}") # Usually /usr/lib64/libprotobuf-lite.so
# By default, protobuf_generate_cpp generates pb.* files directy in the cmake build dir.
# But CLD3 sources have been coded using hard coded pathes to cld_3/protos/*.pb.h.
# So *.pb.h must be output to cld_3/protos.
# For that, let's use a custom my_protobuf_generate_cpp:
include(${CMAKE_CURRENT_SOURCE_DIR}/misc/myprotobuf.cmake)
my_protobuf_generate_cpp(cld_3/protos PROTO_SRCS PROTO_HDRS src/feature_extractor.proto src/sentence.proto src/task_spec.proto)
message(STATUS "PROTO_HDRS= ${PROTO_HDRS}")
add_definitions(-fPIC) # Position Independant Code
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
add_definitions(-std=c++11) # Needed for std::to_string(), ...
include_directories(${CMAKE_CURRENT_BINARY_DIR} ${Protobuf_INCLUDE_DIRS}) # needed to include generated pb headers
add_library(${PROJECT_NAME}
${PROTO_SRCS} ${PROTO_HDRS}
src/base.cc
src/embedding_feature_extractor.cc
src/embedding_network.cc
src/feature_extractor.cc
src/feature_extractor.h
src/feature_types.cc
src/fml_parser.cc
src/language_identifier_features.cc
src/lang_id_nn_params.cc
src/nnet_language_identifier.cc
src/registry.cc
src/relevant_script_feature.cc
src/sentence_features.cc
src/task_context.cc
src/task_context_params.cc
src/unicodetext.cc
src/utils.cc
src/workspace.cc
src/script_span/generated_entities.cc
src/script_span/getonescriptspan.cc
src/script_span/getonescriptspan.h
src/script_span/getonescriptspan_test.cc
src/script_span/utf8statetable.cc
src/script_span/offsetmap.cc
src/script_span/text_processing.cc
src/script_span/text_processing.h
src/script_span/fixunicodevalue.cc
)
# unit tests exec:
add_executable(language_identifier_main src/language_identifier_main.cc)
target_link_libraries(language_identifier_main cld3 ${Protobuf_LITE_LIBRARIES})
add_executable(getonescriptspan_test src/script_span/getonescriptspan_test.cc)
target_link_libraries(getonescriptspan_test cld3 ${Protobuf_LITE_LIBRARIES})
add_executable(language_identifier_features_test src/language_identifier_features_test.cc)
target_link_libraries(language_identifier_features_test cld3 ${Protobuf_LITE_LIBRARIES})

View File

@@ -0,0 +1,26 @@
Want to contribute? Great! First, read this page (including the small print at
the end).
### Before you contribute
Before we can use your code, you must sign the
[Google Individual Contributor License Agreement](https://cla.developers.google.com/about/google-individual)
(CLA), which you can do online. The CLA is necessary mainly because you own the
copyright to your changes, even after your contribution becomes part of our
codebase, so we need your permission to use and distribute your code. We also
need to be sure of various other things—for instance that you'll tell us if you
know that your code infringes on other people's patents. You don't have to sign
the CLA until after you've submitted your code for review and a member has
approved it, but you must do it before we can put your code into our codebase.
Before you start working on a larger contribution, you should get in touch with
us first through the issue tracker with your idea so that we can help out and
possibly guide you. Coordinating up front makes it much easier to avoid
frustration later on.
### Code reviews
All submissions, including submissions by project members, require review. We
use Github pull requests for this purpose.
### The small print
Contributions made by corporations are covered by a different agreement than
the one above, the
[Software Grant and Corporate Contributor License Agreement](https://cla.developers.google.com/about/google-corporate).

203
Telegram/ThirdParty/cld3/LICENSE vendored Normal file
View File

@@ -0,0 +1,203 @@
Copyright 2016 Google Inc. All rights reserved.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2016, Google Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

13
Telegram/ThirdParty/cld3/MANIFEST.in vendored Normal file
View File

@@ -0,0 +1,13 @@
include LICENSE
include README.md
include requirements.txt
global-include *h
global-include *cc
global-include *proto
prune .github/
prune .eggs/
global-exclude *.pyc
global-exclude *.cache
global-exclude *.so
exclude src/cld_3/protos/*h
exclude src/cld_3/protos/*cc

191
Telegram/ThirdParty/cld3/README.md vendored Normal file
View File

@@ -0,0 +1,191 @@
# Compact Language Detector v3 (CLD3)
* [Model](#model)
* [Supported Languages](#supported-languages)
* [Installation](#installation)
* [Bugs and Feature Requests](#bugs-and-feature-requests)
* [Credits](#credits)
### Model
CLD3 is a neural network model for language identification. This package
contains the inference code and a trained model. The inference code
extracts character ngrams from the input text and computes the fraction
of times each of them appears. For example, as shown in the figure below,
if the input text is "banana", then one of the extracted trigrams is "ana"
and the corresponding fraction is 2/4. The ngrams are hashed down to an id
within a small range, and each id is represented by a dense embedding vector
estimated during training.
The model averages the embeddings corresponding to each ngram type according
to the fractions, and the averaged embeddings are concatenated to produce
the embedding layer. The remaining components of the network are a hidden
(Rectified linear) layer and a softmax layer.
To get a language prediction for the input text, we simply perform a forward
pass through the network.
![Figure](model.png "CLD3")
### Supported Languages
The model outputs BCP-47-style language codes, shown in the table below. For
some languages, output is differentiated by script. Language and script names
from
[Unicode CLDR](https://github.com/unicode-cldr/cldr-localenames-modern/blob/master/main/en).
Output Code | Language Name | Script Name
----------- | --------------- | ------------------------------------------
af | Afrikaans | Latin
am | Amharic | Ethiopic
ar | Arabic | Arabic
bg | Bulgarian | Cyrillic
bg-Latn | Bulgarian | Latin
bn | Bangla | Bangla
bs | Bosnian | Latin
ca | Catalan | Latin
ceb | Cebuano | Latin
co | Corsican | Latin
cs | Czech | Latin
cy | Welsh | Latin
da | Danish | Latin
de | German | Latin
el | Greek | Greek
el-Latn | Greek | Latin
en | English | Latin
eo | Esperanto | Latin
es | Spanish | Latin
et | Estonian | Latin
eu | Basque | Latin
fa | Persian | Arabic
fi | Finnish | Latin
fil | Filipino | Latin
fr | French | Latin
fy | Western Frisian | Latin
ga | Irish | Latin
gd | Scottish Gaelic | Latin
gl | Galician | Latin
gu | Gujarati | Gujarati
ha | Hausa | Latin
haw | Hawaiian | Latin
hi | Hindi | Devanagari
hi-Latn | Hindi | Latin
hmn | Hmong | Latin
hr | Croatian | Latin
ht | Haitian Creole | Latin
hu | Hungarian | Latin
hy | Armenian | Armenian
id | Indonesian | Latin
ig | Igbo | Latin
is | Icelandic | Latin
it | Italian | Latin
iw | Hebrew | Hebrew
ja | Japanese | Japanese
ja-Latn | Japanese | Latin
jv | Javanese | Latin
ka | Georgian | Georgian
kk | Kazakh | Cyrillic
km | Khmer | Khmer
kn | Kannada | Kannada
ko | Korean | Korean
ku | Kurdish | Latin
ky | Kyrgyz | Cyrillic
la | Latin | Latin
lb | Luxembourgish | Latin
lo | Lao | Lao
lt | Lithuanian | Latin
lv | Latvian | Latin
mg | Malagasy | Latin
mi | Maori | Latin
mk | Macedonian | Cyrillic
ml | Malayalam | Malayalam
mn | Mongolian | Cyrillic
mr | Marathi | Devanagari
ms | Malay | Latin
mt | Maltese | Latin
my | Burmese | Myanmar
ne | Nepali | Devanagari
nl | Dutch | Latin
no | Norwegian | Latin
ny | Nyanja | Latin
pa | Punjabi | Gurmukhi
pl | Polish | Latin
ps | Pashto | Arabic
pt | Portuguese | Latin
ro | Romanian | Latin
ru | Russian | Cyrillic
ru-Latn | Russian | English
sd | Sindhi | Arabic
si | Sinhala | Sinhala
sk | Slovak | Latin
sl | Slovenian | Latin
sm | Samoan | Latin
sn | Shona | Latin
so | Somali | Latin
sq | Albanian | Latin
sr | Serbian | Cyrillic
st | Southern Sotho | Latin
su | Sundanese | Latin
sv | Swedish | Latin
sw | Swahili | Latin
ta | Tamil | Tamil
te | Telugu | Telugu
tg | Tajik | Cyrillic
th | Thai | Thai
tr | Turkish | Latin
uk | Ukrainian | Cyrillic
ur | Urdu | Arabic
uz | Uzbek | Latin
vi | Vietnamese | Latin
xh | Xhosa | Latin
yi | Yiddish | Hebrew
yo | Yoruba | Latin
zh | Chinese | Han (including Simplified and Traditional)
zh-Latn | Chinese | Latin
zu | Zulu | Latin
### Installation
CLD3 is designed to run in the Chrome browser, so it relies on code in
[Chromium](http://www.chromium.org/).
The steps for building and running the demo of the language detection model are:
- [check out](http://www.chromium.org/developers/how-tos/get-the-code) the
Chromium repository.
- copy the code to `//third_party/cld_3`
- Uncomment `language_identifier_main` executable in `src/BUILD.gn`.
- build and run the model using the commands:
```shell
gn gen out/Default
ninja -C out/Default third_party/cld_3/src/src:language_identifier_main
out/Default/language_identifier_main
```
### Bugs and Feature Requests
Open a [GitHub issue](https://github.com/google/cld3/issues) for this repository to file bugs and feature requests.
### Announcements and Discussion
For announcements regarding major updates as well as general discussion list, please subscribe to:
[cld3-users@googlegroups.com](https://groups.google.com/forum/#!forum/cld3-users)
### Credits
Original authors of the code in this package include (in alphabetical order):
* Alex Salcianu
* Andy Golding
* Anton Bakalov
* Chris Alberti
* Daniel Andor
* David Weiss
* Emily Pitler
* Greg Coppola
* Jason Riesa
* Kuzman Ganchev
* Michael Ringgaard
* Nan Hua
* Ryan McDonald
* Slav Petrov
* Stefan Istrate
* Terry Koo

View File

@@ -0,0 +1 @@
from .pybind_ext import *

View File

@@ -0,0 +1,43 @@
#include <pybind11/pybind11.h>
#include <pybind11/pytypes.h>
#include <pybind11/stl.h>
#include "../src/nnet_language_identifier.h"
namespace pybind11 {
using chrome_lang_id::NNetLanguageIdentifier;
// This is conventional.
namespace py = pybind11;
PYBIND11_MODULE(pybind_ext, py_module) {
py::class_<NNetLanguageIdentifier>(py_module, "NNetLanguageIdentifier")
.def(py::init<const int, const int>(), py::arg("min_num_bytes"),
py::arg("max_num_bytes"))
.def("FindLanguage", &NNetLanguageIdentifier::FindLanguage,
py::arg("text"))
.def("FindTopNMostFreqLangs",
&NNetLanguageIdentifier::FindTopNMostFreqLangs, py::arg("text"),
py::arg("num_langs"))
.def_readonly_static("kUnknown", &NNetLanguageIdentifier::kUnknown)
.def_readonly_static("kMinNumBytesToConsider",
&NNetLanguageIdentifier::kMinNumBytesToConsider)
.def_readonly_static("kMaxNumBytesToConsider",
&NNetLanguageIdentifier::kMaxNumBytesToConsider)
.def_readonly_static("kMaxNumInputBytesToConsider",
&NNetLanguageIdentifier::kMaxNumInputBytesToConsider)
.def_readonly_static("kReliabilityThreshold",
&NNetLanguageIdentifier::kReliabilityThreshold)
.def_readonly_static("kReliabilityHrBsThreshold",
&NNetLanguageIdentifier::kReliabilityHrBsThreshold);
py::class_<NNetLanguageIdentifier::Result>(py_module, "Result")
.def_readwrite("language", &NNetLanguageIdentifier::Result::language)
.def_readwrite("probability",
&NNetLanguageIdentifier::Result::probability)
.def_readwrite("is_reliable",
&NNetLanguageIdentifier::Result::is_reliable)
.def_readwrite("proportion", &NNetLanguageIdentifier::Result::proportion);
}
} // namespace pybind11

View File

@@ -0,0 +1,43 @@
"""Tests for gcld3."""
import gcld3
import unittest
class NnetLanguageIdentifierTest(unittest.TestCase):
def testLangIdentification(self):
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
sample = "This text is written in English."
result = detector.FindLanguage(text=sample)
self.assertEqual(result.language, "en")
self.assertTrue(result.is_reliable)
self.assertGreater(result.proportion, 0.99)
self.assertGreater(result.probability, 0.90)
def testEmptyString(self):
detector = gcld3.NNetLanguageIdentifier(
min_num_bytes=10, max_num_bytes=1000)
sample = ""
result = detector.FindLanguage(text=sample)
self.assertEqual(result.language, "und")
self.assertFalse(result.is_reliable)
self.assertEqual(result.proportion, 0.0)
self.assertEqual(result.probability, 0.00)
def testLangsIdentification(self):
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
sample = ("This piece of text is in English. Този текст е на " "Български.")
results = detector.FindTopNMostFreqLangs(text=sample, num_langs=2)
self.assertEqual(results[0].language, "bg")
self.assertTrue(results[0].is_reliable)
self.assertLess(results[0].proportion, 0.75)
self.assertGreater(results[0].probability, 0.90)
self.assertEqual(results[1].language, "en")
self.assertTrue(results[1].is_reliable)
self.assertLess(results[1].proportion, 0.75)
self.assertGreater(results[1].probability, 0.90)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,58 @@
# Special PROTOBUF_GENERATE_CPP which allows to set the output folder:
# From https://stackoverflow.com/users/1600278/akira-okumura
function(MY_PROTOBUF_GENERATE_CPP PATH SRCS HDRS)
if(NOT ARGN)
message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files")
return()
endif()
if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
# Create an include path for each file specified
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(ABS_PATH ${ABS_FIL} PATH)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
else()
set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
endif()
if(DEFINED PROTOBUF_IMPORT_DIRS)
foreach(DIR ${PROTOBUF_IMPORT_DIRS})
get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
endif()
set(${SRCS})
set(${HDRS})
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(FIL_WE ${FIL} NAME_WE)
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc")
list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h")
execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${PATH})
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc"
"${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h"
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/${PATH} ${_protobuf_include_path} ${ABS_FIL}
DEPENDS ${ABS_FIL}
COMMENT "Running C++ protocol buffer compiler on ${FIL}"
VERBATIM )
endforeach()
set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
set(${SRCS} ${${SRCS}} PARENT_SCOPE)
set(${HDRS} ${${HDRS}} PARENT_SCOPE)
endfunction()

BIN
Telegram/ThirdParty/cld3/model.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

View File

@@ -0,0 +1,3 @@
protobuf >=3.0.0
pybind11 >=2.5.0
wheel >= 0.34.2

120
Telegram/ThirdParty/cld3/setup.py vendored Normal file
View File

@@ -0,0 +1,120 @@
"""Setup utility for gcld3."""
import os
import platform
import shutil
import subprocess
import setuptools
from setuptools.command import build_ext
__version__ = '3.0.13'
_NAME = 'gcld3'
REQUIREMENTS = ['pybind11 >= 2.5.0', 'wheel >= 0.34.2']
PROTO_FILES = [
'src/feature_extractor.proto',
'src/sentence.proto',
'src/task_spec.proto',
]
SRCS = [
'src/base.cc',
'src/embedding_feature_extractor.cc',
'src/embedding_network.cc',
'src/feature_extractor.cc',
'src/feature_types.cc',
'src/fml_parser.cc',
'src/lang_id_nn_params.cc',
'src/language_identifier_features.cc',
'src/language_identifier_main.cc',
'src/nnet_language_identifier.cc',
'src/registry.cc',
'src/relevant_script_feature.cc',
'src/sentence_features.cc',
'src/task_context.cc',
'src/task_context_params.cc',
'src/unicodetext.cc',
'src/utils.cc',
'src/workspace.cc',
'src/script_span/fixunicodevalue.cc',
'src/script_span/generated_entities.cc',
'src/script_span/generated_ulscript.cc',
'src/script_span/getonescriptspan.cc',
'src/script_span/offsetmap.cc',
'src/script_span/text_processing.cc',
'src/script_span/utf8statetable.cc',
# These CC files have to be generated by the proto buffer compiler 'protoc'
'src/cld_3/protos/feature_extractor.pb.cc',
'src/cld_3/protos/sentence.pb.cc',
'src/cld_3/protos/task_spec.pb.cc',
# pybind11 bindings
'gcld3/pybind_ext.cc',
]
class CompileProtos(build_ext.build_ext):
"""Compile protocol buffers via `protoc` compiler."""
def run(self):
if shutil.which('protoc') is None:
raise RuntimeError('Please install the proto buffer compiler.')
# The C++ code expect the protos to be compiled under the following
# directory, therefore, create it if necessary.
compiled_protos_dir = 'src/cld_3/protos/'
os.makedirs(compiled_protos_dir, exist_ok=True)
command = ['protoc', f'--cpp_out={compiled_protos_dir}', '--proto_path=src']
command.extend(PROTO_FILES)
subprocess.run(command, check=True, cwd='./')
build_ext.build_ext.run(self)
class PyBindIncludes(object):
"""Returns the include paths for pybind11 when needed.
To delay the invocation of "pybind11.get_include()" until it is available
in the environment. This lazy evaluation allows us to install it first, then
import it later to determine the correct include paths.
"""
def __str__(self):
import pybind11 # pylint: disable=g-import-not-at-top
return pybind11.get_include()
MACOS = platform.system() == 'Darwin'
ext_modules = [
setuptools.Extension(
'gcld3.pybind_ext',
sorted(SRCS),
include_dirs=[
PyBindIncludes(),
],
libraries=['protobuf'],
extra_compile_args=['-std=c++11', '-stdlib=libc++'] if MACOS else [],
extra_link_args=['-stdlib=libc++'] if MACOS else [],
language='c++'),
]
DESCRIPTION = """CLD3 is a neural network model for language identification.
This package contains the inference code and a trained model. See
https://github.com/google/cld3 for more details.
"""
setuptools.setup(
author='Rami Al-Rfou',
author_email='rmyeid@google.com',
cmdclass={
'build_ext': CompileProtos,
},
ext_modules=ext_modules,
packages=setuptools.find_packages(),
description='CLD3 is a neural network model for language identification.',
long_description=DESCRIPTION,
name=_NAME,
setup_requires=REQUIREMENTS,
url='https://github.com/google/cld3',
version=__version__,
zip_safe=False,
)

133
Telegram/ThirdParty/cld3/src/BUILD.gn vendored Normal file
View File

@@ -0,0 +1,133 @@
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#==============================================================================
import("//third_party/protobuf/proto_library.gni")
proto_library("protos") {
sources = [
"feature_extractor.proto",
"sentence.proto",
"task_spec.proto",
]
proto_out_dir = "cld_3/protos"
}
static_library("cld_3") {
sources = [
"base.cc",
"base.h",
"casts.h",
"embedding_feature_extractor.cc",
"embedding_feature_extractor.h",
"embedding_network.cc",
"embedding_network.h",
"embedding_network_params.h",
"feature_extractor.cc",
"feature_extractor.h",
"feature_types.cc",
"feature_types.h",
"float16.h",
"fml_parser.cc",
"fml_parser.h",
"language_identifier_features.cc",
"language_identifier_features.h",
"lang_id_nn_params.cc",
"lang_id_nn_params.h",
"nnet_language_identifier.cc",
"nnet_language_identifier.h",
"registry.cc",
"registry.h",
"relevant_script_feature.cc",
"relevant_script_feature.h",
"script_detector.h",
"sentence_features.cc",
"sentence_features.h",
"simple_adder.h",
"script_span/fixunicodevalue.cc",
"script_span/fixunicodevalue.h",
"script_span/generated_entities.cc",
"script_span/generated_ulscript.cc",
"script_span/generated_ulscript.h",
"script_span/getonescriptspan.cc",
"script_span/getonescriptspan.h",
"script_span/integral_types.h",
"script_span/offsetmap.cc",
"script_span/offsetmap.h",
"script_span/port.h",
"script_span/stringpiece.h",
"script_span/text_processing.cc",
"script_span/text_processing.h",
"script_span/utf8acceptinterchange.h",
"script_span/utf8prop_lettermarkscriptnum.h",
"script_span/utf8repl_lettermarklower.h",
"script_span/utf8scannot_lettermarkspecial.h",
"script_span/utf8statetable.cc",
"script_span/utf8statetable.h",
"task_context.cc",
"task_context.h",
"task_context_params.cc",
"task_context_params.h",
"unicodetext.cc",
"unicodetext.h",
"utils.cc",
"utils.h",
"workspace.cc",
"workspace.h",
]
public_deps = [
"//third_party/protobuf:protobuf_lite",
":protos",
]
}
# The executables below are functional. Uncomment to use.
#executable("language_identifier_main") {
# sources = [
# "language_identifier_main.cc",
# ]
# deps = [
# ":cld_3",
# ]
#}
#executable("getonescriptspan_test") {
# sources = [
# "script_span/getonescriptspan_test.cc",
# ]
# deps = [
# ":cld_3",
# ]
#}
#executable("language_identifier_features_test") {
# sources = [
# "language_identifier_features_test.cc",
# ]
# deps = [
# ":cld_3",
# ]
#}
#executable("nnet_lang_id_test") {
# sources = [
# "nnet_lang_id_test.cc",
# "nnet_lang_id_test_data.cc",
# "nnet_lang_id_test_data.h",
# ]
# deps = [
# ":cld_3",
# ]
#}

4
Telegram/ThirdParty/cld3/src/DEPS vendored Normal file
View File

@@ -0,0 +1,4 @@
include_rules = [
'+cld_3',
'+script_span',
]

36
Telegram/ThirdParty/cld3/src/base.cc vendored Normal file
View File

@@ -0,0 +1,36 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "base.h"
#include <string>
#if defined(COMPILER_MSVC) || defined(_WIN32)
#include <sstream>
#endif // defined(COMPILER_MSVC) || defined(_WIN32)
namespace chrome_lang_id {
// TODO(abakalov): Pick the most efficient approach.
#if defined(COMPILER_MSVC) || defined(_WIN32)
std::string Int64ToString(int64 input) {
std::stringstream stream;
stream << input;
return stream.str();
}
#else
std::string Int64ToString(int64 input) { return std::to_string(input); }
#endif // defined(COMPILER_MSVC) || defined(_WIN32)
} // namespace chrome_lang_id

106
Telegram/ThirdParty/cld3/src/base.h vendored Normal file
View File

@@ -0,0 +1,106 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef BASE_H_
#define BASE_H_
#include <cassert>
#include <map>
#include <string>
#include <vector>
namespace chrome_lang_id {
using std::vector;
using std::string;
using std::map;
using std::pair;
typedef unsigned int uint32;
#if LANG_CXX11
#define CLD3_DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName &) = delete; \
TypeName &operator=(const TypeName &) = delete
#else // C++98 case follows
// Note that these C++98 implementations cannot completely disallow copying,
// as members and friends can still accidentally make elided copies without
// triggering a linker error.
#define CLD3_DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName &); \
TypeName &operator=(const TypeName &)
#endif // LANG_CXX11
#ifndef CLD3_IMMEDIATE_CRASH
#if defined(__GNUC__) || defined(__clang__)
#define CLD3_IMMEDIATE_CRASH() __builtin_trap()
#else
#define CLD3_IMMEDIATE_CRASH() ((void)(*(volatile char *)0 = 0))
#endif
#endif // CLD3_IMMEDIATE_CRASH
#define CLD3_CHECK(f) (!(f) ? CLD3_IMMEDIATE_CRASH() : (void)0)
#if defined(NDEBUG) && !defined(DCHECK_ALWAYS_ON)
#define CLD3_DCHECK(f) ((void)0)
#else
#define CLD3_DCHECK(f) CLD3_CHECK(f)
#endif
#ifndef SWIG
typedef int int32;
typedef unsigned char uint8; // NOLINT
typedef unsigned short uint16; // NOLINT
// A type to represent a Unicode code-point value. As of Unicode 4.0,
// such values require up to 21 bits.
// (For type-checking on pointers, make this explicitly signed,
// and it should always be the signed version of whatever int32 is.)
typedef signed int char32;
#endif // SWIG
#ifdef COMPILER_MSVC
typedef __int64 int64;
#else
typedef long long int64; // NOLINT
#endif // COMPILER_MSVC
#if defined(__GNUC__) && \
(__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
// For functions we want to force inline.
// Introduced in gcc 3.1.
#define CLD3_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
#elif defined(_MSC_VER)
#define CLD3_ATTRIBUTE_ALWAYS_INLINE __forceinline
#else
// Other compilers will have to figure it out for themselves.
#define CLD3_ATTRIBUTE_ALWAYS_INLINE
#endif
#ifdef INTERNAL_BUILD
typedef basic_string<char> bstring;
#else
typedef std::basic_string<char> bstring;
#endif // INTERNAL_BUILD
// Converts int64 to string.
std::string Int64ToString(int64 input);
} // namespace chrome_lang_id
#endif // BASE_H_

98
Telegram/ThirdParty/cld3/src/casts.h vendored Normal file
View File

@@ -0,0 +1,98 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This code is compiled directly on many platforms, including client
// platforms like Windows, Mac, and embedded systems. Before making
// any changes here, make sure that you're not breaking any platforms.
//
#ifndef CASTS_H_
#define CASTS_H_
#include <string.h> // for memcpy
namespace chrome_lang_id {
// lang_id_bit_cast<Dest,Source> is a template function that implements the
// equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in
// very low-level functions like the protobuf library and fast math
// support.
//
// float f = 3.14159265358979;
// int i = lang_id_bit_cast<int32>(f);
// // i = 0x40490fdb
//
// The classical address-casting method is:
//
// // WRONG
// float f = 3.14159265358979; // WRONG
// int i = * reinterpret_cast<int*>(&f); // WRONG
//
// The address-casting method actually produces undefined behavior
// according to ISO C++ specification section 3.10 -15 -. Roughly, this
// section says: if an object in memory has one type, and a program
// accesses it with a different type, then the result is undefined
// behavior for most values of "different type".
//
// This is true for any cast syntax, either *(int*)&f or
// *reinterpret_cast<int*>(&f). And it is particularly true for
// conversions between integral lvalues and floating-point lvalues.
//
// The purpose of 3.10 -15- is to allow optimizing compilers to assume
// that expressions with different types refer to different memory. gcc
// 4.0.1 has an optimizer that takes advantage of this. So a
// non-conforming program quietly produces wildly incorrect output.
//
// The problem is not the use of reinterpret_cast. The problem is type
// punning: holding an object in memory of one type and reading its bits
// back using a different type.
//
// The C++ standard is more subtle and complex than this, but that
// is the basic idea.
//
// Anyways ...
//
// lang_id_bit_cast<> calls memcpy() which is blessed by the standard,
// especially by the example in section 3.9 . Also, of course,
// lang_id_bit_cast<> wraps up the nasty logic in one place.
//
// Fortunately memcpy() is very fast. In optimized mode, with a
// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
// code with the minimal amount of data movement. On a 32-bit system,
// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
// compiles to two loads and two stores.
//
// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
//
// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
// is likely to surprise you.
//
// Props to Bill Gibbons for the compile time assertion technique and
// Art Komninos and Igor Tandetnik for the msvc experiments.
//
// -- mec 2005-10-17
template <class Dest, class Source>
inline Dest lang_id_bit_cast(const Source &source) {
static_assert(sizeof(Dest) == sizeof(Source), "Sizes do not match");
Dest dest;
memcpy(&dest, &source, sizeof(dest));
return dest;
}
} // namespace chrome_lang_id
#endif // CASTS_H_

View File

@@ -0,0 +1,51 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "embedding_feature_extractor.h"
#include <stddef.h>
#include <vector>
#include "feature_extractor.h"
#include "feature_types.h"
#include "task_context.h"
#include "utils.h"
namespace chrome_lang_id {
GenericEmbeddingFeatureExtractor::GenericEmbeddingFeatureExtractor() {}
GenericEmbeddingFeatureExtractor::~GenericEmbeddingFeatureExtractor() {}
void GenericEmbeddingFeatureExtractor::Setup(TaskContext *context) {
// Don't use version to determine how to get feature FML.
string features_param = ArgPrefix();
features_param += "_features";
const string features = context->Get(features_param, "");
const string embedding_names =
context->Get(GetParamName("embedding_names"), "");
const string embedding_dims =
context->Get(GetParamName("embedding_dims"), "");
embedding_fml_ = utils::Split(features, ';');
add_strings_ = context->Get(GetParamName("add_varlen_strings"), false);
embedding_names_ = utils::Split(embedding_names, ';');
for (const string &dim : utils::Split(embedding_dims, ';')) {
embedding_dims_.push_back(utils::ParseUsing<int>(dim, utils::ParseInt32));
}
}
void GenericEmbeddingFeatureExtractor::Init(TaskContext *context) {}
} // namespace chrome_lang_id

View File

@@ -0,0 +1,182 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef EMBEDDING_FEATURE_EXTRACTOR_H_
#define EMBEDDING_FEATURE_EXTRACTOR_H_
#include <memory>
#include <string>
#include <vector>
#include "feature_extractor.h"
#include "task_context.h"
#include "workspace.h"
namespace chrome_lang_id {
// An EmbeddingFeatureExtractor manages the extraction of features for
// embedding-based models. It wraps a sequence of underlying classes of feature
// extractors, along with associated predicate maps. Each class of feature
// extractors is associated with a name, e.g., "unigrams", "bigrams".
//
// The class is split between a generic abstract version,
// GenericEmbeddingFeatureExtractor (that can be initialized without knowing the
// signature of the ExtractFeatures method) and a typed version.
//
// The predicate maps must be initialized before use: they can be loaded using
// Read() or updated via UpdateMapsForExample.
class GenericEmbeddingFeatureExtractor {
public:
GenericEmbeddingFeatureExtractor();
virtual ~GenericEmbeddingFeatureExtractor();
// Get the prefix string to put in front of all arguments, so they don't
// conflict with other embedding models.
virtual const string ArgPrefix() const = 0;
// Sets up predicate maps and embedding space names that are common for all
// embedding based feature extractors.
virtual void Setup(TaskContext *context);
virtual void Init(TaskContext *context);
// Requests workspace for the underlying feature extractors. This is
// implemented in the typed class.
virtual void RequestWorkspaces(WorkspaceRegistry *registry) = 0;
// Number of predicates for the embedding at a given index (vocabulary size.)
int EmbeddingSize(int index) const {
return generic_feature_extractor(index).GetDomainSize();
}
// Returns number of embedding spaces.
int NumEmbeddings() const { return embedding_dims_.size(); }
// Returns the number of features in the embedding space.
int FeatureSize(int idx) const {
return generic_feature_extractor(idx).feature_types();
}
// Returns the dimensionality of the embedding space.
int EmbeddingDims(int index) const { return embedding_dims_[index]; }
// Accessor for embedding dims (dimensions of the embedding spaces).
const std::vector<int> &embedding_dims() const { return embedding_dims_; }
const std::vector<string> &embedding_fml() const { return embedding_fml_; }
// Get parameter name by concatenating the prefix and the original name.
string GetParamName(const string &param_name) const {
string name = ArgPrefix();
name += "_";
name += param_name;
return name;
}
protected:
// Provides the generic class with access to the templated extractors. This is
// used to get the type information out of the feature extractor without
// knowing the specific calling arguments of the extractor itself.
virtual const GenericFeatureExtractor &generic_feature_extractor(
int idx) const = 0;
private:
// Embedding space names for parameter sharing.
std::vector<string> embedding_names_;
// FML strings for each feature extractor.
std::vector<string> embedding_fml_;
// Size of each of the embedding spaces (maximum predicate id).
std::vector<int> embedding_sizes_;
// Embedding dimensions of the embedding spaces (i.e. 32, 64 etc.)
std::vector<int> embedding_dims_;
// Whether or not to add string descriptions to converted examples.
bool add_strings_;
};
// Templated, object-specific implementation of the
// EmbeddingFeatureExtractor. EXTRACTOR should be a FeatureExtractor<OBJ,
// ARGS...> class that has the appropriate FeatureTraits() to ensure that
// locator type features work.
//
// Note: for backwards compatibility purposes, this always reads the FML spec
// from "<prefix>_features".
template <class EXTRACTOR, class OBJ, class... ARGS>
class EmbeddingFeatureExtractor : public GenericEmbeddingFeatureExtractor {
public:
// Sets up all predicate maps, feature extractors, and flags.
void Setup(TaskContext *context) override {
GenericEmbeddingFeatureExtractor::Setup(context);
feature_extractors_.resize(embedding_fml().size());
for (size_t i = 0; i < embedding_fml().size(); ++i) {
feature_extractors_[i].Parse(embedding_fml()[i]);
feature_extractors_[i].Setup(context);
}
}
// Initializes resources needed by the feature extractors.
void Init(TaskContext *context) override {
GenericEmbeddingFeatureExtractor::Init(context);
for (auto &feature_extractor : feature_extractors_) {
feature_extractor.Init(context);
}
}
// Requests workspaces from the registry. Must be called after Init(), and
// before Preprocess().
void RequestWorkspaces(WorkspaceRegistry *registry) override {
for (auto &feature_extractor : feature_extractors_) {
feature_extractor.RequestWorkspaces(registry);
}
}
// Must be called on the object one state for each sentence, before any
// feature extraction (e.g., UpdateMapsForExample, ExtractSparseFeatures).
void Preprocess(WorkspaceSet *workspaces, OBJ *obj) const {
for (auto &feature_extractor : feature_extractors_) {
feature_extractor.Preprocess(workspaces, obj);
}
}
// Extracts features using the extractors. Note that features must already
// be initialized to the correct number of feature extractors. No predicate
// mapping is applied.
void ExtractFeatures(const WorkspaceSet &workspaces, const OBJ &obj,
ARGS... args,
std::vector<FeatureVector> *features) const {
for (size_t i = 0; i < feature_extractors_.size(); ++i) {
features->at(i).clear();
feature_extractors_.at(i).ExtractFeatures(workspaces, obj, args...,
&features->at(i));
}
}
protected:
// Provides generic access to the feature extractors.
const GenericFeatureExtractor &generic_feature_extractor(
int idx) const override {
return feature_extractors_.at(idx);
}
private:
// Templated feature extractor class.
std::vector<EXTRACTOR> feature_extractors_;
};
} // namespace chrome_lang_id
#endif // EMBEDDING_FEATURE_EXTRACTOR_H_

View File

@@ -0,0 +1,197 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "embedding_network.h"
#include "base.h"
#include "embedding_network_params.h"
#include "float16.h"
#include "simple_adder.h"
namespace chrome_lang_id {
namespace {
using VectorWrapper = EmbeddingNetwork::VectorWrapper;
void CheckNoQuantization(const EmbeddingNetworkParams::Matrix matrix) {
// Quantization not allowed here.
CLD3_DCHECK(static_cast<int>(QuantizationType::NONE) ==
static_cast<int>(matrix.quant_type));
}
// Fills a Matrix object with the parameters in the given MatrixParams. This
// function is used to initialize weight matrices that are *not* embedding
// matrices.
void FillMatrixParams(const EmbeddingNetworkParams::Matrix source_matrix,
EmbeddingNetwork::Matrix *mat) {
mat->resize(source_matrix.rows);
CheckNoQuantization(source_matrix);
const float *weights =
reinterpret_cast<const float *>(source_matrix.elements);
for (int r = 0; r < source_matrix.rows; ++r) {
(*mat)[r] = EmbeddingNetwork::VectorWrapper(weights, source_matrix.cols);
weights += source_matrix.cols;
}
}
// Computes y = weights * Relu(x) + b where Relu is optionally applied.
template <typename ScaleAdderClass>
void SparseReluProductPlusBias(bool apply_relu,
const EmbeddingNetwork::Matrix &weights,
const EmbeddingNetwork::VectorWrapper &b,
const EmbeddingNetwork::Vector &x,
EmbeddingNetwork::Vector *y) {
y->assign(b.data(), b.data() + b.size());
ScaleAdderClass adder(y->data(), y->size());
const int x_size = x.size();
for (int i = 0; i < x_size; ++i) {
const float &scale = x[i];
if (apply_relu) {
if (scale > 0) {
adder.LazyScaleAdd(weights[i].data(), scale);
}
} else {
adder.LazyScaleAdd(weights[i].data(), scale);
}
}
adder.Finalize();
}
} // namespace
void EmbeddingNetwork::ConcatEmbeddings(
const std::vector<FeatureVector> &feature_vectors, Vector *concat) const {
concat->resize(model_->concat_layer_size());
// "es_index" stands for "embedding space index".
for (size_t es_index = 0; es_index < feature_vectors.size(); ++es_index) {
const int concat_offset = model_->concat_offset(es_index);
const int embedding_dim = model_->embedding_dim(es_index);
const EmbeddingMatrix &embedding_matrix = embedding_matrices_[es_index];
CLD3_DCHECK(embedding_matrix.dim() == embedding_dim);
const bool is_quantized =
embedding_matrix.quant_type() != QuantizationType::NONE;
const FeatureVector &feature_vector = feature_vectors[es_index];
const int num_features = feature_vector.size();
for (int fi = 0; fi < num_features; ++fi) {
const FeatureType *feature_type = feature_vector.type(fi);
int feature_offset = concat_offset + feature_type->base() * embedding_dim;
CLD3_DCHECK(feature_offset + embedding_dim <=
static_cast<int>(concat->size()));
// Weighted embeddings will be added starting from this address.
float *concat_ptr = concat->data() + feature_offset;
// Pointer to float / uint8 weights for relevant embedding.
const void *embedding_data;
// Multiplier for each embedding weight.
float multiplier;
const FeatureValue feature_value = feature_vector.value(fi);
if (feature_type->is_continuous()) {
// Continuous features (encoded as FloatFeatureValue).
FloatFeatureValue float_feature_value(feature_value);
const int id = float_feature_value.value.id;
embedding_matrix.get_embedding(id, &embedding_data, &multiplier);
multiplier *= float_feature_value.value.weight;
} else {
// Discrete features: every present feature has implicit value 1.0.
embedding_matrix.get_embedding(feature_value, &embedding_data,
&multiplier);
}
if (is_quantized) {
const uint8 *quant_weights =
reinterpret_cast<const uint8 *>(embedding_data);
for (int i = 0; i < embedding_dim; ++i, ++quant_weights, ++concat_ptr) {
// 128 is bias for UINT8 quantization, only one we currently support.
*concat_ptr += (static_cast<int>(*quant_weights) - 128) * multiplier;
}
} else {
const float *weights = reinterpret_cast<const float *>(embedding_data);
for (int i = 0; i < embedding_dim; ++i, ++weights, ++concat_ptr) {
*concat_ptr += *weights * multiplier;
}
}
}
}
}
template <typename ScaleAdderClass>
void EmbeddingNetwork::FinishComputeFinalScores(const Vector &concat,
Vector *scores) const {
Vector h0(hidden_bias_[0].size());
SparseReluProductPlusBias<ScaleAdderClass>(false, hidden_weights_[0],
hidden_bias_[0], concat, &h0);
CLD3_DCHECK((hidden_weights_.size() == 1) || (hidden_weights_.size() == 2));
if (hidden_weights_.size() == 1) { // 1 hidden layer
SparseReluProductPlusBias<ScaleAdderClass>(true, softmax_weights_,
softmax_bias_, h0, scores);
} else if (hidden_weights_.size() == 2) { // 2 hidden layers
Vector h1(hidden_bias_[1].size());
SparseReluProductPlusBias<ScaleAdderClass>(true, hidden_weights_[1],
hidden_bias_[1], h0, &h1);
SparseReluProductPlusBias<ScaleAdderClass>(true, softmax_weights_,
softmax_bias_, h1, scores);
}
}
void EmbeddingNetwork::ComputeFinalScores(
const std::vector<FeatureVector> &features, Vector *scores) const {
Vector concat;
ConcatEmbeddings(features, &concat);
scores->resize(softmax_bias_.size());
FinishComputeFinalScores<SimpleAdder>(concat, scores);
}
EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
: model_(model) {
int offset_sum = 0;
for (int i = 0; i < model_->embedding_dim_size(); ++i) {
CLD3_DCHECK(offset_sum == model_->concat_offset(i));
offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
(void)offset_sum; // Avoid compiler warning for "unused" variable.
embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
}
CLD3_DCHECK(model_->hidden_size() == model_->hidden_bias_size());
hidden_weights_.resize(model_->hidden_size());
hidden_bias_.resize(model_->hidden_size());
for (int i = 0; i < model_->hidden_size(); ++i) {
FillMatrixParams(model_->GetHiddenLayerMatrix(i), &hidden_weights_[i]);
EmbeddingNetworkParams::Matrix bias = model_->GetHiddenLayerBias(i);
CLD3_DCHECK(1 == bias.cols);
CheckNoQuantization(bias);
hidden_bias_[i] = VectorWrapper(
reinterpret_cast<const float *>(bias.elements), bias.rows);
}
CLD3_DCHECK(model_->HasSoftmax());
FillMatrixParams(model_->GetSoftmaxMatrix(), &softmax_weights_);
EmbeddingNetworkParams::Matrix softmax_bias = model_->GetSoftmaxBias();
CLD3_DCHECK(1 == softmax_bias.cols);
CheckNoQuantization(softmax_bias);
softmax_bias_ =
VectorWrapper(reinterpret_cast<const float *>(softmax_bias.elements),
softmax_bias.rows);
}
} // namespace chrome_lang_id

View File

@@ -0,0 +1,186 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef EMBEDDING_NETWORK_H_
#define EMBEDDING_NETWORK_H_
#include <vector>
#include "embedding_network_params.h"
#include "feature_extractor.h"
#include "float16.h"
namespace chrome_lang_id {
// Classifier using a hand-coded feed-forward neural network.
//
// No gradient computation, just inference.
//
// Based on the more general nlp_saft::EmbeddingNetwork.
//
// Classification works as follows:
//
// Discrete features -> Embeddings -> Concatenation -> Hidden+ -> Softmax
//
// In words: given some discrete features, this class extracts the embeddings
// for these features, concatenates them, passes them through one or two hidden
// layers (each layer uses Relu) and next through a softmax layer that computes
// an unnormalized score for each possible class. Note: there is always a
// softmax layer.
//
// NOTE(salcianu): current code can easily be changed to allow more than two
// hidden layers. Feel free to do so if you have a genuine need for that.
class EmbeddingNetwork {
public:
// Class used to represent an embedding matrix. Each row is the embedding on
// a vocabulary element. Number of columns = number of embedding dimensions.
class EmbeddingMatrix {
public:
explicit EmbeddingMatrix(const EmbeddingNetworkParams::Matrix source_matrix)
: rows_(source_matrix.rows),
cols_(source_matrix.cols),
quant_type_(source_matrix.quant_type),
data_(source_matrix.elements),
row_size_in_bytes_(GetRowSizeInBytes(cols_, quant_type_)),
quant_scales_(source_matrix.quant_scales) {}
// Returns vocabulary size; one embedding for each vocabulary element.
int size() const { return rows_; }
// Returns number of weights in embedding of each vocabulary element.
int dim() const { return cols_; }
// Returns quantization type for this embedding matrix.
QuantizationType quant_type() const { return quant_type_; }
// Gets embedding for k-th vocabulary element: on return, sets *data to
// point to the embedding weights and *scale to the quantization scale (1.0
// if no quantization).
void get_embedding(int k, const void **data, float *scale) const {
CLD3_CHECK(k >= 0);
CLD3_CHECK(k < size());
*data = reinterpret_cast<const char *>(data_) + k * row_size_in_bytes_;
if (quant_type_ == QuantizationType::NONE) {
*scale = 1.0;
} else {
*scale = Float16To32(quant_scales_[k]);
}
}
private:
static int GetRowSizeInBytes(int cols, QuantizationType quant_type) {
CLD3_DCHECK((quant_type == QuantizationType::NONE) ||
(quant_type == QuantizationType::UINT8));
if (quant_type == QuantizationType::NONE) {
return cols * sizeof(float);
} else { // QuantizationType::UINT8
return cols * sizeof(uint8);
}
}
// Vocabulary size.
int rows_;
// Number of elements in each embedding.
int cols_;
QuantizationType quant_type_;
// Pointer to the embedding weights, in row-major order. This is a pointer
// to an array of floats / uint8, depending on the quantization type.
// Not owned.
const void *data_;
// Number of bytes for one row. Used to jump to next row in data_.
int row_size_in_bytes_;
// Pointer to quantization scales. nullptr if no quantization. Otherwise,
// quant_scales_[i] is scale for embedding of i-th vocabulary element.
const float16 *quant_scales_;
};
// An immutable vector that doesn't own the memory that stores the underlying
// floats. Can be used e.g., as a wrapper around model weights stored in the
// static memory.
class VectorWrapper {
public:
VectorWrapper() : VectorWrapper(nullptr, 0) {}
// Constructs a vector wrapper around the size consecutive floats that start
// at address data. Note: the underlying data should be alive for at least
// the lifetime of this VectorWrapper object. That's trivially true if data
// points to statically allocated data :)
VectorWrapper(const float *data, int size) : data_(data), size_(size) {}
int size() const { return size_; }
const float *data() const { return data_; }
private:
const float *data_; // Not owned.
int size_;
// Doesn't own anything, so it can be copied and assigned at will :)
};
typedef std::vector<VectorWrapper> Matrix;
typedef std::vector<float> Vector;
// Constructs an embedding network using the parameters from model.
//
// Note: model should stay alive for at least the lifetime of this
// EmbeddingNetwork object. TODO(salcianu): remove this constraint: we should
// copy all necessary data (except, of course, the static weights) at
// construction time and use that, instead of relying on model.
explicit EmbeddingNetwork(const EmbeddingNetworkParams *model);
virtual ~EmbeddingNetwork() {}
// Runs forward computation to fill scores with unnormalized output unit
// scores. This is useful for making predictions.
void ComputeFinalScores(const std::vector<FeatureVector> &features,
Vector *scores) const;
private:
// Computes the softmax scores (prior to normalization) from the concatenated
// representation.
template <typename ScaleAdderClass>
void FinishComputeFinalScores(const Vector &concat, Vector *scores) const;
// Constructs the concatenated input embedding vector in place in output
// vector concat.
void ConcatEmbeddings(const std::vector<FeatureVector> &features,
Vector *concat) const;
// Pointer to the model object passed to the constructor. Not owned.
const EmbeddingNetworkParams *model_;
// Network parameters.
// One weight matrix for each embedding.
std::vector<EmbeddingMatrix> embedding_matrices_;
// One weight matrix and one vector of bias weights for each hiden layer.
std::vector<Matrix> hidden_weights_;
std::vector<VectorWrapper> hidden_bias_;
// Weight matrix and bias vector for the softmax layer.
Matrix softmax_weights_;
VectorWrapper softmax_bias_;
};
} // namespace chrome_lang_id
#endif // EMBEDDING_NETWORK_H_

View File

@@ -0,0 +1,285 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef EMBEDDING_NETWORK_PARAMS_H_
#define EMBEDDING_NETWORK_PARAMS_H_
#include <string>
#include "base.h"
#include "float16.h"
namespace chrome_lang_id {
enum class QuantizationType { NONE = 0, UINT8 };
// API for accessing parameters from a statically-linked EmbeddingNetworkProto.
class EmbeddingNetworkParams {
public:
virtual ~EmbeddingNetworkParams() {}
// **** High-level API.
// Simple representation of a matrix. This small struct that doesn't own any
// resource intentionally supports copy / assign, to simplify our APIs.
struct Matrix {
// Number of rows.
int rows;
// Number of columns.
int cols;
QuantizationType quant_type;
// Pointer to matrix elements, in row-major order
// (https://en.wikipedia.org/wiki/Row-major_order) Not owned.
const void *elements;
// Quantization scales: one scale for each row.
const float16 *quant_scales;
};
// Returns i-th embedding matrix. Crashes on out of bounds indices.
//
// This is the transpose of the corresponding matrix from the original proto.
Matrix GetEmbeddingMatrix(int i) const {
CheckMatrixRange(i, embeddings_size(), "embedding matrix");
Matrix matrix;
matrix.rows = embeddings_num_rows(i);
matrix.cols = embeddings_num_cols(i);
matrix.elements = embeddings_weights(i);
matrix.quant_type = embeddings_quant_type(i);
matrix.quant_scales = embeddings_quant_scales(i);
return matrix;
}
// Returns weight matrix for i-th hidden layer. Crashes on out of bounds
// indices.
//
// This is the transpose of the corresponding matrix from the original proto.
Matrix GetHiddenLayerMatrix(int i) const {
CheckMatrixRange(i, hidden_size(), "hidden layer");
Matrix matrix;
matrix.rows = hidden_num_rows(i);
matrix.cols = hidden_num_cols(i);
// Quantization not supported here.
matrix.quant_type = QuantizationType::NONE;
matrix.elements = hidden_weights(i);
return matrix;
}
// Returns bias for i-th hidden layer. Technically a Matrix, but we expect it
// to be a row/column vector (i.e., num rows or num cols is 1). However, we
// don't CHECK for that: we just provide access to underlying data. Crashes
// on out of bounds indices.
Matrix GetHiddenLayerBias(int i) const {
CheckMatrixRange(i, hidden_bias_size(), "hidden layer bias");
Matrix matrix;
matrix.rows = hidden_bias_num_rows(i);
matrix.cols = hidden_bias_num_cols(i);
// Quantization not supported here.
matrix.quant_type = QuantizationType::NONE;
matrix.elements = hidden_bias_weights(i);
return matrix;
}
// Returns true if a softmax layer exists.
bool HasSoftmax() const { return softmax_size() == 1; }
// Returns weight matrix for the softmax layer. Note: should be called only
// if HasSoftmax() is true.
//
// This is the transpose of the corresponding matrix from the original proto.
Matrix GetSoftmaxMatrix() const {
CLD3_DCHECK(HasSoftmax());
Matrix matrix;
matrix.rows = softmax_num_rows(0);
matrix.cols = softmax_num_cols(0);
// Quantization not supported here.
matrix.quant_type = QuantizationType::NONE;
matrix.elements = softmax_weights(0);
return matrix;
}
// Returns bias for the softmax layer. Technically a Matrix, but we expect it
// to be a row/column vector (i.e., num rows or num cols is 1). However, we
// don't CHECK for that: we just provide access to underlying data.
Matrix GetSoftmaxBias() const {
CLD3_DCHECK(HasSoftmax());
Matrix matrix;
matrix.rows = softmax_bias_num_rows(0);
matrix.cols = softmax_bias_num_cols(0);
// Quantization not supported here.
matrix.quant_type = QuantizationType::NONE;
matrix.elements = softmax_bias_weights(0);
return matrix;
}
// **** Low-level API.
//
// * Most low-level API methods are documented by giving an equivalent
// function call on proto, the original proto (of type
// EmbeddingNetworkProto) which was used to generate the C++ code.
//
// * To simplify our generation code, optional proto fields of message type
// are treated as repeated fields with 0 or 1 instances. As such, we have
// *_size() methods for such optional fields: they return 0 or 1.
//
// * "transpose(M)" denotes the transpose of a matrix M.
// ** Access methods for repeated MatrixParams embeddings.
//
// Returns proto.embeddings_size().
virtual int embeddings_size() const = 0;
// Returns number of rows of transpose(proto.embeddings(i)).
virtual int embeddings_num_rows(int i) const = 0;
// Returns number of columns of transpose(proto.embeddings(i)).
virtual int embeddings_num_cols(int i) const = 0;
// Returns pointer to elements of transpose(proto.embeddings(i)), in row-major
// order.
virtual const void *embeddings_weights(int i) const = 0;
virtual QuantizationType embeddings_quant_type(int i) const {
return QuantizationType::NONE;
}
virtual const float16 *embeddings_quant_scales(int i) const {
return nullptr;
}
// ** Access methods for repeated MatrixParams hidden.
//
// Returns embedding_network_proto.hidden_size().
virtual int hidden_size() const = 0;
// Returns embedding_network_proto.hidden(i).rows().
virtual int hidden_num_rows(int i) const = 0;
// Returns embedding_network_proto.hidden(i).rows().
virtual int hidden_num_cols(int i) const = 0;
// Returns pointer to beginning of array of floats with all values from
// embedding_network_proto.hidden(i).
virtual const void *hidden_weights(int i) const = 0;
// ** Access methods for repeated MatrixParams hidden_bias.
//
// Returns proto.hidden_bias_size().
virtual int hidden_bias_size() const = 0;
// Returns number of rows of proto.hidden_bias(i).
virtual int hidden_bias_num_rows(int i) const = 0;
// Returns number of columns of proto.hidden_bias(i).
virtual int hidden_bias_num_cols(int i) const = 0;
// Returns pointer to elements of proto.hidden_bias(i), in row-major order.
virtual const void *hidden_bias_weights(int i) const = 0;
// ** Access methods for optional MatrixParams softmax.
//
// Returns 1 if proto has optional field softmax, 0 otherwise.
virtual int softmax_size() const = 0;
// Returns number of rows of transpose(proto.softmax()).
virtual int softmax_num_rows(int i) const = 0;
// Returns number of columns of transpose(proto.softmax()).
virtual int softmax_num_cols(int i) const = 0;
// Returns pointer to elements of transpose(proto.softmax()), in row-major
// order.
virtual const void *softmax_weights(int i) const = 0;
// ** Access methods for optional MatrixParams softmax_bias.
//
// Returns 1 if proto has optional field softmax_bias, 0 otherwise.
virtual int softmax_bias_size() const = 0;
// Returns number of rows of proto.softmax_bias().
virtual int softmax_bias_num_rows(int i) const = 0;
// Returns number of columns of proto.softmax_bias().
virtual int softmax_bias_num_cols(int i) const = 0;
// Returns pointer to elements of proto.softmax_bias(), in row-major order.
virtual const void *softmax_bias_weights(int i) const = 0;
// ** Access methods for repeated int32 embedding_dim.
//
// Returns proto.embedding_dim_size().
virtual int embedding_dim_size() const = 0;
// Returns proto.embedding_dim(i).
virtual int embedding_dim(int i) const = 0;
// ** Access methods for repeated int32 embedding_num_features.
//
// Returns proto.embedding_num_features_size().
virtual int embedding_num_features_size() const = 0;
// Returns proto.embedding_num_features(i).
virtual int embedding_num_features(int i) const = 0;
// ** Access methods for repeated int32 embedding_features_domain_size.
//
// Returns proto.embedding_features_domain_size_size().
virtual int embedding_features_domain_size_size() const = 0;
// Returns proto.embedding_features_domain_size(i).
virtual int embedding_features_domain_size(int i) const = 0;
// ** Access methods for repeated int32 concat_offset.
//
// Returns proto.concat_offset_size().
virtual int concat_offset(int i) const = 0;
// Returns proto.concat_offset(i).
virtual int concat_offset_size() const = 0;
// ** Access methods for concat_layer_size.
//
// Returns proto.has_concat_layer_size().
virtual bool has_concat_layer_size() const = 0;
// Returns proto.concat_layer_size().
virtual int concat_layer_size() const = 0;
// ** Access methods for is_precomputed
//
// Returns proto.has_is_precomputed().
virtual bool has_is_precomputed() const = 0;
// Returns proto.is_precomputed().
virtual bool is_precomputed() const = 0;
private:
void CheckMatrixRange(int index, int num_matrices,
const string &description) const {
CLD3_DCHECK(index >= 0);
CLD3_DCHECK(index < num_matrices);
}
}; // class EmbeddingNetworkParams
} // namespace chrome_lang_id
#endif // EMBEDDING_NETWORK_PARAMS_H_

View File

@@ -0,0 +1,137 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "feature_extractor.h"
#include <string>
#include "feature_types.h"
#include "fml_parser.h"
#include "utils.h"
namespace chrome_lang_id {
constexpr FeatureValue GenericFeatureFunction::kNone;
FeatureVector::FeatureVector() {}
FeatureVector::~FeatureVector() {}
GenericFeatureExtractor::GenericFeatureExtractor() {}
GenericFeatureExtractor::~GenericFeatureExtractor() {}
GenericFeatureExtractor::GenericFeatureExtractor(
const GenericFeatureExtractor &extractor)
: descriptor_(extractor.descriptor_),
feature_types_(extractor.feature_types_) {}
void GenericFeatureExtractor::Parse(const string &source) {
// Parse feature specification into descriptor.
FMLParser parser;
parser.Parse(source, mutable_descriptor());
// Initialize feature extractor from descriptor.
InitializeFeatureFunctions();
}
void GenericFeatureExtractor::InitializeFeatureTypes() {
// Register all feature types.
GetFeatureTypes(&feature_types_);
for (size_t i = 0; i < feature_types_.size(); ++i) {
FeatureType *ft = feature_types_[i];
ft->set_base(i);
// Check for feature space overflow.
CLD3_DCHECK(ft->GetDomainSize() >= 0);
}
std::vector<string> types_names;
GetFeatureTypeNames(&types_names);
CLD3_DCHECK(feature_types_.size() == types_names.size());
}
void GenericFeatureExtractor::GetFeatureTypeNames(
std::vector<string> *type_names) const {
for (size_t i = 0; i < feature_types_.size(); ++i) {
FeatureType *ft = feature_types_[i];
type_names->push_back(ft->name());
}
}
FeatureValue GenericFeatureExtractor::GetDomainSize() const {
// Domain size of the set of features is equal to:
// [largest domain size of any feature types] * [number of feature types]
FeatureValue max_feature_type_dsize = 0;
for (size_t i = 0; i < feature_types_.size(); ++i) {
FeatureType *ft = feature_types_[i];
const FeatureValue feature_type_dsize = ft->GetDomainSize();
if (feature_type_dsize > max_feature_type_dsize) {
max_feature_type_dsize = feature_type_dsize;
}
}
return max_feature_type_dsize;
}
string GenericFeatureFunction::GetParameter(const string &name) const {
// Find named parameter in feature descriptor.
for (int i = 0; i < descriptor_->parameter_size(); ++i) {
if (name == descriptor_->parameter(i).name()) {
return descriptor_->parameter(i).value();
}
}
return "";
}
GenericFeatureFunction::GenericFeatureFunction() {}
GenericFeatureFunction::~GenericFeatureFunction() { delete feature_type_; }
int GenericFeatureFunction::GetIntParameter(const string &name,
int default_value) const {
string value = GetParameter(name);
return value.empty() ? default_value
: utils::ParseUsing<int>(value, utils::ParseInt32);
}
bool GenericFeatureFunction::GetBoolParameter(const string &name,
bool default_value) const {
string value = GetParameter(name);
if (value.empty()) return default_value;
if (value == "true") return true;
if (value == "false") return false;
return false;
}
void GenericFeatureFunction::GetFeatureTypes(
std::vector<FeatureType *> *types) const {
if (feature_type_ != nullptr) types->push_back(feature_type_);
}
FeatureType *GenericFeatureFunction::GetFeatureType() const {
// If a single feature type has been registered return it.
if (feature_type_ != nullptr) return feature_type_;
// Get feature types for function.
std::vector<FeatureType *> types;
GetFeatureTypes(&types);
// If there is exactly one feature type return this, else return null.
if (types.size() == 1) return types[0];
return nullptr;
}
} // namespace chrome_lang_id

View File

@@ -0,0 +1,633 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Generic feature extractor for extracting features from objects. The feature
// extractor can be used for extracting features from any object. The feature
// extractor and feature function classes are template classes that have to
// be instantiated for extracting feature from a specific object type.
//
// A feature extractor consists of a hierarchy of feature functions. Each
// feature function extracts one or more feature type and value pairs from the
// object.
//
// The feature extractor has a modular design where new feature functions can be
// registered as components. The feature extractor is initialized from a
// descriptor represented by a protocol buffer. The feature extractor can also
// be initialized from a text-based source specification of the feature
// extractor. Feature specification parsers can be added as components. By
// default the feature extractor can be read from an ASCII protocol buffer or in
// a simple feature modeling language (fml).
// A feature function is invoked with a focus. Nested feature function can be
// invoked with another focus determined by the parent feature function.
#ifndef FEATURE_EXTRACTOR_H_
#define FEATURE_EXTRACTOR_H_
#include <stddef.h>
#include <memory>
#include <string>
#include <vector>
#include "base.h"
#include "cld_3/protos/feature_extractor.pb.h"
#include "feature_types.h"
#include "registry.h"
#include "script_span/stringpiece.h"
#include "task_context.h"
#include "utils.h"
#include "workspace.h"
namespace chrome_lang_id {
// TODO(djweiss) Clean this up as well.
// Use the same type for feature values as is used for predicated.
typedef int64 Predicate;
typedef Predicate FeatureValue;
// Output feature model in FML format.
void ToFMLFunction(const FeatureFunctionDescriptor &function, string *output);
void ToFML(const FeatureFunctionDescriptor &function, string *output);
// A union used to represent discrete and continuous feature values.
union FloatFeatureValue {
public:
explicit FloatFeatureValue(FeatureValue v) : discrete_value(v) {}
FloatFeatureValue(uint32 i, float w) {
value.id = i;
value.weight = w;
}
FeatureValue discrete_value;
struct IdWeight {
uint32 id;
float weight;
} value;
};
// A feature vector contains feature type and value pairs.
class FeatureVector {
public:
FeatureVector();
~FeatureVector();
// Adds feature type and value pair to feature vector.
void add(FeatureType *type, FeatureValue value) {
features_.emplace_back(type, value);
}
// Removes all elements from the feature vector.
void clear() { features_.clear(); }
// Returns the number of elements in the feature vector.
int size() const { return features_.size(); }
// Reserves space in the underlying feature vector.
void reserve(int n) { features_.reserve(n); }
// Returns feature type for an element in the feature vector.
FeatureType *type(int index) const { return features_[index].type; }
// Returns feature value for an element in the feature vector.
FeatureValue value(int index) const { return features_[index].value; }
private:
// Structure for holding feature type and value pairs.
struct Element {
Element() : type(NULL), value(-1) {}
Element(FeatureType *t, FeatureValue v) : type(t), value(v) {}
FeatureType *type;
FeatureValue value;
};
// Array for storing feature vector elements.
std::vector<Element> features_;
CLD3_DISALLOW_COPY_AND_ASSIGN(FeatureVector);
};
// The generic feature extractor is the type-independent part of a feature
// extractor. This holds the descriptor for the feature extractor and the
// collection of feature types used in the feature extractor. The feature
// types are not available until FeatureExtractor<>::Init() has been called.
class GenericFeatureExtractor {
public:
GenericFeatureExtractor();
virtual ~GenericFeatureExtractor();
GenericFeatureExtractor(const GenericFeatureExtractor &extractor);
// Initializes the feature extractor from a source representation of the
// feature extractor. The first line is used for determining the feature
// specification language. If the first line starts with #! followed by a name
// then this name is used for instantiating a feature specification parser
// with that name. If the language cannot be detected this way it falls back
// to using the default language supplied.
void Parse(const string &source);
// Returns the feature extractor descriptor.
const FeatureExtractorDescriptor &descriptor() const { return descriptor_; }
FeatureExtractorDescriptor *mutable_descriptor() { return &descriptor_; }
// Returns the number of feature types in the feature extractor. Invalid
// before Init() has been called.
int feature_types() const { return feature_types_.size(); }
// Returns all feature types names used by the extractor. The names are
// added to the types_names array. Invalid before Init() has been called.
void GetFeatureTypeNames(std::vector<string> *type_names) const;
// Returns a feature type used in the extractor. Invalid before Init() has
// been called.
const FeatureType *feature_type(int index) const {
return feature_types_[index];
}
// Returns the feature domain size of this feature extractor.
// NOTE: The way that domain size is calculated is, for some, unintuitive. It
// is the largest domain size of any feature type.
FeatureValue GetDomainSize() const;
protected:
// Initializes the feature types used by the extractor. Called from
// FeatureExtractor<>::Init().
void InitializeFeatureTypes();
private:
// Initializes the top-level feature functions.
virtual void InitializeFeatureFunctions() = 0;
// Returns all feature types used by the extractor. The feature types are
// added to the result array.
virtual void GetFeatureTypes(std::vector<FeatureType *> *types) const = 0;
// Descriptor for the feature extractor. This is a protocol buffer that
// contains all the information about the feature extractor. The feature
// functions are initialized from the information in the descriptor.
FeatureExtractorDescriptor descriptor_;
// All feature types used by the feature extractor. The collection of all the
// feature types describes the feature space of the feature set produced by
// the feature extractor. Not owned.
std::vector<FeatureType *> feature_types_;
};
// The generic feature function is the type-independent part of a feature
// function. Each feature function is associated with the descriptor that it is
// instantiated from. The feature types associated with this feature function
// will be established by the time FeatureExtractor<>::Init() completes.
class GenericFeatureFunction {
public:
// A feature value that represents the absence of a value.
static constexpr FeatureValue kNone = -1;
GenericFeatureFunction();
virtual ~GenericFeatureFunction();
// Sets up the feature function. NB: FeatureTypes of nested functions are not
// guaranteed to be available until Init().
virtual void Setup(TaskContext *context) {}
// Initializes the feature function. NB: The FeatureType of this function must
// be established when this method completes.
virtual void Init(TaskContext *context) {}
// Requests workspaces from a registry to obtain indices into a WorkspaceSet
// for any Workspace objects used by this feature function. NB: This will be
// called after Init(), so it can depend on resources and arguments.
virtual void RequestWorkspaces(WorkspaceRegistry *registry) {}
// Appends the feature types produced by the feature function to types. The
// default implementation appends feature_type(), if non-null. Invalid
// before Init() has been called.
virtual void GetFeatureTypes(std::vector<FeatureType *> *types) const;
// Returns the feature type for feature produced by this feature function. If
// the feature function produces features of different types this returns
// null. Invalid before Init() has been called.
virtual FeatureType *GetFeatureType() const;
// Returns the name of the registry used for creating the feature function.
// This can be used for checking if two feature functions are of the same
// kind.
virtual const char *RegistryName() const = 0;
// Returns the value of a named parameter in the feature functions descriptor.
// If the named parameter is not found the global parameters are searched.
string GetParameter(const string &name) const;
int GetIntParameter(const string &name, int default_value) const;
bool GetBoolParameter(const string &name, bool default_value) const;
// Returns the FML function description for the feature function, i.e. the
// name and parameters without the nested features.
string FunctionName() const {
string output;
ToFMLFunction(*descriptor_, &output);
return output;
}
// Returns the prefix for nested feature functions. This is the prefix of this
// feature function concatenated with the feature function name.
string SubPrefix() const {
return prefix_.empty() ? FunctionName() : prefix_ + "." + FunctionName();
}
// Returns/sets the feature extractor this function belongs to.
GenericFeatureExtractor *extractor() const { return extractor_; }
void set_extractor(GenericFeatureExtractor *extractor) {
extractor_ = extractor;
}
// Returns/sets the feature function descriptor.
FeatureFunctionDescriptor *descriptor() const { return descriptor_; }
void set_descriptor(FeatureFunctionDescriptor *descriptor) {
descriptor_ = descriptor;
}
// Returns a descriptive name for the feature function. The name is taken from
// the descriptor for the feature function. If the name is empty or the
// feature function is a variable the name is the FML representation of the
// feature, including the prefix.
string name() const {
string output;
if (descriptor_->name().empty()) {
if (!prefix_.empty()) {
output.append(prefix_);
output.append(".");
}
ToFML(*descriptor_, &output);
} else {
output = descriptor_->name();
}
StringPiece stripped(output);
utils::RemoveWhitespaceContext(&stripped);
string stripped_output(stripped.data(), stripped.size());
return stripped_output;
}
// Returns the argument from the feature function descriptor. It defaults to
// 0 if the argument has not been specified.
int argument() const {
return descriptor_->has_argument() ? descriptor_->argument() : 0;
}
// Returns/sets/clears function name prefix.
const string &prefix() const { return prefix_; }
void set_prefix(const string &prefix) { prefix_ = prefix; }
protected:
// Returns the feature type for single-type feature functions.
FeatureType *feature_type() const { return feature_type_; }
// Sets the feature type for single-type feature functions. This takes
// ownership of feature_type. Can only be called once.
void set_feature_type(FeatureType *feature_type) {
CLD3_DCHECK(feature_type_ == nullptr);
feature_type_ = feature_type;
}
private:
// Feature extractor this feature function belongs to. Not owned.
GenericFeatureExtractor *extractor_ = nullptr;
// Descriptor for feature function. Not owned.
FeatureFunctionDescriptor *descriptor_ = nullptr;
// Feature type for features produced by this feature function. If the
// feature function produces features of multiple feature types this is null
// and the feature function must return it's feature types in
// GetFeatureTypes(). Owned.
FeatureType *feature_type_ = nullptr;
// Prefix used for sub-feature types of this function.
string prefix_;
};
// Feature function that can extract features from an object. Templated on
// two type arguments:
//
// OBJ: The "object" from which features are extracted; e.g., a sentence. This
// should be a plain type, rather than a reference or pointer.
//
// ARGS: A set of 0 or more types that are used to "index" into some part of the
// object that should be extracted, e.g. an int token index for a sentence
// object. This should not be a reference type.
template <class OBJ, class... ARGS>
class FeatureFunction
: public GenericFeatureFunction,
public RegisterableClass<FeatureFunction<OBJ, ARGS...> > {
public:
using Self = FeatureFunction<OBJ, ARGS...>;
// Preprocesses the object. This will be called prior to calling Evaluate()
// or Compute() on that object.
virtual void Preprocess(WorkspaceSet *workspaces, OBJ *object) const {}
// Appends features computed from the object and focus to the result. The
// default implementation delegates to Compute(), adding a single value if
// available. Multi-valued feature functions must override this method.
virtual void Evaluate(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const {
FeatureValue value = Compute(workspaces, object, args..., result);
if (value != kNone) result->add(feature_type(), value);
}
// Returns a feature value computed from the object and focus, or kNone if no
// value is computed. Single-valued feature functions only need to override
// this method.
virtual FeatureValue Compute(const WorkspaceSet &workspaces,
const OBJ &object, ARGS... args,
const FeatureVector *fv) const {
return kNone;
}
// Instantiates a new feature function in a feature extractor from a feature
// descriptor.
static Self *Instantiate(GenericFeatureExtractor *extractor,
FeatureFunctionDescriptor *fd,
const string &prefix) {
Self *f = Self::Create(fd->type());
f->set_extractor(extractor);
f->set_descriptor(fd);
f->set_prefix(prefix);
return f;
}
// Returns the name of the registry for the feature function.
const char *RegistryName() const override { return Self::registry()->name; }
private:
// Special feature function class for resolving variable references. The type
// of the feature function is used for resolving the variable reference. When
// evaluated it will either get the feature value(s) from the variable portion
// of the feature vector, if present, or otherwise it will call the referenced
// feature extractor function directly to extract the feature(s).
class Reference;
};
// Base class for features with nested feature functions. The nested functions
// are of type NES, which may be different from the type of the parent function.
// NB: NestedFeatureFunction will ensure that all initialization of nested
// functions takes place during Setup() and Init() -- after the nested features
// are initialized, the parent feature is initialized via SetupNested() and
// InitNested(). Alternatively, a derived classes that overrides Setup() and
// Init() directly should call Parent::Setup(), Parent::Init(), etc. first.
//
// Note: NestedFeatureFunction cannot know how to call Preprocess, Evaluate, or
// Compute, since the nested functions may be of a different type.
template <class NES, class OBJ, class... ARGS>
class NestedFeatureFunction : public FeatureFunction<OBJ, ARGS...> {
public:
using Parent = NestedFeatureFunction<NES, OBJ, ARGS...>;
// Clean up nested functions.
~NestedFeatureFunction() override { utils::STLDeleteElements(&nested_); }
// By default, just appends the nested feature types.
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
// Nested features require nested features to be defined.
CLD3_DCHECK(!this->nested().empty());
for (auto *function : nested_) function->GetFeatureTypes(types);
}
// Sets up the nested features.
void Setup(TaskContext *context) override {
CreateNested(this->extractor(), this->descriptor(), &nested_,
this->SubPrefix());
for (auto *function : nested_) function->Setup(context);
SetupNested(context);
}
// Sets up this NestedFeatureFunction specifically.
virtual void SetupNested(TaskContext *context) {}
// Initializes the nested features.
void Init(TaskContext *context) override {
for (auto *function : nested_) function->Init(context);
InitNested(context);
}
// Initializes this NestedFeatureFunction specifically.
virtual void InitNested(TaskContext *context) {}
// Gets all the workspaces needed for the nested functions.
void RequestWorkspaces(WorkspaceRegistry *registry) override {
for (auto *function : nested_) function->RequestWorkspaces(registry);
}
// Returns the list of nested feature functions.
const vector<NES *> &nested() const { return nested_; }
// Instantiates nested feature functions for a feature function. Creates and
// initializes one feature function for each sub-descriptor in the feature
// descriptor.
static void CreateNested(GenericFeatureExtractor *extractor,
FeatureFunctionDescriptor *fd,
vector<NES *> *functions, const string &prefix) {
for (int i = 0; i < fd->feature_size(); ++i) {
FeatureFunctionDescriptor *sub = fd->mutable_feature(i);
NES *f = NES::Instantiate(extractor, sub, prefix);
functions->push_back(f);
}
}
protected:
// The nested feature functions, if any, in order of declaration in the
// feature descriptor. Owned.
vector<NES *> nested_;
};
// Base class for a nested feature function that takes nested features with the
// same signature as these features, i.e. a meta feature. For this class, we can
// provide preprocessing of the nested features.
template <class OBJ, class... ARGS>
class MetaFeatureFunction
: public NestedFeatureFunction<FeatureFunction<OBJ, ARGS...>, OBJ,
ARGS...> {
public:
// Preprocesses using the nested features.
void Preprocess(WorkspaceSet *workspaces, OBJ *object) const override {
for (auto *function : this->nested_) {
function->Preprocess(workspaces, object);
}
}
};
// Template for a special type of locator: The locator of type
// FeatureFunction<OBJ, ARGS...> calls nested functions of type
// FeatureFunction<OBJ, IDX, ARGS...>, where the derived class DER is
// responsible for translating by providing the following:
//
// // Gets the new additional focus.
// IDX GetFocus(const WorkspaceSet &workspaces, const OBJ &object);
//
// This is useful to e.g. add a token focus to a parser state based on some
// desired property of that state.
template <class DER, class OBJ, class IDX, class... ARGS>
class FeatureAddFocusLocator
: public NestedFeatureFunction<FeatureFunction<OBJ, IDX, ARGS...>, OBJ,
ARGS...> {
public:
void Preprocess(WorkspaceSet *workspaces, OBJ *object) const override {
for (auto *function : this->nested_) {
function->Preprocess(workspaces, object);
}
}
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object, ARGS... args,
FeatureVector *result) const override {
IDX focus =
static_cast<const DER *>(this)->GetFocus(workspaces, object, args...);
for (auto *function : this->nested()) {
function->Evaluate(workspaces, object, focus, args..., result);
}
}
// Returns the first nested feature's computed value.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args,
const FeatureVector *result) const override {
IDX focus =
static_cast<const DER *>(this)->GetFocus(workspaces, object, args...);
return this->nested()[0]->Compute(workspaces, object, focus, args...,
result);
}
};
// CRTP feature locator class. This is a meta feature that modifies ARGS and
// then calls the nested feature functions with the modified ARGS. Note that in
// order for this template to work correctly, all of ARGS must be types for
// which the reference operator & can be interpreted as a pointer to the
// argument. The derived class DER must implement the UpdateFocus method which
// takes pointers to the ARGS arguments:
//
// // Updates the current arguments.
// void UpdateArgs(const OBJ &object, ARGS *...args) const;
template <class DER, class OBJ, class... ARGS>
class FeatureLocator : public MetaFeatureFunction<OBJ, ARGS...> {
public:
// Feature locators have an additional check that there is no intrinsic type.
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
// FeatureLocators should not have an intrinsic type.
CLD3_DCHECK(this->feature_type() == nullptr);
MetaFeatureFunction<OBJ, ARGS...>::GetFeatureTypes(types);
}
// Evaluates the locator.
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object, ARGS... args,
FeatureVector *result) const override {
static_cast<const DER *>(this)->UpdateArgs(workspaces, object, &args...);
for (auto *function : this->nested()) {
function->Evaluate(workspaces, object, args..., result);
}
}
// Returns the first nested feature's computed value.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args,
const FeatureVector *result) const override {
static_cast<const DER *>(this)->UpdateArgs(workspaces, object, &args...);
return this->nested()[0]->Compute(workspaces, object, args..., result);
}
};
// Feature extractor for extracting features from objects of a certain class.
// Template type parameters are as defined for FeatureFunction.
template <class OBJ, class... ARGS>
class FeatureExtractor : public GenericFeatureExtractor {
public:
// Feature function type for top-level functions in the feature extractor.
typedef FeatureFunction<OBJ, ARGS...> Function;
typedef FeatureExtractor<OBJ, ARGS...> Self;
// Feature locator type for the feature extractor.
template <class DER>
using Locator = FeatureLocator<DER, OBJ, ARGS...>;
// Initializes feature extractor.
FeatureExtractor() {}
~FeatureExtractor() override { utils::STLDeleteElements(&functions_); }
// Sets up the feature extractor. Note that only top-level functions exist
// until Setup() is called. This does not take ownership over the context,
// which must outlive this.
void Setup(TaskContext *context) {
for (Function *function : functions_) function->Setup(context);
}
// Initializes the feature extractor. Must be called after Setup(). This
// does not take ownership over the context, which must outlive this.
void Init(TaskContext *context) {
for (Function *function : functions_) function->Init(context);
this->InitializeFeatureTypes();
}
// Requests workspaces from the registry. Must be called after Init(), and
// before Preprocess(). Does not take ownership over registry. This should be
// the same registry used to initialize the WorkspaceSet used in Preprocess()
// and ExtractFeatures(). NB: This is a different ordering from that used in
// SentenceFeatureRepresentation style feature computation.
void RequestWorkspaces(WorkspaceRegistry *registry) {
for (auto *function : functions_) function->RequestWorkspaces(registry);
}
// Preprocesses the object using feature functions for the phase. Must be
// called before any calls to ExtractFeatures() on that object and phase.
void Preprocess(WorkspaceSet *workspaces, OBJ *object) const {
for (Function *function : functions_) {
function->Preprocess(workspaces, object);
}
}
// Extracts features from an object with a focus. This invokes all the
// top-level feature functions in the feature extractor. Only feature
// functions belonging to the specified phase are invoked.
void ExtractFeatures(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const {
result->reserve(this->feature_types());
// Extract features.
for (size_t i = 0; i < functions_.size(); ++i) {
functions_[i]->Evaluate(workspaces, object, args..., result);
}
}
private:
// Creates and initializes all feature functions in the feature extractor.
void InitializeFeatureFunctions() override {
// Create all top-level feature functions.
for (int i = 0; i < descriptor().feature_size(); ++i) {
FeatureFunctionDescriptor *fd = mutable_descriptor()->mutable_feature(i);
Function *function = Function::Instantiate(this, fd, "");
functions_.push_back(function);
}
}
// Collect all feature types used in the feature extractor.
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
for (size_t i = 0; i < functions_.size(); ++i) {
functions_[i]->GetFeatureTypes(types);
}
}
// Top-level feature functions (and variables) in the feature extractor.
// Owned.
std::vector<Function *> functions_;
};
} // namespace chrome_lang_id
#endif // FEATURE_EXTRACTOR_H_

View File

@@ -0,0 +1,50 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Protocol buffers for feature extractor.
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package chrome_lang_id;
message Parameter {
optional string name = 1;
optional string value = 2;
}
// Descriptor for feature function.
message FeatureFunctionDescriptor {
// Feature function type.
required string type = 1;
// Feature function name.
optional string name = 2;
// Default argument for feature function.
optional int32 argument = 3 [default = 0];
// Named parameters for feature descriptor.
repeated Parameter parameter = 4;
// Nested sub-feature function descriptors.
repeated FeatureFunctionDescriptor feature = 7;
};
// Descriptor for feature extractor.
message FeatureExtractorDescriptor {
// Top-level feature function for extractor.
repeated FeatureFunctionDescriptor feature = 1;
};

View File

@@ -0,0 +1,72 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "feature_types.h"
#include <algorithm>
#include <map>
#include <string>
#include <utility>
#include "base.h"
namespace chrome_lang_id {
FeatureType::FeatureType(const string &name)
: name_(name),
base_(0),
is_continuous_(name.find("continuous") != string::npos) {}
FeatureType::~FeatureType() {}
template <class Resource>
ResourceBasedFeatureType<Resource>::ResourceBasedFeatureType(
const string &name, const Resource *resource,
const std::map<FeatureValue, string> &values)
: FeatureType(name), resource_(resource), values_(values) {
max_value_ = resource->NumValues() - 1;
for (const auto &pair : values) {
CLD3_DCHECK(pair.first >= resource->NumValues());
max_value_ = pair.first > max_value_ ? pair.first : max_value_;
}
}
template <class Resource>
ResourceBasedFeatureType<Resource>::ResourceBasedFeatureType(
const string &name, const Resource *resource)
: ResourceBasedFeatureType(name, resource, {}) {}
EnumFeatureType::EnumFeatureType(
const string &name, const std::map<FeatureValue, string> &value_names)
: FeatureType(name), value_names_(value_names) {
for (const auto &pair : value_names) {
CLD3_DCHECK(pair.first >= 0);
domain_size_ = std::max(domain_size_, pair.first + 1);
}
}
EnumFeatureType::~EnumFeatureType() {}
string EnumFeatureType::GetFeatureValueName(FeatureValue value) const {
auto it = value_names_.find(value);
if (it == value_names_.end()) {
return "<INVALID>";
}
return it->second;
}
FeatureValue EnumFeatureType::GetDomainSize() const { return domain_size_; }
} // namespace chrome_lang_id

View File

@@ -0,0 +1,158 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Common feature types for parser components.
#ifndef FEATURE_TYPES_H_
#define FEATURE_TYPES_H_
#include <algorithm>
#include <map>
#include <string>
#include <utility>
#include "base.h"
namespace chrome_lang_id {
// TODO(djweiss) Clean this up as well.
// Use the same type for feature values as is used for predicated.
typedef int64 Predicate;
typedef Predicate FeatureValue;
// Each feature value in a feature vector has a feature type. The feature type
// is used for converting feature type and value pairs to predicate values. The
// feature type can also return names for feature values and calculate the size
// of the feature value domain. The FeatureType class is abstract and must be
// specialized for the concrete feature types.
class FeatureType {
public:
// Initializes a feature type.
explicit FeatureType(const string &name);
virtual ~FeatureType();
// Converts a feature value to a name.
virtual string GetFeatureValueName(FeatureValue value) const = 0;
// Returns the size of the feature values domain.
virtual int64 GetDomainSize() const = 0;
// Returns the feature type name.
const string &name() const { return name_; }
Predicate base() const { return base_; }
void set_base(Predicate base) { base_ = base; }
// Returns true iff this feature is continuous; see FloatFeatureValue.
bool is_continuous() const { return is_continuous_; }
private:
// Feature type name.
string name_;
// "Base" feature value: i.e. a "slot" in a global ordering of features.
Predicate base_;
// See doc for is_continuous().
bool is_continuous_;
};
// Templated generic resource based feature type. This feature type delegates
// look up of feature value names to an unknown resource class, which is not
// owned. Optionally, this type can also store a mapping of extra values which
// are not in the resource.
//
// Note: this class assumes that Resource->GetFeatureValueName() will return
// successfully for values ONLY in the range [0, Resource->NumValues()) Any
// feature value not in the extra value map and not in the above range of
// Resource will result in a ERROR and return of "<INVALID>".
template <class Resource>
class ResourceBasedFeatureType : public FeatureType {
public:
// Creates a new type with given name, resource object, and a mapping of
// special values. The values must be greater or equal to
// resource->NumValues() so as to avoid collisions; this is verified with
// CHECK at creation.
ResourceBasedFeatureType(const string &name, const Resource *resource,
const std::map<FeatureValue, string> &values);
// Creates a new type with no special values.
ResourceBasedFeatureType(const string &name, const Resource *resource);
// Returns the feature name for a given feature value. First checks the values
// map, then checks the resource to look up the name.
string GetFeatureValueName(FeatureValue value) const override {
if (values_.find(value) != values_.end()) {
return values_.find(value)->second;
}
if (value >= 0 && value < resource_->NumValues()) {
return resource_->GetFeatureValueName(value);
} else {
return "<INVALID>";
}
}
// Returns the number of possible values for this feature type. This is the
// based on the largest value that was observed in the extra values.
FeatureValue GetDomainSize() const override { return max_value_ + 1; }
protected:
// Shared resource. Not owned.
const Resource *resource_ = nullptr;
// Maximum possible value this feature could take.
FeatureValue max_value_;
// Mapping for extra feature values not in the resource.
std::map<FeatureValue, string> values_;
};
// Feature type that is defined using an explicit map from FeatureValue to
// string values. This can reduce some of the boilerplate when defining
// features that generate enum values. Example usage:
//
// class BeverageSizeFeature : public FeatureFunction<Beverage>
// enum FeatureValue { SMALL, MEDIUM, LARGE }; // values for this feature
// void Init(TaskContext *context) override {
// set_feature_type(new EnumFeatureType("beverage_size",
// {{SMALL, "SMALL"}, {MEDIUM, "MEDIUM"}, {LARGE, "LARGE"}});
// }
// [...]
// };
class EnumFeatureType : public FeatureType {
public:
EnumFeatureType(const string &name,
const std::map<FeatureValue, string> &value_names);
~EnumFeatureType() override;
// Returns the feature name for a given feature value.
string GetFeatureValueName(FeatureValue value) const override;
// Returns the number of possible values for this feature type. This is one
// greater than the largest value in the value_names map.
FeatureValue GetDomainSize() const override;
protected:
// Maximum possible value this feature could take.
FeatureValue domain_size_ = 0;
// Names of feature values.
std::map<FeatureValue, string> value_names_;
};
} // namespace chrome_lang_id
#endif // FEATURE_TYPES_H_

58
Telegram/ThirdParty/cld3/src/float16.h vendored Normal file
View File

@@ -0,0 +1,58 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef FLOAT16_H_
#define FLOAT16_H_
#include <string.h> // for memcpy
#include "base.h"
#include "casts.h"
namespace chrome_lang_id {
// Compact 16-bit encoding of floating point numbers. This
// representation uses 1 bit for the sign, 8 bits for the exponent and
// 7 bits for the mantissa. It is assumed that floats are in IEEE 754
// format so a float16 is just bits 16-31 of a single precision float.
//
// NOTE: The IEEE floating point standard defines a float16 format that
// is different than this format (it has fewer bits of exponent and more
// bits of mantissa). We don't use that format here because conversion
// to/from 32-bit floats is more complex for that format, and the
// conversion for this format is very simple.
//
// <---------float16------------>
// s e e e e e e e e f f f f f f f f f f f f f f f f f f f f f f f
// <------------------------------float-------------------------->
// 3 3 2 2 1 1 0
// 1 0 3 2 5 4 0
typedef uint16 float16;
static inline float16 Float32To16(float f) {
// Note that we just truncate the mantissa bits: we make no effort to
// do any smarter rounding.
return (lang_id_bit_cast<uint32>(f) >> 16) & 0xffff;
}
static inline float Float16To32(float16 f) {
// We fill in the new mantissa bits with 0, and don't do anything smarter.
return lang_id_bit_cast<float>(f << 16);
}
} // namespace chrome_lang_id
#endif // FLOAT16_H_

View File

@@ -0,0 +1,308 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "fml_parser.h"
#include <ctype.h>
#include <string>
#include "base.h"
#include "utils.h"
namespace chrome_lang_id {
namespace {
inline bool IsValidCharAtStartOfIdentifier(char c) {
return isalpha(c) || (c == '_') || (c == '/');
}
// Returns true iff character c can appear inside an identifier.
inline bool IsValidCharInsideIdentifier(char c) {
return isalnum(c) || (c == '_') || (c == '-') || (c == '/');
}
// Returns true iff character c can appear at the beginning of a number.
inline bool IsValidCharAtStartOfNumber(char c) {
return isdigit(c) || (c == '+') || (c == '-');
}
// Returns true iff character c can appear inside a number.
inline bool IsValidCharInsideNumber(char c) { return isdigit(c) || (c == '.'); }
} // namespace
FMLParser::FMLParser() {}
FMLParser::~FMLParser() {}
void FMLParser::Initialize(const string &source) {
// Initialize parser state.
source_ = source;
current_ = source_.begin();
item_start_ = line_start_ = current_;
line_number_ = item_line_number_ = 1;
// Read first input item.
NextItem();
}
void FMLParser::Next() {
// Move to the next input character. If we are at a line break update line
// number and line start position.
if (CurrentChar() == '\n') {
++line_number_;
++current_;
line_start_ = current_;
} else {
++current_;
}
}
void FMLParser::NextItem() {
// Skip white space and comments.
while (!eos()) {
if (CurrentChar() == '#') {
// Skip comment.
while (!eos() && CurrentChar() != '\n') Next();
} else if (isspace(CurrentChar())) {
// Skip whitespace.
while (!eos() && isspace(CurrentChar())) Next();
} else {
break;
}
}
// Record start position for next item.
item_start_ = current_;
item_line_number_ = line_number_;
// Check for end of input.
if (eos()) {
item_type_ = END;
return;
}
// Parse number.
if (IsValidCharAtStartOfNumber(CurrentChar())) {
string::iterator start = current_;
Next();
while (!eos() && IsValidCharInsideNumber(CurrentChar())) Next();
item_text_.assign(start, current_);
item_type_ = NUMBER;
return;
}
// Parse string.
if (CurrentChar() == '"') {
Next();
string::iterator start = current_;
while (CurrentChar() != '"') {
CLD3_DCHECK(!eos());
Next();
}
item_text_.assign(start, current_);
item_type_ = STRING;
Next();
return;
}
// Parse identifier name.
if (IsValidCharAtStartOfIdentifier(CurrentChar())) {
string::iterator start = current_;
while (!eos() && IsValidCharInsideIdentifier(CurrentChar())) {
Next();
}
item_text_.assign(start, current_);
item_type_ = NAME;
return;
}
// Single character item.
item_type_ = CurrentChar();
Next();
}
void FMLParser::Parse(const string &source,
FeatureExtractorDescriptor *result) {
// Initialize parser.
Initialize(source);
while (item_type_ != END) {
// Parse either a parameter name or a feature.
CLD3_DCHECK(item_type_ == NAME);
string name = item_text_;
NextItem();
// Feature expected.
CLD3_DCHECK(static_cast<char>(item_type_) != '=');
// Parse feature.
FeatureFunctionDescriptor *descriptor = result->add_feature();
descriptor->set_type(name);
ParseFeature(descriptor);
}
}
void FMLParser::ParseFeature(FeatureFunctionDescriptor *result) {
// Parse argument and parameters.
if (item_type_ == '(') {
NextItem();
ParseParameter(result);
while (item_type_ == ',') {
NextItem();
ParseParameter(result);
}
CLD3_DCHECK(item_type_ == ')');
NextItem();
}
// Parse feature name.
if (item_type_ == ':') {
NextItem();
// Feature name expected.
CLD3_DCHECK((item_type_ == NAME) || (item_type_ == STRING));
string name = item_text_;
NextItem();
// Set feature name.
result->set_name(name);
}
// Parse sub-features.
if (item_type_ == '.') {
// Parse dotted sub-feature.
NextItem();
CLD3_DCHECK(item_type_ == NAME);
string type = item_text_;
NextItem();
// Parse sub-feature.
FeatureFunctionDescriptor *subfeature = result->add_feature();
subfeature->set_type(type);
ParseFeature(subfeature);
} else if (item_type_ == '{') {
// Parse sub-feature block.
NextItem();
while (item_type_ != '}') {
CLD3_DCHECK(item_type_ == NAME);
string type = item_text_;
NextItem();
// Parse sub-feature.
FeatureFunctionDescriptor *subfeature = result->add_feature();
subfeature->set_type(type);
ParseFeature(subfeature);
}
NextItem();
}
}
void FMLParser::ParseParameter(FeatureFunctionDescriptor *result) {
CLD3_DCHECK((item_type_ == NUMBER) || (item_type_ == NAME));
if (item_type_ == NUMBER) {
int argument = utils::ParseUsing<int>(item_text_, utils::ParseInt32);
NextItem();
// Set default argument for feature.
result->set_argument(argument);
} else { // item_type_ == NAME
string name = item_text_;
NextItem();
CLD3_DCHECK(item_type_ == '=');
NextItem();
// Parameter value expected.
CLD3_DCHECK(item_type_ < END);
string value = item_text_;
NextItem();
// Add parameter to feature.
Parameter *parameter;
parameter = result->add_parameter();
parameter->set_name(name);
parameter->set_value(value);
}
}
void ToFMLFunction(const FeatureFunctionDescriptor &function, string *output) {
output->append(function.type());
if (function.argument() != 0 || function.parameter_size() > 0) {
output->append("(");
bool first = true;
if (function.argument() != 0) {
output->append(Int64ToString(function.argument()));
first = false;
}
for (int i = 0; i < function.parameter_size(); ++i) {
if (!first) output->append(",");
output->append(function.parameter(i).name());
output->append("=");
output->append("\"");
output->append(function.parameter(i).value());
output->append("\"");
first = false;
}
output->append(")");
}
}
void ToFML(const FeatureFunctionDescriptor &function, string *output) {
ToFMLFunction(function, output);
if (function.feature_size() == 1) {
output->append(".");
ToFML(function.feature(0), output);
} else if (function.feature_size() > 1) {
output->append(" { ");
for (int i = 0; i < function.feature_size(); ++i) {
if (i > 0) output->append(" ");
ToFML(function.feature(i), output);
}
output->append(" } ");
}
}
void ToFML(const FeatureExtractorDescriptor &extractor, string *output) {
for (int i = 0; i < extractor.feature_size(); ++i) {
ToFML(extractor.feature(i), output);
output->append("\n");
}
}
string AsFML(const FeatureFunctionDescriptor &function) {
string str;
ToFML(function, &str);
return str;
}
string AsFML(const FeatureExtractorDescriptor &extractor) {
string str;
ToFML(extractor, &str);
return str;
}
void StripFML(string *fml_string) {
auto it = fml_string->begin();
while (it != fml_string->end()) {
if (*it == '"') {
it = fml_string->erase(it);
} else {
++it;
}
}
}
} // namespace chrome_lang_id

View File

@@ -0,0 +1,123 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Feature modeling language (fml) parser.
//
// BNF grammar for fml:
//
// <feature model> ::= { <feature extractor> }
//
// <feature extractor> ::= <extractor spec> |
// <extractor spec> '.' <feature extractor> |
// <extractor spec> '{' { <feature extractor> } '}'
//
// <extractor spec> ::= <extractor type>
// [ '(' <parameter list> ')' ]
// [ ':' <extractor name> ]
//
// <parameter list> = ( <parameter> | <argument> ) { ',' <parameter> }
//
// <parameter> ::= <parameter name> '=' <parameter value>
//
// <extractor type> ::= NAME
// <extractor name> ::= NAME | STRING
// <argument> ::= NUMBER
// <parameter name> ::= NAME
// <parameter value> ::= NUMBER | STRING | NAME
#ifndef FML_PARSER_H_
#define FML_PARSER_H_
#include <string>
#include "base.h"
#include "cld_3/protos/feature_extractor.pb.h"
namespace chrome_lang_id {
class FMLParser {
public:
// Parses fml specification into feature extractor descriptor.
void Parse(const string &source, FeatureExtractorDescriptor *result);
FMLParser();
~FMLParser();
private:
// Initializes the parser with the source text.
void Initialize(const string &source);
// Moves to the next input character.
void Next();
// Moves to the next input item.
void NextItem();
// Parses a feature descriptor.
void ParseFeature(FeatureFunctionDescriptor *result);
// Parses a parameter specification.
void ParseParameter(FeatureFunctionDescriptor *result);
// Returns true if end of source input has been reached.
bool eos() const { return current_ == source_.end(); }
// Returns current character. Other methods should access the current
// character through this method (instead of using *current_ directly): this
// method performs extra safety checks.
char CurrentChar() const {
// CLD3_DCHECK that we are reading from inside the string.
CLD3_DCHECK(current_ >= source_.begin());
CLD3_DCHECK(current_ < source_.end());
return *current_;
}
// Item types.
enum ItemTypes {
END = 0,
NAME = -1,
NUMBER = -2,
STRING = -3,
};
// Source text.
string source_;
// Current input position.
string::iterator current_;
// Line number for current input position.
int line_number_;
// Start position for current item.
string::iterator item_start_;
// Start position for current line.
string::iterator line_start_;
// Line number for current item.
int item_line_number_;
// Item type for current item. If this is positive it is interpreted as a
// character. If it is negative it is interpreted as an item type.
int item_type_;
// Text for current item.
string item_text_;
};
} // namespace chrome_lang_id
#endif // FML_PARSER_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,178 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef LANG_ID_NN_PARAMS_H_
#define LANG_ID_NN_PARAMS_H_
#include "base.h"
#include "embedding_network_params.h"
#include "float16.h"
namespace chrome_lang_id {
class LangIdNNParams : public EmbeddingNetworkParams {
public:
~LangIdNNParams() override {}
// Access methods for embeddings:
int embeddings_size() const override { return 6; }
int embeddings_num_rows(int i) const override {
return kEmbeddingsNumRows[i];
}
int embeddings_num_cols(int i) const override {
return kEmbeddingsNumCols[i];
}
const void *embeddings_weights(int i) const override {
return embeddings_weights_[i];
}
QuantizationType embeddings_quant_type(int i) const override {
return QuantizationType::UINT8;
}
const float16 *embeddings_quant_scales(int i) const override {
return embeddings_quant_scales_[i];
}
// Access methods for hidden:
int hidden_size() const override { return 1; }
int hidden_num_rows(int i) const override { return kHiddenNumRows[i]; }
int hidden_num_cols(int i) const override { return kHiddenNumCols[i]; }
const void *hidden_weights(int i) const override {
return hidden_weights_[i];
}
// Access methods for hidden_bias:
int hidden_bias_size() const override { return 1; }
int hidden_bias_num_rows(int i) const override {
return kHiddenBiasNumRows[i];
}
int hidden_bias_num_cols(int i) const override {
return kHiddenBiasNumCols[i];
}
const void *hidden_bias_weights(int i) const override {
return hidden_bias_weights_[i];
}
// Access methods for softmax:
int softmax_size() const override { return 1; }
int softmax_num_rows(int i) const override { return kSoftmaxNumRows[i]; }
int softmax_num_cols(int i) const override { return kSoftmaxNumCols[i]; }
const void *softmax_weights(int i) const override {
return softmax_weights_[i];
}
// Access methods for softmax_bias:
int softmax_bias_size() const override { return 1; }
int softmax_bias_num_rows(int i) const override {
return kSoftmaxBiasNumRows[i];
}
int softmax_bias_num_cols(int i) const override {
return kSoftmaxBiasNumCols[i];
}
const void *softmax_bias_weights(int i) const override {
return softmax_bias_weights_[i];
}
// Access methods for embedding_dim:
int embedding_dim_size() const override { return 6; }
int32 embedding_dim(int i) const override { return kEmbeddingDimValues[i]; }
// Access methods for embedding_num_features:
int embedding_num_features_size() const override { return 6; }
int32 embedding_num_features(int i) const override {
return kEmbeddingNumFeaturesValues[i];
}
// Access methods for embedding_features_domain_size:
int embedding_features_domain_size_size() const override { return 6; }
int32 embedding_features_domain_size(int i) const override {
return kEmbeddingFeaturesDomainSizeValues[i];
}
// Access methods for concat_offset:
int concat_offset_size() const override { return 6; }
int32 concat_offset(int i) const override { return kConcatOffsetValues[i]; }
// Access methods for concat_layer_size:
bool has_concat_layer_size() const override { return true; }
int32 concat_layer_size() const override { return 80; }
// Access methods for is_precomputed:
bool has_is_precomputed() const override { return false; }
bool is_precomputed() const override { return false; }
private:
// Private fields for embeddings:
static const int kEmbeddingsNumRows[];
static const int kEmbeddingsNumCols[];
static const uint8 kEmbeddingsWeights0[];
static const uint8 kEmbeddingsWeights1[];
static const uint8 kEmbeddingsWeights2[];
static const uint8 kEmbeddingsWeights3[];
static const uint8 kEmbeddingsWeights4[];
static const uint8 kEmbeddingsWeights5[];
const void *embeddings_weights_[6] = {
kEmbeddingsWeights0, kEmbeddingsWeights1, kEmbeddingsWeights2,
kEmbeddingsWeights3, kEmbeddingsWeights4, kEmbeddingsWeights5};
static const float16 kEmbeddingsQuantScales0[];
static const float16 kEmbeddingsQuantScales1[];
static const float16 kEmbeddingsQuantScales2[];
static const float16 kEmbeddingsQuantScales3[];
static const float16 kEmbeddingsQuantScales4[];
static const float16 kEmbeddingsQuantScales5[];
const float16 *embeddings_quant_scales_[6] = {
kEmbeddingsQuantScales0, kEmbeddingsQuantScales1,
kEmbeddingsQuantScales2, kEmbeddingsQuantScales3,
kEmbeddingsQuantScales4, kEmbeddingsQuantScales5};
// Private fields for hidden:
static const int kHiddenNumRows[];
static const int kHiddenNumCols[];
static const float kHiddenWeights0[];
const void *hidden_weights_[1] = {kHiddenWeights0};
// Private fields for hidden_bias:
static const int kHiddenBiasNumRows[];
static const int kHiddenBiasNumCols[];
static const float kHiddenBiasWeights0[];
const void *hidden_bias_weights_[1] = {kHiddenBiasWeights0};
// Private fields for softmax:
static const int kSoftmaxNumRows[];
static const int kSoftmaxNumCols[];
static const float kSoftmaxWeights0[];
const void *softmax_weights_[1] = {kSoftmaxWeights0};
// Private fields for softmax_bias:
static const int kSoftmaxBiasNumRows[];
static const int kSoftmaxBiasNumCols[];
static const float kSoftmaxBiasWeights0[];
const void *softmax_bias_weights_[1] = {kSoftmaxBiasWeights0};
// Private fields for embedding_dim:
static const int32 kEmbeddingDimValues[];
// Private fields for embedding_num_features:
static const int32 kEmbeddingNumFeaturesValues[];
// Private fields for embedding_features_domain_size:
static const int32 kEmbeddingFeaturesDomainSizeValues[];
// Private fields for concat_offset:
static const int32 kConcatOffsetValues[];
}; // class LangIdNNParams
} // namespace chrome_lang_id
#endif // LANG_ID_NN_PARAMS_H_

View File

@@ -0,0 +1,165 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "language_identifier_features.h"
#include <sstream>
#include <unordered_map>
#include <utility>
#include <vector>
#include "base.h"
#include "feature_extractor.h"
#include "feature_types.h"
#include "script_span/generated_ulscript.h"
#include "script_span/getonescriptspan.h"
#include "sentence_features.h"
#include "task_context.h"
#include "unicodetext.h"
#include "utils.h"
namespace chrome_lang_id {
NumericFeatureType::NumericFeatureType(const string &name, FeatureValue size)
: FeatureType(name), size_(size) {}
string NumericFeatureType::GetFeatureValueName(FeatureValue value) const {
return value < 0 ? "" : Int64ToString(value);
}
FeatureValue NumericFeatureType::GetDomainSize() const { return size_; }
void ContinuousBagOfNgramsFunction::Setup(TaskContext *context) {
// Parameters in the feature function descriptor.
include_terminators_ = GetBoolParameter("include_terminators", false);
include_spaces_ = GetBoolParameter("include_spaces", false);
use_equal_ngram_weight_ = GetBoolParameter("use_equal_weight", false);
ngram_id_dimension_ = GetIntParameter("id_dim", 10000);
ngram_size_ = GetIntParameter("size", 3);
}
void ContinuousBagOfNgramsFunction::Init(TaskContext *context) {
set_feature_type(new NumericFeatureType(name(), ngram_id_dimension_));
}
void ContinuousBagOfNgramsFunction::Evaluate(const WorkspaceSet &workspaces,
const Sentence &sentence,
FeatureVector *result) const {
// Include terminators for each token. Tokens are discovered by splitting the
// text on spaces.
std::vector<string> chars;
utils::GetUTF8Chars(sentence.text(), &chars);
if (include_terminators_) {
std::vector<string> new_chars{"^"};
for (size_t index = 0; index < chars.size(); ++index) {
if (chars.at(index) == " ") {
new_chars.push_back("$");
new_chars.push_back(" ");
new_chars.push_back("^");
} else {
new_chars.push_back(chars.at(index));
}
}
new_chars.push_back("$");
chars.swap(new_chars);
}
// Find the char ngram counts.
std::unordered_map<string, int> char_ngram_counts;
int count_sum = 0;
for (int start = 0; start <= static_cast<int>(chars.size()) - ngram_size_;
++start) {
string char_ngram;
int index;
for (index = 0; index < ngram_size_; ++index) {
const string &current_char = chars.at(start + index);
if (current_char == " " && !include_spaces_) {
break;
}
char_ngram.append(current_char);
}
if (index == ngram_size_) {
char_ngram_counts[char_ngram]++;
++count_sum;
}
}
// Populate the feature vector.
const float equal_weight = 1.0 / char_ngram_counts.size();
const float norm = static_cast<float>(count_sum);
for (const auto &ngram_and_count : char_ngram_counts) {
const float weight =
use_equal_ngram_weight_ ? equal_weight : ngram_and_count.second / norm;
FloatFeatureValue value(
utils::Hash32WithDefaultSeed(ngram_and_count.first) %
ngram_id_dimension_,
weight);
result->add(feature_type(), value.discrete_value);
}
}
FeatureValue ScriptFeature::Compute(const WorkspaceSet &workspaces,
const Sentence &sentence,
const FeatureVector *result) const {
const string &text = sentence.text();
CLD2::ScriptScanner ss(text.c_str(), text.size(),
/*is_plain_text=*/true);
// GetOneScriptSpan() is called only once because of the assumption that the
// input contains one script. This function also cleans up the input (e.g.,
// removes digits, punctuation).
// TODO(abakalov): Extract the clean-up and script detection code out of
// GetOneScriptSpan() because we don't have to iterate over the whole text,
// just look at the first codepoint after clean-up.
CLD2::LangSpan script_span;
ss.GetOneScriptSpan(&script_span);
const CLD2::ULScript ulscript = script_span.ulscript;
if (ulscript != CLD2::ULScript_Hani) {
return ulscript;
} else {
// Out of the codepoints captured by ULScript_Hani, separately count those
// in Hangul (Korean script) and those in a script other than Hangul.
int num_hangul = 0;
int num_non_hangul = 0;
UnicodeText unicode_text;
unicode_text.PointToUTF8(script_span.text, script_span.text_bytes);
for (chrome_lang_id::char32 codepoint : unicode_text) {
// If the current codepoint is space, continue.
if (codepoint == 0x20) {
continue;
}
// Check if the current codepoint is within the ranges associated with
// Hangul.
if ((codepoint >= 0x1100 && codepoint <= 0x11FF) || // Hangul Jamo
(codepoint >= 0xA960 && codepoint <= 0xA97F) || // Jamo Extended A
(codepoint >= 0xD7B0 && codepoint <= 0xD7FF) || // Jamo Extended B
(codepoint >= 0x3130 && codepoint <= 0x318F) || // Compatibility Jamo
(codepoint >= 0xFFA0 && codepoint <= 0xFFDC) || // Halfwidth Jamo
(codepoint >= 0xAC00 && codepoint <= 0xD7AF)) { // Hangul Syllables
num_hangul++;
} else {
num_non_hangul++;
}
}
if (num_hangul > num_non_hangul) {
return static_cast<FeatureValue>(CLD2::NUM_ULSCRIPTS);
} else {
return static_cast<FeatureValue>(CLD2::ULScript_Hani);
}
}
}
} // namespace chrome_lang_id

View File

@@ -0,0 +1,116 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef LANGUAGE_IDENTIFIER_FEATURES_H_
#define LANGUAGE_IDENTIFIER_FEATURES_H_
#include <string>
#include "feature_extractor.h"
#include "feature_types.h"
#include "script_span/generated_ulscript.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "workspace.h"
namespace chrome_lang_id {
// Feature type for numeric features.
class NumericFeatureType : public FeatureType {
public:
// Initializes numeric feature.
NumericFeatureType(const string &name, FeatureValue size);
// Returns numeric feature value.
string GetFeatureValueName(FeatureValue value) const override;
// Returns the number of feature values.
FeatureValue GetDomainSize() const override;
private:
FeatureValue size_;
};
// Class for computing continuous char ngram features.
// Feature function descriptor parameters:
// include_terminators(bool, false):
// If 'true', then splits the text based on spaces to get tokens, adds "^"
// to the beginning of each token, and adds "$" to the end of each token.
// include_spaces(bool, false):
// If 'true', then includes char ngrams containing spaces.
// use_equal_weight(bool, false):
// If 'true', then weighs each unique ngram by 1.0 / (number of unique
// ngrams in the input). Otherwise, weighs each unique ngram by (ngram
// count) / (total number of ngrams).
// id_dim(int, 10000):
// The integer id of each char ngram is computed as follows:
// Hash32WithDefaultSeed(char ngram) % id_dim.
// size(int, 3):
// Only ngrams of this size will be extracted.
class ContinuousBagOfNgramsFunction : public WholeSentenceFeature {
public:
void Setup(TaskContext *context) override;
void Init(TaskContext *context) override;
// Appends the features computed from the focus to the feature vector.
void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
FeatureVector *result) const override;
private:
// If 'true', then splits the text based on spaces to get tokens, adds "^" to
// the beginning of each token, and adds "$" to the end of each token.
bool include_terminators_;
// If 'true', then includes char ngrams containing spaces.
bool include_spaces_;
// If 'true', then weighs each unique ngram by 1.0 / (number of unique ngrams
// in the input). Otherwise, weighs each unique ngram by (ngram count) /
// (total number of ngrams).
bool use_equal_ngram_weight_;
// The integer id of each char ngram is computed as follows:
// Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
int ngram_id_dimension_;
// Only ngrams of size ngram_size_ will be extracted.
int ngram_size_;
};
// Class for detecting the script of a piece of text. The list of supported
// scripts is chrome_lang_id::CLD2::ULScript. This class uses the script
// recognition code ported from CLD2. ULScript_Hani is split into non-Korean
// script and Korean script (Hangul). In the former case, the function emits
// ULScript_Hani. In the latter case, the function emits NUM_ULSCRIPTS. The
// class assumes that the input is (1) interchange valid UTF8, and (2) contains
// only one chrome_lang_id::CLD2::ULScript.
class ScriptFeature : public WholeSentenceFeature {
public:
void Init(TaskContext *context) override {
// The dimension is incremented by 1 because ULScript_Hani is split into two
// as mentioned in the class description.
set_feature_type(new NumericFeatureType(
name(), chrome_lang_id::CLD2::NUM_ULSCRIPTS + 1));
}
// Computes the feature and saves it in the feature vector.
FeatureValue Compute(const WorkspaceSet &workspaces, const Sentence &sentence,
const FeatureVector *result) const override;
};
} // namespace chrome_lang_id
#endif // LANGUAGE_IDENTIFIER_FEATURES_H_

View File

@@ -0,0 +1,261 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <cmath>
#include <iostream>
#include <vector>
#include <set>
#include "base.h"
#include "feature_extractor.h"
#include "language_identifier_features.h"
#include "nnet_language_identifier.h"
#include "script_span/generated_ulscript.h"
#include "cld_3/protos/sentence.pb.h"
#include "task_context.h"
#include "utils.h"
#include "workspace.h"
namespace chrome_lang_id {
namespace language_identifier_features_test {
static WholeSentenceFeature *cbog_factory() {
return new ContinuousBagOfNgramsFunction;
}
static WholeSentenceFeature *sf_factory() { return new ScriptFeature; }
// Class for calculating the feature weights and ids.
class FeatureIdWeightCalculator {
public:
explicit FeatureIdWeightCalculator(TaskContext *context) {
if (WholeSentenceFeature::registry() == nullptr) {
// Create registry for our WholeSentenceFeature(s).
RegisterableClass<WholeSentenceFeature>::CreateRegistry(
"sentence feature function", "WholeSentenceFeature", __FILE__,
__LINE__);
}
// Register our WholeSentenceFeature(s).
// Register ContinuousBagOfNgramsFunction feature function.
static WholeSentenceFeature::Registry::Registrar cbog_registrar(
WholeSentenceFeature::registry(), "continuous-bag-of-ngrams",
"ContinuousBagOfNgramsFunction", __FILE__, __LINE__, cbog_factory);
// Register Script feature function.
static WholeSentenceFeature::Registry::Registrar sf_registrar(
WholeSentenceFeature::registry(), "script", "ScriptFeature", __FILE__,
__LINE__, sf_factory);
feature_extractor_.Setup(context);
feature_extractor_.Init(context);
}
// Assumes that a single feature is specified and extracts it.
void ExtractOnlyFeature(Sentence *sentence,
std::vector<FeatureVector> *features) {
CLD3_CHECK(features->size() == 1);
WorkspaceSet workspace;
workspace.Reset(workspace_registry_);
feature_extractor_.Preprocess(&workspace, sentence);
feature_extractor_.ExtractFeatures(workspace, *sentence, features);
CLD3_CHECK(features->size() == 1);
}
// Returns a map from feature value id to feature value weight.
std::unordered_map<int, float> GetFloatFeatureValIdsAndWeights(
Sentence *sentence) {
std::vector<FeatureVector> feature_vectors(1); // one feature space
ExtractOnlyFeature(sentence, &feature_vectors);
const FeatureVector &feature_vector = feature_vectors.at(0);
// Save the (feature value id, feature value weight) pairs to a map.
std::unordered_map<int, float> feature_id_weight;
for (int index = 0; index < feature_vector.size(); ++index) {
const FloatFeatureValue feature_value =
FloatFeatureValue(feature_vector.value(index));
feature_id_weight[feature_value.value.id] = feature_value.value.weight;
}
return feature_id_weight;
}
// Returns the feature value ids.
std::set<int> GetFeatureValueIds(Sentence *sentence) {
std::vector<FeatureVector> feature_vectors(1); // one feature space
ExtractOnlyFeature(sentence, &feature_vectors);
const FeatureVector &feature_vector = feature_vectors.at(0);
std::set<int> ids;
for (int index = 0; index < feature_vector.size(); ++index) {
ids.insert(feature_vector.value(index));
}
return ids;
}
private:
// The registry of shared workspaces in the feature extractor.
WorkspaceRegistry workspace_registry_;
LanguageIdEmbeddingFeatureExtractor feature_extractor_;
};
// Extracts features and checks that their ids and weights are correct.
bool ExtractAndCheckFeatures(const string &features, const int id_dim,
const std::vector<string> &expected_char_ngrams,
const std::vector<float> &expected_weights,
Sentence *sentence) {
TaskContext context;
context.SetParameter("language_identifier_features", features);
FeatureIdWeightCalculator calc(&context);
// Get the feature ids and the corresponding weights.
const std::unordered_map<int, float> feature_id_weight =
calc.GetFloatFeatureValIdsAndWeights(sentence);
if (feature_id_weight.size() != expected_char_ngrams.size()) {
std::cout << " Failure" << std::endl;
std::cout << " Number of expected feature ids: "
<< expected_char_ngrams.size() << std::endl;
std::cout << " Number of extracted feature ids: "
<< feature_id_weight.size() << std::endl;
return false;
}
// Specifies how close two float values should be to be considered equal.
const float epsilon = 0.0001f;
bool test_successful = true;
for (size_t i = 0; i < expected_char_ngrams.size(); ++i) {
const int expected_id =
utils::Hash32WithDefaultSeed(expected_char_ngrams.at(i)) % id_dim;
// Check the ids and the weights.
if (feature_id_weight.count(expected_id) == 0) {
std::cout << " Failure" << std::endl;
std::cout << " Feature id " << expected_id << " is missing" << std::endl;
test_successful = false;
} else {
if (std::abs(feature_id_weight.at(expected_id) - expected_weights.at(i)) >
epsilon) {
std::cout << " Failure" << std::endl;
std::cout << " Different weight for feature id " << expected_id
<< ": expected weight " << expected_weights.at(i)
<< ", actual weight " << feature_id_weight.at(expected_id)
<< std::endl;
test_successful = false;
}
}
}
if (test_successful) {
std::cout << " Success!" << std::endl;
}
return test_successful;
}
// Tests the case when ngram features get equal weight. Returns "true" if the
// test is successful and "false" otherwise.
bool TestExtractFeaturesWithEqualWeight() {
std::cout << "Running " << __FUNCTION__ << std::endl;
// The integer id of each char ngram is computed as follows:
// utils::Hash32WithDefaultSeed(char ngram) % id_dim.
const int id_dim = 100;
const string features = "continuous-bag-of-ngrams(id_dim=" +
std::to_string(id_dim) +
",size=2,include_terminators=true,include_" +
"spaces=false,use_equal_weight=true)";
Sentence sentence;
sentence.set_text("aa aab");
const std::vector<string> expected_char_ngrams{"ab", "b$", "^a", "aa", "a$"};
const std::vector<float> expected_weights = {0.2f, 0.2f, 0.2f, 0.2f, 0.2f};
return ExtractAndCheckFeatures(features, id_dim, expected_char_ngrams,
expected_weights, &sentence);
}
// Tests the case when ngram features get weights equal to their normalized
// counts. Returns "true" if the test is successful and "false" otherwise.
bool TestExtractFeaturesWithNonEqualWeight() {
std::cout << "Running " << __FUNCTION__ << std::endl;
// The integer id of each char ngram is computed as follows:
// utils::Hash32WithDefaultSeed(char ngram) % id_dim.
const int id_dim = 100;
const string features = "continuous-bag-of-ngrams(id_dim=" +
std::to_string(id_dim) +
",size=2,include_terminators=true,include_" +
"spaces=false,use_equal_weight=false)";
Sentence sentence;
sentence.set_text("aa aab");
const std::vector<string> expected_char_ngrams{"ab", "b$", "^a", "aa", "a$"};
const std::vector<float> expected_weights{0.1428f, 0.1428f, 0.2857f, 0.2857f,
0.1428f};
return ExtractAndCheckFeatures(features, id_dim, expected_char_ngrams,
expected_weights, &sentence);
}
// Tests the feature Script.
bool TestScriptFeature() {
std::cout << "Running " << __FUNCTION__ << std::endl;
bool test_successful = true;
TaskContext context;
context.SetParameter("language_identifier_features", "script");
FeatureIdWeightCalculator calc(&context);
// Check the script of the English sentence.
Sentence sentence;
sentence.set_text("food");
std::set<int> feature_val_ids = calc.GetFeatureValueIds(&sentence);
if (feature_val_ids.size() != 1 ||
feature_val_ids.count(chrome_lang_id::CLD2::ULScript_Latin) == 0) {
test_successful = false;
std::cout << " Failure for input: " << sentence.text() << std::endl;
}
// Check the script of a Chinese sentence.
sentence.set_text("");
feature_val_ids = calc.GetFeatureValueIds(&sentence);
if (feature_val_ids.size() != 1 ||
feature_val_ids.count(chrome_lang_id::CLD2::ULScript_Hani) == 0) {
test_successful = false;
std::cout << " Failure for input: " << sentence.text() << std::endl;
}
// Check the script of a Korean sentence.
sentence.set_text("워드");
feature_val_ids = calc.GetFeatureValueIds(&sentence);
if (feature_val_ids.size() != 1 ||
feature_val_ids.count(chrome_lang_id::CLD2::NUM_ULSCRIPTS) == 0) {
test_successful = false;
std::cout << " Failure for input: " << sentence.text() << std::endl;
}
if (test_successful) {
std::cout << " Success!" << std::endl;
}
return test_successful;
}
} // namespace language_identifier_features_test
} // namespace chrome_lang_id
// Runs the feature extraction tests.
int main(int argc, char **argv) {
const bool tests_successful =
chrome_lang_id::language_identifier_features_test::
TestExtractFeaturesWithEqualWeight() &&
chrome_lang_id::language_identifier_features_test::
TestExtractFeaturesWithNonEqualWeight() &&
chrome_lang_id::language_identifier_features_test::TestScriptFeature();
return tests_successful ? 0 : 1;
}

View File

@@ -0,0 +1,54 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <iostream>
#include <string>
#include "base.h"
#include "nnet_language_identifier.h"
using chrome_lang_id::NNetLanguageIdentifier;
// Runs a neural net model for language identification.
int main(int argc, char **argv) {
NNetLanguageIdentifier lang_id(/*min_num_bytes=*/0,
/*max_num_bytes=*/1000);
const std::vector<std::string> texts{"This text is written in English.",
"Text in deutscher Sprache verfasst."};
for (const std::string &text : texts) {
const NNetLanguageIdentifier::Result result = lang_id.FindLanguage(text);
std::cout << "text: " << text << std::endl
<< " language: " << result.language << std::endl
<< " probability: " << result.probability << std::endl
<< " reliable: " << result.is_reliable << std::endl
<< " proportion: " << result.proportion << std::endl
<< std::endl;
}
const std::string &text =
"This piece of text is in English. Този текст е на Български.";
std::cout << "text: " << text << std::endl;
const std::vector<NNetLanguageIdentifier::Result> results =
lang_id.FindTopNMostFreqLangs(text, /*num_langs*/ 3);
for (const NNetLanguageIdentifier::Result &result : results) {
std::cout << " language: " << result.language << std::endl
<< " probability: " << result.probability << std::endl
<< " reliable: " << result.is_reliable << std::endl
<< " proportion: " << result.proportion << std::endl
<< std::endl;
}
return 0;
}

View File

@@ -0,0 +1,254 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <cmath>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
#include "base.h"
#include "nnet_lang_id_test_data.h"
#include "nnet_language_identifier.h"
namespace chrome_lang_id {
namespace nnet_lang_id_test {
// Tests the model on all supported languages. Returns "true" if the test is
// successful and "false" otherwise.
// TODO(abakalov): Add a test for random input that should be labeled as
// "unknown" due to low confidence.
bool TestPredictions() {
std::cout << "Running " << __FUNCTION__ << std::endl;
// (gold language, sample text) pairs used for testing.
const std::vector<std::pair<std::string, std::string>> gold_lang_text = {
{"af", NNetLangIdTestData::kTestStrAF},
{"ar", NNetLangIdTestData::kTestStrAR},
{"az", NNetLangIdTestData::kTestStrAZ},
{"be", NNetLangIdTestData::kTestStrBE},
{"bg", NNetLangIdTestData::kTestStrBG},
{"bn", NNetLangIdTestData::kTestStrBN},
{"bs", NNetLangIdTestData::kTestStrBS},
{"ca", NNetLangIdTestData::kTestStrCA},
{"ceb", NNetLangIdTestData::kTestStrCEB},
{"cs", NNetLangIdTestData::kTestStrCS},
{"cy", NNetLangIdTestData::kTestStrCY},
{"da", NNetLangIdTestData::kTestStrDA},
{"de", NNetLangIdTestData::kTestStrDE},
{"el", NNetLangIdTestData::kTestStrEL},
{"en", NNetLangIdTestData::kTestStrEN},
{"eo", NNetLangIdTestData::kTestStrEO},
{"es", NNetLangIdTestData::kTestStrES},
{"et", NNetLangIdTestData::kTestStrET},
{"eu", NNetLangIdTestData::kTestStrEU},
{"fa", NNetLangIdTestData::kTestStrFA},
{"fi", NNetLangIdTestData::kTestStrFI},
{"fil", NNetLangIdTestData::kTestStrFIL},
{"fr", NNetLangIdTestData::kTestStrFR},
{"ga", NNetLangIdTestData::kTestStrGA},
{"gl", NNetLangIdTestData::kTestStrGL},
{"gu", NNetLangIdTestData::kTestStrGU},
{"ha", NNetLangIdTestData::kTestStrHA},
{"hi", NNetLangIdTestData::kTestStrHI},
{"hmn", NNetLangIdTestData::kTestStrHMN},
{"hr", NNetLangIdTestData::kTestStrHR},
{"ht", NNetLangIdTestData::kTestStrHT},
{"hu", NNetLangIdTestData::kTestStrHU},
{"hy", NNetLangIdTestData::kTestStrHY},
{"id", NNetLangIdTestData::kTestStrID},
{"ig", NNetLangIdTestData::kTestStrIG},
{"is", NNetLangIdTestData::kTestStrIS},
{"it", NNetLangIdTestData::kTestStrIT},
{"iw", NNetLangIdTestData::kTestStrIW},
{"ja", NNetLangIdTestData::kTestStrJA},
{"jv", NNetLangIdTestData::kTestStrJV},
{"ka", NNetLangIdTestData::kTestStrKA},
{"kk", NNetLangIdTestData::kTestStrKK},
{"km", NNetLangIdTestData::kTestStrKM},
{"kn", NNetLangIdTestData::kTestStrKN},
{"ko", NNetLangIdTestData::kTestStrKO},
{"la", NNetLangIdTestData::kTestStrLA},
{"lo", NNetLangIdTestData::kTestStrLO},
{"lt", NNetLangIdTestData::kTestStrLT},
{"lv", NNetLangIdTestData::kTestStrLV},
{"mg", NNetLangIdTestData::kTestStrMG},
{"mi", NNetLangIdTestData::kTestStrMI},
{"mk", NNetLangIdTestData::kTestStrMK},
{"ml", NNetLangIdTestData::kTestStrML},
{"mn", NNetLangIdTestData::kTestStrMN},
{"mr", NNetLangIdTestData::kTestStrMR},
{"ms", NNetLangIdTestData::kTestStrMS},
{"mt", NNetLangIdTestData::kTestStrMT},
{"my", NNetLangIdTestData::kTestStrMY},
{"ne", NNetLangIdTestData::kTestStrNE},
{"nl", NNetLangIdTestData::kTestStrNL},
{"no", NNetLangIdTestData::kTestStrNO},
{"ny", NNetLangIdTestData::kTestStrNY},
{"pa", NNetLangIdTestData::kTestStrPA},
{"pl", NNetLangIdTestData::kTestStrPL},
{"pt", NNetLangIdTestData::kTestStrPT},
{"ro", NNetLangIdTestData::kTestStrRO},
{"ru", NNetLangIdTestData::kTestStrRU},
{"si", NNetLangIdTestData::kTestStrSI},
{"sk", NNetLangIdTestData::kTestStrSK},
{"sl", NNetLangIdTestData::kTestStrSL},
{"so", NNetLangIdTestData::kTestStrSO},
{"sq", NNetLangIdTestData::kTestStrSQ},
{"sr", NNetLangIdTestData::kTestStrSR},
{"st", NNetLangIdTestData::kTestStrST},
{"su", NNetLangIdTestData::kTestStrSU},
{"sv", NNetLangIdTestData::kTestStrSV},
{"sw", NNetLangIdTestData::kTestStrSW},
{"ta", NNetLangIdTestData::kTestStrTA},
{"te", NNetLangIdTestData::kTestStrTE},
{"tg", NNetLangIdTestData::kTestStrTG},
{"th", NNetLangIdTestData::kTestStrTH},
{"tr", NNetLangIdTestData::kTestStrTR},
{"uk", NNetLangIdTestData::kTestStrUK},
{"ur", NNetLangIdTestData::kTestStrUR},
{"uz", NNetLangIdTestData::kTestStrUZ},
{"vi", NNetLangIdTestData::kTestStrVI},
{"yi", NNetLangIdTestData::kTestStrYI},
{"yo", NNetLangIdTestData::kTestStrYO},
{"zh", NNetLangIdTestData::kTestStrZH},
{"zu", NNetLangIdTestData::kTestStrZU}};
NNetLanguageIdentifier lang_id(/*min_num_bytes=*/0,
/*max_num_bytes=*/1000);
// Iterate over all the test instances, make predictions and check that they
// are correct.
int num_wrong = 0;
for (const auto &test_instance : gold_lang_text) {
const std::string &expected_lang = test_instance.first;
const std::string &text = test_instance.second;
const NNetLanguageIdentifier::Result result = lang_id.FindLanguage(text);
if (result.language != expected_lang) {
++num_wrong;
std::cout << " Misclassification: " << std::endl;
std::cout << " Text: " << text << std::endl;
std::cout << " Expected language: " << expected_lang << std::endl;
std::cout << " Predicted language: " << result.language << std::endl;
}
}
if (num_wrong == 0) {
std::cout << " Success!" << std::endl;
return true;
} else {
std::cout << " Failure: " << num_wrong << " wrong predictions"
<< std::endl;
return false;
}
}
// Tests the model on input containing multiple languages of different scripts.
// Returns "true" if the test is successful and "false" otherwise.
bool TestMultipleLanguagesInInput() {
std::cout << "Running " << __FUNCTION__ << std::endl;
// Text containing snippets in English and Bulgarian.
const std::string text =
"This piece of text is in English. Този текст е на Български.";
// Expected language spans in the input text, corresponding respectively to
// Bulgarian and English.
const std::string expected_bg_span = " Този текст е на Български ";
const std::string expected_en_span = " This piece of text is in English ";
const float expected_byte_sum =
static_cast<float>(expected_bg_span.size() + expected_en_span.size());
// Number of languages to query for and the expected byte proportions.
const int num_queried_langs = 3;
const std::unordered_map<string, float> expected_lang_proportions{
{"bg", expected_bg_span.size() / expected_byte_sum},
{"en", expected_en_span.size() / expected_byte_sum},
{NNetLanguageIdentifier::kUnknown, 0.0}};
NNetLanguageIdentifier lang_id(/*min_num_bytes=*/0,
/*max_num_bytes=*/1000);
const std::vector<NNetLanguageIdentifier::Result> results =
lang_id.FindTopNMostFreqLangs(text, num_queried_langs);
if (results.size() != expected_lang_proportions.size()) {
std::cout << " Failure" << std::endl;
std::cout << " Wrong number of languages: expected "
<< expected_lang_proportions.size() << ", obtained "
<< results.size() << std::endl;
return false;
}
// Iterate over the results and check that the correct proportions are
// returned for the expected languages.
const float epsilon = 0.00001f;
for (const NNetLanguageIdentifier::Result &result : results) {
if (expected_lang_proportions.count(result.language) == 0) {
std::cout << " Failure" << std::endl;
std::cout << " Incorrect language: " << result.language << std::endl;
return false;
}
if (std::abs(result.proportion -
expected_lang_proportions.at(result.language)) > epsilon) {
std::cout << " Failure" << std::endl;
std::cout << " Language " << result.language << ": expected proportion "
<< expected_lang_proportions.at(result.language) << ", got "
<< result.proportion << std::endl;
return false;
}
// Skip over undefined language.
if (result.language == "und")
continue;
if (result.byte_ranges.size() != 1) {
std::cout << " Should only detect one span containing " << result.language
<< std::endl;
return false;
}
// Check that specified byte ranges for language are correct.
int start_index = result.byte_ranges[0].start_index;
int end_index = result.byte_ranges[0].end_index;
std::string byte_ranges_text = text.substr(start_index, end_index - start_index);
if (result.language == "bg") {
if (byte_ranges_text.compare("Този текст е на Български.") != 0) {
std::cout << " Incorrect byte ranges returned for Bulgarian " << std::endl;
return false;
}
} else if (result.language == "en") {
if (byte_ranges_text.compare("This piece of text is in English. ") != 0) {
std::cout << " Incorrect byte ranges returned for English " << std::endl;
return false;
}
} else {
std::cout << " Got language other than English or Bulgarian "
<< std::endl;
return false;
}
}
std::cout << " Success!" << std::endl;
return true;
}
} // namespace nnet_lang_id_test
} // namespace chrome_lang_id
// Runs tests for the language identification model.
int main(int argc, char **argv) {
const bool tests_successful =
chrome_lang_id::nnet_lang_id_test::TestPredictions() &&
chrome_lang_id::nnet_lang_id_test::TestMultipleLanguagesInInput();
return tests_successful ? 0 : 1;
}

View File

@@ -0,0 +1,529 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "nnet_lang_id_test_data.h"
namespace chrome_lang_id {
const char *const NNetLangIdTestData::kTestStrAF =
"Dit is 'n kort stukkie van die teks wat gebruik sal word vir die toets "
"van die akkuraatheid van die nuwe benadering.";
const char *const NNetLangIdTestData::kTestStrAR = "احتيالية بيع أي حساب";
const char *const NNetLangIdTestData::kTestStrAZ =
" a az qalıb breyn rinq intellektual oyunu üzrə yarışın zona mərhələləri "
"keçirilib miq un qalıqlarının dənizdən çıxarılması davam edir məhəmməd "
"peyğəmbərin karikaturalarını çap edən qəzetin baş redaktoru iş otağında "
"ölüb";
const char *const NNetLangIdTestData::kTestStrBE =
" а друкаваць іх не было тэхнічна магчыма бліжэй за вільню тым самым часам "
"нямецкае кіраўніцтва прапаноўвала апроч ўвядзення лацінкі яе";
const char *const NNetLangIdTestData::kTestStrBG =
" а дума попада в състояние на изпитание ключовите думи с предсказана "
"малко под то изискване на страниците за търсене в";
const char *const NNetLangIdTestData::kTestStrBN =
"গ্যালারির ৩৮ বছর পূর্তিতে মূল্যছাড় অর্থনীতি বিএনপির ওয়াক আউট তপন"
" চৌধুরী হারবাল অ্যাসোসিয়েশনের সভাপতি আন্তর্জাতিক পরামর্শক "
"বোর্ড দিয়ে শরিয়াহ্ ইনন্ডেক্স করবে "
"সিএসই মালিকপক্ষের কান্না, শ্রমিকের অনিশ্চয়তা মতিঝিলে সমাবেশ নিষিদ্ধ: "
"এফবিসিসিআইয়ের ধন্যবাদ বিনোদন বিশেষ প্রতিবেদন বাংলালিংকের গ্র্যান্ডমাস্টার "
"সিজন-৩ ব্রাজিলে বিশ্বকাপ ফুটবল আয়োজনবিরোধী বিক্ষোভ দেশের নিরাপত্তার"
" চেয়ে অনেক বেশি সচেতন । প্রার্থীদের দক্ষতা ও যোগ্যতার"
" পাশাপাশি তারা জাতীয় ইস্যুগুলোতে প্রাধান্য দিয়েছেন । ” পাঁচটি সিটিতে ২০"
" লাখ ভোটারদের দিয়ে জাতীয় নির্বাচনে ৮ কোটি ভোটারদের"
" সঙ্গে তুলনা করা যাবে কি একজন দর্শকের এমন প্রশ্নে জবাবে আব্দুল্লাহ "
"আল নোমান বলেন , “ এই পাঁচটি সিটি কর্পোরেশন নির্বাচন দেশের পাঁচটি বড়"
" বিভাগের প্রতিনিধিত্ব করছে । এছাড়া এখানকার ভোটার রা সবাই সচেতন । তারা";
const char *const NNetLangIdTestData::kTestStrBS =
"Novi predsjednik Mešihata Islamske zajednice u Srbiji (IZuS) i muftija "
"dr. Mevlud ef. Dudić izjavio je u intervjuu za Anadolu Agency (AA) kako "
"je uvjeren da će doći do vraćanja jedinstva među muslimanima i unutar "
"Islamske zajednice na prostoru Sandžaka, te da je njegova ruka pružena za "
"povratak svih u okrilje Islamske zajednice u Srbiji nakon skoro sedam "
"godina podjela u tom dijelu Srbije. Dudić je za predsjednika Mešihata IZ "
"u Srbiji izabran 4. januara, a zvanična inauguracija će biti obavljena u "
"prvoj polovini februara. Kako se očekuje, prisustvovat će joj i "
"reisu-l-ulema Islamske zajednice u Srbiji Husein ef. Kavazović koji će i "
"zvanično promovirati Dudića u novog prvog čovjeka IZ u Srbiji. Dudić će "
"danas boraviti u prvoj zvaničnoj posjeti reisu Kavazoviću, što je njegov "
"privi simbolični potez nakon imenovanja. ";
const char *const NNetLangIdTestData::kTestStrCA =
"al final en un únic lloc nhorabona l correu electrònic està concebut com "
"a eina de productivitat aleshores per què perdre el temps arxivant "
"missatges per després intentar recordar on els veu desar i per què heu d "
"eliminar missatges importants per l";
const char *const NNetLangIdTestData::kTestStrCEB =
"Ang Sugbo usa sa mga labing ugmad nga lalawigan sa nasod. Kini ang sentro "
"sa komersyo, edukasyon ug industriya sa sentral ug habagatang dapit sa "
"kapupod-an. Ang mipadayag sa Sugbo isip ikapito nga labing nindot nga "
"pulo sa , ang nag-inusarang pulo sa Pilipinas nga napasidunggan sa maong "
"magasin sukad pa sa tuig";
const char *const NNetLangIdTestData::kTestStrCS =
" a akci opakujte film uložen vykreslit gmail tokio smazat obsah adresáře "
"nelze načíst systémový profil jednotky smoot okud používáte pro určení "
"polokoule značky z západ nebo v východ používejte nezáporné hodnoty "
"zeměpisné délky nelze";
const char *const NNetLangIdTestData::kTestStrCY =
" a chofrestru eich cyfrif ymwelwch a unwaith i chi greu eich cyfrif mi "
"fydd yn cael ei hysbysu o ch cyfeiriad ebost newydd fel eich bod yn gallu "
"cadw mewn cysylltiad drwy gmail os nad ydych chi wedi clywed yn barod am "
"gmail mae n gwasanaeth gwebost";
const char *const NNetLangIdTestData::kTestStrDA =
" a z tallene og punktummer der er tilladte log ud angiv den ønskede "
"adgangskode igen november gem personlige oplysninger kontrolspørgsmål det "
"sidste tegn i dit brugernavn skal være et bogstav a z eller tal skriv de "
"tegn du kan se i billedet nedenfor";
const char *const NNetLangIdTestData::kTestStrDE =
" abschnitt ordner aktivieren werden die ordnereinstellungen im "
"farbabschnitt deaktiviert öchten sie wirklich fortfahren eldtypen angeben "
"optional n diesem schritt geben sie für jedesfeld aus dem datenset den "
"typ an ieser schritt ist optional eldtypen";
const char *const NNetLangIdTestData::kTestStrEL =
" ή αρνητική αναζήτηση λέξης κλειδιού καταστήστε τις μεμονωμένες λέξεις "
"κλειδιά περισσότερο στοχοθετημένες με τη μετατροπή τους σε";
const char *const NNetLangIdTestData::kTestStrEN =
" a backup credit card by visiting your billing preferences page or visit "
"the adwords help centre for more details https adwords google com support "
"bin answer py answer hl en we were unable to process the payment of for "
"your outstanding google adwords";
const char *const NNetLangIdTestData::kTestStrEO =
" a jarcento refoje per enmetado de koncerna pastro tiam de reformita "
"konfesio ekde refoje ekzistis luteranaj komunumanoj tamen tiuj fondis "
"propran komunumon nur en ambaŭ apartenis ekde al la evangela eklezio en "
"prusio resp ties rejnlanda provinceklezio en";
const char *const NNetLangIdTestData::kTestStrES =
" a continuación haz clic en el botón obtener ruta también puedes "
"desplazarte hasta el final de la página para cambiar tus opciones de "
"búsqueda gráfico y detalles ésta es una lista de los vídeos que te "
"recomendamos nuestras recomendaciones se basan";
const char *const NNetLangIdTestData::kTestStrET =
" a niipea kui sinu maksimaalne igakuine krediidi limiit on meie poolt "
"heaks kiidetud on sinu kohustuseks see krediidilimiit";
const char *const NNetLangIdTestData::kTestStrEU =
" a den eraso bat honen kontra hortaz eragiketa bakarrik behar dituen "
"eraso batek aes apurtuko luke nahiz eta oraingoz eraso bideraezina izan "
"gaur egungo teknologiaren mugak direla eta oraingoz kezka hauek alde "
"batera utzi daitezke orain arteko indar";
const char *const NNetLangIdTestData::kTestStrFA =
" آب خوردن عجله می کردند به جای باز ی کتک کاری می کردند و همه چيز مثل قبل "
"بود فقط من ماندم و يک دنيا حرف و انتظار تا عاقبت رسيد احضاريه ی ای با";
const char *const NNetLangIdTestData::kTestStrFI =
" a joilla olet käynyt tämä kerro meille kuka ä olet ei tunnistettavia "
"käyttötietoja kuten virheraportteja käytetään google desktopin "
"parantamiseen etsi näyttää mukautettuja uutisia google desktop "
"keskivaihto leikkaa voit kaksoisnapsauttaa";
const char *const NNetLangIdTestData::kTestStrFIL =
"Ito ay isang maikling piraso ng teksto na ito ay gagamitin para sa "
"pagsubok ang kawastuhan ng mga bagong diskarte.";
const char *const NNetLangIdTestData::kTestStrFR =
" a accès aux collections et aux frontaux qui lui ont été attribués il "
"peut consulter et modifier ses collections et exporter des configurations "
"de collection toutefois il ne peut pas créer ni supprimer des collections "
"enfin il a accès aux fonctions";
const char *const NNetLangIdTestData::kTestStrGA =
" a bhfuil na focail go léir i do cheist le fáil orthu ní gá ach focail "
"breise a chur leis na cinn a cuardaíodh cheana chun an cuardach a "
"bheachtú nó a chúngú má chuirtear focal breise isteach aimseofar fo aicme "
"ar leith de na torthaí a fuarthas";
const char *const NNetLangIdTestData::kTestStrGL =
" debe ser como mínimo taranto tendas de venda polo miúdo cociñas "
"servizos bordado canadá viaxes parques de vehículos de recreo hotel "
"oriental habitación recibir unha postal no enderezo indicado "
"anteriormente";
const char *const NNetLangIdTestData::kTestStrGU =
" આના પરિણામ પ્રમાણસર ફોન્ટ અવતરણ ચિન્હવાળા પાઠને છુપાવો બધા સમૂહો શોધાયા"
" હાલનો જ સંદેશ વિષયની";
const char *const NNetLangIdTestData::kTestStrHA =
" a cikin a kan sakamako daga sakwannin a kan sakamako daga sakwannin daga "
"ranar zuwa a kan sakamako daga guda daga ranar zuwa a kan sakamako daga "
"shafukan daga ranar zuwa a kan sakamako daga guda a cikin last hour a kan "
"sakamako daga guda daga kafar";
const char *const NNetLangIdTestData::kTestStrHI =
" ं ऐडवर्ड्स विज्ञापनों के अनुभव पर आधारित हैं और इनकी मदद से आपको अपने"
" विज्ञापनों का अधिकतम लाभ";
const char *const NNetLangIdTestData::kTestStrHMN =
"Qhov no yog ib tug luv luv daim ntawv nyeem uas yuav siv tau rau kev soj "
"ntsuam qhov tseeb ntawm tus tshiab mus kom ze.";
const char *const NNetLangIdTestData::kTestStrHR =
"Posljednja dva vladara su Kijaksar (Κυαξαρης; 625-585 prije Krista), "
"fraortov sin koji će proširiti teritorij Medije i Astijag. Kijaksar je "
"imao kćer ili unuku koja se zvala Amitis a postala je ženom "
"Nabukodonosora II. kojoj je ovaj izgradio Viseće vrtove Babilona. "
"Kijaksar je modernizirao svoju vojsku i uništio Ninivu 612. prije Krista. "
"Naslijedio ga je njegov sin, posljednji medijski kralj, Astijag, kojega "
"je detronizirao (srušio sa vlasti) njegov unuk Kir Veliki. Zemljom su "
"zavladali Perzijanci. Hrvatska je zemlja situacija u Europi. Ona ima "
"bogatu kulturu i ukusna jela.";
const char *const NNetLangIdTestData::kTestStrHT =
" ak pitit tout sosyete a chita se pou sa leta dwe pwoteje yo nimewo leta "
"fèt pou li pwoteje tout paran ak pitit nan peyi a menm jan kit paran yo "
"marye kit yo pa marye tout manman ki fè pitit leta fèt pou ba yo konkoul "
"menm jan tou pou timoun piti ak pou";
const char *const NNetLangIdTestData::kTestStrHU =
" a felhasználóim a google azonosító szöveget ikor látják a felhasználóim "
"a google azonosító szöveget felhasználók a google azonosító szöveget "
"fogják látni minden tranzakció után ha a vásárlását regisztrációját "
"oldalunk";
const char *const NNetLangIdTestData::kTestStrHY =
" ա յ եվ նա հիացած աչքերով նայում է հինգհարկանի շենքի տարօրինակ փոքրիկ "
"քառակուսի պատուհաններին դեռ մենք շատ ենք հետամնաց ասում է նա այսպես է";
const char *const NNetLangIdTestData::kTestStrID =
"berdiri setelah pengurusnya yang berusia 83 tahun, Fayzrahman Satarov, "
"mendeklarasikan diri sebagai nabi dan rumahnya sebagai negara Islam "
"Satarov digambarkan sebagai mantan ulama Islam tahun 1970-an. "
"Pengikutnya didorong membaca manuskripnya dan kebanyakan dilarang "
"meninggalkan tempat persembunyian bawah tanah di dasar gedung delapan "
"lantai mereka. Jaksa membuka penyelidikan kasus kriminal pada kelompok "
"itu dan menyatakan akan membubarkan kelompok kalau tetap melakukan "
"kegiatan ilegal seperti mencegah anggotanya mencari bantuan medis atau "
"pendidikan. Sampai sekarang pihak berwajib belum melakukan penangkapan "
"meskipun polisi mencurigai adanya tindak kekerasan pada anak. Pengadilan "
"selanjutnya akan memutuskan apakah anak-anak diizinkan tetap tinggal "
"dengan orang tua mereka. Kazan yang berada sekitar 800 kilometer di timur "
"Moskow merupakan wilayah Tatarstan yang";
const char *const NNetLangIdTestData::kTestStrIG =
"Chineke bụ aha ọzọ ndï omenala Igbo kpọro Chukwu. Mgbe ndị bekee bịara, "
"ha mee ya nke ndi Christian. N'echiche ndi ekpere chi Omenala Ndi Igbo, "
"Christianity, Judaism, ma Islam, Chineke nwere ọtụtụ utu aha, ma nwee "
"nanị otu aha. Ụzọ abụọ e si akpọ aha ahụ bụ Jehovah ma Ọ bụ Yahweh. Na "
"ọtụtụ Akwụkwọ Nsọ, e wepụla aha Chineke ma jiri utu aha bụ Onyenwe Anyị "
"ma ọ bụ Chineke dochie ya. Ma mgbe e dere akwụkwọ nsọ, aha ahụ bụ Jehova "
"pụtara nime ya, ihe dị ka ugboro pụkụ asaa(7,000).";
const char *const NNetLangIdTestData::kTestStrIS =
" a afköst leitarorða þinna leitarorð neikvæð leitarorð auglýsingahópa "
"byggja upp aðallista yfir ný leitarorð fyrir auglýsingahópana og skoða "
"ítarleg gögn um árangur leitarorða eins og samkeppni auglýsenda og "
"leitarmagn er krafist notkun";
const char *const NNetLangIdTestData::kTestStrIT =
" a causa di un intervento di manutenzione del sistema fino alle ore circa "
"ora legale costa del pacifico del novembre le campagne esistenti "
"continueranno a essere pubblicate come di consueto anche durante questo "
"breve periodo di inattività ci scusiamo per";
const char *const NNetLangIdTestData::kTestStrIW =
" או לערוך את העדפות ההפצה אנא עקוב אחרי השלבים הבאים כנס לחשבון האישי שלך "
"ב";
const char *const NNetLangIdTestData::kTestStrJA =
" このペ ジでは アカウントに指定された予算の履歴を一覧にしています "
"それぞれの項目には 予算額と特定期間のステ タスが表示されます "
"現在または今後の予算を設定するには";
const char *const NNetLangIdTestData::kTestStrJV =
"Iki Piece cendhak teks sing bakal digunakake kanggo Testing akurasi "
"pendekatan anyar.";
const char *const NNetLangIdTestData::kTestStrKA =
" ა ბირთვიდან მიღებული ელემენტი მენდელეევის პერიოდულ სიტემაში "
"გადაინაცვლებს ორი უჯრით";
const char *const NNetLangIdTestData::kTestStrKK =
" а билердің өзіне рұқсат берілмеген егер халық талап етсе ғана хан "
"келісім берген өздеріңіз білесіздер қр қыл мыс тық кодексінде жазаның";
const char *const NNetLangIdTestData::kTestStrKM =
"នេះគឺជាបំណែកខ្លីនៃអត្ថបទដែលនឹងត្រូវបានប្រើសម្រាប់ការធ្វើតេស្តភាពត្រឹមត្រូវ"
"នៃវិធីសាស្រ្តថ្មីនេះ។";
const char *const NNetLangIdTestData::kTestStrKN =
" ಂಠಯ್ಯನವರು ತುಮಕೂರು ಜಿಲ್ಲೆಯ ಚಿಕ್ಕನಾಯಕನಹಳ್ಳಿ ತಾಲ್ಲೂಕಿನ ತೀರ್ಥಪುರ ವೆಂಬ ಸಾಧಾರಣ"
" ಹಳ್ಳಿಯ ಶ್ಯಾನುಭೋಗರ";
const char *const NNetLangIdTestData::kTestStrKO =
" 개별적으로 리포트 액세스 권한을 부여할 수 있습니다 액세스 권한 "
"부여사용자에게 프로필 리포트에 액세스할 수 있는 권한을 부여하시려면 가용 "
"프로필 상자에서 프로필 이름을 선택한 다음";
const char *const NNetLangIdTestData::kTestStrLA =
" a deo qui enim nocendi causa mentiri solet si iam consulendi causa "
"mentiatur multum profecit sed aliud est quod per se ipsum laudabile "
"proponitur aliud quod in deterioris comparatione praeponitur aliter enim "
"gratulamur cum sanus est homo aliter cum melius";
const char *const NNetLangIdTestData::kTestStrLO =
" ກຫາທົ່ວທັງເວັບ ແລະໃນເວັບໄຮ້ສາຍ ທຳອິດໃຫ້ທຳການຊອກຫາກ່ອນ ຈາກນັ້ນ"
" ໃຫ້ກົດປຸ່ມເມນູ ໃນໜ້າຜົນໄດ້";
const char *const NNetLangIdTestData::kTestStrLT =
" a išsijungia mano idėja dėl geriausio laiko po pastarųjų savo santykių "
"pasimokiau penki dalykai be kurių negaliu gyventi mano miegamajame tu "
"surasi ideali pora išsilavinimas aukštoji mokykla koledžas universitetas "
"pagrindinis laipsnis metai";
const char *const NNetLangIdTestData::kTestStrLV =
" a gadskārtējā izpārdošana slēpošana jāņi atlaide izmaiņas trafikā kas "
"saistītas ar sezonas izpārdošanu speciālajām atlaidēm u c ir parastas un "
"atslēgvārdi kas ir populāri noteiktos laika posmos šajā laikā saņems "
"lielāku klikšķu";
const char *const NNetLangIdTestData::kTestStrMG =
" amporisihin i ianao mba hijery ny dika teksta ranofotsiny an ity "
"lahatsoratra ity tsy ilaina ny opérateur efa karohina daholo ny teny "
"rehetra nosoratanao ampiasao anaovana dokambarotra i google telugu datin "
"ny takelaka fikarohana sary renitakelak i";
const char *const NNetLangIdTestData::kTestStrMI =
" haere ki te kainga o o haere ki te kainga o o haere ki te kainga o te "
"rapunga ahua o haere ki te kainga o ka tangohia he ki to rapunga kaore au "
"mohio te tikanga whakatiki o te ra he whakaharuru te pai rapunga a te "
"rapunga ahua a e kainga o nga awhina o te";
const char *const NNetLangIdTestData::kTestStrMK =
" гласовите коалицијата на вмро дпмне како партија со најмногу освоени "
"гласови ќе добие евра а на сметката на коализијата за македонија";
const char *const NNetLangIdTestData::kTestStrML =
" അങ്ങനെ ഞങ്ങള് അവരുടെ മുമ്പില് നിന്നു ഔടും ഉടനെ നിങ്ങള് പതിയിരിപ്പില് "
"നിന്നു എഴുന്നേറ്റു";
const char *const NNetLangIdTestData::kTestStrMN =
" а боловсронгуй болгох орон нутгийн ажил үйлсийг уялдуулж зохицуулах "
"дүрэм журам боловсруулах орон нутгийн өмч хөрөнгө санхүүгийн";
const char *const NNetLangIdTestData::kTestStrMR =
"हैदराबाद उच्चार ऐका (सहाय्य·माहिती)तेलुगू: హైదరాబాదు , उर्दू:"
" حیدر آباد हे भारतातील आंध्र प्रदेश राज्याच्या राजधानीचे शहर"
" आहे. हैदराबादची लोकसंख्या ७७ लाख ४० हजार ३३४ आहे. मोत्यांचे शहर"
" अशी एकेकाळी ओळख असलेल्या या शहराला ऐतिहासिक, सांस्कृतिक आणि "
"स्थापत्यशास्त्रीय वारसा लाभला आहे. १९९० नंतर शिक्षण आणि माहिती तंत्रज्ञान"
" त्याचप्रमाणे औषधनिर्मिती आणि जैवतंत्रज्ञान क्षेत्रातील उद्योगधंद्यांची"
" वाढ शहरात झाली. दक्षिण मध्य भारतातील पर्यटन आणि तेलुगू चित्रपटनिर्मितीचे"
" हैदराबाद हे केंद्र आहे";
const char *const NNetLangIdTestData::kTestStrMS =
"pengampunan beramai-ramai supaya mereka pulang ke rumah masing-masing. "
"Orang-orang besarnya enggan mengiktiraf sultan yang dilantik oleh Belanda "
"sebagai Yang DiPertuan Selangor. Orang ramai pula tidak mahu menjalankan "
"perniagaan bijih timah dengan Belanda, selagi raja yang berhak tidak "
"ditabalkan. Perdagang yang lain dibekukan terus kerana untuk membalas "
"jasa beliau yang membantu Belanda menentang Riau, Johor dan Selangor. Di "
"antara tiga orang Sultan juga dipandang oleh rakyat sebagai seorang "
"sultan yang paling gigih. 1 | 2 SULTAN Sebagai ganti Sultan Ibrahim "
"ditabalkan Raja Muhammad iaitu Raja Muda. Walaupun baginda bukan anak "
"isteri pertama bergelar Sultan Muhammad bersemayam di Kuala Selangor "
"juga. Pentadbiran baginda yang lemah itu menyebabkan Kuala Selangor "
"menjadi sarang ioleh Cina di Lukut tidak diambil tindakan, sedangkan "
"baginda sendiri banyak berhutang kepada 1";
const char *const NNetLangIdTestData::kTestStrMT =
" ata ikteb messaġġ lil indirizzi differenti billi tagħżilhom u tagħfas il "
"buttuna ikteb żid numri tfittxijja tal kotba mur print home kotba minn "
"pagni ghal pagna minn ghall ktieb ta aċċessa stieden habib iehor grazzi "
"it tim tal gruppi google";
const char *const NNetLangIdTestData::kTestStrMY =
" တက္ကသုိလ္ မ္ဟ ပ္ရန္ လာ္ရပီးေနာက္ န္ဟစ္ အရ္ဝယ္ ဦးသန္ ့သည္ ပန္"
" းတနော္ အမ္ယုိးသား ေက္ယာင္ း";
const char *const NNetLangIdTestData::kTestStrNE =
"अरू ठाऊँबाटपनि खुलेको छ यो खाता अर अरू ठाऊँबाटपनि खुलेको छ यो खाता अर ू";
const char *const NNetLangIdTestData::kTestStrNL =
" a als volgt te werk om een configuratiebestand te maken sitemap gen py "
"ebruik filters om de s op te geven die moeten worden toegevoegd of "
"uitgesloten op basis van de opmaaktaal elke sitemap mag alleen de s "
"bevatten voor een bepaalde opmaaktaal dit";
const char *const NNetLangIdTestData::kTestStrNO =
" a er obligatorisk tidsforskyvning plassering av katalogsøk "
"planinformasjon loggfilbane gruppenavn kontoinformasjon passord domene "
"gruppeinformasjon alle kampanjesporing alternativ bruker grupper "
"oppgaveplanlegger oppgavehistorikk kontosammendrag antall";
const char *const NNetLangIdTestData::kTestStrNY =
"Boma ndi gawo la dziko lomwe linapangidwa ndi cholinga chothandiza "
"ntchito yolamulira. Kuŵalako kulikuunikabe mandita, Edipo nyima "
"unalephera kugonjetsa kuŵalako.";
const char *const NNetLangIdTestData::kTestStrPA =
" ਂ ਦਿਨਾਂ ਵਿਚ ਭਾਈ ਸਾਹਿਬ ਦੀ ਬੁੱਚੜ ਗੋਬਿੰਦ ਰਾਮ ਨਾਲ ਅੜਫਸ ਚੱਲ ਰਹੀ ਸੀ ਗੋਬਿੰਦ"
" ਰਾਮ ਨੇ ਭਾਈ ਸਾਹਿਬ ਦੀਆਂ ਭੈਣਾ";
const char *const NNetLangIdTestData::kTestStrPL =
" a australii będzie widział inne reklamy niż użytkownik z kanady "
"kierowanie geograficzne sprawia że reklamy są lepiej dopasowane do "
"użytkownika twojej strony oznacza to także że możesz nie zobaczyć "
"wszystkich reklam które są wyświetlane na";
const char *const NNetLangIdTestData::kTestStrPT =
" a abit prevê que a entrada desses produtos estrangeiros no mercado "
"têxtil e vestuário do brasil possa reduzir os preços em cerca de a partir "
"de má notícia para os empresários que terão que lutar para garantir suas "
"margens de lucro mas boa notícia";
const char *const NNetLangIdTestData::kTestStrRO =
" a anunţurilor reţineţi nu plătiţi pentru clicuri sau impresii ci numai "
"atunci când pe site ul dvs survine o acţiune dorită site urile negative "
"nu pot avea uri de destinaţie daţi instrucţiuni societăţii dvs bancare "
"sau constructoare să";
const char *const NNetLangIdTestData::kTestStrRU =
" а неправильный формат идентификатора дн назад";
const char *const NNetLangIdTestData::kTestStrSI =
" අනුරාධ මිහිඳුකුල නමින් සකුරා ට ලිපියක් තැපෑලෙන් එවා තිබුණා කි "
"් රස්ටි ෂෙල්ටන් ප ් රනාන්දු ද";
const char *const NNetLangIdTestData::kTestStrSK =
" a aktivovať reklamnú kampaň ak chcete kampaň pred spustením ešte "
"prispôsobiť uložte ju ako šablónu a pokračujte v úprave vyberte si jednu "
"z možností nižšie a kliknite na tlačidlo uložiť kampaň nastavenia kampane "
"môžete ľubovoľne";
const char *const NNetLangIdTestData::kTestStrSL =
" adsense stanje prijave za google adsense google adsense račun je bil "
"začasno zamrznjen pozdravljeni hvala za vaše zanimanje v google adsense "
"po pregledu vaše prijavnice so naši strokovnjaki ugotovili da spletna "
"stran ki je trenutno povezana z vašim";
const char *const NNetLangIdTestData::kTestStrSO =
" a oo maanta bogga koobaad ugu qoran yahey beesha caalamka laakiin si "
"kata oo beesha caalamku ula guntato soomaaliya waxa aan shaki ku jirin in "
"aakhirataanka dadka soomaalida oo kaliya ay yihiin ku soomaaliya ka saari "
"kara dhibka ay ku jirto";
const char *const NNetLangIdTestData::kTestStrSQ =
" a do të kërkoni nga beogradi që të njohë pavarësinë e kosovës zoti thaçi "
"prishtina është gati ta njoh pavarësinë e serbisë ndërsa natyrisht se do "
"të kërkohet një gjë e tillë që edhe beogradi ta njoh shtetin e pavarur "
"dhe sovran të";
const char *const NNetLangIdTestData::kTestStrSR =
"балчак балчак на мапи србије уреди демографија у насељу балчак живи "
"пунолетна становника а просечна старост становништва износи година";
const char *const NNetLangIdTestData::kTestStrST =
" bang ba nang le thahasello matshwao a sehlooho thuto e thehilweng hodima "
"diphetho ke tsela ya ho ruta le ho ithuta e totobatsang hantle seo "
"baithuti ba lokelang ho se fihlella ntlhatheo eo e sebetsang ka yona ke "
"ya hore titjhere o hlakisa pele seo";
const char *const NNetLangIdTestData::kTestStrSU =
"Nu ngatur kahirupan warga, keur kapentingan pamarentahan diatur ku RT, RW "
"jeung Kepala Dusun, sedengkeun urusan adat dipupuhuan ku Kuncen jeung "
"kepala adat. Sanajan Kampung Kuta teu pati anggang jeung lembur sejenna "
"nu aya di wewengkon Desa Pasir Angin, tapi boh wangunan imah atawa "
"tradisi kahirupan masarakatna nenggang ti nu lian.";
const char *const NNetLangIdTestData::kTestStrSV =
" a bort objekt från google desktop post äldst meny öretag dress etaljer "
"alternativ för vad är inne yaste google skrivbord plugin program för "
"nyheter google visa nyheter som är anpassade efter de artiklar som du "
"läser om du till exempel läser";
const char *const NNetLangIdTestData::kTestStrSW =
" a ujumbe mpya jumla unda tafuta na angalia vikundi vya kujadiliana na "
"kushiriki mawazo iliyopangwa kwa tarehe watumiaji wapya futa orodha hizi "
"lugha hoja vishikanisho vilivyo dhaminiwa ujumbe sanaa na tamasha toka "
"udhibitisho wa neno kwa haraka fikia";
const char *const NNetLangIdTestData::kTestStrTA =
" அங்கு ராஜேந்திர சோழனால் கட்டப்பட்ட பிரம்மாண்டமான சிவன் கோவில் ஒன்றும்"
" உள்ளது தொகு";
const char *const NNetLangIdTestData::kTestStrTE =
" ఁ దనర జయించిన తత్వ మరసి చూడఁ దాన యగును రాజయోగి యిట్లు తేజరిల్లుచు నుండు "
"విశ్వదాభిరామ వినర వేమ";
const char *const NNetLangIdTestData::kTestStrTG =
" адолат ва инсондӯстиро бар фашизм нажодпарастӣ ва адоват тарҷеҳ додааст "
"чоп кунед ба дигарон фиристед чоп кунед ба дигарон фиристед";
const char *const NNetLangIdTestData::kTestStrTH =
" กฏในการค้นหา หรือหน้าเนื้อหา หากท่านเลือกลงโฆษณา "
"ท่านอาจจะปรับต้องเพิ่มงบประมาณรายวันตา";
const char *const NNetLangIdTestData::kTestStrTR =
" a ayarlarınızı görmeniz ve yönetmeniz içindir eğer kampanyanız için "
"günlük bütçenizi gözden geçirebileceğiniz yeri arıyorsanız kampanya "
"yönetimi ne gidin kampanyanızı seçin ve kampanya ayarlarını düzenle yi "
"tıklayın sunumu";
const char *const NNetLangIdTestData::kTestStrUK =
" а більший бюджет щоб забезпечити собі максимум прибутків від переходів "
"відстежуйте свої об яви за датою географічним розташуванням";
const char *const NNetLangIdTestData::kTestStrUR =
" آپ کو کم سے کم ممکنہ رقم چارج کرتا ہے اس کی مثال کے طور پر فرض کریں اگر "
"آپ کی زیادہ سے زیادہ قیمت فی کلِک امریکی ڈالر اور کلِک کرنے کی شرح ہو تو";
const char *const NNetLangIdTestData::kTestStrUZ =
" abadiylashtirildi aqsh ayol prezidentga tayyormi markaziy osiyo afg "
"onistonga qanday yordam berishi mumkin ukrainada o zbekistonlik "
"muhojirlar tazyiqdan shikoyat qilmoqda gruziya va ukraina hozircha natoga "
"qabul qilinmaydi afg oniston o zbekistonni g";
const char *const NNetLangIdTestData::kTestStrVI =
" adsense cho nội dung nhà cung cấp dịch vụ di động xác minh tín"
" dụng thay đổi nhãn kg các ô xem chi phí cho từ chối các đơn đặt"
" hàng dạng cấp dữ liệu ác minh trang web của bạn để xem";
const char *const NNetLangIdTestData::kTestStrYI =
"אן פאנטאזיע ער איז באקאנט צים מערסטן פאר זיינע באַלאַדעס ער האָט געוווינט "
"אין ווארשע יעס פאריס ליווערפול און לאנדאן סוף כל סוף איז ער";
const char *const NNetLangIdTestData::kTestStrYO =
" abinibi han ikawe alantakun le ni opolopo ede abinibi ti a to lesese bi "
"eniyan to fe lo se fe lati se atunse jowo mo pe awon oju iwe itakunagbaye "
"miran ti ako ni oniruru ede abinibi le faragba nipa atunse ninu se iwadi "
"blogs ni ori itakun agbaye ti e ba";
const char *const NNetLangIdTestData::kTestStrZH =
"产品的简报和公告 提交该申请后无法进行更改 请确认您的选择是正确的 "
"对于要提交的图书 我确认 我是版权所有者或已得到版权所有者的授权 "
"要更改您的国家 地区 请在此表的最上端更改您的";
const char *const NNetLangIdTestData::kTestStrZU =
" ana engu uma inkinga iqhubeka siza ubike kwi isexwayiso ngenxa yephutha "
"lomlekeleli sikwazi ukubuyisela emuva kuphela imiphumela engaqediwe "
"ukuthola imiphumela eqediwe zama ukulayisha kabusha leli khasi emizuzwini "
"engu uma inkinga iqhubeka siza uthumele";
} // namespace chrome_lang_id

View File

@@ -0,0 +1,117 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef NNET_LANG_ID_TEST_DATA_H_
#define NNET_LANG_ID_TEST_DATA_H_
namespace chrome_lang_id {
class NNetLangIdTestData {
public:
// Pieces of text in different languages.
static const char *const kTestStrAF;
static const char *const kTestStrAR;
static const char *const kTestStrAZ;
static const char *const kTestStrBE;
static const char *const kTestStrBG;
static const char *const kTestStrBN;
static const char *const kTestStrBS;
static const char *const kTestStrCA;
static const char *const kTestStrCEB;
static const char *const kTestStrCS;
static const char *const kTestStrCY;
static const char *const kTestStrDA;
static const char *const kTestStrDE;
static const char *const kTestStrEL;
static const char *const kTestStrEN;
static const char *const kTestStrEO;
static const char *const kTestStrES;
static const char *const kTestStrET;
static const char *const kTestStrEU;
static const char *const kTestStrFA;
static const char *const kTestStrFI;
static const char *const kTestStrFIL;
static const char *const kTestStrFR;
static const char *const kTestStrGA;
static const char *const kTestStrGL;
static const char *const kTestStrGU;
static const char *const kTestStrHA;
static const char *const kTestStrHI;
static const char *const kTestStrHMN;
static const char *const kTestStrHR;
static const char *const kTestStrHT;
static const char *const kTestStrHU;
static const char *const kTestStrHY;
static const char *const kTestStrID;
static const char *const kTestStrIG;
static const char *const kTestStrIS;
static const char *const kTestStrIT;
static const char *const kTestStrIW;
static const char *const kTestStrJA;
static const char *const kTestStrJV;
static const char *const kTestStrKA;
static const char *const kTestStrKK;
static const char *const kTestStrKM;
static const char *const kTestStrKN;
static const char *const kTestStrKO;
static const char *const kTestStrLA;
static const char *const kTestStrLO;
static const char *const kTestStrLT;
static const char *const kTestStrLV;
static const char *const kTestStrMG;
static const char *const kTestStrMI;
static const char *const kTestStrMK;
static const char *const kTestStrML;
static const char *const kTestStrMN;
static const char *const kTestStrMR;
static const char *const kTestStrMS;
static const char *const kTestStrMT;
static const char *const kTestStrMY;
static const char *const kTestStrNE;
static const char *const kTestStrNL;
static const char *const kTestStrNO;
static const char *const kTestStrNY;
static const char *const kTestStrPA;
static const char *const kTestStrPL;
static const char *const kTestStrPT;
static const char *const kTestStrRO;
static const char *const kTestStrRU;
static const char *const kTestStrSI;
static const char *const kTestStrSK;
static const char *const kTestStrSL;
static const char *const kTestStrSO;
static const char *const kTestStrSQ;
static const char *const kTestStrSR;
static const char *const kTestStrST;
static const char *const kTestStrSU;
static const char *const kTestStrSV;
static const char *const kTestStrSW;
static const char *const kTestStrTA;
static const char *const kTestStrTE;
static const char *const kTestStrTG;
static const char *const kTestStrTH;
static const char *const kTestStrTR;
static const char *const kTestStrUK;
static const char *const kTestStrUR;
static const char *const kTestStrUZ;
static const char *const kTestStrVI;
static const char *const kTestStrYI;
static const char *const kTestStrYO;
static const char *const kTestStrZH;
static const char *const kTestStrZU;
};
} // namespace chrome_lang_id
#endif // NNET_LANG_ID_TEST_DATA_H_

View File

@@ -0,0 +1,386 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "nnet_language_identifier.h"
#include <math.h>
#include <algorithm>
#include <limits>
#include <string>
#include "base.h"
#include "embedding_network.h"
#include "registry.h"
#include "relevant_script_feature.h"
#include "script_span/generated_ulscript.h"
#include "script_span/getonescriptspan.h"
#include "script_span/text_processing.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "workspace.h"
namespace chrome_lang_id {
namespace {
// Struct for accumulating stats for a language as text subsequences of the same
// script are processed.
struct LangChunksStats {
// Sum of probabilities across subsequences.
float prob_sum = 0.0;
// Total number of bytes corresponding to the language.
int byte_sum = 0;
// Number chunks corresponding to the language.
int num_chunks = 0;
// Specifies the byte ranges that language applies to.
std::vector<NNetLanguageIdentifier::SpanInfo> byte_ranges;
};
// Compares two pairs based on their values.
bool OrderBySecondDescending(const std::pair<string, float> &x,
const std::pair<string, float> &y) {
if (x.second == y.second) {
return x.first < y.first;
} else {
return x.second > y.second;
}
}
// Returns "true" if the languge prediction is reliable based on the
// probability, and "false" otherwise.
bool ResultIsReliable(const string &language, float probability) {
if (language == "hr" || language == "bs") {
return (probability >= NNetLanguageIdentifier::kReliabilityHrBsThreshold);
} else {
return (probability >= NNetLanguageIdentifier::kReliabilityThreshold);
}
}
// Finds the number of interchange-valid bytes to process.
int FindNumValidBytesToProcess(const string &text) {
// Check if the size of the input text can fit into an int. If not, focus on
// the first std::numeric_limits<int>::max() bytes.
const int doc_text_size =
(text.size() < static_cast<size_t>(std::numeric_limits<int>::max()))
? static_cast<int>(text.size())
: std::numeric_limits<int>::max();
// Truncate the input text if it is too long and find the span containing
// interchange-valid UTF8.
const int num_valid_bytes = CLD2::SpanInterchangeValid(
text.c_str(),
std::min(NNetLanguageIdentifier::kMaxNumInputBytesToConsider,
doc_text_size));
return num_valid_bytes;
}
} // namespace
const int NNetLanguageIdentifier::kMinNumBytesToConsider = 140;
const int NNetLanguageIdentifier::kMaxNumBytesToConsider = 700;
const int NNetLanguageIdentifier::kMaxNumInputBytesToConsider = 10000;
const int NNetLanguageIdentifier::kNumSnippets = 5;
const char NNetLanguageIdentifier::kUnknown[] = "und";
const float NNetLanguageIdentifier::kReliabilityThreshold = 0.7f;
const float NNetLanguageIdentifier::kReliabilityHrBsThreshold = 0.5f;
const string LanguageIdEmbeddingFeatureExtractor::ArgPrefix() const {
return "language_identifier";
}
NNetLanguageIdentifier::NNetLanguageIdentifier()
: NNetLanguageIdentifier(kMinNumBytesToConsider, kMaxNumBytesToConsider) {}
static WholeSentenceFeature *cbog_factory() {
return new ContinuousBagOfNgramsFunction;
}
static WholeSentenceFeature *rsf_factory() { return new RelevantScriptFeature; }
static WholeSentenceFeature *sf_factory() { return new ScriptFeature; }
NNetLanguageIdentifier::NNetLanguageIdentifier(int min_num_bytes,
int max_num_bytes)
: num_languages_(TaskContextParams::GetNumLanguages()),
network_(&nn_params_),
min_num_bytes_(min_num_bytes),
max_num_bytes_(max_num_bytes) {
CLD3_CHECK(max_num_bytes_ > 0);
CLD3_CHECK(min_num_bytes_ >= 0);
CLD3_CHECK(min_num_bytes_ < max_num_bytes_);
num_snippets_ = (max_num_bytes_ <= kNumSnippets) ? 1 : kNumSnippets;
snippet_size_ = max_num_bytes_ / num_snippets_;
if (WholeSentenceFeature::registry() == nullptr) {
// Create registry for our WholeSentenceFeature(s).
RegisterableClass<WholeSentenceFeature>::CreateRegistry(
"sentence feature function", "WholeSentenceFeature", __FILE__,
__LINE__);
}
// Register our WholeSentenceFeature(s).
// Register ContinuousBagOfNgramsFunction feature function.
static WholeSentenceFeature::Registry::Registrar cbog_registrar(
WholeSentenceFeature::registry(), "continuous-bag-of-ngrams",
"ContinuousBagOfNgramsFunction", __FILE__, __LINE__, cbog_factory);
// Register RelevantScriptFeature feature function.
static WholeSentenceFeature::Registry::Registrar rsf_registrar(
WholeSentenceFeature::registry(), "continuous-bag-of-relevant-scripts",
"RelevantScriptFeature", __FILE__, __LINE__, rsf_factory);
// Register ScriptFeature feature function.
static WholeSentenceFeature::Registry::Registrar sf_registrar(
WholeSentenceFeature::registry(), "script", "ScriptFeature", __FILE__,
__LINE__, sf_factory);
// Get the model parameters, set up and initialize the model.
TaskContext context;
TaskContextParams::ToTaskContext(&context);
Setup(&context);
Init(&context);
}
NNetLanguageIdentifier::~NNetLanguageIdentifier() {}
void NNetLanguageIdentifier::Setup(TaskContext *context) {
feature_extractor_.Setup(context);
}
void NNetLanguageIdentifier::Init(TaskContext *context) {
feature_extractor_.Init(context);
feature_extractor_.RequestWorkspaces(&workspace_registry_);
}
void NNetLanguageIdentifier::GetFeatures(
Sentence *sentence, std::vector<FeatureVector> *features) const {
// Feature workspace set.
WorkspaceSet workspace;
workspace.Reset(workspace_registry_);
feature_extractor_.Preprocess(&workspace, sentence);
feature_extractor_.ExtractFeatures(workspace, *sentence, features);
}
// Returns the language name corresponding to the given id.
string NNetLanguageIdentifier::GetLanguageName(int language_id) const {
CLD3_CHECK(language_id >= 0);
CLD3_CHECK(language_id < num_languages_);
return TaskContextParams::language_names(language_id);
}
NNetLanguageIdentifier::Result NNetLanguageIdentifier::FindLanguage(
const string &text) {
const int num_valid_bytes = FindNumValidBytesToProcess(text);
// Iterate over the input with ScriptScanner to clean up the text (e.g.,
// removing digits, punctuation, brackets).
// TODO(abakalov): Extract the code that does the clean-up out of
// ScriptScanner.
CLD2::ScriptScanner ss(text.c_str(), num_valid_bytes, /*is_plain_text=*/true);
CLD2::LangSpan script_span;
string cleaned;
while (ss.GetOneScriptSpanLower(&script_span)) {
// script_span has spaces at the beginning and the end, so there is no need
// for a delimiter.
cleaned.append(script_span.text, script_span.text_bytes);
}
if (static_cast<int>(cleaned.size()) < min_num_bytes_) {
return Result();
}
// Copy to a vector because a non-const char* will be needed.
std::vector<char> text_to_process;
for (size_t i = 0; i < cleaned.size(); ++i) {
text_to_process.push_back(cleaned[i]);
}
text_to_process.push_back('\0');
// Remove repetitive chunks or ones containing mostly spaces.
const int chunk_size = 0; // Use the default.
char *text_begin = &text_to_process[0];
const int new_length = CLD2::CheapSqueezeInplace(
text_begin, text_to_process.size() - 1, chunk_size);
if (new_length < min_num_bytes_) {
return Result();
}
const string squeezed_text_to_process =
SelectTextGivenBeginAndSize(text_begin, new_length);
return FindLanguageOfValidUTF8(squeezed_text_to_process);
}
NNetLanguageIdentifier::Result NNetLanguageIdentifier::FindLanguageOfValidUTF8(
const string &text) {
// Create a Sentence storing the input text.
Sentence sentence;
sentence.set_text(text);
// Predict language.
// TODO(salcianu): reuse vector<FeatureVector>.
std::vector<FeatureVector> features(feature_extractor_.NumEmbeddings());
GetFeatures(&sentence, &features);
EmbeddingNetwork::Vector scores;
network_.ComputeFinalScores(features, &scores);
int prediction_id = -1;
float max_val = -std::numeric_limits<float>::infinity();
for (size_t i = 0; i < scores.size(); ++i) {
if (scores[i] > max_val) {
prediction_id = i;
max_val = scores[i];
}
}
// Compute probability.
Result result;
float diff_sum = 0.0;
for (size_t i = 0; i < scores.size(); ++i) {
diff_sum += exp(scores[i] - max_val);
}
const float log_sum_exp = max_val + log(diff_sum);
result.probability = exp(max_val - log_sum_exp);
result.language = GetLanguageName(prediction_id);
result.is_reliable = ResultIsReliable(result.language, result.probability);
result.proportion = 1.0;
return result;
}
std::vector<NNetLanguageIdentifier::Result>
NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
int num_langs) {
std::vector<Result> results;
// Truncate the input text if it is too long and find the span containing
// interchange-valid UTF8.
const int num_valid_bytes = FindNumValidBytesToProcess(text);
if (num_valid_bytes == 0) {
while (num_langs-- > 0) {
results.emplace_back();
}
return results;
}
// Process each subsequence of the same script.
CLD2::ScriptScanner ss(text.c_str(), num_valid_bytes, /*is_plain_text=*/true);
CLD2::LangSpan script_span;
std::unordered_map<string, LangChunksStats> lang_stats;
int total_num_bytes = 0;
int chunk_size = 0; // Use the default.
while (ss.GetOneScriptSpanLower(&script_span)) {
const int num_original_span_bytes = script_span.text_bytes;
// Remove repetitive chunks or ones containing mostly spaces.
const int new_length = CLD2::CheapSqueezeInplace(
script_span.text, script_span.text_bytes, chunk_size);
script_span.text_bytes = new_length;
if (script_span.text_bytes < min_num_bytes_) {
continue;
}
total_num_bytes += num_original_span_bytes;
const string selected_text = SelectTextGivenScriptSpan(script_span);
Result result = FindLanguageOfValidUTF8(selected_text);
string language = result.language;
lang_stats[language].byte_sum += num_original_span_bytes;
lang_stats[language].prob_sum +=
result.probability * num_original_span_bytes;
lang_stats[language].num_chunks++;
// Add SpanInfo. Start and end indices are relative to original input.
lang_stats[language].byte_ranges.push_back(SpanInfo(
ss.MapBack(0), ss.MapBack(script_span.text_bytes), result.probability));
}
// Sort the languages based on the number of bytes associated with them.
// TODO(abakalov): Consider alternative possibly more efficient portable
// approaches for finding the top N languages. Given that on average, there
// aren't that many languages in the input, it's likely that the benefits will
// be negligible (if any).
std::vector<std::pair<string, float>> langs_and_byte_counts;
for (const auto &entry : lang_stats) {
langs_and_byte_counts.emplace_back(entry.first, entry.second.byte_sum);
}
std::sort(langs_and_byte_counts.begin(), langs_and_byte_counts.end(),
OrderBySecondDescending);
const float byte_sum = static_cast<float>(total_num_bytes);
const int num_langs_to_save =
std::min(num_langs, static_cast<int>(langs_and_byte_counts.size()));
for (int indx = 0; indx < num_langs_to_save; ++indx) {
Result result;
const string &language = langs_and_byte_counts.at(indx).first;
const LangChunksStats &stats = lang_stats.at(language);
result.language = language;
result.probability = stats.prob_sum / stats.byte_sum;
result.proportion = stats.byte_sum / byte_sum;
result.is_reliable = ResultIsReliable(language, result.probability);
result.byte_ranges = stats.byte_ranges;
results.push_back(result);
}
int padding_size = num_langs - langs_and_byte_counts.size();
while (padding_size-- > 0) {
results.emplace_back();
}
return results;
}
string NNetLanguageIdentifier::SelectTextGivenScriptSpan(
const CLD2::LangSpan &script_span) {
return SelectTextGivenBeginAndSize(script_span.text, script_span.text_bytes);
}
string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
const char *text_begin, int text_size) {
string output_text;
// If the size of the input is greater than the maximum number of bytes needed
// for a prediction, then concatenate snippets that are equally spread out
// throughout the input.
if (text_size > max_num_bytes_) {
const char *snippet_begin = nullptr;
const char *snippet_end = text_begin;
// Number of bytes between the snippets.
const int num_skip_bytes =
(text_size - max_num_bytes_) / (num_snippets_ + 1);
for (int i = 0; i < num_snippets_; ++i) {
// Using SpanInterchangeValid to find the offsets to ensure that we are
// not splitting a character in two.
const int actual_num_skip_bytes =
CLD2::SpanInterchangeValid(snippet_end, num_skip_bytes);
snippet_begin = snippet_end + actual_num_skip_bytes;
const int actual_snippet_size =
CLD2::SpanInterchangeValid(snippet_begin, snippet_size_);
snippet_end = snippet_begin + actual_snippet_size;
output_text.append(snippet_begin, actual_snippet_size);
output_text.append(" ");
}
} else {
output_text.append(text_begin, text_size);
}
return output_text;
}
} // namespace chrome_lang_id

View File

@@ -0,0 +1,191 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef NNET_LANGUAGE_IDENTIFIER_H_
#define NNET_LANGUAGE_IDENTIFIER_H_
#include <string>
#include "base.h"
#include "embedding_feature_extractor.h"
#include "embedding_network.h"
#include "lang_id_nn_params.h"
#include "language_identifier_features.h"
#include "script_span/getonescriptspan.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "task_context_params.h"
#include "cld_3/protos/task_spec.pb.h"
#include "workspace.h"
namespace chrome_lang_id {
// Specialization of the EmbeddingFeatureExtractor for extracting from
// (Sentence, int).
class LanguageIdEmbeddingFeatureExtractor
: public EmbeddingFeatureExtractor<WholeSentenceExtractor, Sentence> {
public:
const string ArgPrefix() const override;
};
// Class for detecting the language of a document.
class NNetLanguageIdentifier {
public:
// Holds probability that Span, specified by start/end indices, is a given
// language. The langauge is not stored here; it can be found in Result, which
// holds a vector of SpanInfo.
struct SpanInfo {
SpanInfo(int start_index_val, int end_index_val, float probability_val)
: start_index(start_index_val),
end_index(end_index_val),
probability(probability_val) {}
int start_index = -1;
int end_index = -1;
float probability = 0.0;
};
// Information about a predicted language.
struct Result {
string language = kUnknown;
float probability = 0.0; // Language probability.
bool is_reliable = false; // Whether the prediction is reliable.
// Proportion of bytes associated with the language. If FindLanguage is
// called, this variable is set to 1.
float proportion = 0.0;
// Specifies the byte ranges that |language| applies to.
std::vector<SpanInfo> byte_ranges;
};
NNetLanguageIdentifier();
NNetLanguageIdentifier(int min_num_bytes, int max_num_bytes);
~NNetLanguageIdentifier();
// Finds the most likely language for the given text, along with additional
// information (e.g., probability). The prediction is based on the first N
// bytes where N is the minumum between the number of interchange valid UTF8
// bytes and max_num_bytes_. If N is less than min_num_bytes_ long, then this
// function returns kUnknown.
Result FindLanguage(const string &text);
// Splits the input text (up to the first byte, if any, that is not
// interchange valid UTF8) into spans based on the script, predicts a language
// for each span, and returns a vector storing the top num_langs most frequent
// languages along with additional information (e.g., proportions). The number
// of bytes considered for each span is the minimum between the size of the
// span and max_num_bytes_. If more languages are requested than what is
// available in the input, then for those cases kUnknown is returned. Also, if
// the size of the span is less than min_num_bytes_ long, then the span is
// skipped. If the input text is too long, only the first
// kMaxNumInputBytesToConsider bytes are processed.
std::vector<Result> FindTopNMostFreqLangs(const string &text, int num_langs);
// String returned when a language is unknown or prediction cannot be made.
static const char kUnknown[];
// Min number of bytes needed to make a prediction if the default constructor
// is called.
static const int kMinNumBytesToConsider;
// Max number of bytes to consider to make a prediction if the default
// constructor is called.
static const int kMaxNumBytesToConsider;
// Max number of input bytes to process.
static const int kMaxNumInputBytesToConsider;
// Predictions with probability greater than or equal to this threshold are
// marked as reliable. This threshold was optimized on a set of text segments
// extracted from wikipedia, and results in an overall precision, recall,
// and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
static const float kReliabilityThreshold;
// Reliability threshold for the languages hr and bs.
static const float kReliabilityHrBsThreshold;
private:
// Sets up and initializes the model.
void Setup(TaskContext *context);
void Init(TaskContext *context);
// Extract features from sentence. On return, FeatureVector features[i]
// contains the features for the embedding space #i.
void GetFeatures(Sentence *sentence,
std::vector<FeatureVector> *features) const;
// Finds the most likely language for the given text. Assumes that the text is
// interchange valid UTF8.
Result FindLanguageOfValidUTF8(const string &text);
// Returns the language name corresponding to the given id.
string GetLanguageName(int language_id) const;
// Concatenates snippets of text equally spread out throughout the input if
// the size of the input is greater than the maximum number of bytes needed to
// make a prediction. The resulting string is used for language
// identification.
string SelectTextGivenScriptSpan(const CLD2::LangSpan &script_span);
string SelectTextGivenBeginAndSize(const char *text_begin, int text_size);
// Number of languages.
const int num_languages_;
// Typed feature extractor for embeddings.
LanguageIdEmbeddingFeatureExtractor feature_extractor_;
// The registry of shared workspaces in the feature extractor.
WorkspaceRegistry workspace_registry_;
// Parameters for the neural networks.
LangIdNNParams nn_params_;
// Neural network to use for scoring.
EmbeddingNetwork network_;
// This feature function is not relevant to this class. Adding this variable
// ensures that the features are linked.
ContinuousBagOfNgramsFunction ngram_function_;
// Minimum number of bytes needed to make a prediction. If the default
// constructor is called, this variable is equal to kMinNumBytesToConsider.
int min_num_bytes_;
// Maximum number of bytes to use to make a prediction. If the default
// constructor is called, this variable is equal to kMaxNumBytesToConsider.
int max_num_bytes_;
// Number of snippets to concatenate to produce the string used for language
// identification. If max_num_bytes_ <= kNumSnippets (i.e., the maximum number
// of bytes needed to make a prediction is smaller or equal to the number of
// default snippets), then this variable is equal to 1. Otherwise, it is set
// to kNumSnippets.
int num_snippets_;
// The string used to make a prediction is created by concatenating
// num_snippets_ snippets of size snippet_size_ = (max_num_bytes_ /
// num_snippets_) that are equaly spread out throughout the input.
int snippet_size_;
// Default number of snippets to concatenate to produce the string used for
// language identification. For the actual number of snippets, see
// num_snippets_.
static const int kNumSnippets;
};
} // namespace chrome_lang_id
#endif // NNET_LANGUAGE_IDENTIFIER_H_

View File

@@ -0,0 +1,28 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "registry.h"
namespace chrome_lang_id {
// Global list of all component registries.
RegistryMetadata *global_registry_list = NULL;
void RegistryMetadata::Register(RegistryMetadata *registry) {
registry->set_link(global_registry_list);
global_registry_list = registry;
}
} // namespace chrome_lang_id

242
Telegram/ThirdParty/cld3/src/registry.h vendored Normal file
View File

@@ -0,0 +1,242 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Registry for component registration. These classes can be used for creating
// registries of components conforming to the same interface. This is useful for
// making a component-based architecture where the specific implementation
// classes can be selected at runtime. There is support for both class-based and
// instance based registries.
//
// Example:
// function.h:
//
// class Function : public RegisterableInstance<Function> {
// public:
// virtual double Evaluate(double x) = 0;
// };
//
// #define REGISTER_FUNCTION(type, component)
// REGISTER_INSTANCE_COMPONENT(Function, type, component);
//
// function.cc:
//
// REGISTER_INSTANCE_REGISTRY("function", Function);
//
// class Cos : public Function {
// public:
// double Evaluate(double x) { return cos(x); }
// };
//
// class Exp : public Function {
// public:
// double Evaluate(double x) { return exp(x); }
// };
//
// REGISTER_FUNCTION("cos", Cos);
// REGISTER_FUNCTION("exp", Exp);
//
// Function *f = Function::Lookup("cos");
// double result = f->Evaluate(arg);
#ifndef REGISTRY_H_
#define REGISTRY_H_
#include <string.h>
#include <string>
#include "base.h"
namespace chrome_lang_id {
// Component metadata with information about name, class, and code location.
class ComponentMetadata {
public:
ComponentMetadata(const char *name, const char *class_name, const char *file,
int line)
: name_(name),
class_name_(class_name),
file_(file),
line_(line),
link_(NULL) {}
// Getters.
const char *name() const { return name_; }
const char *class_name() const { return class_name_; }
const char *file() const { return file_; }
int line() const { return line_; }
// Metadata objects can be linked in a list.
ComponentMetadata *link() const { return link_; }
void set_link(ComponentMetadata *link) { link_ = link; }
private:
// Component name.
const char *name_;
// Name of class for component.
const char *class_name_;
// Code file and location where the component was registered.
const char *file_;
int line_;
// Link to next metadata object in list.
ComponentMetadata *link_;
};
// The master registry contains all registered component registries. A registry
// is not registered in the master registry until the first component of that
// type is registered.
class RegistryMetadata : public ComponentMetadata {
public:
RegistryMetadata(const char *name, const char *class_name, const char *file,
int line)
: ComponentMetadata(name, class_name, file, line) {}
// Registers a component registry in the master registry.
static void Register(RegistryMetadata *registry);
};
// Registry for components. An object can be registered with a type name in the
// registry. The named instances in the registry can be returned using the
// Lookup() method. The components in the registry are put into a linked list
// of components. It is important that the component registry can be statically
// initialized in order not to depend on initialization order.
template <class T>
struct ComponentRegistry {
typedef ComponentRegistry<T> Self;
// Component registration class.
class Registrar : public ComponentMetadata {
public:
// Registers new component by linking itself into the component list of
// the registry.
Registrar(Self *registry, const char *type, const char *class_name,
const char *file, int line, T *object)
: ComponentMetadata(type, class_name, file, line), object_(object) {
// Register registry in master registry if this is the first registered
// component of this type.
if (registry->components == NULL) {
RegistryMetadata::Register(
new RegistryMetadata(registry->name, registry->class_name,
registry->file, registry->line));
}
// Register component in registry.
set_link(registry->components);
registry->components = this;
}
// Returns component type.
const char *type() const { return name(); }
// Returns component object.
T *object() const { return object_; }
// Returns the next component in the component list.
Registrar *next() const { return static_cast<Registrar *>(link()); }
private:
// Component object.
T *object_;
};
// Finds registrar for named component in registry.
const Registrar *GetComponent(const char *type) const {
Registrar *r = components;
while (r != NULL && strcmp(type, r->type()) != 0) r = r->next();
CLD3_DCHECK(r != nullptr);
return r;
}
// Finds a named component in the registry.
T *Lookup(const char *type) const { return GetComponent(type)->object(); }
T *Lookup(const string &type) const { return Lookup(type.c_str()); }
// Textual description of the kind of components in the registry.
const char *name;
// Base class name of component type.
const char *class_name;
// File and line where the registry is defined.
const char *file;
int line;
// Linked list of registered components.
Registrar *components;
};
// Base class for registerable class-based components.
template <class T>
class RegisterableClass {
public:
// Factory function type.
typedef T *(Factory)();
// Registry type.
typedef ComponentRegistry<Factory> Registry;
// Should be called before any call to Create() or registry(), i.e., before
// using the registration mechanism to register and or instantiate subclasses
// of T.
static void CreateRegistry(
const char *name,
const char *class_name,
const char *file,
int line) {
registry_ = new Registry();
registry_->name = name;
registry_->class_name = class_name;
registry_->file = file;
registry_->line = line;
registry_->components = nullptr;
}
// Should be called when one is done using the registration mechanism for
// class T.
static void DeleteRegistry() {
delete registry_;
registry_ = nullptr;
}
// Creates a new component instance.
static T *Create(const string &type) { return registry()->Lookup(type)(); }
// Returns registry for class.
static Registry *registry() { return registry_; }
private:
// Registry for class.
static Registry *registry_;
};
// Base class for registerable instance-based components.
template <class T>
class RegisterableInstance {
public:
// Registry type.
typedef ComponentRegistry<T> Registry;
private:
// Registry for class.
static Registry registry_;
};
} // namespace chrome_lang_id
#endif // REGISTRY_H_

View File

@@ -0,0 +1,89 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "relevant_script_feature.h"
#include <ctype.h>
#include <string>
#include "feature_extractor.h"
#include "feature_types.h"
#include "language_identifier_features.h"
#include "script_detector.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "utils.h"
#include "workspace.h"
namespace chrome_lang_id {
void RelevantScriptFeature::Setup(TaskContext *context) {
// Nothing.
}
void RelevantScriptFeature::Init(TaskContext *context) {
set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
}
void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
const Sentence &sentence,
FeatureVector *result) const {
const string &text = sentence.text();
// We expect kNumRelevantScripts to be small, so we stack-allocate the array
// of counts. Still, if that changes, we want to find out.
static_assert(
kNumRelevantScripts < 25,
"switch counts to vector<int>: too big for stack-allocated int[]");
// counts[s] is the number of characters with script s.
// Note: {} "value-initializes" the array to zero.
int counts[kNumRelevantScripts]{};
int total_count = 0;
const char *const text_end = text.data() + text.size();
for (const char *curr = text.data(); curr < text_end;
curr += utils::OneCharLen(curr)) {
const int num_bytes = utils::OneCharLen(curr);
// If a partial UTF-8 character is encountered, break out of the loop.
if (curr + num_bytes > text_end) {
break;
}
// Skip spaces, numbers, punctuation, and all other non-alpha ASCII
// characters: these characters are used in so many languages, they do not
// communicate language-related information.
if ((num_bytes == 1) && !isalpha(*curr)) {
continue;
}
Script script = GetScript(curr, num_bytes);
CLD3_DCHECK(script >= 0);
CLD3_DCHECK(script < kNumRelevantScripts);
counts[static_cast<int>(script)]++;
total_count++;
}
for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
int count = counts[script_id];
if (count > 0) {
const float weight = static_cast<float>(count) / total_count;
FloatFeatureValue value(script_id, weight);
result->add(feature_type(), value.discrete_value);
}
}
}
} // namespace chrome_lang_id

View File

@@ -0,0 +1,49 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef RELEVANT_SCRIPT_FEATURE_H_
#define RELEVANT_SCRIPT_FEATURE_H_
#include "feature_extractor.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "workspace.h"
namespace chrome_lang_id {
// Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode
// script (see below): each such feature indicates the script and the ratio of
// UTF8 characters in that script, in the given sentence.
//
// What is a relevant script? Recognizing all 100+ Unicode scripts would
// require too much code size and runtime. Instead, we focus only on a few
// scripts that communicate a lot of language information: e.g., the use of
// Hiragana characters almost always indicates Japanese, so Hiragana is a
// "relevant" script for us. The Latin script is used by dozens of language, so
// Latin is not relevant in this context.
class RelevantScriptFeature : public WholeSentenceFeature {
public:
void Setup(TaskContext *context) override;
void Init(TaskContext *context) override;
// Appends the features computed from the sentence to the feature vector.
void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
FeatureVector *result) const override;
};
} // namespace chrome_lang_id
#endif // RELEVANT_SCRIPT_FEATURE_H_

View File

@@ -0,0 +1,259 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <algorithm>
#include <cmath>
#include <iostream>
#include <memory>
#include "feature_extractor.h"
#include "feature_types.h"
#include "relevant_script_feature.h"
#include "script_detector.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "utils.h"
#include "workspace.h"
namespace chrome_lang_id {
namespace relevant_script_feature_test {
namespace {
// Checks whether the expected and actual float feature values are within 0.0001
// of each other.
bool FeatureValuesNear(float expected_value, float actual_value) {
return std::abs(expected_value - actual_value) < 0.0001;
}
// Checks whether two sets of feature values are within an acceptable amount of
// each other.
bool FeaturesNear(const string &test_input,
const std::map<int, float> &expected_features,
const std::map<int, float> &actual_features) {
if (expected_features.size() != actual_features.size()) {
std::cout << " Failure for input: " << test_input << std::endl;
return false;
}
for (const auto &id_and_value : expected_features) {
const int id = id_and_value.first;
if (actual_features.count(id) == 0 ||
!FeatureValuesNear(expected_features.at(id), actual_features.at(id))) {
std::cout << " Failure for input: " << test_input << std::endl;
return false;
}
}
std::cout << " Success for input: " << test_input << std::endl;
return true;
}
// Checks whether the set of features is empty.
bool CheckFeaturesEmpty(const string &input,
const std::map<int, float> &actual_features) {
if (!actual_features.empty()) {
std::cout << " Failure for input: " << input << std::endl;
return false;
} else {
std::cout << " Success for input: " << input << std::endl;
return true;
}
}
} // namespace
static WholeSentenceFeature *rsf_factory() { return new RelevantScriptFeature; }
class RelevantScriptFeatureExtractor {
public:
RelevantScriptFeatureExtractor() {
if (WholeSentenceFeature::registry() == nullptr) {
// Create registry for our WholeSentenceFeature(s).
RegisterableClass<WholeSentenceFeature>::CreateRegistry(
"sentence feature function", "WholeSentenceFeature", __FILE__,
__LINE__);
}
// Register our WholeSentenceFeature(s).
// Register RelevantScriptFeature feature function.
static WholeSentenceFeature::Registry::Registrar rsf_registrar(
WholeSentenceFeature::registry(), "continuous-bag-of-relevant-scripts",
"RelevantScriptFeature", __FILE__, __LINE__, rsf_factory);
feature_extractor_.Parse("continuous-bag-of-relevant-scripts");
TaskContext context;
feature_extractor_.Setup(&context);
feature_extractor_.Init(&context);
feature_extractor_.RequestWorkspaces(&workspace_registry_);
}
// Returns "true" if feature extraction is successful, and "false" otherwise.
bool Extract(const string &text, std::map<int, float> *float_features) {
float_features->clear();
if (text.empty()) {
return true;
}
Sentence sentence;
sentence.set_text(text);
workspace_.Reset(workspace_registry_);
feature_extractor_.Preprocess(&workspace_, &sentence);
FeatureVector feature_vector;
feature_extractor_.ExtractFeatures(workspace_, sentence, &feature_vector);
for (int index = 0; index < feature_vector.size(); ++index) {
const FloatFeatureValue value =
FloatFeatureValue(feature_vector.value(index));
if (float_features->count(value.value.id) != 0) {
std::cout << " Failure: duplicate feature" << std::endl;
return false;
}
float_features->emplace(value.value.id, value.value.weight);
}
return true;
}
private:
WorkspaceSet workspace_;
WholeSentenceExtractor feature_extractor_;
// The registry of shared workspaces in the feature extractor.
WorkspaceRegistry workspace_registry_;
};
bool TestCommonCases() {
std::cout << "Running " << __FUNCTION__ << std::endl;
RelevantScriptFeatureExtractor extractor;
std::map<int, float> float_features;
bool test_successful = true;
string input = "just some plain text";
if (!extractor.Extract(input, &float_features) ||
!FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 1.00}},
float_features)) {
test_successful = false;
}
input = "ヸヂ゠ヂ";
if (!extractor.Extract(input, &float_features) ||
!FeaturesNear(input, {{chrome_lang_id::kScriptKatakana, 1.00}},
float_features)) {
test_successful = false;
}
// 4 Latin letters mixed with 4 Katakana letters.
input = "ヸtヂextヂ";
if (!extractor.Extract(input, &float_features) ||
!FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 0.5},
{chrome_lang_id::kScriptKatakana, 0.5}},
float_features)) {
test_successful = false;
}
input = "just some 121212%^^( ヸヂ゠ヂ text";
if (!extractor.Extract(input, &float_features) ||
!FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 0.75},
{chrome_lang_id::kScriptKatakana, 0.25}},
float_features)) {
test_successful = false;
}
return test_successful;
}
bool TestCornerCases() {
std::cout << "Running " << __FUNCTION__ << std::endl;
RelevantScriptFeatureExtractor extractor;
std::map<int, float> float_features;
bool test_successful = true;
// Empty string.
string input = "";
if (!extractor.Extract(input, &float_features) ||
!CheckFeaturesEmpty(input, float_features)) {
test_successful = false;
}
// Only whitespaces.
input = " ";
if (!extractor.Extract(input, &float_features) ||
!CheckFeaturesEmpty(input, float_features)) {
test_successful = false;
}
// Only numbers and punctuation.
input = "12----)(";
if (!extractor.Extract(input, &float_features) ||
!CheckFeaturesEmpty(input, float_features)) {
test_successful = false;
}
// Only numbers, punctuation, and spaces.
input = "12--- - ) ( ";
if (!extractor.Extract(input, &float_features) ||
!CheckFeaturesEmpty(input, float_features)) {
test_successful = false;
}
// One UTF8 character by itself.
input = "";
if (!extractor.Extract(input, &float_features) ||
!FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
float_features)) {
test_successful = false;
}
input = "ה";
if (!extractor.Extract(input, &float_features) ||
!FeaturesNear(input, {{chrome_lang_id::kScriptHebrew, 1.00}},
float_features)) {
test_successful = false;
}
// One UTF8 character with some numbers / punctuation / spaces: character at
// one extremity or in the middle.
input = "1234ゟ";
if (!extractor.Extract(input, &float_features) ||
!FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
float_features)) {
test_successful = false;
}
input = "ゟ12-(";
if (!extractor.Extract(input, &float_features) ||
!FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
float_features)) {
test_successful = false;
}
input = "8*1ゟ12----";
if (!extractor.Extract(input, &float_features) ||
!FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
float_features)) {
test_successful = false;
}
return test_successful;
}
} // namespace relevant_script_feature_test
} // namespace chrome_lang_id
// Runs the feature extraction tests.
int main(int argc, char **argv) {
const bool tests_successful =
chrome_lang_id::relevant_script_feature_test::TestCommonCases() &&
chrome_lang_id::relevant_script_feature_test::TestCornerCases();
return tests_successful ? 0 : 1;
}

View File

@@ -0,0 +1,156 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef SCRIPT_DETECTOR_H_
#define SCRIPT_DETECTOR_H_
namespace chrome_lang_id {
// Unicode scripts we care about. To get compact and fast code, we detect only
// a few Unicode scripts that offer a strong indication about the language of
// the text (e.g., Hiragana -> Japanese).
enum Script {
// Special value to indicate internal errors in the script detection code.
kScriptError,
// Special values for all Unicode scripts that we do not detect. One special
// value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
// already have that information, we use it). kScriptOtherUtf8OneByte means
// ~Latin and kScriptOtherUtf8FourBytes means ~Han.
kScriptOtherUtf8OneByte,
kScriptOtherUtf8TwoBytes,
kScriptOtherUtf8ThreeBytes,
kScriptOtherUtf8FourBytes,
kScriptGreek,
kScriptCyrillic,
kScriptHebrew,
kScriptArabic,
kScriptHangulJamo, // Used primarily for Korean.
kScriptHiragana, // Used primarily for Japanese.
kScriptKatakana, // Used primarily for Japanese.
// Add new scripts here.
// Do not add any script after kNumRelevantScripts. This value indicates the
// number of elements in this enum Script (except this value) such that we can
// easily iterate over the scripts.
kNumRelevantScripts,
};
template <typename IntType>
inline bool InRange(IntType value, IntType low, IntType hi) {
return (value >= low) && (value <= hi);
}
// Returns Script for the UTF8 character that starts at address p.
// Precondition: p points to a valid UTF8 character of num_bytes bytes.
inline Script GetScript(const unsigned char *p, int num_bytes) {
switch (num_bytes) {
case 1:
return kScriptOtherUtf8OneByte;
case 2: {
// 2-byte UTF8 characters have 11 bits of information. unsigned int has
// at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
// it's enough. It's also usually the fastest int type on the current
// CPU, so it's better to use than int32.
static const unsigned int kGreekStart = 0x370;
// Commented out (unsued in the code): kGreekEnd = 0x3FF;
static const unsigned int kCyrillicStart = 0x400;
static const unsigned int kCyrillicEnd = 0x4FF;
static const unsigned int kHebrewStart = 0x590;
// Commented out (unsued in the code): kHebrewEnd = 0x5FF;
static const unsigned int kArabicStart = 0x600;
static const unsigned int kArabicEnd = 0x6FF;
const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
if (codepoint > kCyrillicEnd) {
if (codepoint >= kArabicStart) {
if (codepoint <= kArabicEnd) {
return kScriptArabic;
}
} else {
// At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
// codepoint <= kHebrewEnd.
if (codepoint >= kHebrewStart) {
return kScriptHebrew;
}
}
} else {
if (codepoint >= kCyrillicStart) {
return kScriptCyrillic;
} else {
// At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
// codepoint <= kGreekEnd.
if (codepoint >= kGreekStart) {
return kScriptGreek;
}
}
}
return kScriptOtherUtf8TwoBytes;
}
case 3: {
// 3-byte UTF8 characters have 16 bits of information. unsigned int has
// at least 16 bits.
static const unsigned int kHangulJamoStart = 0x1100;
static const unsigned int kHangulJamoEnd = 0x11FF;
static const unsigned int kHiraganaStart = 0x3041;
static const unsigned int kHiraganaEnd = 0x309F;
// Commented out (unsued in the code): kKatakanaStart = 0x30A0;
static const unsigned int kKatakanaEnd = 0x30FF;
const unsigned int codepoint =
((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
if (codepoint > kHiraganaEnd) {
// On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
// codepoint >= kKatakanaStart.
if (codepoint <= kKatakanaEnd) {
return kScriptKatakana;
}
} else {
if (codepoint >= kHiraganaStart) {
return kScriptHiragana;
} else {
if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
return kScriptHangulJamo;
}
}
}
return kScriptOtherUtf8ThreeBytes;
}
case 4:
return kScriptOtherUtf8FourBytes;
default:
return kScriptError;
}
}
// Returns Script for the UTF8 character that starts at address p. Similar to
// the previous version of GetScript, except for "char" vs "unsigned char".
// Most code works with "char *" pointers, ignoring the fact that char is
// unsigned (by default) on most platforms, but signed on iOS. This code takes
// care of making sure we always treat chars as unsigned.
inline Script GetScript(const char *p, int num_bytes) {
return GetScript(reinterpret_cast<const unsigned char *>(p), num_bytes);
}
} // namespace chrome_lang_id
#endif // SCRIPT_DETECTOR_H_

View File

@@ -0,0 +1,161 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "script_detector.h"
#include <iostream>
#include "utils.h"
namespace chrome_lang_id {
namespace script_detector_test {
Script GetScript(const char *p) {
const int num_bytes = utils::OneCharLen(p);
return chrome_lang_id::GetScript(p, num_bytes);
}
bool PrintAndReturnStatus(bool status) {
if (status) {
std::cout << " Success" << std::endl;
return true;
} else {
std::cout << " Failure" << std::endl;
return false;
}
}
bool TestGreekScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
// The first two conditions check first / last character from the Greek and
// Coptic script. The last two ones are negative tests.
return PrintAndReturnStatus(
kScriptGreek == GetScript("Ͱ") && kScriptGreek == GetScript("Ͽ") &&
kScriptGreek == GetScript("δ") && kScriptGreek == GetScript("Θ") &&
kScriptGreek == GetScript("Δ") && kScriptGreek != GetScript("a") &&
kScriptGreek != GetScript("0"));
}
bool TestCyrillicScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(
kScriptCyrillic == GetScript("Ѐ") && kScriptCyrillic == GetScript("ӿ") &&
kScriptCyrillic == GetScript("ш") && kScriptCyrillic == GetScript("Б") &&
kScriptCyrillic == GetScript("Ӱ"));
}
bool TestHebrewScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(
kScriptHebrew == GetScript("֑") && kScriptHebrew == GetScript("״") &&
kScriptHebrew == GetScript("ד") && kScriptHebrew == GetScript("ה") &&
kScriptHebrew == GetScript("צ"));
}
bool TestArabicScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(kScriptArabic == GetScript("م") &&
kScriptArabic == GetScript("خ"));
}
bool TestHangulJamoScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(kScriptHangulJamo == GetScript("") &&
kScriptHangulJamo == GetScript("") &&
kScriptHangulJamo == GetScript("") &&
kScriptHangulJamo == GetScript("") &&
kScriptHangulJamo == GetScript(""));
}
bool TestHiraganaScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(kScriptHiragana == GetScript("") &&
kScriptHiragana == GetScript("") &&
kScriptHiragana == GetScript("") &&
kScriptHiragana == GetScript("") &&
kScriptHiragana == GetScript(""));
}
bool TestKatakanaScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(kScriptKatakana == GetScript("") &&
kScriptKatakana == GetScript("") &&
kScriptKatakana == GetScript("") &&
kScriptKatakana == GetScript("") &&
kScriptKatakana == GetScript(""));
}
bool TestOtherScripts() {
std::cout << "Running " << __FUNCTION__ << std::endl;
bool test_successful = true;
if (kScriptOtherUtf8OneByte != GetScript("^") ||
kScriptOtherUtf8OneByte != GetScript("$")) {
test_successful = false;
}
// Unrecognized 2-byte scripts. For info on the scripts mentioned below, see
// http://www.unicode.org/charts/#scripts Note: the scripts below are uniquely
// associated with a language. Still, the number of queries in those
// languages is small and we didn't want to increase the code size and
// latency, so (at least for now) we do not treat them specially.
// The following three tests are, respectively, for Armenian, Syriac and
// Thaana.
if (kScriptOtherUtf8TwoBytes != GetScript("Ձ") ||
kScriptOtherUtf8TwoBytes != GetScript("ܔ") ||
kScriptOtherUtf8TwoBytes != GetScript("ށ")) {
test_successful = false;
}
// Unrecognized 3-byte script: CJK Unified Ideographs: not uniquely associated
// with a language.
if (kScriptOtherUtf8ThreeBytes != GetScript("") ||
kScriptOtherUtf8ThreeBytes != GetScript("")) {
test_successful = false;
}
// Unrecognized 4-byte script: CJK Unified Ideographs Extension C. Note:
// there is a nice UTF-8 encoder / decoder at https://mothereff.in/utf-8
if (kScriptOtherUtf8FourBytes != GetScript("\xF0\xAA\x9C\x94")) {
test_successful = false;
}
// Unrecognized 4-byte script: CJK Unified Ideographs Extension E
if (kScriptOtherUtf8FourBytes != GetScript("\xF0\xAB\xA0\xB5") ||
kScriptOtherUtf8FourBytes != GetScript("\xF0\xAC\xBA\xA1")) {
test_successful = false;
}
return PrintAndReturnStatus(test_successful);
}
} // namespace script_detector_test
} // namespace chrome_lang_id
// Runs the feature extraction tests.
int main(int argc, char **argv) {
const bool tests_successful =
chrome_lang_id::script_detector_test::TestGreekScript() &&
chrome_lang_id::script_detector_test::TestCyrillicScript() &&
chrome_lang_id::script_detector_test::TestHebrewScript() &&
chrome_lang_id::script_detector_test::TestArabicScript() &&
chrome_lang_id::script_detector_test::TestHangulJamoScript() &&
chrome_lang_id::script_detector_test::TestHiraganaScript() &&
chrome_lang_id::script_detector_test::TestKatakanaScript() &&
chrome_lang_id::script_detector_test::TestOtherScripts();
return tests_successful ? 0 : 1;
}

View File

@@ -0,0 +1,11 @@
The code in this directory identifies the scripts present in a given piece of
text along with the corresponding spans. The code was copied from
[CLD2](https://github.com/CLD2Owners/cld2) and was slightly refactored. It can
be further simplified and cleaned up.

View File

@@ -0,0 +1,55 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Routine that maps a Unicode code point to an interchange-valid one
//
#include "fixunicodevalue.h"
#include "integral_types.h"
namespace chrome_lang_id {
namespace CLD2 {
// Guarantees that the resulting output value is interchange valid
// 00-FF; map to spaces or MS CP1252
// D800-DFFF; surrogates
// FDD0-FDEF; non-characters
// xxFFFE-xxFFFF; non-characters
char32 FixUnicodeValue(char32 uv) {
uint32 uuv = static_cast<uint32>(uv);
if (uuv < 0x0100) {
return kMapFullMicrosoft1252OrSpace[uuv];
}
if (uuv < 0xD800) {
return uv;
}
if ((uuv & ~0x0F) == 0xFDD0) { // non-characters
return 0xFFFD;
}
if ((uuv & ~0x0F) == 0xFDE0) { // non-characters
return 0xFFFD;
}
if ((uuv & 0x00FFFE) == 0xFFFE) { // non-characters
return 0xFFFD;
}
if ((0xE000 <= uuv) && (uuv <= 0x10FFFF)) {
return uv;
}
// surrogates and negative and > 0x10FFFF all land here
return 0xFFFD;
}
} // End namespace CLD2
} // End namespace chrome_lang_id

View File

@@ -0,0 +1,69 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Routine that maps a Unicode code point to an interchange-valid one
//
// Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
// code points. C0 and C1 control codes that are not interchange-valid
// are mapped to spaces.
#ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
#define SCRIPT_SPAN_FIXUNICODEVALUE_H_
#include "integral_types.h" // for char32
#include "port.h"
namespace chrome_lang_id {
namespace CLD2 {
// Map byte value 0000-00FF to char32
// Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
// Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
static const char32 kMapFullMicrosoft1252OrSpace[256] = {
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
};
// Guarantees that the resulting output value is interchange valid
// 00-FF; map to spaces or MS CP1252
// D800-DFFF; surrogates
// FDD0-FDEF; non-characters
// xxFFFE-xxFFFF; non-characters
char32 FixUnicodeValue(char32 uv);
} // End namespace CLD2
} // End namespace chrome_lang_id
#endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_

View File

@@ -0,0 +1,296 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// generated_entities.cc
// Machine generated. Do Not Edit.
//
// Declarations for HTML entities recognized by CLD2
//
#include "generated_ulscript.h" // for CharIntPair
namespace chrome_lang_id {
namespace CLD2 {
// Alphabetical order for binary search
extern const int kNameToEntitySize = 265;
extern const CharIntPair kNameToEntity[kNameToEntitySize] = {
{"AElig", 198},
{"AMP", 38},
{"Aacute", 193},
{"Acirc", 194},
{"Agrave", 192},
{"Alpha", 913},
{"Aring", 197},
{"Atilde", 195},
{"Auml", 196},
{"Beta", 914},
{"Ccaron", 268},
{"Ccedil", 199},
{"Chi", 935},
{"Dagger", 8225},
{"Delta", 916},
{"ETH", 208},
{"Eacute", 201},
{"Ecaron", 282},
{"Ecirc", 202},
{"Egrave", 200},
{"Epsilon", 917},
{"Eta", 919},
{"Euml", 203},
{"GT", 62},
{"Gamma", 915},
{"Iacute", 205},
{"Icirc", 206},
{"Igrave", 204},
{"Iota", 921},
{"Iuml", 207},
{"Kappa", 922},
{"LT", 60},
{"Lambda", 923},
{"Mu", 924},
{"Ntilde", 209},
{"Nu", 925},
{"OElig", 338},
{"Oacute", 211},
{"Ocirc", 212},
{"Ograve", 210},
{"Omega", 937},
{"Omicron", 927},
{"Oslash", 216},
{"Otilde", 213},
{"Ouml", 214},
{"Phi", 934},
{"Pi", 928},
{"Prime", 8243},
{"Psi", 936},
{"QUOT", 34},
{"Rcaron", 344},
{"Rho", 929},
{"Scaron", 352},
{"Sigma", 931},
{"THORN", 222},
{"Tau", 932},
{"Theta", 920},
{"Uacute", 218},
{"Ucirc", 219},
{"Ugrave", 217},
{"Upsilon", 933},
{"Uuml", 220},
{"Xi", 926},
{"Yacute", 221},
{"Yuml", 376},
{"Zeta", 918},
{"aacute", 225},
{"acirc", 226},
{"acute", 180},
{"aelig", 230},
{"agrave", 224},
{"alefsym", 8501},
{"alpha", 945},
{"amp", 38},
{"and", 8743},
{"ang", 8736},
{"apos", 39},
{"aring", 229},
{"asymp", 8776},
{"atilde", 227},
{"auml", 228},
{"bdquo", 8222},
{"beta", 946},
{"brvbar", 166},
{"bull", 8226},
{"cap", 8745},
{"ccaron", 269},
{"ccedil", 231},
{"cedil", 184},
{"cent", 162},
{"chi", 967},
{"circ", 710},
{"clubs", 9827},
{"cong", 8773},
{"copy", 169},
{"crarr", 8629},
{"cup", 8746},
{"curren", 164},
{"dArr", 8659},
{"dagger", 8224},
{"darr", 8595},
{"deg", 176},
{"delta", 948},
{"diams", 9830},
{"divide", 247},
{"eacute", 233},
{"ecaron", 283},
{"ecirc", 234},
{"egrave", 232},
{"emdash", 8212},
{"empty", 8709},
{"emsp", 8195},
{"endash", 8211},
{"ensp", 8194},
{"epsilon", 949},
{"equiv", 8801},
{"eta", 951},
{"eth", 240},
{"euml", 235},
{"euro", 8364},
{"exist", 8707},
{"fnof", 402},
{"forall", 8704},
{"frac12", 189},
{"frac14", 188},
{"frac34", 190},
{"frasl", 8260},
{"gamma", 947},
{"ge", 8805},
{"gt", 62},
{"hArr", 8660},
{"harr", 8596},
{"hearts", 9829},
{"hellip", 8230},
{"iacute", 237},
{"icirc", 238},
{"iexcl", 161},
{"igrave", 236},
{"image", 8465},
{"infin", 8734},
{"int", 8747},
{"iota", 953},
{"iquest", 191},
{"isin", 8712},
{"iuml", 239},
{"kappa", 954},
{"lArr", 8656},
{"lambda", 955},
{"lang", 9001},
{"laquo", 171},
{"larr", 8592},
{"lceil", 8968},
{"ldquo", 8220},
{"le", 8804},
{"lfloor", 8970},
{"lowast", 8727},
{"loz", 9674},
{"lrm", 8206},
{"lsaquo", 8249},
{"lsquo", 8216},
{"lt", 60},
{"macr", 175},
{"mdash", 8212},
{"micro", 181},
{"middot", 183},
{"minus", 8722},
{"mu", 956},
{"nabla", 8711},
{"nbsp", 160},
{"ndash", 8211},
{"ne", 8800},
{"ni", 8715},
{"not", 172},
{"notin", 8713},
{"nsub", 8836},
{"ntilde", 241},
{"nu", 957},
{"oacute", 243},
{"ocirc", 244},
{"oelig", 339},
{"ograve", 242},
{"oline", 8254},
{"omega", 969},
{"omicron", 959},
{"oplus", 8853},
{"or", 8744},
{"ordf", 170},
{"ordm", 186},
{"oslash", 248},
{"otilde", 245},
{"otimes", 8855},
{"ouml", 246},
{"para", 182},
{"part", 8706},
{"permil", 8240},
{"perp", 8869},
{"phi", 966},
{"pi", 960},
{"piv", 982},
{"plusmn", 177},
{"pound", 163},
{"prime", 8242},
{"prod", 8719},
{"prop", 8733},
{"psi", 968},
{"quot", 34},
{"rArr", 8658},
{"radic", 8730},
{"rang", 9002},
{"raquo", 187},
{"rarr", 8594},
{"rcaron", 345},
{"rceil", 8969},
{"rdquo", 8221},
{"real", 8476},
{"reg", 174},
{"rfloor", 8971},
{"rho", 961},
{"rlm", 8207},
{"rsaquo", 8250},
{"rsquo", 8217},
{"sbquo", 8218},
{"scaron", 353},
{"sdot", 8901},
{"sect", 167},
{"shy", 173},
{"sigma", 963},
{"sigmaf", 962},
{"sim", 8764},
{"spades", 9824},
{"sub", 8834},
{"sube", 8838},
{"sum", 8721},
{"sup", 8835},
{"sup1", 185},
{"sup2", 178},
{"sup3", 179},
{"supe", 8839},
{"szlig", 223},
{"tau", 964},
{"there4", 8756},
{"theta", 952},
{"thetasym", 977},
{"thinsp", 8201},
{"thorn", 254},
{"tilde", 732},
{"times", 215},
{"trade", 8482},
{"uArr", 8657},
{"uacute", 250},
{"uarr", 8593},
{"ucirc", 251},
{"ugrave", 249},
{"uml", 168},
{"upsih", 978},
{"upsilon", 965},
{"uuml", 252},
{"weierp", 8472},
{"xi", 958},
{"yacute", 253},
{"yen", 165},
{"yuml", 255},
{"zeta", 950},
{"zwj", 8205},
{"zwnj", 8204},
};
} // namespace CLD2
} // namespace chrome_lang_id

View File

@@ -0,0 +1,678 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// generated_ulscript.cc
// Machine generated. Do Not Edit.
//
// Declarations for scripts recognized by CLD2
//
#include "generated_ulscript.h"
namespace chrome_lang_id {
namespace CLD2 {
// Subscripted by enum ULScript
extern const int kULScriptToNameSize = 102;
extern const char* const kULScriptToName[kULScriptToNameSize] = {
"Common", // 0 Zyyy
"Latin", // 1 Latn
"Greek", // 2 Grek
"Cyrillic", // 3 Cyrl
"Armenian", // 4 Armn
"Hebrew", // 5 Hebr
"Arabic", // 6 Arab
"Syriac", // 7 Syrc
"Thaana", // 8 Thaa
"Devanagari", // 9 Deva
"Bengali", // 10 Beng
"Gurmukhi", // 11 Guru
"Gujarati", // 12 Gujr
"Oriya", // 13 Orya
"Tamil", // 14 Taml
"Telugu", // 15 Telu
"Kannada", // 16 Knda
"Malayalam", // 17 Mlym
"Sinhala", // 18 Sinh
"Thai", // 19 Thai
"Lao", // 20 Laoo
"Tibetan", // 21 Tibt
"Myanmar", // 22 Mymr
"Georgian", // 23 Geor
"Hani", // 24 Hani
"Ethiopic", // 25 Ethi
"Cherokee", // 26 Cher
"Canadian_Aboriginal", // 27 Cans
"Ogham", // 28 Ogam
"Runic", // 29 Runr
"Khmer", // 30 Khmr
"Mongolian", // 31 Mong
"", // 32
"", // 33
"Bopomofo", // 34 Bopo
"", // 35
"Yi", // 36 Yiii
"Old_Italic", // 37 Ital
"Gothic", // 38 Goth
"Deseret", // 39 Dsrt
"Inherited", // 40 Zinh
"Tagalog", // 41 Tglg
"Hanunoo", // 42 Hano
"Buhid", // 43 Buhd
"Tagbanwa", // 44 Tagb
"Limbu", // 45 Limb
"Tai_Le", // 46 Tale
"Linear_B", // 47 Linb
"Ugaritic", // 48 Ugar
"Shavian", // 49 Shaw
"Osmanya", // 50 Osma
"Cypriot", // 51 Cprt
"Braille", // 52 Brai
"Buginese", // 53 Bugi
"Coptic", // 54 Copt
"New_Tai_Lue", // 55 Talu
"Glagolitic", // 56 Glag
"Tifinagh", // 57 Tfng
"Syloti_Nagri", // 58 Sylo
"Old_Persian", // 59 Xpeo
"Kharoshthi", // 60 Khar
"Balinese", // 61 Bali
"Cuneiform", // 62 Xsux
"Phoenician", // 63 Phnx
"Phags_Pa", // 64 Phag
"Nko", // 65 Nkoo
"Sundanese", // 66 Sund
"Lepcha", // 67 Lepc
"Ol_Chiki", // 68 Olck
"Vai", // 69 Vaii
"Saurashtra", // 70 Saur
"Kayah_Li", // 71 Kali
"Rejang", // 72 Rjng
"Lycian", // 73 Lyci
"Carian", // 74 Cari
"Lydian", // 75 Lydi
"Cham", // 76 Cham
"Tai_Tham", // 77 Lana
"Tai_Viet", // 78 Tavt
"Avestan", // 79 Avst
"Egyptian_Hieroglyphs", // 80 Egyp
"Samaritan", // 81 Samr
"Lisu", // 82 Lisu
"Bamum", // 83 Bamu
"Javanese", // 84 Java
"Meetei_Mayek", // 85 Mtei
"Imperial_Aramaic", // 86 Armi
"Old_South_Arabian", // 87 Sarb
"Inscriptional_Parthian", // 88 Prti
"Inscriptional_Pahlavi", // 89 Phli
"Old_Turkic", // 90 Orkh
"Kaithi", // 91 Kthi
"Batak", // 92 Batk
"Brahmi", // 93 Brah
"Mandaic", // 94 Mand
"Chakma", // 95 Cakm
"Meroitic_Cursive", // 96 Merc
"Meroitic_Hieroglyphs", // 97 Mero
"Miao", // 98 Plrd
"Sharada", // 99 Shrd
"Sora_Sompeng", // 100 Sora
"Takri", // 101 Takr
};
// Subscripted by enum ULScript
extern const int kULScriptToCodeSize = 102;
extern const char* const kULScriptToCode[kULScriptToCodeSize] = {
"Zyyy", // 0 Common
"Latn", // 1 Latin
"Grek", // 2 Greek
"Cyrl", // 3 Cyrillic
"Armn", // 4 Armenian
"Hebr", // 5 Hebrew
"Arab", // 6 Arabic
"Syrc", // 7 Syriac
"Thaa", // 8 Thaana
"Deva", // 9 Devanagari
"Beng", // 10 Bengali
"Guru", // 11 Gurmukhi
"Gujr", // 12 Gujarati
"Orya", // 13 Oriya
"Taml", // 14 Tamil
"Telu", // 15 Telugu
"Knda", // 16 Kannada
"Mlym", // 17 Malayalam
"Sinh", // 18 Sinhala
"Thai", // 19 Thai
"Laoo", // 20 Lao
"Tibt", // 21 Tibetan
"Mymr", // 22 Myanmar
"Geor", // 23 Georgian
"Hani", // 24 Hani
"Ethi", // 25 Ethiopic
"Cher", // 26 Cherokee
"Cans", // 27 Canadian_Aboriginal
"Ogam", // 28 Ogham
"Runr", // 29 Runic
"Khmr", // 30 Khmer
"Mong", // 31 Mongolian
"", // 32
"", // 33
"Bopo", // 34 Bopomofo
"", // 35
"Yiii", // 36 Yi
"Ital", // 37 Old_Italic
"Goth", // 38 Gothic
"Dsrt", // 39 Deseret
"Zinh", // 40 Inherited
"Tglg", // 41 Tagalog
"Hano", // 42 Hanunoo
"Buhd", // 43 Buhid
"Tagb", // 44 Tagbanwa
"Limb", // 45 Limbu
"Tale", // 46 Tai_Le
"Linb", // 47 Linear_B
"Ugar", // 48 Ugaritic
"Shaw", // 49 Shavian
"Osma", // 50 Osmanya
"Cprt", // 51 Cypriot
"Brai", // 52 Braille
"Bugi", // 53 Buginese
"Copt", // 54 Coptic
"Talu", // 55 New_Tai_Lue
"Glag", // 56 Glagolitic
"Tfng", // 57 Tifinagh
"Sylo", // 58 Syloti_Nagri
"Xpeo", // 59 Old_Persian
"Khar", // 60 Kharoshthi
"Bali", // 61 Balinese
"Xsux", // 62 Cuneiform
"Phnx", // 63 Phoenician
"Phag", // 64 Phags_Pa
"Nkoo", // 65 Nko
"Sund", // 66 Sundanese
"Lepc", // 67 Lepcha
"Olck", // 68 Ol_Chiki
"Vaii", // 69 Vai
"Saur", // 70 Saurashtra
"Kali", // 71 Kayah_Li
"Rjng", // 72 Rejang
"Lyci", // 73 Lycian
"Cari", // 74 Carian
"Lydi", // 75 Lydian
"Cham", // 76 Cham
"Lana", // 77 Tai_Tham
"Tavt", // 78 Tai_Viet
"Avst", // 79 Avestan
"Egyp", // 80 Egyptian_Hieroglyphs
"Samr", // 81 Samaritan
"Lisu", // 82 Lisu
"Bamu", // 83 Bamum
"Java", // 84 Javanese
"Mtei", // 85 Meetei_Mayek
"Armi", // 86 Imperial_Aramaic
"Sarb", // 87 Old_South_Arabian
"Prti", // 88 Inscriptional_Parthian
"Phli", // 89 Inscriptional_Pahlavi
"Orkh", // 90 Old_Turkic
"Kthi", // 91 Kaithi
"Batk", // 92 Batak
"Brah", // 93 Brahmi
"Mand", // 94 Mandaic
"Cakm", // 95 Chakma
"Merc", // 96 Meroitic_Cursive
"Mero", // 97 Meroitic_Hieroglyphs
"Plrd", // 98 Miao
"Shrd", // 99 Sharada
"Sora", // 100 Sora_Sompeng
"Takr", // 101 Takri
};
// Subscripted by enum ULScript
extern const int kULScriptToCNameSize = 102;
extern const char* const kULScriptToCName[kULScriptToCNameSize] = {
"ULScript_Common", // 0 Zyyy
"ULScript_Latin", // 1 Latn
"ULScript_Greek", // 2 Grek
"ULScript_Cyrillic", // 3 Cyrl
"ULScript_Armenian", // 4 Armn
"ULScript_Hebrew", // 5 Hebr
"ULScript_Arabic", // 6 Arab
"ULScript_Syriac", // 7 Syrc
"ULScript_Thaana", // 8 Thaa
"ULScript_Devanagari", // 9 Deva
"ULScript_Bengali", // 10 Beng
"ULScript_Gurmukhi", // 11 Guru
"ULScript_Gujarati", // 12 Gujr
"ULScript_Oriya", // 13 Orya
"ULScript_Tamil", // 14 Taml
"ULScript_Telugu", // 15 Telu
"ULScript_Kannada", // 16 Knda
"ULScript_Malayalam", // 17 Mlym
"ULScript_Sinhala", // 18 Sinh
"ULScript_Thai", // 19 Thai
"ULScript_Lao", // 20 Laoo
"ULScript_Tibetan", // 21 Tibt
"ULScript_Myanmar", // 22 Mymr
"ULScript_Georgian", // 23 Geor
"ULScript_Hani", // 24 Hani
"ULScript_Ethiopic", // 25 Ethi
"ULScript_Cherokee", // 26 Cher
"ULScript_Canadian_Aboriginal", // 27 Cans
"ULScript_Ogham", // 28 Ogam
"ULScript_Runic", // 29 Runr
"ULScript_Khmer", // 30 Khmr
"ULScript_Mongolian", // 31 Mong
"ULScript_32", // 32
"ULScript_33", // 33
"ULScript_Bopomofo", // 34 Bopo
"ULScript_35", // 35
"ULScript_Yi", // 36 Yiii
"ULScript_Old_Italic", // 37 Ital
"ULScript_Gothic", // 38 Goth
"ULScript_Deseret", // 39 Dsrt
"ULScript_Inherited", // 40 Zinh
"ULScript_Tagalog", // 41 Tglg
"ULScript_Hanunoo", // 42 Hano
"ULScript_Buhid", // 43 Buhd
"ULScript_Tagbanwa", // 44 Tagb
"ULScript_Limbu", // 45 Limb
"ULScript_Tai_Le", // 46 Tale
"ULScript_Linear_B", // 47 Linb
"ULScript_Ugaritic", // 48 Ugar
"ULScript_Shavian", // 49 Shaw
"ULScript_Osmanya", // 50 Osma
"ULScript_Cypriot", // 51 Cprt
"ULScript_Braille", // 52 Brai
"ULScript_Buginese", // 53 Bugi
"ULScript_Coptic", // 54 Copt
"ULScript_New_Tai_Lue", // 55 Talu
"ULScript_Glagolitic", // 56 Glag
"ULScript_Tifinagh", // 57 Tfng
"ULScript_Syloti_Nagri", // 58 Sylo
"ULScript_Old_Persian", // 59 Xpeo
"ULScript_Kharoshthi", // 60 Khar
"ULScript_Balinese", // 61 Bali
"ULScript_Cuneiform", // 62 Xsux
"ULScript_Phoenician", // 63 Phnx
"ULScript_Phags_Pa", // 64 Phag
"ULScript_Nko", // 65 Nkoo
"ULScript_Sundanese", // 66 Sund
"ULScript_Lepcha", // 67 Lepc
"ULScript_Ol_Chiki", // 68 Olck
"ULScript_Vai", // 69 Vaii
"ULScript_Saurashtra", // 70 Saur
"ULScript_Kayah_Li", // 71 Kali
"ULScript_Rejang", // 72 Rjng
"ULScript_Lycian", // 73 Lyci
"ULScript_Carian", // 74 Cari
"ULScript_Lydian", // 75 Lydi
"ULScript_Cham", // 76 Cham
"ULScript_Tai_Tham", // 77 Lana
"ULScript_Tai_Viet", // 78 Tavt
"ULScript_Avestan", // 79 Avst
"ULScript_Egyptian_Hieroglyphs", // 80 Egyp
"ULScript_Samaritan", // 81 Samr
"ULScript_Lisu", // 82 Lisu
"ULScript_Bamum", // 83 Bamu
"ULScript_Javanese", // 84 Java
"ULScript_Meetei_Mayek", // 85 Mtei
"ULScript_Imperial_Aramaic", // 86 Armi
"ULScript_Old_South_Arabian", // 87 Sarb
"ULScript_Inscriptional_Parthian", // 88 Prti
"ULScript_Inscriptional_Pahlavi", // 89 Phli
"ULScript_Old_Turkic", // 90 Orkh
"ULScript_Kaithi", // 91 Kthi
"ULScript_Batak", // 92 Batk
"ULScript_Brahmi", // 93 Brah
"ULScript_Mandaic", // 94 Mand
"ULScript_Chakma", // 95 Cakm
"ULScript_Meroitic_Cursive", // 96 Merc
"ULScript_Meroitic_Hieroglyphs", // 97 Mero
"ULScript_Miao", // 98 Plrd
"ULScript_Sharada", // 99 Shrd
"ULScript_Sora_Sompeng", // 100 Sora
"ULScript_Takri", // 101 Takr
};
// Subscripted by enum ULScript
extern const int kULScriptToRtypeSize = 102;
extern const ULScriptRType kULScriptToRtype[kULScriptToRtypeSize] = {
RTypeNone, // 0 Zyyy
RTypeMany, // 1 Latn
RTypeOne, // 2 Grek
RTypeMany, // 3 Cyrl
RTypeOne, // 4 Armn
RTypeMany, // 5 Hebr
RTypeMany, // 6 Arab
RTypeOne, // 7 Syrc
RTypeOne, // 8 Thaa
RTypeMany, // 9 Deva
RTypeMany, // 10 Beng
RTypeOne, // 11 Guru
RTypeOne, // 12 Gujr
RTypeOne, // 13 Orya
RTypeOne, // 14 Taml
RTypeOne, // 15 Telu
RTypeOne, // 16 Knda
RTypeOne, // 17 Mlym
RTypeOne, // 18 Sinh
RTypeOne, // 19 Thai
RTypeOne, // 20 Laoo
RTypeMany, // 21 Tibt
RTypeOne, // 22 Mymr
RTypeOne, // 23 Geor
RTypeCJK, // 24 Hani
RTypeMany, // 25 Ethi
RTypeOne, // 26 Cher
RTypeOne, // 27 Cans
RTypeNone, // 28 Ogam
RTypeNone, // 29 Runr
RTypeOne, // 30 Khmr
RTypeOne, // 31 Mong
RTypeNone, // 32
RTypeNone, // 33
RTypeNone, // 34 Bopo
RTypeNone, // 35
RTypeNone, // 36 Yiii
RTypeNone, // 37 Ital
RTypeNone, // 38 Goth
RTypeNone, // 39 Dsrt
RTypeNone, // 40 Zinh
RTypeOne, // 41 Tglg
RTypeNone, // 42 Hano
RTypeNone, // 43 Buhd
RTypeNone, // 44 Tagb
RTypeOne, // 45 Limb
RTypeNone, // 46 Tale
RTypeNone, // 47 Linb
RTypeNone, // 48 Ugar
RTypeNone, // 49 Shaw
RTypeNone, // 50 Osma
RTypeNone, // 51 Cprt
RTypeNone, // 52 Brai
RTypeNone, // 53 Bugi
RTypeNone, // 54 Copt
RTypeNone, // 55 Talu
RTypeNone, // 56 Glag
RTypeNone, // 57 Tfng
RTypeNone, // 58 Sylo
RTypeNone, // 59 Xpeo
RTypeNone, // 60 Khar
RTypeNone, // 61 Bali
RTypeNone, // 62 Xsux
RTypeNone, // 63 Phnx
RTypeNone, // 64 Phag
RTypeNone, // 65 Nkoo
RTypeNone, // 66 Sund
RTypeNone, // 67 Lepc
RTypeNone, // 68 Olck
RTypeNone, // 69 Vaii
RTypeNone, // 70 Saur
RTypeNone, // 71 Kali
RTypeNone, // 72 Rjng
RTypeNone, // 73 Lyci
RTypeNone, // 74 Cari
RTypeNone, // 75 Lydi
RTypeNone, // 76 Cham
RTypeNone, // 77 Lana
RTypeNone, // 78 Tavt
RTypeNone, // 79 Avst
RTypeNone, // 80 Egyp
RTypeNone, // 81 Samr
RTypeNone, // 82 Lisu
RTypeNone, // 83 Bamu
RTypeNone, // 84 Java
RTypeNone, // 85 Mtei
RTypeNone, // 86 Armi
RTypeNone, // 87 Sarb
RTypeNone, // 88 Prti
RTypeNone, // 89 Phli
RTypeNone, // 90 Orkh
RTypeNone, // 91 Kthi
RTypeNone, // 92 Batk
RTypeNone, // 93 Brah
RTypeNone, // 94 Mand
RTypeNone, // 95 Cakm
RTypeNone, // 96 Merc
RTypeNone, // 97 Mero
RTypeNone, // 98 Plrd
RTypeNone, // 99 Shrd
RTypeNone, // 100 Sora
RTypeNone, // 101 Takr
};
// Subscripted by enum ULScript
extern const int kULScriptToDefaultLangSize = 102;
// Alphabetical order for binary search
extern const int kNameToULScriptSize = 105;
extern const CharIntPair kNameToULScript[kNameToULScriptSize] = {
{"Arabic", 6}, // Arab
{"Armenian", 4}, // Armn
{"Avestan", 79}, // Avst
{"Balinese", 61}, // Bali
{"Bamum", 83}, // Bamu
{"Batak", 92}, // Batk
{"Bengali", 10}, // Beng
{"Bopomofo", 34}, // Bopo
{"Brahmi", 93}, // Brah
{"Braille", 52}, // Brai
{"Buginese", 53}, // Bugi
{"Buhid", 43}, // Buhd
{"Canadian_Aboriginal", 27}, // Cans
{"Carian", 74}, // Cari
{"Chakma", 95}, // Cakm
{"Cham", 76}, // Cham
{"Cherokee", 26}, // Cher
{"Common", 0}, // Zyyy
{"Coptic", 54}, // Copt
{"Cuneiform", 62}, // Xsux
{"Cypriot", 51}, // Cprt
{"Cyrillic", 3}, // Cyrl
{"Deseret", 39}, // Dsrt
{"Devanagari", 9}, // Deva
{"Egyptian_Hieroglyphs", 80}, // Egyp
{"Ethiopic", 25}, // Ethi
{"Georgian", 23}, // Geor
{"Glagolitic", 56}, // Glag
{"Gothic", 38}, // Goth
{"Greek", 2}, // Grek
{"Gujarati", 12}, // Gujr
{"Gurmukhi", 11}, // Guru
{"Han", 24}, // Hant
{"Han", 24}, // Hans
{"Han", 24}, // Hani
{"Hangul", 24}, // Hang
{"Hani", 24}, // Hani
{"Hanunoo", 42}, // Hano
{"Hebrew", 5}, // Hebr
{"Hiragana", 24}, // Hira
{"Imperial_Aramaic", 86}, // Armi
{"Inherited", 40}, // Zinh
{"Inscriptional_Pahlavi", 89}, // Phli
{"Inscriptional_Parthian", 88}, // Prti
{"Javanese", 84}, // Java
{"Kaithi", 91}, // Kthi
{"Kannada", 16}, // Knda
{"Katakana", 24}, // Kana
{"Kayah_Li", 71}, // Kali
{"Kharoshthi", 60}, // Khar
{"Khmer", 30}, // Khmr
{"Lao", 20}, // Laoo
{"Latin", 1}, // Latn
{"Lepcha", 67}, // Lepc
{"Limbu", 45}, // Limb
{"Linear_B", 47}, // Linb
{"Lisu", 82}, // Lisu
{"Lycian", 73}, // Lyci
{"Lydian", 75}, // Lydi
{"Malayalam", 17}, // Mlym
{"Mandaic", 94}, // Mand
{"Meetei_Mayek", 85}, // Mtei
{"Meroitic_Cursive", 96}, // Merc
{"Meroitic_Hieroglyphs", 97}, // Mero
{"Miao", 98}, // Plrd
{"Mongolian", 31}, // Mong
{"Myanmar", 22}, // Mymr
{"New_Tai_Lue", 55}, // Talu
{"Nko", 65}, // Nkoo
{"Ogham", 28}, // Ogam
{"Ol_Chiki", 68}, // Olck
{"Old_Italic", 37}, // Ital
{"Old_Persian", 59}, // Xpeo
{"Old_South_Arabian", 87}, // Sarb
{"Old_Turkic", 90}, // Orkh
{"Oriya", 13}, // Orya
{"Osmanya", 50}, // Osma
{"Phags_Pa", 64}, // Phag
{"Phoenician", 63}, // Phnx
{"Rejang", 72}, // Rjng
{"Runic", 29}, // Runr
{"Samaritan", 81}, // Samr
{"Saurashtra", 70}, // Saur
{"Sharada", 99}, // Shrd
{"Shavian", 49}, // Shaw
{"Sinhala", 18}, // Sinh
{"Sora_Sompeng", 100}, // Sora
{"Sundanese", 66}, // Sund
{"Syloti_Nagri", 58}, // Sylo
{"Syriac", 7}, // Syrc
{"Tagalog", 41}, // Tglg
{"Tagbanwa", 44}, // Tagb
{"Tai_Le", 46}, // Tale
{"Tai_Tham", 77}, // Lana
{"Tai_Viet", 78}, // Tavt
{"Takri", 101}, // Takr
{"Tamil", 14}, // Taml
{"Telugu", 15}, // Telu
{"Thaana", 8}, // Thaa
{"Thai", 19}, // Thai
{"Tibetan", 21}, // Tibt
{"Tifinagh", 57}, // Tfng
{"Ugaritic", 48}, // Ugar
{"Vai", 69}, // Vaii
{"Yi", 36}, // Yiii
};
// Alphabetical order for binary search
extern const int kCodeToULScriptSize = 105;
extern const CharIntPair kCodeToULScript[kNameToULScriptSize] = {
{"Arab", 6}, // Arab
{"Armi", 86}, // Armi
{"Armn", 4}, // Armn
{"Avst", 79}, // Avst
{"Bali", 61}, // Bali
{"Bamu", 83}, // Bamu
{"Batk", 92}, // Batk
{"Beng", 10}, // Beng
{"Bopo", 34}, // Bopo
{"Brah", 93}, // Brah
{"Brai", 52}, // Brai
{"Bugi", 53}, // Bugi
{"Buhd", 43}, // Buhd
{"Cakm", 95}, // Cakm
{"Cans", 27}, // Cans
{"Cari", 74}, // Cari
{"Cham", 76}, // Cham
{"Cher", 26}, // Cher
{"Copt", 54}, // Copt
{"Cprt", 51}, // Cprt
{"Cyrl", 3}, // Cyrl
{"Deva", 9}, // Deva
{"Dsrt", 39}, // Dsrt
{"Egyp", 80}, // Egyp
{"Ethi", 25}, // Ethi
{"Geor", 23}, // Geor
{"Glag", 56}, // Glag
{"Goth", 38}, // Goth
{"Grek", 2}, // Grek
{"Gujr", 12}, // Gujr
{"Guru", 11}, // Guru
{"Hang", 24}, // Hang
{"Hani", 24}, // Hani
{"Hani", 24}, // Hani
{"Hano", 42}, // Hano
{"Hans", 24}, // Hans
{"Hant", 24}, // Hant
{"Hebr", 5}, // Hebr
{"Hira", 24}, // Hira
{"Ital", 37}, // Ital
{"Java", 84}, // Java
{"Kali", 71}, // Kali
{"Kana", 24}, // Kana
{"Khar", 60}, // Khar
{"Khmr", 30}, // Khmr
{"Knda", 16}, // Knda
{"Kthi", 91}, // Kthi
{"Lana", 77}, // Lana
{"Laoo", 20}, // Laoo
{"Latn", 1}, // Latn
{"Lepc", 67}, // Lepc
{"Limb", 45}, // Limb
{"Linb", 47}, // Linb
{"Lisu", 82}, // Lisu
{"Lyci", 73}, // Lyci
{"Lydi", 75}, // Lydi
{"Mand", 94}, // Mand
{"Merc", 96}, // Merc
{"Mero", 97}, // Mero
{"Mlym", 17}, // Mlym
{"Mong", 31}, // Mong
{"Mtei", 85}, // Mtei
{"Mymr", 22}, // Mymr
{"Nkoo", 65}, // Nkoo
{"Ogam", 28}, // Ogam
{"Olck", 68}, // Olck
{"Orkh", 90}, // Orkh
{"Orya", 13}, // Orya
{"Osma", 50}, // Osma
{"Phag", 64}, // Phag
{"Phli", 89}, // Phli
{"Phnx", 63}, // Phnx
{"Plrd", 98}, // Plrd
{"Prti", 88}, // Prti
{"Rjng", 72}, // Rjng
{"Runr", 29}, // Runr
{"Samr", 81}, // Samr
{"Sarb", 87}, // Sarb
{"Saur", 70}, // Saur
{"Shaw", 49}, // Shaw
{"Shrd", 99}, // Shrd
{"Sinh", 18}, // Sinh
{"Sora", 100}, // Sora
{"Sund", 66}, // Sund
{"Sylo", 58}, // Sylo
{"Syrc", 7}, // Syrc
{"Tagb", 44}, // Tagb
{"Takr", 101}, // Takr
{"Tale", 46}, // Tale
{"Talu", 55}, // Talu
{"Taml", 14}, // Taml
{"Tavt", 78}, // Tavt
{"Telu", 15}, // Telu
{"Tfng", 57}, // Tfng
{"Tglg", 41}, // Tglg
{"Thaa", 8}, // Thaa
{"Thai", 19}, // Thai
{"Tibt", 21}, // Tibt
{"Ugar", 48}, // Ugar
{"Vaii", 69}, // Vaii
{"Xpeo", 59}, // Xpeo
{"Xsux", 62}, // Xsux
{"Yiii", 36}, // Yiii
{"Zinh", 40}, // Zinh
{"Zyyy", 0}, // Zyyy
};
} // namespace CLD2
} // namespace chrome_lang_id

View File

@@ -0,0 +1,142 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// generated_ulscript.h
// Machine generated. Do Not Edit.
//
// Declarations for scripts recognized by CLD2
//
#ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
#define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
namespace chrome_lang_id {
namespace CLD2 {
typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
typedef struct {const char* s; int i;} CharIntPair;
typedef enum {
ULScript_Common = 0, // Zyyy
ULScript_Latin = 1, // Latn
ULScript_Greek = 2, // Grek
ULScript_Cyrillic = 3, // Cyrl
ULScript_Armenian = 4, // Armn
ULScript_Hebrew = 5, // Hebr
ULScript_Arabic = 6, // Arab
ULScript_Syriac = 7, // Syrc
ULScript_Thaana = 8, // Thaa
ULScript_Devanagari = 9, // Deva
ULScript_Bengali = 10, // Beng
ULScript_Gurmukhi = 11, // Guru
ULScript_Gujarati = 12, // Gujr
ULScript_Oriya = 13, // Orya
ULScript_Tamil = 14, // Taml
ULScript_Telugu = 15, // Telu
ULScript_Kannada = 16, // Knda
ULScript_Malayalam = 17, // Mlym
ULScript_Sinhala = 18, // Sinh
ULScript_Thai = 19, // Thai
ULScript_Lao = 20, // Laoo
ULScript_Tibetan = 21, // Tibt
ULScript_Myanmar = 22, // Mymr
ULScript_Georgian = 23, // Geor
ULScript_Hani = 24, // Hani
ULScript_Ethiopic = 25, // Ethi
ULScript_Cherokee = 26, // Cher
ULScript_Canadian_Aboriginal = 27, // Cans
ULScript_Ogham = 28, // Ogam
ULScript_Runic = 29, // Runr
ULScript_Khmer = 30, // Khmr
ULScript_Mongolian = 31, // Mong
ULScript_32 = 32, //
ULScript_33 = 33, //
ULScript_Bopomofo = 34, // Bopo
ULScript_35 = 35, //
ULScript_Yi = 36, // Yiii
ULScript_Old_Italic = 37, // Ital
ULScript_Gothic = 38, // Goth
ULScript_Deseret = 39, // Dsrt
ULScript_Inherited = 40, // Zinh
ULScript_Tagalog = 41, // Tglg
ULScript_Hanunoo = 42, // Hano
ULScript_Buhid = 43, // Buhd
ULScript_Tagbanwa = 44, // Tagb
ULScript_Limbu = 45, // Limb
ULScript_Tai_Le = 46, // Tale
ULScript_Linear_B = 47, // Linb
ULScript_Ugaritic = 48, // Ugar
ULScript_Shavian = 49, // Shaw
ULScript_Osmanya = 50, // Osma
ULScript_Cypriot = 51, // Cprt
ULScript_Braille = 52, // Brai
ULScript_Buginese = 53, // Bugi
ULScript_Coptic = 54, // Copt
ULScript_New_Tai_Lue = 55, // Talu
ULScript_Glagolitic = 56, // Glag
ULScript_Tifinagh = 57, // Tfng
ULScript_Syloti_Nagri = 58, // Sylo
ULScript_Old_Persian = 59, // Xpeo
ULScript_Kharoshthi = 60, // Khar
ULScript_Balinese = 61, // Bali
ULScript_Cuneiform = 62, // Xsux
ULScript_Phoenician = 63, // Phnx
ULScript_Phags_Pa = 64, // Phag
ULScript_Nko = 65, // Nkoo
ULScript_Sundanese = 66, // Sund
ULScript_Lepcha = 67, // Lepc
ULScript_Ol_Chiki = 68, // Olck
ULScript_Vai = 69, // Vaii
ULScript_Saurashtra = 70, // Saur
ULScript_Kayah_Li = 71, // Kali
ULScript_Rejang = 72, // Rjng
ULScript_Lycian = 73, // Lyci
ULScript_Carian = 74, // Cari
ULScript_Lydian = 75, // Lydi
ULScript_Cham = 76, // Cham
ULScript_Tai_Tham = 77, // Lana
ULScript_Tai_Viet = 78, // Tavt
ULScript_Avestan = 79, // Avst
ULScript_Egyptian_Hieroglyphs = 80, // Egyp
ULScript_Samaritan = 81, // Samr
ULScript_Lisu = 82, // Lisu
ULScript_Bamum = 83, // Bamu
ULScript_Javanese = 84, // Java
ULScript_Meetei_Mayek = 85, // Mtei
ULScript_Imperial_Aramaic = 86, // Armi
ULScript_Old_South_Arabian = 87, // Sarb
ULScript_Inscriptional_Parthian = 88, // Prti
ULScript_Inscriptional_Pahlavi = 89, // Phli
ULScript_Old_Turkic = 90, // Orkh
ULScript_Kaithi = 91, // Kthi
ULScript_Batak = 92, // Batk
ULScript_Brahmi = 93, // Brah
ULScript_Mandaic = 94, // Mand
ULScript_Chakma = 95, // Cakm
ULScript_Meroitic_Cursive = 96, // Merc
ULScript_Meroitic_Hieroglyphs = 97, // Mero
ULScript_Miao = 98, // Plrd
ULScript_Sharada = 99, // Shrd
ULScript_Sora_Sompeng = 100, // Sora
ULScript_Takri = 101, // Takr
NUM_ULSCRIPTS
} ULScript;
#define UNKNOWN_ULSCRIPT ULScript_Common
} // namespace CLD2
} // namespace chrome_lang_id
#endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,124 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef SCRIPT_SPAN_GETONESCRIPTSPAN_H_
#define SCRIPT_SPAN_GETONESCRIPTSPAN_H_
#include "generated_ulscript.h"
#include "integral_types.h"
#include "offsetmap.h"
namespace chrome_lang_id {
namespace CLD2 {
static const int kMaxScriptBuffer = 40960;
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
static const int kWithinScriptTail = 32; // Stop at word space in last
// N bytes of script buffer
struct LangSpan {
char* text = nullptr; // Pointer to the span, somewhere
int text_bytes = 0; // Number of bytes of text in the span
int offset = 0; // Offset of start of span in original input buffer
ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
bool truncated = false; // true if buffer filled up before a
// different script or EOF was found
};
static inline bool IsContinuationByte(char c) {
return static_cast<signed char>(c) < -64;
}
// Gets lscript number for letters; always returns
// 0 (common script) for non-letters
int GetUTF8LetterScriptNum(const char* src);
// Update src pointer to point to next quadgram, +2..+5
// Looks at src[0..4]
const char* AdvanceQuad(const char* src);
// Utility routine to search alphabetical tables
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
// Returns the length in bytes of the prefix of src that is all
// interchange valid UTF-8
int SpanInterchangeValid(const char* src, int byte_length);
class ScriptScanner {
public:
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
bool any_text, bool any_script);
~ScriptScanner();
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
bool GetOneScriptSpan(LangSpan* span);
// Force Latin and Cyrillic scripts to be lowercase
void LowerScriptSpan(LangSpan* span);
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Force Latin and Cyrillic scripts to be lowercase
bool GetOneScriptSpanLower(LangSpan* span);
// Copy next run of non-tag characters to buffer [NUL terminated]
// This just removes tags and removes entities
// Buffer has leading space
bool GetOneTextSpan(LangSpan* span);
// Maps byte offset in most recent GetOneScriptSpan/Lower
// span->text [0..text_bytes] into an additional byte offset from
// span->offset, to get back to corresponding text in the original
// input buffer.
// text_offset must be the first byte
// of a UTF-8 character, or just beyond the last character. Normally this
// routine is called with the first byte of an interesting range and
// again with the first byte of the following range.
int MapBack(int text_offset);
const char* GetBufferStart() {return start_byte_;}
private:
// Skip over tags and non-letters
int SkipToFrontOfSpan(const char* src, int len, int* script);
const char* start_byte_; // Starting byte of buffer to scan
const char* next_byte_; // First unscanned byte
int byte_length_; // Bytes left
bool is_plain_text_; // true fo text, false for HTML
char* script_buffer_; // Holds text with expanded entities
char* script_buffer_lower_; // Holds lowercased text
bool letters_marks_only_; // To distinguish scriptspan of one
// letters/marks vs. any mixture of text
bool one_script_only_; // To distinguish scriptspan of one
// script vs. any mixture of scripts
int exit_state_; // For tag parser kTagParseTbl_0, based
// on letters_marks_only_
public :
// Expose for debugging
OffsetMap map2original_; // map from script_buffer_ to buffer
OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
};
} // namespace CLD2
} // namespace chrome_lang_id
#endif // SCRIPT_SPAN_GETONESCRIPTSPAN_H_

View File

@@ -0,0 +1,135 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "getonescriptspan.h"
#include <iostream>
#include <vector>
namespace chrome_lang_id {
namespace CLD2 {
namespace getonescriptspan_test {
// Tests invalid and interchange-invalid input. Returns "true" if the test is
// successful and "false" otherwise.
bool TestInvalidUTF8Input() {
std::cout << "Running " << __FUNCTION__ << std::endl;
const std::vector<std::string> invalid_strings{"\xC0\xA9",
"\377\377\377\377"};
const std::string gold_valid_prefix = "Some valid bytes followed by ";
// Iterates over the invalid strings, inserts each of them in the middle of a
// piece of text, and checks whether these strings are correctly identified.
bool test_successful = true;
for (size_t i = 0; i < invalid_strings.size(); ++i) {
const std::string text = "Some valid bytes followed by " +
invalid_strings.at(i) +
" and then valid ones again.";
const int num_valid_bytes = SpanInterchangeValid(text.c_str(), text.size());
const std::string detected_valid_prefix(text.c_str(), num_valid_bytes);
std::cout << " Testing input string at position " << i << std::endl;
if (detected_valid_prefix == gold_valid_prefix) {
std::cout << " Success!" << std::endl;
} else {
std::cout << " Failure" << std::endl;
std::cout << " Gold: " << gold_valid_prefix << std::endl;
std::cout << " Detected: " << detected_valid_prefix << std::endl;
test_successful = false;
}
}
return test_successful;
}
// Tests whether different scripts are correctly detected. Returns "true" if the
// test is successful and "false" otherwise.
bool TestScriptDetection() {
std::cout << "Running " << __FUNCTION__ << std::endl;
// Text containing a snippet in English, a snippet in Bulgarian, and a snippet
// in English again.
const std::string text =
"Text in English. Текст на Български. Also text in English.";
const std::vector<std::string> gold_script_spans{
" Text in English ", " Текст на Български ", " Also text in English "};
std::vector<std::string> detected_script_spans;
ScriptScanner ss(text.c_str(), text.size(), /*is_plain_text=*/true);
LangSpan script_span;
while (ss.GetOneScriptSpan(&script_span)) {
detected_script_spans.emplace_back(script_span.text,
script_span.text_bytes);
}
if (detected_script_spans.size() != gold_script_spans.size()) {
std::cout << " Failure" << std::endl;
std::cout << " Number of gold spans " << gold_script_spans.size()
<< std::endl;
std::cout << " Number of detected spans " << detected_script_spans.size()
<< std::endl;
return false;
}
for (size_t i = 0; i < detected_script_spans.size(); ++i) {
if (detected_script_spans.at(i) != gold_script_spans.at(i)) {
std::cout << " Failure" << std::endl;
std::cout << " Gold span: " << gold_script_spans.at(i) << std::endl;
std::cout << " Detected span: " << detected_script_spans.at(i)
<< std::endl;
return false;
}
}
std::cout << " Success!" << std::endl;
return true;
}
// Tests the case when the input string is truncated in such a way that a
// character is split in two pieces. Returns "true" if the test is successful
// and "false" otherwise.
bool TestStringCut() {
std::cout << "Running " << __FUNCTION__ << std::endl;
// Text in Bulgarian (Cyrillic script).
const std::string text = "Текст на Български";
// The size of the first two words ("Текст на ") is 16, and size of the first
// two words plus the first char of the third word ("Текст на Б") is 18, so a
// threshold of 17 results in slicing the first char of the third word.
const int first_two_words_size = 16;
const int span_size = 17;
const int num_valid_bytes = SpanInterchangeValid(text.c_str(), span_size);
if (num_valid_bytes == first_two_words_size) {
std::cout << " Success!" << std::endl;
return true;
} else {
std::cout << " Failure" << std::endl;
std::cout << " Size of gold interchange-valid span: "
<< first_two_words_size << std::endl;
std::cout << " Size of detected span: " << num_valid_bytes << std::endl;
return false;
}
}
} // namespace getonescriptspan_test
} // namespace CLD2
} // namespace chrome_lang_id
// Runs the functions above.
int main(int argc, char **argv) {
const bool tests_successful =
chrome_lang_id::CLD2::getonescriptspan_test::TestInvalidUTF8Input() &&
chrome_lang_id::CLD2::getonescriptspan_test::TestScriptDetection() &&
chrome_lang_id::CLD2::getonescriptspan_test::TestStringCut();
return tests_successful ? 0 : 1;
}

View File

@@ -0,0 +1,37 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SCRIPT_SPAN_INTEGRAL_TYPES_H_
#define SCRIPT_SPAN_INTEGRAL_TYPES_H_
// Cheap version
namespace chrome_lang_id {
namespace CLD2 {
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long long int uint64;
typedef signed char int8;
typedef signed short int16;
typedef signed int int32;
typedef signed long long int int64;
typedef int32 char32;
} // End namespace CLD2
} // End namespace chrome_lang_id
#endif // SCRIPT_SPAN_INTEGRAL_TYPES_H_

View File

@@ -0,0 +1,478 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
//
#include "offsetmap.h"
#include <string.h> // for strcmp
#include <algorithm> // for min
using namespace std;
namespace chrome_lang_id {
namespace CLD2 {
// Constructor, destructor
OffsetMap::OffsetMap() {
Clear();
}
OffsetMap::~OffsetMap() {
}
// Clear the map
// After:
// next_diff_sub_ is 0
// Windows are the a and a' ranges covered by diffs_[next_diff_sub_-1]
// which is a fake range of width 0 mapping 0=>0
void OffsetMap::Clear() {
diffs_.clear();
pending_op_ = COPY_OP;
pending_length_ = 0;
next_diff_sub_ = 0;
current_lo_aoffset_ = 0;
current_hi_aoffset_ = 0;
current_lo_aprimeoffset_ = 0;
current_hi_aprimeoffset_ = 0;
current_diff_ = 0;
max_aoffset_ = 0; // Largest seen so far
max_aprimeoffset_ = 0; // Largest seen so far
}
static inline char OpPart(const char c) {
return (c >> 6) & 3;
}
static inline char LenPart(const char c) {
return c & 0x3f;
}
// Reset to offset 0
void OffsetMap::Reset() {
MaybeFlushAll();
next_diff_sub_ = 0;
current_lo_aoffset_ = 0;
current_hi_aoffset_ = 0;
current_lo_aprimeoffset_ = 0;
current_hi_aprimeoffset_ = 0;
current_diff_ = 0;
}
// Add to mapping from A to A', specifying how many next bytes are
// identical in A and A'
void OffsetMap::Copy(int bytes) {
if (bytes == 0) {return;}
max_aoffset_ += bytes; // Largest seen so far
max_aprimeoffset_ += bytes; // Largest seen so far
if (pending_op_ == COPY_OP) {
pending_length_ += bytes;
} else {
Flush();
pending_op_ = COPY_OP;
pending_length_ = bytes;
}
}
// Add to mapping from A to A', specifying how many next bytes are
// inserted in A' while not advancing in A at all
void OffsetMap::Insert(int bytes){
if (bytes == 0) {return;}
max_aprimeoffset_ += bytes; // Largest seen so far
if (pending_op_ == INSERT_OP) {
pending_length_ += bytes;
} else if ((bytes == 1) &&
(pending_op_ == DELETE_OP) && (pending_length_ == 1)) {
// Special-case exactly delete(1) insert(1) +> copy(1);
// all others backmap inserts to after deletes
pending_op_ = COPY_OP;
} else {
Flush();
pending_op_ = INSERT_OP;
pending_length_ = bytes;
}
}
// Add to mapping from A to A', specifying how many next bytes are
// deleted from A while not advancing in A' at all
void OffsetMap::Delete(int bytes){
if (bytes == 0) {return;}
max_aoffset_ += bytes; // Largest seen so far
if (pending_op_ == DELETE_OP) {
pending_length_ += bytes;
} else if ((bytes == 1) &&
(pending_op_ == INSERT_OP) && (pending_length_ == 1)) {
// Special-case exactly insert(1) delete(1) => copy(1);
// all others backmap deletes to after insertss
pending_op_ = COPY_OP;
} else {
Flush();
pending_op_ = DELETE_OP;
pending_length_ = bytes;
}
}
void OffsetMap::Flush() {
if (pending_length_ == 0) {
return;
}
// We may be emitting a copy op just after a copy op because +1 -1 cancelled
// inbetween. If the lengths don't need a prefix byte, combine them
if ((pending_op_ == COPY_OP) && !diffs_.empty()) {
char c = diffs_[diffs_.size() - 1];
MapOp prior_op = static_cast<MapOp>(OpPart(c));
int prior_len = LenPart(c);
if ((prior_op == COPY_OP) && ((prior_len + pending_length_) <= 0x3f)) {
diffs_[diffs_.size() - 1] += pending_length_;
pending_length_ = 0;
return;
}
}
if (pending_length_ > 0x3f) {
bool non_zero_emitted = false;
for (int shift = 30; shift > 0; shift -= 6) {
int prefix = (pending_length_ >> shift) & 0x3f;
if ((prefix > 0) || non_zero_emitted) {
Emit(PREFIX_OP, prefix);
non_zero_emitted = true;
}
}
}
Emit(pending_op_, pending_length_ & 0x3f);
pending_length_ = 0;
}
// Add one more entry to copy one byte off the end, then flush
void OffsetMap::FlushAll() {
Copy(1);
Flush();
}
// Flush all if necessary
void OffsetMap::MaybeFlushAll() {
if ((0 < pending_length_) || diffs_.empty()) {
FlushAll();
}
}
// Len may be 0, for example as the low piece of length=64
void OffsetMap::Emit(MapOp op, int len) {
char c = (static_cast<char>(op) << 6) | (len & 0x3f);
diffs_.push_back(c);
}
//----------------------------------------------------------------------------//
// The guts of the 2013 design //
// If there are three ranges a b c in diffs_, we can be in one of five //
// states: LEFT of a, in ranges a b c, or RIGHT of c //
// In each state, there are windows A[Alo..Ahi), A'[A'lo..A'hi) and diffs_ //
// position next_diff_sub_ //
// There also are mapping constants max_aoffset_ and max_aprimeoffset_ //
// If LEFT, Alo=Ahi=0, A'lo=A'hi=0 and next_diff_sub_=0 //
// If RIGHT, Alo=Ahi=max_aoffset_, A'lo=A'hi=max_aprimeoffset_ and //
// next_diff_sub_=diffs_.size() //
// Otherwise, at least one of A[) and A'[) is non-empty and the first bytes //
// correspond to each other. If range i is active, next_diff_sub_ is at //
// the first byte of range i+1. Because of the length-prefix operator, //
// an individual range item in diffs_ may be multiple bytes //
// In all cases aprimeoffset = aoffset + current_diff_ //
// i.e. current_diff_ = aprimeoffset - aoffset //
// //
// In the degenerate case of diffs_.empty(), there are only two states //
// LEFT and RIGHT and the mapping is the identity mapping. //
// The initial state is LEFT. //
// It is an error to move left into LEFT or right into RIGHT, but the code //
// below is robust in these cases. //
//----------------------------------------------------------------------------//
void OffsetMap::SetLeft() {
current_lo_aoffset_ = 0;
current_hi_aoffset_ = 0;
current_lo_aprimeoffset_ = 0;
current_hi_aprimeoffset_ = 0;
current_diff_ = 0;
next_diff_sub_ = 0;
}
void OffsetMap::SetRight() {
current_lo_aoffset_ = max_aoffset_;
current_hi_aoffset_ = max_aoffset_;
current_lo_aprimeoffset_ = max_aprimeoffset_;
current_hi_aprimeoffset_ = max_aprimeoffset_;
current_diff_ = max_aprimeoffset_ - max_aoffset_;
next_diff_sub_ = 0;
}
// Back up over previous range, 1..5 bytes
// Return subscript at the beginning of that. Pins at 0
int OffsetMap::Backup(int sub) {
if (sub <= 0) {return 0;}
--sub;
while ((0 < sub) &&
(static_cast<MapOp>(OpPart(diffs_[sub - 1]) == PREFIX_OP))) {
--sub;
}
return sub;
}
// Parse next range, 1..5 bytes
// Return subscript just off the end of that
int OffsetMap::ParseNext(int sub, MapOp* op, int* length) {
*op = PREFIX_OP;
*length = 0;
char c;
while ((sub < static_cast<int>(diffs_.size())) && (*op == PREFIX_OP)) {
c = diffs_[sub++];
*op = static_cast<MapOp>(OpPart(c));
int len = LenPart(c);
*length = (*length << 6) + len;
}
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
// Mal-formed can include a trailing prefix byte with no following op
return sub;
}
// Parse previous range, 1..5 bytes
// Return current subscript
int OffsetMap::ParsePrevious(int sub, MapOp* op, int* length) {
sub = Backup(sub);
return ParseNext(sub, op, length);
}
// Move active window one range to the right
// Return true if move was OK
bool OffsetMap::MoveRight() {
// If at last range or RIGHT, set to RIGHT, return error
if (next_diff_sub_ >= static_cast<int>(diffs_.size())) {
SetRight();
return false;
}
// Actually OK to move right
MapOp op;
int length;
bool retval = true;
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
next_diff_sub_ = ParseNext(next_diff_sub_, &op, &length);
current_lo_aoffset_ = current_hi_aoffset_;
current_lo_aprimeoffset_ = current_hi_aprimeoffset_;
if (op == COPY_OP) {
current_hi_aoffset_ = current_lo_aoffset_ + length;
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
} else if (op == INSERT_OP) {
current_hi_aoffset_ = current_lo_aoffset_ + 0;
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
} else if (op == DELETE_OP) {
current_hi_aoffset_ = current_lo_aoffset_ + length;
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + 0;
} else {
SetRight();
retval = false;
}
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
return retval;
}
// Move active window one range to the left
// Return true if move was OK
bool OffsetMap::MoveLeft() {
// If at first range or LEFT, set to LEFT, return error
if (next_diff_sub_ <= 0) {
SetLeft();
return false;
}
// Back up over current active window
next_diff_sub_ = Backup(next_diff_sub_);
if (next_diff_sub_ <= 0) {
SetLeft();
return false;
}
// Actually OK to move left
MapOp op;
int length;
// TODO(abakalov): 'retval' below is set but not used, which is suspicious.
// Did the authors mean to return this variable, analogously to MoveRight()?
// bool retval = true;
// If mal-formed or in LEFT, this will return with op = PREFIX_OP
next_diff_sub_ = ParsePrevious(next_diff_sub_, &op, &length);
current_hi_aoffset_ = current_lo_aoffset_;
current_hi_aprimeoffset_ = current_lo_aprimeoffset_;
if (op == COPY_OP) {
current_lo_aoffset_ = current_hi_aoffset_ - length;
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
} else if (op == INSERT_OP) {
current_lo_aoffset_ = current_hi_aoffset_ - 0;
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
} else if (op == DELETE_OP) {
current_lo_aoffset_ = current_hi_aoffset_ - length;
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - 0;
} else {
SetLeft();
// retval = false;
}
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
return true;
}
// Map an offset in A' to the corresponding offset in A
int OffsetMap::MapBack(int aprimeoffset){
MaybeFlushAll();
if (aprimeoffset < 0) {return 0;}
if (max_aprimeoffset_ <= aprimeoffset) {
return (aprimeoffset - max_aprimeoffset_) + max_aoffset_;
}
// If current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_,
// use current mapping, else move window left/right
bool ok = true;
while (ok && (aprimeoffset < current_lo_aprimeoffset_)) {
ok = MoveLeft();
}
while (ok && (current_hi_aprimeoffset_ <= aprimeoffset)) {
ok = MoveRight();
}
// So now current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_
int aoffset = aprimeoffset - current_diff_;
if (aoffset >= current_hi_aoffset_) {
// A' is in an insert region, all bytes of which backmap to A=hi_aoffset_
aoffset = current_hi_aoffset_;
}
return aoffset;
}
// Map an offset in A to the corresponding offset in A'
int OffsetMap::MapForward(int aoffset){
MaybeFlushAll();
if (aoffset < 0) {return 0;}
if (max_aoffset_ <= aoffset) {
return (aoffset - max_aoffset_) + max_aprimeoffset_;
}
// If current_lo_aoffset_ <= aoffset < current_hi_aoffset_,
// use current mapping, else move window left/right
bool ok = true;
while (ok && (aoffset < current_lo_aoffset_)) {
ok = MoveLeft();
}
while (ok && (current_hi_aoffset_ <= aoffset)) {
ok = MoveRight();
}
int aprimeoffset = aoffset + current_diff_;
if (aprimeoffset >= current_hi_aprimeoffset_) {
// A is in a delete region, all bytes of which map to A'=hi_aprimeoffset_
aprimeoffset = current_hi_aprimeoffset_;
}
return aprimeoffset;
}
// static
bool OffsetMap::CopyInserts(OffsetMap* source, OffsetMap* dest) {
bool ok = true;
while (ok && (source->next_diff_sub_ !=
static_cast<int>(source->diffs_.size()))) {
ok = source->MoveRight();
if (source->current_lo_aoffset_ != source->current_hi_aoffset_) {
return false;
}
dest->Insert(
source->current_hi_aprimeoffset_ - source->current_lo_aprimeoffset_);
}
return true;
}
// static
bool OffsetMap::CopyDeletes(OffsetMap* source, OffsetMap* dest) {
bool ok = true;
while (ok && (source->next_diff_sub_ !=
static_cast<int>(source->diffs_.size()))) {
ok = source->MoveRight();
if (source->current_lo_aprimeoffset_ != source->current_hi_aprimeoffset_) {
return false;
}
dest->Delete(source->current_hi_aoffset_ - source->current_lo_aoffset_);
}
return true;
}
// static
void OffsetMap::ComposeOffsetMap(
OffsetMap* g, OffsetMap* f, OffsetMap* h) {
h->Clear();
f->Reset();
g->Reset();
int lo = 0;
for (;;) {
// Consume delete operations in f. This moves A without moving
// A' and A''.
if (lo >= g->current_hi_aoffset_ && CopyInserts(g, h)) {
if (lo >= f->current_hi_aprimeoffset_ && CopyDeletes(f, h)) {
// fprintf(stderr,
// "ComposeOffsetMap ERROR, f is longer than g.<br>\n");
}
// FlushAll(), called by Reset(), MapForward() or MapBack(), has
// added an extra COPY_OP to f and g, so this function has
// composed an extra COPY_OP in h from those. To avoid
// FlushAll() adds one more extra COPY_OP to h later, dispatch
// Flush() right now.
h->Flush();
return;
}
// Consume insert operations in g. This moves A'' without moving A
// and A'.
if (lo >= f->current_hi_aprimeoffset_) {
if (!CopyDeletes(f, h)) {
// fprintf(stderr,
// "ComposeOffsetMap ERROR, g is longer than f.<br>\n");
}
}
// Compose one operation which moves A' from lo to hi.
int hi = min(f->current_hi_aprimeoffset_, g->current_hi_aoffset_);
if (f->current_lo_aoffset_ != f->current_hi_aoffset_ &&
g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
h->Copy(hi - lo);
} else if (f->current_lo_aoffset_ != f->current_hi_aoffset_) {
h->Delete(hi - lo);
} else if (g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
h->Insert(hi - lo);
}
lo = hi;
}
}
// For testing only -- force a mapping
void OffsetMap::StuffIt(const std::string& diffs,
int max_aoffset, int max_aprimeoffset) {
Clear();
diffs_ = diffs;
max_aoffset_ = max_aoffset;
max_aprimeoffset_ = max_aprimeoffset;
}
} // namespace CLD2
} // namespace chrome_lang_id

View File

@@ -0,0 +1,168 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef SCRIPT_SPAN_OFFSETMAP_H_
#define SCRIPT_SPAN_OFFSETMAP_H_
#include <string> // for string
#include "integral_types.h" // for uint32
// ***************************** OffsetMap **************************
//
// An OffsetMap object is a container for a mapping from offsets in one text
// buffer A' to offsets in another text buffer A. It is most useful when A' is
// built from A via substitutions that occasionally do not preserve byte length.
//
// A series of operators are used to build the correspondence map, then
// calls can be made to map an offset in A' to an offset in A, or vice versa.
// The map starts with offset 0 in A corresponding to offset 0 in A'.
// The mapping is then built sequentially, adding on byte ranges that are
// identical in A and A', byte ranges that are inserted in A', and byte ranges
// that are deleted from A. All bytes beyond those specified when building the
// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
// end of the map.
//
// The internal data structure records positions at which bytes are added or
// deleted. Using the map is O(1) when increasing the A' or A offset
// monotonically, and O(n) when accessing random offsets, where n is the
// number of differences.
//
namespace chrome_lang_id {
namespace CLD2 {
class OffsetMap {
public:
// Constructor, destructor
OffsetMap();
~OffsetMap();
// Clear the map
void Clear();
// Add to mapping from A to A', specifying how many next bytes correspond
// in A and A'
void Copy(int bytes);
// Add to mapping from A to A', specifying how many next bytes are
// inserted in A' while not advancing in A at all
void Insert(int bytes);
// Add to mapping from A to A', specifying how many next bytes are
// deleted from A while not advancing in A' at all
void Delete(int bytes);
// [Finish building map,] Re-position to offset 0
// This call is optional; MapForward and MapBack finish building the map
// if necessary
void Reset();
// Map an offset in A' to the corresponding offset in A
int MapBack(int aprimeoffset);
// Map an offset in A to the corresponding offset in A'
int MapForward(int aoffset);
// h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
// from A' to A'' and h is from A to A''.
//
// Note that g->MoveForward(f->MoveForward(aoffset)) always equals
// to h->MoveForward(aoffset), while
// f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
// to h->MoveBack(aprimeprimeoffset). This happens when deletion in
// f and insertion in g are at the same place. For example,
//
// A 1 2 3 4
// ^ | ^ ^
// | | / | f
// v vv v
// A' 1' 2' 3'
// ^ ^^ ^
// | | \ | g
// v | v v
// A'' 1'' 2'' 3'' 4''
//
// results in:
//
// A 1 2 3 4
// ^ ^\ ^ ^
// | | \ | | h
// v | vv v
// A'' 1'' 2'' 3'' 4''
//
// 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
// the latter figure.
static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
// For testing only -- force a mapping
void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
private:
enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
void Flush();
void FlushAll();
void MaybeFlushAll();
void Emit(MapOp op, int len);
void SetLeft();
void SetRight();
// Back up over previous range, 1..5 bytes
// Return subscript at the beginning of that. Pins at 0
int Backup(int sub);
// Parse next range, 1..5 bytes
// Return subscript just off the end of that
int ParseNext(int sub, MapOp* op, int* length);
// Parse previous range, 1..5 bytes
// Return current subscript
int ParsePrevious(int sub, MapOp* op, int* length);
bool MoveRight(); // Returns true if OK
bool MoveLeft(); // Returns true if OK
// Copies insert operations from source to dest. Returns true if no
// other operations are found.
static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
// Copies delete operations from source to dest. Returns true if no other
// operations are found.
static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
std::string diffs_;
MapOp pending_op_;
uint32 pending_length_;
// Offsets in the ranges below correspond to each other, with A' = A + diff
int next_diff_sub_;
int current_lo_aoffset_;
int current_hi_aoffset_;
int current_lo_aprimeoffset_;
int current_hi_aprimeoffset_;
int current_diff_;
int max_aoffset_;
int max_aprimeoffset_;
};
} // namespace CLD2
} // namespace chrome_lang_id
#endif // SCRIPT_SPAN_OFFSETMAP_H_

View File

@@ -0,0 +1,143 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// These are weird things we need to do to get this compiling on
// random systems [subset].
#ifndef SCRIPT_SPAN_PORT_H_
#define SCRIPT_SPAN_PORT_H_
#include <string.h> // for memcpy()
#include "integral_types.h"
namespace chrome_lang_id {
namespace CLD2 {
// Portable handling of unaligned loads, stores, and copies.
// On some platforms, like ARM, the copy functions can be more efficient
// then a load and a store.
#if defined(ARCH_PIII) || defined(ARCH_ATHLON) || defined(ARCH_K8) || defined(_ARCH_PPC)
// x86 and x86-64 can perform unaligned loads/stores directly;
// modern PowerPC hardware can also do unaligned integer loads and stores;
// but note: the FPU still sends unaligned loads and stores to a trap handler!
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
#elif defined(__arm__) && \
!defined(__ARM_ARCH_5__) && \
!defined(__ARM_ARCH_5T__) && \
!defined(__ARM_ARCH_5TE__) && \
!defined(__ARM_ARCH_5TEJ__) && \
!defined(__ARM_ARCH_6__) && \
!defined(__ARM_ARCH_6J__) && \
!defined(__ARM_ARCH_6K__) && \
!defined(__ARM_ARCH_6Z__) && \
!defined(__ARM_ARCH_6ZK__) && \
!defined(__ARM_ARCH_6T2__) && \
!defined(__ARM_ARCH_7__) && \
!defined(__ARM_ARCH_7A__) && \
!defined(__ARM_ARCH_7M__) && \
!defined(__ARM_ARCH_7R__) && \
!defined(__ARM_ARCH_8__) && \
!defined(__ARM_ARCH_8A__)
// ARMv7 and newer support native unaligned accesses, but only of 16-bit
// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
// do an unaligned read and rotate the words around a bit, or do the reads very
// slowly (trip through kernel mode). There's no simple #define that says just
// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
// sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
// so in time, maybe we can move on to that.
//
// Note that even if a chipset supports unaligned access, it might not be
// enabled in any given system, e.g.:
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHCGCFD.html
// Therefore, it's generally just not safe to allow unaligned access on any ARM
// variant.
//
// This is a mess, but there's not much we can do about it.
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
// TODO(sesse): NEON supports unaligned 64-bit loads and stores.
// See if that would be more efficient on platforms supporting it,
// at least for copies.
inline uint64 UNALIGNED_LOAD64(const void *p) {
uint64 t;
memcpy(&t, p, sizeof t);
return t;
}
inline void UNALIGNED_STORE64(void *p, uint64 v) {
memcpy(p, &v, sizeof v);
}
#else
#define NEED_ALIGNED_LOADS
// These functions are provided for architectures that don't support
// unaligned loads and stores.
inline uint16 UNALIGNED_LOAD16(const void *p) {
uint16 t;
memcpy(&t, p, sizeof t);
return t;
}
inline uint32 UNALIGNED_LOAD32(const void *p) {
uint32 t;
memcpy(&t, p, sizeof t);
return t;
}
inline uint64 UNALIGNED_LOAD64(const void *p) {
uint64 t;
memcpy(&t, p, sizeof t);
return t;
}
inline void UNALIGNED_STORE16(void *p, uint16 v) {
memcpy(p, &v, sizeof v);
}
inline void UNALIGNED_STORE32(void *p, uint32 v) {
memcpy(p, &v, sizeof v);
}
inline void UNALIGNED_STORE64(void *p, uint64 v) {
memcpy(p, &v, sizeof v);
}
#endif
} // End namespace CLD2
} // End namespace chrome_lang_id
#endif // SCRIPT_SPAN_PORT_H_

View File

@@ -0,0 +1,81 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// A StringPiece points to part or all of a string, double-quoted string
// literal, or other string-like object. A StringPiece does *not* own the
// string to which it points. A StringPiece is not null-terminated. [subset]
//
#ifndef SCRIPT_SPAN_STRINGPIECE_H_
#define SCRIPT_SPAN_STRINGPIECE_H_
#include <string.h>
#include <string>
namespace chrome_lang_id {
typedef int stringpiece_ssize_type;
class StringPiece {
private:
const char* ptr_;
stringpiece_ssize_type length_;
public:
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece() : ptr_(NULL), length_(0) {}
StringPiece(const char* str) // NOLINT(runtime/explicit)
: ptr_(str), length_(0) {
if (str != NULL) {
length_ = static_cast<stringpiece_ssize_type>(strlen(str));
}
}
StringPiece(const std::string& str) // NOLINT(runtime/explicit)
: ptr_(str.data()), length_(0) {
length_ = static_cast<stringpiece_ssize_type>(str.size());
}
StringPiece(const char* offset, stringpiece_ssize_type len)
: ptr_(offset), length_(len) {
}
void remove_prefix(stringpiece_ssize_type n) {
ptr_ += n;
length_ -= n;
}
void remove_suffix(stringpiece_ssize_type n) {
length_ -= n;
}
// data() may return a pointer to a buffer with embedded NULs, and the
// returned buffer may or may not be null terminated. Therefore it is
// typically a mistake to pass data() to a routine that expects a NUL
// terminated string.
const char* data() const { return ptr_; }
stringpiece_ssize_type size() const { return length_; }
stringpiece_ssize_type length() const { return length_; }
bool empty() const { return length_ == 0; }
};
class StringPiece;
} // namespace chrome_lang_id
#endif // SCRIPT_SPAN_STRINGPIECE_H__

View File

@@ -0,0 +1,245 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "text_processing.h"
#include <stdio.h>
#include <string.h>
namespace chrome_lang_id {
namespace CLD2 {
namespace {
static const int kMaxSpaceScan = 32; // Bytes
int minint(int a, int b) { return (a < b) ? a : b; }
// Counts number of spaces; a little faster than one-at-a-time
// Doesn't count odd bytes at end
int CountSpaces4(const char *src, int src_len) {
int s_count = 0;
for (int i = 0; i < (src_len & ~3); i += 4) {
s_count += (src[i] == ' ');
s_count += (src[i + 1] == ' ');
s_count += (src[i + 2] == ' ');
s_count += (src[i + 3] == ' ');
}
return s_count;
}
// This uses a cheap predictor to get a measure of compression, and
// hence a measure of repetitiveness. It works on complete UTF-8 characters
// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
// all the time when done with a byte-based count. Sigh.
//
// To allow running prediction across multiple chunks, caller passes in current
// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
//
// Returns the number of *bytes* correctly predicted, increments by 1..4 for
// each correctly-predicted character.
//
// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
//
// TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
int CountPredictedBytes(const char *isrc, int src_len, int *hash, int *tbl) {
typedef unsigned char uint8;
int p_count = 0;
const uint8 *src = reinterpret_cast<const uint8 *>(isrc);
const uint8 *srclimit = src + src_len;
int local_hash = *hash;
while (src < srclimit) {
int c = src[0];
int incr = 1;
// Pick up one char and length
if (c < 0xc0) {
// One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
// Do nothing more
} else if ((c & 0xe0) == 0xc0) {
// Two-byte
c = (c << 8) | src[1];
incr = 2;
} else if ((c & 0xf0) == 0xe0) {
// Three-byte
c = (c << 16) | (src[1] << 8) | src[2];
incr = 3;
} else {
// Four-byte
c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
incr = 4;
}
src += incr;
int p = tbl[local_hash]; // Prediction
tbl[local_hash] = c; // Update prediction
if (c == p) {
p_count += incr; // Count bytes of good predictions
}
local_hash = ((local_hash << 4) ^ c) & 0xfff;
}
*hash = local_hash;
return p_count;
}
// Backscan to word boundary, returning how many bytes n to go back
// so that src - n is non-space ans src - n - 1 is space.
// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
int BackscanToSpace(const char *src, int limit) {
int n = 0;
limit = minint(limit, kMaxSpaceScan);
while (n < limit) {
if (src[-n - 1] == ' ') {
return n;
} // We are at _X
++n;
}
n = 0;
while (n < limit) {
if ((src[-n] & 0xc0) != 0x80) {
return n;
} // We are at char begin
++n;
}
return 0;
}
// Forwardscan to word boundary, returning how many bytes n to go forward
// so that src + n is non-space ans src + n - 1 is space.
// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
int ForwardscanToSpace(const char *src, int limit) {
int n = 0;
limit = minint(limit, kMaxSpaceScan);
while (n < limit) {
if (src[n] == ' ') {
return n + 1;
} // We are at _X
++n;
}
n = 0;
while (n < limit) {
if ((src[n] & 0xc0) != 0x80) {
return n;
} // We are at char begin
++n;
}
return 0;
}
} // namespace
// Must be exactly 4096 for cheap compressor.
static const int kPredictionTableSize = 4096;
static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
static const int kSpacesThreshPercent = 30; // Squeeze if >=30% spaces
static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
// Remove portions of text that have a high density of spaces, or that are
// overly repetitive, squeezing the remaining text in-place to the front of the
// input buffer.
//
// Squeezing looks at density of space/prediced chars in fixed-size chunks,
// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
//
// Return the new, possibly-shorter length
//
// Result Buffer ALWAYS has leading space and trailing space space space NUL,
// if input does
//
int CheapSqueezeInplace(char *isrc, int src_len, int ichunksize) {
char *src = isrc;
char *dst = src;
char *srclimit = src + src_len;
bool skipping = false;
int hash = 0;
// Allocate local prediction table.
int *predict_tbl = new int[kPredictionTableSize];
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
int chunksize = ichunksize;
if (chunksize == 0) {
chunksize = kChunksizeDefault;
}
int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
while (src < srclimit) {
int remaining_bytes = srclimit - src;
int len = minint(chunksize, remaining_bytes);
// Make len land us on a UTF-8 character boundary.
// Ah. Also fixes mispredict because we could get out of phase
// Loop always terminates at trailing space in buffer
while ((src[len] & 0xc0) == 0x80) {
++len;
} // Move past continuation bytes
int space_n = CountSpaces4(src, len);
int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
// Skip the text
if (!skipping) {
// Keeping-to-skipping transition; do it at a space
int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
dst -= n;
if (dst == isrc) {
// Force a leading space if the first chunk is deleted
*dst++ = ' ';
}
skipping = true;
}
} else {
// Keep the text
if (skipping) {
// Skipping-to-keeping transition; do it at a space
int n = ForwardscanToSpace(src, len);
src += n;
remaining_bytes -= n; // Shrink remaining length
len -= n;
skipping = false;
}
// "len" can be negative in some cases
if (len > 0) {
memmove(dst, src, len);
dst += len;
}
}
src += len;
}
if ((dst - isrc) < (src_len - 3)) {
// Pad and make last char clean UTF-8 by putting following spaces
dst[0] = ' ';
dst[1] = ' ';
dst[2] = ' ';
dst[3] = '\0';
} else if ((dst - isrc) < src_len) {
// Make last char clean UTF-8 by putting following space off the end
dst[0] = ' ';
}
// Deallocate local prediction table
delete[] predict_tbl;
return static_cast<int>(dst - isrc);
}
} // namespace CLD2
} // namespace chrome_lang_id

View File

@@ -0,0 +1,30 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SCRIPT_SPAN_TEXT_PROCESSING_H_
#define SCRIPT_SPAN_TEXT_PROCESSING_H_
namespace chrome_lang_id {
namespace CLD2 {
// Remove portions of text that have a high density of spaces, or that are
// overly repetitive, squeezing the remaining text in-place to the front
// of the input buffer.
// Return the new, possibly-shorter length
int CheapSqueezeInplace(char *isrc, int srclen, int ichunksize);
} // namespace CLD2
} // namespace chrome_lang_id
#endif // SCRIPT_SPAN_TEXT_PROCESSING_H_

View File

@@ -0,0 +1,486 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by utf8tablebuilder version 2.9
//
// Rejects all codes that are not interchange-valid
// Accepts all other UTF-8 codes 0000..10FFFF
// Exit optimized -- exits after four times in state 0
// All bytes are checked for structurally valid UTF-8
// Table entries are absolute statetable subscripts
#ifndef SCRIPT_SPAN_UTF8ACCEPTINTERCHANGE_H_
#define SCRIPT_SPAN_UTF8ACCEPTINTERCHANGE_H_
#include "integral_types.h"
#include "utf8statetable.h"
namespace chrome_lang_id {
namespace CLD2 {
#define X__ (kExitIllegalStructure)
#define RJ_ (kExitReject)
#define S1_ (kExitReplace1)
#define S2_ (kExitReplace2)
#define S3_ (kExitReplace3)
#define S21 (kExitReplace21)
#define S31 (kExitReplace31)
#define S32 (kExitReplace32)
#define T1_ (kExitReplaceOffset1)
#define T2_ (kExitReplaceOffset2)
#define S11 (kExitReplace1S0)
#define SP_ (kExitSpecial)
#define D__ (kExitDoAgain)
#define RJA (kExitRejectAlt)
// Entire table has 17 state blocks of 256 entries each
static const unsigned int utf8acceptinterchange_STATE0 = 0; // state[0]
static const unsigned int utf8acceptinterchange_STATE0_SIZE = 1024; // =[4]
static const unsigned int utf8acceptinterchange_TOTAL_SIZE = 4352;
static const unsigned int utf8acceptinterchange_MAX_EXPAND_X4 = 0;
static const unsigned int utf8acceptinterchange_SHIFT = 8;
static const unsigned int utf8acceptinterchange_BYTES = 1;
static const unsigned int utf8acceptinterchange_LOSUB = 0x20202020;
static const unsigned int utf8acceptinterchange_HIADD = 0x01010101;
static const uint8 utf8acceptinterchange[] = {
// state[0] 0x000000 Byte 1
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_, 1, 1,RJ_, 1, 1,RJ_,RJ_,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,RJ_,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 10,
13, 15, 15, 15, 16,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[1] 0x000000 Byte 1
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_, 2, 2,RJ_, 2, 2,RJ_,RJ_,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,RJ_,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 10,
13, 15, 15, 15, 16,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[2] 0x000000 Byte 1
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_, 3, 3,RJ_, 3, 3,RJ_,RJ_,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,RJ_,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 10,
13, 15, 15, 15, 16,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[3] 0x000000 Byte 1
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,D__,D__,RJ_,D__,D__,RJ_,RJ_,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,RJ_,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 10,
13, 15, 15, 15, 16,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[4] 0x0000c0 Byte 2 of 2
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[5] 0x000000 Byte 2 of 3
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[6] 0x001000 Byte 2 of 3
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[7] 0x000080 Byte 2 of 2
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[8] 0x00d000 Byte 2 of 3
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[9] 0x00d800 Byte 3 of 3
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[10] 0x00f000 Byte 2 of 3
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 11, 4, 4, 4, 4, 4, 4, 4, 12,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[11] 0x00fdc0 Byte 3 of 3
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[12] 0x00ffc0 Byte 3 of 3
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,RJ_,RJ_,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[13] 0x000000 Byte 2 of 4
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[14] 0x01f000 Byte 3 of 4
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[15] 0x040000 Byte 2 of 4
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[16] 0x100000 Byte 2 of 4
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
};
// Remap base[0] = (del, add, string_offset)
static const RemapEntry utf8acceptinterchange_remap_base[] = {
{0,0,0} };
// Remap string[0]
static const unsigned char utf8acceptinterchange_remap_string[] = {
0 };
static const unsigned char utf8acceptinterchange_fast[256] = {
1,1,1,1,1,1,1,1, 1,0,0,1,0,0,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
};
static const UTF8ScanObj utf8acceptinterchange_obj = {
utf8acceptinterchange_STATE0,
utf8acceptinterchange_STATE0_SIZE,
utf8acceptinterchange_TOTAL_SIZE,
utf8acceptinterchange_MAX_EXPAND_X4,
utf8acceptinterchange_SHIFT,
utf8acceptinterchange_BYTES,
utf8acceptinterchange_LOSUB,
utf8acceptinterchange_HIADD,
utf8acceptinterchange,
utf8acceptinterchange_remap_base,
utf8acceptinterchange_remap_string,
utf8acceptinterchange_fast
};
#undef X__
#undef RJ_
#undef S1_
#undef S2_
#undef S3_
#undef S21
#undef S31
#undef S32
#undef T1_
#undef T2_
#undef S11
#undef SP_
#undef D__
#undef RJA
// Table has 4608 bytes, Hash = 505C-3D29
} // End namespace CLD2
} // End namespace chrome_lang_id
#endif // SCRIPT_SPAN_UTF8ACCEPTINTERCHANGE_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,758 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by utf8tablebuilder version 2.9
//
// Replaces all codes from file:
// lettermarklower_6.2.0.txt
// Accepts all other UTF-8 codes 0000..10FFFF
// Space optimized
//
// ** ASSUMES INPUT IS STRUCTURALLY VALID UTF-8 **
//
// Table entries are absolute statetable subscripts
#ifndef SCRIPT_SPAN_UTF8REPL_LETTERMARKLOWER_H_
#define SCRIPT_SPAN_UTF8REPL_LETTERMARKLOWER_H_
#include "integral_types.h"
#include "utf8statetable.h"
namespace chrome_lang_id {
namespace CLD2 {
#define X__ (kExitIllegalStructure)
#define RJ_ (kExitReject)
#define S1_ (kExitReplace1)
#define S2_ (kExitReplace2)
#define S3_ (kExitReplace3)
#define S21 (kExitReplace21)
#define S31 (kExitReplace31)
#define S32 (kExitReplace32)
#define T1_ (kExitReplaceOffset1)
#define T2_ (kExitReplaceOffset2)
#define S11 (kExitReplace1S0)
#define SP_ (kExitSpecial)
#define D__ (kExitDoAgain)
#define RJA (kExitRejectAlt)
// Entire table has 111 state blocks of 64 entries each
static const unsigned int utf8repl_lettermarklower_STATE0 = 0; // state[0]
static const unsigned int utf8repl_lettermarklower_STATE0_SIZE = 320; // =[5]
static const unsigned int utf8repl_lettermarklower_TOTAL_SIZE = 7104;
static const unsigned int utf8repl_lettermarklower_MAX_EXPAND_X4 = 12;
static const unsigned int utf8repl_lettermarklower_SHIFT = 6;
static const unsigned int utf8repl_lettermarklower_BYTES = 1;
static const unsigned int utf8repl_lettermarklower_LOSUB = 0x5b5b5b5b;
static const unsigned int utf8repl_lettermarklower_HIADD = 0x00000000;
static const uint8 utf8repl_lettermarklower[] = {
// state[0] 0x000000 Byte 1
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,S11,S11,S11,S11,S11,S11,S11, S11,S11,S11,S11,S11,S11,S11,S11,
S11,S11,S11,S11,S11,S11,S11,S11, S11,S11,S11, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__, 6, 11, 13, 16, 19, 22, 25, 28, 6, 6, 6, 31, 33, 36,
39, 42, 44, 46, 48, 51, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 54, 74, 8, 8, 8, 8, 8, 8, 8, 88, 8, 8, 8, 8,100,
104, 9, 9, 9, 10,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[6 + 2] 0x000080 Byte 2 of 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// state[7 + 2] 0x000000 Byte 2 of 3
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[8 + 2] 0x003000 Byte 2 of 3
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[9 + 2] 0x040000 Byte 2 of 4
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
// state[10 + 2] 0x100000 Byte 2 of 4
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[11 + 2] 0x0000c0 Byte 2 of 2
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S1_,S1_,S1_,S1_,S1_,S1_,S1_, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0x00, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[13 + 2] 0x000100 Byte 2 of 2
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S21, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,S1_, 0,S1_, 0,S2_,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x69,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0x00,0xba,0x00,0xbc,0x00,0xbe,0x00,0x80,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xc5,
// state[16 + 2] 0x000140 Byte 2 of 2
0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S2_,S1_, 0,S1_, 0,S1_, 0, 0,
0x00,0x82,0x00,0x84,0x00,0x86,0x00,0x88, 0x00,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xbf,0xba,0x00,0xbc,0x00,0xbe,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xc3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[19 + 2] 0x000180 Byte 2 of 2
0,S2_,S1_, 0,S1_, 0,S2_,S1_, 0,S2_,S2_,S1_, 0, 0,S2_,S2_,
S2_,S1_, 0,S2_,S2_, 0,S2_,S2_, S1_, 0, 0, 0,S2_,S2_, 0,S2_,
S1_, 0,S1_, 0,S1_, 0,S2_,S1_, 0,S2_, 0, 0,S1_, 0,S2_,S1_,
0,S2_,S2_,S1_, 0,S1_, 0,S2_, S1_, 0, 0, 0,S1_, 0, 0, 0,
0x00,0x93,0x83,0x00,0x85,0x00,0x94,0x88, 0x00,0x96,0x97,0x8c,0x00,0x00,0x9d,0x99,
0x9b,0x92,0x00,0xa0,0xa3,0x00,0xa9,0xa8, 0x99,0x00,0x00,0x00,0xaf,0xb2,0x00,0xb5,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0x80,0xa8, 0x00,0x83,0x00,0x00,0xad,0x00,0x88,0xb0,
0x00,0x8a,0x8b,0xb4,0x00,0xb6,0x00,0x92, 0xb9,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
0x00,0xc9,0x00,0x00,0x00,0x00,0xc9,0x00, 0x00,0xc9,0xc9,0x00,0x00,0x00,0xc7,0xc9,
0xc9,0x00,0x00,0xc9,0xc9,0x00,0xc9,0xc9, 0x00,0x00,0x00,0x00,0xc9,0xc9,0x00,0xc9,
0x00,0x00,0x00,0x00,0x00,0x00,0xca,0x00, 0x00,0xca,0x00,0x00,0x00,0x00,0xca,0x00,
0x00,0xca,0xca,0x00,0x00,0x00,0x00,0xca, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[22 + 2] 0x0001c0 Byte 2 of 2
0, 0, 0, 0,S1_,S1_, 0,S1_, S1_, 0,S1_,S1_, 0,S1_, 0,S1_,
0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0,S1_,S1_, 0,S1_, 0,S2_,S2_, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x00,0x00,0x00,0x00,0x86,0x86,0x00,0x89, 0x89,0x00,0x8c,0x8c,0x00,0x8e,0x00,0x90,
0x00,0x92,0x00,0x94,0x00,0x96,0x00,0x98, 0x00,0x9a,0x00,0x9c,0x00,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x00,0xb3,0xb3,0x00,0xb5,0x00,0x95,0xbf, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xc6,0xc6, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[25 + 2] 0x000200 Byte 2 of 2
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S2_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0,T1_,S1_, 0,S2_,T1_, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0x9e,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xbc,0x00,0x9a,0x01,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xc6,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc6,0x00,0x00,
// state[28 + 2] 0x000240 Byte 2 of 2
0,S1_, 0,S2_,S2_,S2_,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x82,0x00,0x80,0x89,0x8c,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0xc6,0xca,0xca,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[31 + 2] 0x000340 Byte 2 of 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
S1_, 0,S1_, 0, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xb1,0x00,0xb3,0x00,0x00,0x00,0xb7,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[33 + 2] 0x000380 Byte 2 of 2
0, 0, 0, 0, 0, 0,S1_, 0, S1_,S1_,S1_, 0,S2_, 0,S2_,S2_,
0,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S2_,S2_, 0,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0xac,0x00, 0xad,0xae,0xaf,0x00,0x8c,0x00,0x8d,0x8e,
0x00,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x00,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0xcf,0x00,0xcf,0xcf,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xcf,0xcf,0x00,0xcf,0xcf,0xcf,0xcf,0xcf, 0xcf,0xcf,0xcf,0xcf,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[36 + 2] 0x0003c0 Byte 2 of 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0, 0, 0, 0,S2_, 0, 0,S1_, 0,S1_,S1_, 0, 0,S2_,S2_,S2_,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x97,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x00,0x00,0x00,0x00,0xb8,0x00,0x00,0xb8, 0x00,0xb2,0xbb,0x00,0x00,0xbb,0xbc,0xbd,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0xce,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xcd,0xcd,0xcd,
// state[39 + 2] 0x000400 Byte 2 of 2
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1, 0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1, 0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[42 + 2] 0x000440 Byte 2 of 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[44 + 2] 0x000480 Byte 2 of 2
S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x81,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[46 + 2] 0x0004c0 Byte 2 of 2
S1_,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x8f,0x82,0x00,0x84,0x00,0x86,0x00,0x88, 0x00,0x8a,0x00,0x8c,0x00,0x8e,0x00,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[48 + 2] 0x000500 Byte 2 of 2
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5, 0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,
// state[51 + 2] 0x000540 Byte 2 of 2
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xd6,0xd6,0xd6,0xd6,0xd6,0xd6,0xd6,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[54 + 2] 0x001000 Byte 2 of 3
6, 6, 55, 57, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 59, 59, 61, 59, 64, 66, 68, 71,
// state[55 + 2] 0x001080 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_, T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_,
T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_, T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09, 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,
0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19, 0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,
// state[57 + 2] 0x0010c0 Byte 3 of 3
T1_,T1_,T1_,T1_,T1_,T1_, 0,T1_, 0, 0, 0, 0, 0,T1_, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x22,0x23,0x24,0x25,0x26,0x27,0x00,0x28, 0x00,0x00,0x00,0x00,0x00,0x29,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[59 + 2] 0x001e00 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[61 + 2] 0x001e80 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,S32, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xc3,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[64 + 2] 0x001f00 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0x92,0x93,0x94,0x95,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
// state[66 + 2] 0x001f40 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,S1_, 0,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x91,0x00,0x93,0x00,0x95,0x00,0x97,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[68 + 2] 0x001f80 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_,S1_, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb0,0xb1,0xb0,0xb1,0xb3,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
// state[71 + 2] 0x001fc0 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, S2_,S2_,S2_,S2_,S1_, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_,S1_, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, S2_,S2_,S2_,S2_,S1_, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb2,0xb3,0xb4,0xb5,0x83,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0xb6,0xb7,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xba,0xbb,0xa5,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb8,0xb9,0xbc,0xbd,0xb3,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xbd,0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xbd,0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,
// state[74 + 2] 0x002000 Byte 2 of 3
6, 6, 6, 6, 75, 6, 78, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
80, 83, 59, 86, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[75 + 2] 0x002100 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,S32, 0, 0, 0,S31,S32, 0, 0, 0, 0,
0, 0,S2_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x89,0x00, 0x00,0x00,0x6b,0xa5,0x00,0x00,0x00,0x00,
0x00,0x00,0x8e,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x00, 0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x00,
0x00,0x00,0x85,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[78 + 2] 0x002180 Byte 3 of 3
0, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[80 + 2] 0x002c00 Byte 3 of 3
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1, 0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,
0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1, 0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[83 + 2] 0x002c40 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
S1_, 0,S32,T1_,S32, 0, 0,S1_, 0,S1_, 0,S1_, 0,S32,S32,S32,
S32, 0,S1_, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0,S32,S32,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xa1,0x00,0xab,0x2a,0xbd,0x00,0x00,0xa8, 0x00,0xaa,0x00,0xac,0x00,0x91,0xb1,0x90,
0x92,0x00,0xb3,0x00,0x00,0xb6,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xbf,0x80,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0xc9,0x00,0xc9,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc9,0xc9,0xc9,
0xc9,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xc8,0xc9,
// state[86 + 2] 0x002cc0 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0, 0,
0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xac,0x00,0xae,0x00,0x00,
0x00,0x00,0xb3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[88 + 2] 0x00a000 Byte 2 of 3
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 89, 91, 6, 93, 95, 97, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[89 + 2] 0x00a640 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[91 + 2] 0x00a680 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[93 + 2] 0x00a700 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x00,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[95 + 2] 0x00a740 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,T1_,S1_, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0xba,0x00,0xbc,0x00,0x2b,0xbf,0x00,
// state[97 + 2] 0x00a780 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0,S1_, 0,S32, 0, 0,
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x00,0x00,0x00,0x8c,0x00,0xa5,0x00,0x00,
0x91,0x00,0x93,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xa6,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc9,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xc9,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[100 + 2] 0x00f000 Byte 2 of 3
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,101, 6, 6, 6,
// state[101 + 2] 0x00ff00 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd, 0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,
0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd, 0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,0x00,
// state[104 + 2] 0x000000 Byte 2 of 4
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
105, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
// state[105 + 2] 0x010000 Byte 3 of 4
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
106, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[106 + 2] 0x010400 Byte 4 of 4
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x91,0x91,0x91,0x91,0x91,0x91,0x91,0x91,
0x91,0x91,0x91,0x91,0x91,0x91,0x91,0x91, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
};
// Remap base[44] = (del, add, string_offset)
static const RemapEntry utf8repl_lettermarklower_remap_base[] = {
{2,3, 0}, {2,3, 3}, {3,3, 6}, {3,3, 9},
{3,3, 12}, {3,3, 15}, {3,3, 18}, {3,3, 21},
{3,3, 24}, {3,3, 27}, {3,3, 30}, {3,3, 33},
{3,3, 36}, {3,3, 39}, {3,3, 42}, {3,3, 45},
{3,3, 48}, {3,3, 51}, {3,3, 54}, {3,3, 57},
{3,3, 60}, {3,3, 63}, {3,3, 66}, {3,3, 69},
{3,3, 72}, {3,3, 75}, {3,3, 78}, {3,3, 81},
{3,3, 84}, {3,3, 87}, {3,3, 90}, {3,3, 93},
{3,3, 96}, {3,3, 99}, {3,3, 102}, {3,3, 105},
{3,3, 108}, {3,3, 111}, {3,3, 114}, {3,3, 117},
{3,3, 120}, {3,3, 123}, {3,3, 126}, {3,3, 129},
{0,0,0} };
// Remap string[132]
static const unsigned char utf8repl_lettermarklower_remap_string[] = {
0xe2,0xb1,0xa5,0xe2,0xb1,0xa6,0xe2,0xb4, 0x80,0xe2,0xb4,0x81,0xe2,0xb4,0x82,0xe2,
0xb4,0x83,0xe2,0xb4,0x84,0xe2,0xb4,0x85, 0xe2,0xb4,0x86,0xe2,0xb4,0x87,0xe2,0xb4,
0x88,0xe2,0xb4,0x89,0xe2,0xb4,0x8a,0xe2, 0xb4,0x8b,0xe2,0xb4,0x8c,0xe2,0xb4,0x8d,
0xe2,0xb4,0x8e,0xe2,0xb4,0x8f,0xe2,0xb4, 0x90,0xe2,0xb4,0x91,0xe2,0xb4,0x92,0xe2,
0xb4,0x93,0xe2,0xb4,0x94,0xe2,0xb4,0x95, 0xe2,0xb4,0x96,0xe2,0xb4,0x97,0xe2,0xb4,
0x98,0xe2,0xb4,0x99,0xe2,0xb4,0x9a,0xe2, 0xb4,0x9b,0xe2,0xb4,0x9c,0xe2,0xb4,0x9d,
0xe2,0xb4,0x9e,0xe2,0xb4,0x9f,0xe2,0xb4, 0xa0,0xe2,0xb4,0xa1,0xe2,0xb4,0xa2,0xe2,
0xb4,0xa3,0xe2,0xb4,0xa4,0xe2,0xb4,0xa5, 0xe2,0xb4,0xa7,0xe2,0xb4,0xad,0xe1,0xb5,
0xbd,0xe1,0xb5,0xb9,0 };
static const unsigned char utf8repl_lettermarklower_fast[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
};
static const UTF8ReplaceObj utf8repl_lettermarklower_obj = {
utf8repl_lettermarklower_STATE0,
utf8repl_lettermarklower_STATE0_SIZE,
utf8repl_lettermarklower_TOTAL_SIZE,
utf8repl_lettermarklower_MAX_EXPAND_X4,
utf8repl_lettermarklower_SHIFT,
utf8repl_lettermarklower_BYTES,
utf8repl_lettermarklower_LOSUB,
utf8repl_lettermarklower_HIADD,
utf8repl_lettermarklower,
utf8repl_lettermarklower_remap_base,
utf8repl_lettermarklower_remap_string,
utf8repl_lettermarklower_fast
};
#undef X__
#undef RJ_
#undef S1_
#undef S2_
#undef S3_
#undef S21
#undef S31
#undef S32
#undef T1_
#undef T2_
#undef S11
#undef SP_
#undef D__
#undef RJA
// Table has 7668 bytes, Hash = 07A2-C4E3
} // End namespace CLD2
} // End namespace chrome_lang_id
#endif // SCRIPT_SPAN_UTF8REPL_LETTERMARKLOWER_H_

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,285 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// State Table follower for scanning UTF-8 strings without converting to
// 32- or 16-bit Unicode values.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef SCRIPT_SPAN_UTF8STATETABLE_H_
#define SCRIPT_SPAN_UTF8STATETABLE_H_
#include <string>
#include "integral_types.h" // for uint8, uint32, uint16
#include "stringpiece.h"
namespace chrome_lang_id {
namespace CLD2 {
class OffsetMap;
// These four-byte entries compactly encode how many bytes 0..255 to delete
// in making a string replacement, how many bytes to add 0..255, and the offset
// 0..64k-1 of the replacement string in remap_string.
struct RemapEntry {
uint8 delete_bytes;
uint8 add_bytes;
uint16 bytes_offset;
};
// Exit type codes for state tables. All but the first get stuffed into
// signed one-byte entries. The first is only generated by executable code.
// To distinguish from next-state entries, these must be contiguous and
// all <= kExitNone
typedef enum {
kExitDstSpaceFull = 239,
kExitIllegalStructure, // 240
kExitOK, // 241
kExitReject, // ...
kExitReplace1,
kExitReplace2,
kExitReplace3,
kExitReplace21,
kExitReplace31,
kExitReplace32,
kExitReplaceOffset1,
kExitReplaceOffset2,
kExitReplace1S0,
kExitSpecial,
kExitDoAgain,
kExitRejectAlt,
kExitNone // 255
} ExitReason;
typedef enum {
kExitDstSpaceFull_2 = 32767, // 0x7fff
kExitIllegalStructure_2, // 32768 0x8000
kExitOK_2, // 32769 0x8001
kExitReject_2, // ...
kExitReplace1_2,
kExitReplace2_2,
kExitReplace3_2,
kExitReplace21_2,
kExitReplace31_2,
kExitReplace32_2,
kExitReplaceOffset1_2,
kExitReplaceOffset2_2,
kExitReplace1S0_2,
kExitSpecial_2,
kExitDoAgain_2,
kExitRejectAlt_2,
kExitNone_2 // 32783 0x800f
} ExitReason_2;
// This struct represents one entire state table. The three initialized byte
// areas are state_table, remap_base, and remap_string. state0 and state0_size
// give the byte offset and length within state_table of the initial state --
// table lookups are expected to start and end in this state, but for
// truncated UTF-8 strings, may end in a different state. These allow a quick
// test for that condition. entry_shift is 8 for tables subscripted by a full
// byte value and 6 for space-optimized tables subscripted by only six
// significant bits in UTF-8 continuation bytes.
typedef struct {
const uint32 state0;
const uint32 state0_size;
const uint32 total_size;
const int max_expand;
const int entry_shift;
const int bytes_per_entry;
const uint32 losub;
const uint32 hiadd;
const uint8* state_table;
const RemapEntry* remap_base;
const uint8* remap_string;
const uint8* fast_state;
} UTF8StateMachineObj;
// Near-duplicate declaration for tables with two-byte entries
typedef struct {
const uint32 state0;
const uint32 state0_size;
const uint32 total_size;
const int max_expand;
const int entry_shift;
const int bytes_per_entry;
const uint32 losub;
const uint32 hiadd;
const unsigned short* state_table;
const RemapEntry* remap_base;
const uint8* remap_string;
const uint8* fast_state;
} UTF8StateMachineObj_2;
typedef UTF8StateMachineObj UTF8PropObj;
typedef UTF8StateMachineObj UTF8ScanObj;
typedef UTF8StateMachineObj UTF8ReplaceObj;
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericProperty(const UTF8PropObj* st,
const uint8** src,
int* srclen);
// Look up property of one UTF-8 character (assumed to be valid).
// (This is a faster version of UTF8GenericProperty.)
bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
// BigOneByte versions are needed for tables > 240 states, but most
// won't need the TwoByte versions.
// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
const uint8** src,
int* srclen);
// TwoByte versions are needed for tables > 240 states that don't fit onto
// BigOneByte -- rare ultimate fallback
// Look up property of one UTF-8 character (assumed to be valid).
// (This is a faster version of UTF8GenericProperty.)
bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
const uint8** src,
int* srclen);
// Look up property of one UTF-8 character (assumed to be valid).
// (This is a faster version of UTF8GenericProperty.)
bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
// Scan a UTF-8 stringpiece based on a state table.
// Always scan complete UTF-8 characters
// Set number of bytes scanned. Return reason for exiting
int UTF8GenericScan(const UTF8ScanObj* st,
const StringPiece& str,
int* bytes_consumed);
// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
// and doing text replacements.
// Always scan complete UTF-8 characters
// Set number of bytes consumed from input, number filled to output.
// Return reason for exiting
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
int UTF8GenericReplace(const UTF8ReplaceObj* st,
const StringPiece& istr,
StringPiece& ostr,
bool is_plain_text,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed,
OffsetMap* offsetmap);
// Older version without offsetmap
int UTF8GenericReplace(const UTF8ReplaceObj* st,
const StringPiece& istr,
StringPiece& ostr,
bool is_plain_text,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed);
// Older version without is_plain_text or offsetmap
int UTF8GenericReplace(const UTF8ReplaceObj* st,
const StringPiece& istr,
StringPiece& ostr,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed);
// TwoByte version is needed for tables > about 256 states, such
// as the table for full Unicode 4.1 canonical + compatibility mapping
// Scan a UTF-8 stringpiece based on state table with two-byte entries,
// copying to output stringpiece
// and doing text replacements.
// Always scan complete UTF-8 characters
// Set number of bytes consumed from input, number filled to output.
// Return reason for exiting
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
const StringPiece& istr,
StringPiece& ostr,
bool is_plain_text,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed,
OffsetMap* offsetmap);
// Older version without offsetmap
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
const StringPiece& istr,
StringPiece& ostr,
bool is_plain_text,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed);
// Older version without is_plain_text or offsetmap
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
const StringPiece& istr,
StringPiece& ostr,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed);
static const unsigned char kUTF8LenTbl[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
};
inline int UTF8OneCharLen(const char* in) {
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
}
// Adjust a stringpiece to encompass complete UTF-8 characters.
// The data pointer will be increased by 0..3 bytes to get to a character
// boundary, and the length will then be decreased by 0..3 bytes
// to encompass the last complete character.
// This is useful especially when a UTF-8 string must be put into a fixed-
// maximum-size buffer cleanly, such as a MySQL buffer.
void UTF8TrimToChars(StringPiece* istr);
} // End namespace CLD2
} // End namespace chrome_lang_id
#endif // SCRIPT_SPAN_UTF8STATETABLE_H_

View File

@@ -0,0 +1,77 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Protocol buffer specification for sentence analysis.
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package chrome_lang_id;
// A Sentence contains the raw text contents of a sentence, as well as an
// analysis.
message Sentence {
// Identifier for sentence.
optional string id = 1;
// Raw text contents of the sentence.
optional string text = 2;
// Tokenization of the sentence.
repeated Token token = 3;
extensions 1000 to max;
}
// A sentence token marks a span of bytes in the sentence text as a token
// or word.
message Token {
// Token word form.
required string word = 1;
// Start position of token in text.
required int32 start = 2;
// End position of token in text. Gives index of last byte, not one past
// the last byte. If token came from lexer, excludes any trailing HTML tags.
required int32 end = 3;
// Head of this token in the dependency tree: the id of the token which has an
// arc going to this one. If it is the root token of a sentence, then it is
// set to -1.
optional int32 head = 4 [default = -1];
// Part-of-speech tag for token.
optional string tag = 5;
// Coarse-grained word category for token.
optional string category = 6;
// Label for dependency relation between this token and its head.
optional string label = 7;
// Break level for tokens that indicates how it was separated from the
// previous token in the text.
enum BreakLevel {
NO_BREAK = 0; // No separation between tokens.
SPACE_BREAK = 1; // Tokens separated by space.
LINE_BREAK = 2; // Tokens separated by line break.
SENTENCE_BREAK = 3; // Tokens separated by sentence break.
}
optional BreakLevel break_level = 8 [default = SPACE_BREAK];
extensions 1000 to max;
}

View File

@@ -0,0 +1,29 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "sentence_features.h"
#include "registry.h"
namespace chrome_lang_id {
// Define registry for the whole Sentence feature functions. NOTE: this is not
// yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
// constructor, *before* we use any feature.
template <>
WholeSentenceFeature::Registry*
RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
} // namespace chrome_lang_id

View File

@@ -0,0 +1,45 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Features that operate on Sentence objects. Most features are defined
// in this header so they may be re-used via composition into other more
// advanced feature classes.
#ifndef SENTENCE_FEATURES_H_
#define SENTENCE_FEATURES_H_
#include "feature_extractor.h"
#include "cld_3/protos/sentence.pb.h"
namespace chrome_lang_id {
// Feature function that extracts features for the full Sentence.
using WholeSentenceFeature = FeatureFunction<Sentence>;
using WholeSentenceExtractor = FeatureExtractor<Sentence>;
// Declare registry for the whole Sentence feature functions. This is required
// for clang's -Wundefined-var-template. However, MSVC has a bug which treats
// this declaration as a definition, leading to multiple definition errors, so
// omit this on MSVC.
#if !defined(COMPILER_MSVC)
template <>
WholeSentenceFeature::Registry
*RegisterableClass<WholeSentenceFeature>::registry_;
#endif
} // namespace chrome_lang_id
#endif // SENTENCE_FEATURES_H_

View File

@@ -0,0 +1,72 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef SIMPLE_ADDER_H_
#define SIMPLE_ADDER_H_
#include "base.h"
namespace chrome_lang_id {
// Class for adding (possibly) scaled arrays.
class SimpleAdder {
public:
static constexpr const int kNumFloatsPerBatch = 1;
CLD3_ATTRIBUTE_ALWAYS_INLINE SimpleAdder(float *dest, int num_floats)
: dest_(dest), num_floats_(num_floats) {}
CLD3_ATTRIBUTE_ALWAYS_INLINE ~SimpleAdder() {
// Should call Finalize function before destruction.
CLD3_DCHECK(dest_ == nullptr);
}
// Caller must call this function before calling deconstruct this object.
CLD3_ATTRIBUTE_ALWAYS_INLINE void Finalize() { dest_ = nullptr; }
CLD3_ATTRIBUTE_ALWAYS_INLINE void LazyAdd(const float *source) const {
AddImpl(source, num_floats_, dest_);
}
CLD3_ATTRIBUTE_ALWAYS_INLINE void LazyScaleAdd(const float *source,
const float scale) const {
ScaleAddImpl(source, num_floats_, scale, dest_);
}
// Simple fast while loop to implement dest += source.
CLD3_ATTRIBUTE_ALWAYS_INLINE static void AddImpl(
const float *__restrict source, uint32 size, float *__restrict dest) {
for (uint32 i = 0; i < size; ++i) {
dest[i] += source[i];
}
}
// Simple fast while loop to implement dest += scale * source.
CLD3_ATTRIBUTE_ALWAYS_INLINE static void ScaleAddImpl(
const float *__restrict source, uint32 size, const float scale,
float *__restrict dest) {
for (uint32 i = 0; i < size; ++i) {
dest[i] += source[i] * scale;
}
}
private:
float *dest_;
int num_floats_;
};
} // namespace chrome_lang_id
#endif // SIMPLE_ADDER_H_

View File

@@ -0,0 +1,161 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "task_context.h"
#include "utils.h"
namespace chrome_lang_id {
TaskContext::TaskContext() {}
TaskContext::~TaskContext() {}
TaskInput *TaskContext::GetInput(const string &name) {
// Return existing input if it exists.
for (int i = 0; i < spec_.input_size(); ++i) {
if (spec_.input(i).name() == name) return spec_.mutable_input(i);
}
// Create new input.
TaskInput *input = spec_.add_input();
input->set_name(name);
return input;
}
TaskInput *TaskContext::GetInput(const string &name, const string &file_format,
const string &record_format) {
TaskInput *input = GetInput(name);
if (!file_format.empty()) {
bool found = false;
for (int i = 0; i < input->file_format_size(); ++i) {
if (input->file_format(i) == file_format) found = true;
}
if (!found) input->add_file_format(file_format);
}
if (!record_format.empty()) {
bool found = false;
for (int i = 0; i < input->record_format_size(); ++i) {
if (input->record_format(i) == record_format) found = true;
}
if (!found) input->add_record_format(record_format);
}
return input;
}
void TaskContext::SetParameter(const string &name, const string &value) {
// If the parameter already exists update the value.
for (int i = 0; i < spec_.parameter_size(); ++i) {
if (spec_.parameter(i).name() == name) {
spec_.mutable_parameter(i)->set_value(value);
return;
}
}
// Add new parameter.
TaskSpec::Parameter *param = spec_.add_parameter();
param->set_name(name);
param->set_value(value);
}
string TaskContext::GetParameter(const string &name) const {
// First try to find parameter in task specification.
for (int i = 0; i < spec_.parameter_size(); ++i) {
if (spec_.parameter(i).name() == name) return spec_.parameter(i).value();
}
// Parameter not found, return empty string.
return "";
}
int TaskContext::GetIntParameter(const string &name) const {
string value = GetParameter(name);
return utils::ParseUsing<int>(value, 0, utils::ParseInt32);
}
bool TaskContext::GetBoolParameter(const string &name) const {
string value = GetParameter(name);
return value == "true";
}
double TaskContext::GetFloatParameter(const string &name) const {
string value = GetParameter(name);
return utils::ParseUsing<double>(value, .0, utils::ParseDouble);
}
string TaskContext::Get(const string &name, const char *defval) const {
// First try to find parameter in task specification.
for (int i = 0; i < spec_.parameter_size(); ++i) {
if (spec_.parameter(i).name() == name) return spec_.parameter(i).value();
}
// Parameter not found, return default value.
return defval;
}
string TaskContext::Get(const string &name, const string &defval) const {
return Get(name, defval.c_str());
}
int TaskContext::Get(const string &name, int defval) const {
string value = Get(name, "");
return utils::ParseUsing<int>(value, defval, utils::ParseInt32);
}
double TaskContext::Get(const string &name, double defval) const {
string value = Get(name, "");
return utils::ParseUsing<double>(value, defval, utils::ParseDouble);
}
bool TaskContext::Get(const string &name, bool defval) const {
string value = Get(name, "");
return value.empty() ? defval : value == "true";
}
string TaskContext::InputFile(const TaskInput &input) {
CLD3_CHECK(input.part_size() == 1);
return input.part(0).file_pattern();
}
bool TaskContext::Supports(const TaskInput &input, const string &file_format,
const string &record_format) {
// Check file format.
if (input.file_format_size() > 0) {
bool found = false;
for (int i = 0; i < input.file_format_size(); ++i) {
if (input.file_format(i) == file_format) {
found = true;
break;
}
}
if (!found) return false;
}
// Check record format.
if (input.record_format_size() > 0) {
bool found = false;
for (int i = 0; i < input.record_format_size(); ++i) {
if (input.record_format(i) == record_format) {
found = true;
break;
}
}
if (!found) return false;
}
return true;
}
} // namespace chrome_lang_id

View File

@@ -0,0 +1,81 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TASK_CONTEXT_H_
#define TASK_CONTEXT_H_
#include <string>
#include <vector>
#include "base.h"
#include "cld_3/protos/task_spec.pb.h"
namespace chrome_lang_id {
// A task context holds configuration information for a task. It is basically a
// wrapper around a TaskSpec protocol buffer.
class TaskContext {
public:
TaskContext();
~TaskContext();
// Returns the underlying task specification protocol buffer for the context.
const TaskSpec &spec() const { return spec_; }
TaskSpec *mutable_spec() { return &spec_; }
// Returns a named input descriptor for the task. A new input is created if
// the task context does not already have an input with that name.
TaskInput *GetInput(const string &name);
TaskInput *GetInput(const string &name, const string &file_format,
const string &record_format);
// Sets task parameter.
void SetParameter(const string &name, const string &value);
// Returns task parameter. If the parameter is not in the task configuration
// the (default) value of the corresponding command line flag is returned.
string GetParameter(const string &name) const;
int GetIntParameter(const string &name) const;
bool GetBoolParameter(const string &name) const;
double GetFloatParameter(const string &name) const;
// Returns task parameter. If the parameter is not in the task configuration
// the default value is returned. Parameters retrieved using these methods
// don't need to be defined with a DEFINE_*() macro.
string Get(const string &name, const string &defval) const;
string Get(const string &name, const char *defval) const;
int Get(const string &name, int defval) const;
double Get(const string &name, double defval) const;
bool Get(const string &name, bool defval) const;
// Returns input file name for a single-file task input.
static string InputFile(const TaskInput &input);
// Returns true if task input supports the file and record format.
static bool Supports(const TaskInput &input, const string &file_format,
const string &record_format);
private:
// Underlying task specification protocol buffer.
TaskSpec spec_;
// Vector of parameters required by this task. These must be specified in the
// task rather than relying on default values.
std::vector<string> required_parameters_;
};
} // namespace chrome_lang_id
#endif // TASK_CONTEXT_H_

View File

@@ -0,0 +1,74 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This file contains the hard-coded parameters from the training workflow. If
// you update the binary model, you may need to update the variables below as
// well.
#include "task_context_params.h"
#include "task_context.h"
namespace chrome_lang_id {
void TaskContextParams::ToTaskContext(TaskContext *context) {
context->SetParameter("language_identifier_features",
kLanguageIdentifierFeatures);
context->SetParameter("language_identifier_embedding_names",
kLanguageIdentifierEmbeddingNames);
context->SetParameter("language_identifier_embedding_dims",
kLanguageIdentifierEmbeddingDims);
}
int TaskContextParams::GetNumLanguages() {
int i = 0;
while (kLanguageNames[i] != nullptr) {
i++;
}
return i;
}
const char *const TaskContextParams::kLanguageNames[] = {
"eo", "co", "eu", "ta", "de", "mt", "ps", "te", "su", "uz", "zh-Latn", "ne",
"nl", "sw", "sq", "hmn", "ja", "no", "mn", "so", "ko", "kk", "sl", "ig",
"mr", "th", "zu", "ml", "hr", "bs", "lo", "sd", "cy", "hy", "uk", "pt",
"lv", "iw", "cs", "vi", "jv", "be", "km", "mk", "tr", "fy", "am", "zh",
"da", "sv", "fi", "ht", "af", "la", "id", "fil", "sm", "ca", "el", "ka",
"sr", "it", "sk", "ru", "ru-Latn", "bg", "ny", "fa", "haw", "gl", "et",
"ms", "gd", "bg-Latn", "ha", "is", "ur", "mi", "hi", "bn", "hi-Latn", "fr",
"yi", "hu", "xh", "my", "tg", "ro", "ar", "lb", "el-Latn", "st", "ceb",
"kn", "az", "si", "ky", "mg", "en", "gu", "es", "pl", "ja-Latn", "ga", "lt",
"sn", "yo", "pa", "ku",
// last element must be nullptr
nullptr,
};
const char TaskContextParams::kLanguageIdentifierFeatures[] =
"continuous-bag-of-ngrams(include_terminators=true,include_spaces=false,"
"use_equal_weight=false,id_dim=1000,size=2);continuous-bag-of-ngrams("
"include_terminators=true,include_spaces=false,use_equal_weight=false,id_"
"dim=5000,size=4);continuous-bag-of-relevant-scripts;script;continuous-bag-"
"of-ngrams(include_terminators=true,include_spaces=false,use_equal_weight="
"false,id_dim=5000,size=3);continuous-bag-of-ngrams(include_terminators="
"true,include_spaces=false,use_equal_weight=false,id_dim=100,size=1)";
const char TaskContextParams::kLanguageIdentifierEmbeddingNames[] =
"bigrams;quadgrams;relevant-scripts;text-script;trigrams;unigrams";
const char TaskContextParams::kLanguageIdentifierEmbeddingDims[] =
"16;16;8;8;16;16";
} // namespace chrome_lang_id

View File

@@ -0,0 +1,54 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TASK_CONTEXT_PARAMS_H_
#define TASK_CONTEXT_PARAMS_H_
#include <string>
#include "base.h"
#include "task_context.h"
namespace chrome_lang_id {
// Encapsulates the TaskContext specifying only the parameters for the model.
// The model weights are loaded statically.
class TaskContextParams {
public:
// Gets the name of the i'th language.
static const char *language_names(int i) { return kLanguageNames[i]; }
// Saves the parameters to the given TaskContext.
static void ToTaskContext(TaskContext *context);
// Gets the number of languages.
static int GetNumLanguages();
private:
// Names of all the languages.
static const char *const kLanguageNames[];
// Features in FML format.
static const char kLanguageIdentifierFeatures[];
// Names of the embedding spaces.
static const char kLanguageIdentifierEmbeddingNames[];
// Dimensions of the embedding spaces.
static const char kLanguageIdentifierEmbeddingDims[];
};
} // namespace chrome_lang_id
#endif // TASK_CONTEXT_PARAMS_H_

View File

@@ -0,0 +1,98 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// LINT: ALLOW_GROUPS
// Protocol buffer specifications for task configuration.
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package chrome_lang_id;
// Task input descriptor.
message TaskInput {
// Name of input resource.
required string name = 1;
// Name of stage responsible of creating this resource.
optional string creator = 2;
// File format for resource.
repeated string file_format = 3;
// Record format for resource.
repeated string record_format = 4;
// Is this resource multi-file?
optional bool multi_file = 5 [default = false];
// An input can consist of multiple file sets.
repeated group Part = 6 {
// File pattern for file set.
optional string file_pattern = 7;
// File format for file set.
optional string file_format = 8;
// Record format for file set.
optional string record_format = 9;
}
}
// Task output descriptor.
message TaskOutput {
// Name of output resource.
required string name = 1;
// File format for output resource.
optional string file_format = 2;
// Record format for output resource.
optional string record_format = 3;
// Number of shards in output. If it is different from zero this output is
// sharded. If the number of shards is set to -1 this means that the output is
// sharded, but the number of shard is unknown. The files are then named
// 'base-*-of-*'.
optional int32 shards = 4 [default = 0];
// Base file name for output resource. If this is not set by the task
// component it is set to a default value by the workflow engine.
optional string file_base = 5;
// Optional extension added to the file name.
optional string file_extension = 6;
}
// A task specification is used for describing executing parameters.
message TaskSpec {
// Name of task.
optional string task_name = 1;
// Workflow task type.
optional string task_type = 2;
// Task parameters.
repeated group Parameter = 3 {
required string name = 4;
optional string value = 5;
}
// Task inputs.
repeated TaskInput input = 6;
// Task outputs.
repeated TaskOutput output = 7;
}

View File

@@ -0,0 +1,96 @@
// Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Author: Jim Meehan
#include "unicodetext.h"
#include "base.h"
#include "utils.h"
namespace chrome_lang_id {
// *************** Data representation **********
// Note: the copy constructor is undefined.
void UnicodeText::Repr::PointTo(const char *data, int size) {
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
data_ = const_cast<char *>(data);
size_ = size;
capacity_ = size;
ours_ = false;
}
// *************** UnicodeText ******************
UnicodeText::UnicodeText() {}
UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
repr_.PointTo(buffer, byte_length);
return *this;
}
UnicodeText::~UnicodeText() {}
// ******************* UnicodeText::const_iterator *********************
// The implementation of const_iterator would be nicer if it
// inherited from boost::iterator_facade
// (http://boost.org/libs/iterator/doc/iterator_facade.html).
UnicodeText::const_iterator::const_iterator() : it_(0) {}
UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(
const const_iterator &other) {
if (&other != this) it_ = other.it_;
return *this;
}
UnicodeText::const_iterator UnicodeText::begin() const {
return const_iterator(repr_.data_);
}
UnicodeText::const_iterator UnicodeText::end() const {
return const_iterator(repr_.data_ + repr_.size_);
}
char32 UnicodeText::const_iterator::operator*() const {
// (We could call chartorune here, but that does some
// error-checking, and we're guaranteed that our data is valid
// UTF-8. Also, we expect this routine to be called very often. So
// for speed, we do the calculation ourselves.)
// Convert from UTF-8
unsigned char byte1 = static_cast<unsigned char>(it_[0]);
if (byte1 < 0x80) return byte1;
unsigned char byte2 = static_cast<unsigned char>(it_[1]);
if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
unsigned char byte3 = static_cast<unsigned char>(it_[2]);
if (byte1 < 0xF0) {
return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
}
unsigned char byte4 = static_cast<unsigned char>(it_[3]);
return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
}
UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {
it_ += chrome_lang_id::utils::OneCharLen(it_);
return *this;
}
} // namespace chrome_lang_id

View File

@@ -0,0 +1,144 @@
// Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Author: Jim Meehan
#ifndef UNICODETEXT_H_
#define UNICODETEXT_H_
#include <iterator>
#include <utility>
#include "base.h"
namespace chrome_lang_id {
// ***************************** UnicodeText **************************
//
// A UnicodeText object is a wrapper around a sequence of Unicode
// codepoint values that allows iteration over these values.
//
// The internal representation of the text is UTF-8. Since UTF-8 is a
// variable-width format, UnicodeText does not provide random access
// to the text, and changes to the text are permitted only at the end.
//
// The UnicodeText class defines a const_iterator. The dereferencing
// operator (*) returns a codepoint (int32). The iterator is a
// read-only iterator. It becomes invalid if the text is changed.
//
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
// 0x10FFFF], but UnicodeText has the additional restriction that it
// can contain only those characters that are valid for interchange on
// the Web. This excludes all of the control codes except for carriage
// return, line feed, and horizontal tab. It also excludes
// non-characters, but codepoints that are in the Private Use regions
// are allowed, as are codepoints that are unassigned. (See the
// Unicode reference for details.)
//
// MEMORY MANAGEMENT:
//
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
//
// The purpose of an alias is to avoid making an unnecessary copy of a
// UTF-8 buffer while still providing access to the Unicode values
// within that text through iterators. The lifetime of an alias must not
// exceed the lifetime of the buffer from which it was constructed.
//
// Aliases should be used with care. If the source from which an alias
// was created is freed, or if the contents are changed, while the
// alias is still in use, fatal errors could result. But it can be
// quite useful to have a UnicodeText "window" through which to see a
// UTF-8 buffer without having to pay the price of making a copy.
// TODO(abakalov): Consider merging this class with the script detection
// code in the directory script_span.
class UnicodeText {
public:
class const_iterator;
UnicodeText(); // Create an empty text.
~UnicodeText();
class const_iterator {
typedef const_iterator CI;
public:
// Iterators are default-constructible.
const_iterator();
// It's safe to make multiple passes over a UnicodeText.
const_iterator(const const_iterator &other);
const_iterator &operator=(const const_iterator &other);
char32 operator*() const; // Dereference
const_iterator &operator++(); // Advance (++iter)
friend bool operator==(const CI &lhs, const CI &rhs) {
return lhs.it_ == rhs.it_;
}
friend bool operator!=(const CI &lhs, const CI &rhs) {
return !(lhs == rhs);
}
private:
friend class UnicodeText;
explicit const_iterator(const char *it) : it_(it) {}
const char *it_;
};
const_iterator begin() const;
const_iterator end() const;
// x.PointToUTF8(buf,len) changes x so that it points to buf
// ("becomes an alias"). It does not take ownership or copy buf.
// This function assumes that the input is interchange valid UTF8.
UnicodeText &PointToUTF8(const char *utf8_buffer, int byte_length);
private:
friend class const_iterator;
class Repr { // A byte-string.
public:
char *data_;
int size_;
int capacity_;
bool ours_; // Do we own data_?
Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
~Repr() {
if (ours_) delete[] data_;
}
void clear();
void reserve(int capacity);
void resize(int size);
void append(const char *bytes, int byte_length);
void Copy(const char *data, int size);
void TakeOwnershipOf(char *data, int size, int capacity);
void PointTo(const char *data, int size);
private:
Repr &operator=(const Repr &);
Repr(const Repr &other);
};
Repr repr_;
};
} // namespace chrome_lang_id
#endif // UNICODETEXT_H_

241
Telegram/ThirdParty/cld3/src/utils.cc vendored Normal file
View File

@@ -0,0 +1,241 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "utils.h"
#include <ctype.h>
#include <stdlib.h>
#include "script_span/stringpiece.h"
namespace chrome_lang_id {
namespace utils {
bool ParseInt32(const char *c_str, int *value) {
char *temp;
*value = strtol(c_str, &temp, 0); // NOLINT
return (*temp == '\0');
}
bool ParseDouble(const char *c_str, double *value) {
char *temp;
*value = strtod(c_str, &temp);
return (*temp == '\0');
}
static char hex_char[] = "0123456789abcdef";
string CEscape(const string &src) {
string dest;
for (unsigned char c : src) {
switch (c) {
case '\n':
dest.append("\\n");
break;
case '\r':
dest.append("\\r");
break;
case '\t':
dest.append("\\t");
break;
case '\"':
dest.append("\\\"");
break;
case '\'':
dest.append("\\'");
break;
case '\\':
dest.append("\\\\");
break;
default:
// Note that if we emit \xNN and the src character after that is a hex
// digit then that digit must be escaped too to prevent it being
// interpreted as part of the character code by C.
if ((c >= 0x80) || !isprint(c)) {
dest.append("\\");
dest.push_back(hex_char[c / 64]);
dest.push_back(hex_char[(c % 64) / 8]);
dest.push_back(hex_char[c % 8]);
} else {
dest.push_back(c);
break;
}
}
}
return dest;
}
std::vector<string> Split(const string &text, char delim) {
std::vector<string> result;
size_t token_start = 0;
if (!text.empty()) {
for (size_t i = 0; i < text.size() + 1; i++) {
if ((i == text.size()) || (text[i] == delim)) {
result.push_back(string(text.data() + token_start, i - token_start));
token_start = i + 1;
}
}
}
return result;
}
int RemoveLeadingWhitespace(StringPiece *text) {
int count = 0;
const char *ptr = text->data();
while (count < text->size() && isspace(*ptr)) {
count++;
ptr++;
}
text->remove_prefix(count);
return count;
}
int RemoveTrailingWhitespace(StringPiece *text) {
int count = 0;
const char *ptr = text->data() + text->size() - 1;
while (count < text->size() && isspace(*ptr)) {
++count;
--ptr;
}
text->remove_suffix(count);
return count;
}
int RemoveWhitespaceContext(StringPiece *text) {
// use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
return RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text);
}
namespace {
// Lower-level versions of Get... that read directly from a character buffer
// without any bounds checking.
inline uint32 DecodeFixed32(const char *ptr) {
return ((static_cast<uint32>(static_cast<unsigned char>(ptr[0]))) |
(static_cast<uint32>(static_cast<unsigned char>(ptr[1])) << 8) |
(static_cast<uint32>(static_cast<unsigned char>(ptr[2])) << 16) |
(static_cast<uint32>(static_cast<unsigned char>(ptr[3])) << 24));
}
// 0xff is in case char is signed.
static inline uint32 ByteAs32(char c) { return static_cast<uint32>(c) & 0xff; }
} // namespace
uint32 Hash32(const char *data, size_t n, uint32 seed) {
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const uint32 m = 0x5bd1e995;
const int r = 24;
// Initialize the hash to a 'random' value
uint32 h = static_cast<uint32>(seed ^ n);
// Mix 4 bytes at a time into the hash
while (n >= 4) {
uint32 k = DecodeFixed32(data);
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
data += 4;
n -= 4;
}
// Handle the last few bytes of the input array
if (n == 3) {
h ^= ByteAs32(data[2]) << 16;
h ^= ByteAs32(data[1]) << 8;
h ^= ByteAs32(data[0]);
h *= m;
} else if (n == 2) {
h ^= ByteAs32(data[1]) << 8;
h ^= ByteAs32(data[0]);
h *= m;
} else if (n == 1) {
h ^= ByteAs32(data[0]);
h *= m;
}
// Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated.
h ^= h >> 13;
h *= m;
h ^= h >> 15;
return h;
}
uint32 Hash32WithDefaultSeed(const string &input) {
return Hash32(input.data(), input.size(), 0xBEEF);
}
PunctuationUtil::CharacterRange PunctuationUtil::kPunctuation[] = {
{33, 35}, {37, 42}, {44, 47}, {58, 59},
{63, 64}, {91, 93}, {95, 95}, {123, 123},
{125, 125}, {161, 161}, {171, 171}, {183, 183},
{187, 187}, {191, 191}, {894, 894}, {903, 903},
{1370, 1375}, {1417, 1418}, {1470, 1470}, {1472, 1472},
{1475, 1475}, {1478, 1478}, {1523, 1524}, {1548, 1549},
{1563, 1563}, {1566, 1567}, {1642, 1645}, {1748, 1748},
{1792, 1805}, {2404, 2405}, {2416, 2416}, {3572, 3572},
{3663, 3663}, {3674, 3675}, {3844, 3858}, {3898, 3901},
{3973, 3973}, {4048, 4049}, {4170, 4175}, {4347, 4347},
{4961, 4968}, {5741, 5742}, {5787, 5788}, {5867, 5869},
{5941, 5942}, {6100, 6102}, {6104, 6106}, {6144, 6154},
{6468, 6469}, {6622, 6623}, {6686, 6687}, {8208, 8231},
{8240, 8259}, {8261, 8273}, {8275, 8286}, {8317, 8318},
{8333, 8334}, {9001, 9002}, {9140, 9142}, {10088, 10101},
{10181, 10182}, {10214, 10219}, {10627, 10648}, {10712, 10715},
{10748, 10749}, {11513, 11516}, {11518, 11519}, {11776, 11799},
{11804, 11805}, {12289, 12291}, {12296, 12305}, {12308, 12319},
{12336, 12336}, {12349, 12349}, {12448, 12448}, {12539, 12539},
{64830, 64831}, {65040, 65049}, {65072, 65106}, {65108, 65121},
{65123, 65123}, {65128, 65128}, {65130, 65131}, {65281, 65283},
{65285, 65290}, {65292, 65295}, {65306, 65307}, {65311, 65312},
{65339, 65341}, {65343, 65343}, {65371, 65371}, {65373, 65373},
{65375, 65381}, {65792, 65793}, {66463, 66463}, {68176, 68184},
{-1, -1}};
void NormalizeDigits(string *form) {
for (size_t i = 0; i < form->size(); ++i) {
if ((*form)[i] >= '0' && (*form)[i] <= '9') (*form)[i] = '9';
}
}
void GetUTF8Chars(const string &text, std::vector<string> *chars) {
const char *start = text.c_str();
const char *end = text.c_str() + text.size();
while (start < end) {
int char_length = UTF8FirstLetterNumBytes(start);
chars->emplace_back(start, char_length);
start += char_length;
}
}
int UTF8FirstLetterNumBytes(const char *utf8_str) {
if (*utf8_str == '\0') return 0;
return OneCharLen(utf8_str);
}
int OneCharLen(const char *src) {
// On most platforms, char is unsigned by default, but iOS is an exception.
// The cast below makes sure we always interpret *src as an unsigned char.
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
[(*(reinterpret_cast<const unsigned char *>(src)) & 0xFF) >> 4];
}
} // namespace utils
} // namespace chrome_lang_id

144
Telegram/ThirdParty/cld3/src/utils.h vendored Normal file
View File

@@ -0,0 +1,144 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef UTILS_H_
#define UTILS_H_
#include <stddef.h>
#include <functional>
#include <initializer_list>
#include <string>
#include <vector>
#include "base.h"
#include "script_span/stringpiece.h"
namespace chrome_lang_id {
namespace utils {
bool ParseInt32(const char *c_str, int *value);
bool ParseDouble(const char *c_str, double *value);
template <typename T>
T ParseUsing(const string &str, std::function<bool(const char *, T *)> func) {
T value;
func(str.c_str(), &value);
return value;
}
template <typename T>
T ParseUsing(const string &str, T defval,
std::function<bool(const char *, T *)> func) {
return str.empty() ? defval : ParseUsing<T>(str, func);
}
string CEscape(const string &src);
std::vector<string> Split(const string &text, char delim);
int RemoveLeadingWhitespace(StringPiece *text);
int RemoveTrailingWhitespace(StringPiece *text);
int RemoveWhitespaceContext(StringPiece *text);
uint32 Hash32(const char *data, size_t n, uint32 seed);
uint32 Hash32WithDefaultSeed(const string &input);
// Deletes all the elements in an STL container and clears the container. This
// function is suitable for use with a vector, set, hash_set, or any other STL
// container which defines sensible begin(), end(), and clear() methods.
// If container is NULL, this function is a no-op.
template <typename T>
void STLDeleteElements(T *container) {
if (!container) return;
auto it = container->begin();
while (it != container->end()) {
auto temp = it;
++it;
delete *temp;
}
container->clear();
}
class PunctuationUtil {
public:
// Unicode character ranges for punctuation characters according to CoNLL.
struct CharacterRange {
int first;
int last;
};
static CharacterRange kPunctuation[];
// Returns true if Unicode character is a punctuation character.
static bool IsPunctuation(int u) {
int i = 0;
while (kPunctuation[i].first > 0) {
if (u < kPunctuation[i].first) return false;
if (u <= kPunctuation[i].last) return true;
++i;
}
return false;
}
// Determine if tag is a punctuation tag.
static bool IsPunctuationTag(const string &tag) {
for (size_t i = 0; i < tag.length(); ++i) {
int c = tag[i];
if (c != ',' && c != ':' && c != '.' && c != '\'' && c != '`') {
return false;
}
}
return true;
}
// Returns true if tag is non-empty and has only punctuation or parens
// symbols.
static bool IsPunctuationTagOrParens(const string &tag) {
if (tag.empty()) return false;
for (size_t i = 0; i < tag.length(); ++i) {
int c = tag[i];
if (c != '(' && c != ')' && c != ',' && c != ':' && c != '.' &&
c != '\'' && c != '`') {
return false;
}
}
return true;
}
};
void NormalizeDigits(string *form);
// Takes a text and convert it into a vector, where each element is a utf8
// character.
void GetUTF8Chars(const string &text, std::vector<string> *chars);
// Returns the number of bytes in the first UTF-8 char at the beginning
// of the string. It is assumed that the string is valid UTF-8. If
// the first byte of the string is null, return 0 (for backwards
// compatibility only; this use is discouraged).
int UTF8FirstLetterNumBytes(const char *in_buf);
// Returns the length (number of bytes) of the Unicode code point starting at
// src, based on inspecting just that one byte. Preconditions: src != NULL,
// *src can be read, and *src is not '\0', and src points to a well-formed UTF-8
// string.
int OneCharLen(const char *src);
} // namespace utils
} // namespace chrome_lang_id
#endif // UTILS_H_

View File

@@ -0,0 +1,64 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "workspace.h"
#include "base.h"
namespace chrome_lang_id {
WorkspaceSet::WorkspaceSet() {}
WorkspaceSet::~WorkspaceSet() { Reset(WorkspaceRegistry()); }
WorkspaceRegistry::WorkspaceRegistry() {}
WorkspaceRegistry::~WorkspaceRegistry() {}
string WorkspaceRegistry::DebugString() const {
string str;
for (auto &it : workspace_names_) {
const string &type_name = workspace_types_.at(it.first);
for (size_t index = 0; index < it.second.size(); ++index) {
const string &workspace_name = it.second[index];
str += "\n ";
str += type_name;
str += " :: ";
str += workspace_name;
}
}
return str;
}
VectorIntWorkspace::~VectorIntWorkspace() {}
VectorIntWorkspace::VectorIntWorkspace(int size) : elements_(size) {}
VectorIntWorkspace::VectorIntWorkspace(int size, int value)
: elements_(size, value) {}
VectorIntWorkspace::VectorIntWorkspace(const std::vector<int> &elements)
: elements_(elements) {}
string VectorIntWorkspace::TypeName() { return "Vector"; }
VectorVectorIntWorkspace::~VectorVectorIntWorkspace() {}
VectorVectorIntWorkspace::VectorVectorIntWorkspace(int size)
: elements_(size) {}
string VectorVectorIntWorkspace::TypeName() { return "VectorVector"; }
} // namespace chrome_lang_id

177
Telegram/ThirdParty/cld3/src/workspace.h vendored Normal file
View File

@@ -0,0 +1,177 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Notes on thread-safety: All of the classes here are thread-compatible. More
// specifically, the registry machinery is thread-safe, as long as each thread
// performs feature extraction on a different Sentence object.
#ifndef WORKSPACE_H_
#define WORKSPACE_H_
#include <stddef.h>
#include <string>
#include <typeindex>
#include <unordered_map>
#include <utility>
#include <vector>
#include "base.h"
namespace chrome_lang_id {
// A base class for shared workspaces. Derived classes implement a static member
// function TypeName() which returns a human readable string name for the class.
class Workspace {
public:
// Polymorphic destructor.
virtual ~Workspace() {}
protected:
// Create an empty workspace.
Workspace() {}
private:
CLD3_DISALLOW_COPY_AND_ASSIGN(Workspace);
};
// A registry that keeps track of workspaces.
class WorkspaceRegistry {
public:
// Create an empty registry.
WorkspaceRegistry();
~WorkspaceRegistry();
const std::unordered_map<std::type_index, std::vector<std::string>>
&WorkspaceNames() const {
return workspace_names_;
}
// Returns a string describing the registered workspaces.
string DebugString() const;
private:
// Workspace type names, indexed as workspace_types_[typeid].
std::unordered_map<std::type_index, string> workspace_types_;
// Workspace names, indexed as workspace_names_[typeid][workspace].
std::unordered_map<std::type_index, std::vector<string>> workspace_names_;
CLD3_DISALLOW_COPY_AND_ASSIGN(WorkspaceRegistry);
};
// A typed collected of workspaces. The workspaces are indexed according to an
// external WorkspaceRegistry. If the WorkspaceSet is const, the contents are
// also immutable.
class WorkspaceSet {
public:
WorkspaceSet();
~WorkspaceSet();
void Reset(const WorkspaceRegistry &registry) {
// Deallocate current workspaces.
for (auto &it : workspaces_) {
for (size_t index = 0; index < it.second.size(); ++index) {
delete it.second[index];
}
}
workspaces_.clear();
// Allocate space for new workspaces.
for (auto &it : registry.WorkspaceNames()) {
workspaces_[it.first].resize(it.second.size());
}
}
private:
// The set of workspaces, indexed as workspaces_[typeid][index].
std::unordered_map<std::type_index, std::vector<Workspace *>> workspaces_;
};
// A workspace that wraps around a single int.
class SingletonIntWorkspace : public Workspace {
public:
// Default-initializes the int value.
SingletonIntWorkspace() {}
// Initializes the int with the given value.
explicit SingletonIntWorkspace(int value) : value_(value) {}
// Returns the name of this type of workspace.
static string TypeName() { return "SingletonInt"; }
// Returns the int value.
int get() const { return value_; }
// Sets the int value.
void set(int value) { value_ = value; }
private:
// The enclosed int.
int value_ = 0;
};
// A workspace that wraps around a vector of int.
class VectorIntWorkspace : public Workspace {
public:
// Creates a vector of the given size.
explicit VectorIntWorkspace(int size);
// Creates a vector initialized with the given array.
explicit VectorIntWorkspace(const std::vector<int> &elements);
// Creates a vector of the given size, with each element initialized to the
// given value.
VectorIntWorkspace(int size, int value);
~VectorIntWorkspace() override;
// Returns the name of this type of workspace.
static string TypeName();
// Returns the i'th element.
int element(int i) const { return elements_[i]; }
// Sets the i'th element.
void set_element(int i, int value) { elements_[i] = value; }
private:
// The enclosed vector.
std::vector<int> elements_;
};
// A workspace that wraps around a vector of vector of int.
class VectorVectorIntWorkspace : public Workspace {
public:
// Creates a vector of empty vectors of the given size.
explicit VectorVectorIntWorkspace(int size);
~VectorVectorIntWorkspace() override;
// Returns the name of this type of workspace.
static string TypeName();
// Returns the i'th vector of elements.
const std::vector<int> &elements(int i) const { return elements_[i]; }
// Mutable access to the i'th vector of elements.
std::vector<int> *mutable_elements(int i) { return &(elements_[i]); }
private:
// The enclosed vector of vector of elements.
std::vector<std::vector<int>> elements_;
};
} // namespace chrome_lang_id
#endif // WORKSPACE_H_