init
Some checks failed
Docker. / Ubuntu (push) Has been cancelled
User-agent updater. / User-agent (push) Failing after 15s
Lock Threads / lock (push) Failing after 10s
Waiting for answer. / waiting-for-answer (push) Failing after 22s
Needs user action. / needs-user-action (push) Failing after 8s
Can't reproduce. / cant-reproduce (push) Failing after 8s
Close stale issues and PRs / stale (push) Has been cancelled
Some checks failed
Docker. / Ubuntu (push) Has been cancelled
User-agent updater. / User-agent (push) Failing after 15s
Lock Threads / lock (push) Failing after 10s
Waiting for answer. / waiting-for-answer (push) Failing after 22s
Needs user action. / needs-user-action (push) Failing after 8s
Can't reproduce. / cant-reproduce (push) Failing after 8s
Close stale issues and PRs / stale (push) Has been cancelled
This commit is contained in:
149
Telegram/ThirdParty/cld3/.github/workflows/main.yml
vendored
Normal file
149
Telegram/ThirdParty/cld3/.github/workflows/main.yml
vendored
Normal file
@@ -0,0 +1,149 @@
|
||||
name: gcld3
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
|
||||
test:
|
||||
name: ${{ matrix.os }}-${{matrix.python-version}}-test
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
python-version: [3.6, 3.7, 3.8, pypy3]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Linux Dependencies
|
||||
if: runner.os == 'Linux'
|
||||
run: sudo apt-get install libprotobuf-dev protobuf-compiler python3-dev
|
||||
|
||||
- name: MacOS Dependencies
|
||||
if: runner.os == 'macOS'
|
||||
run: brew install protobuf
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
|
||||
- name: Build package
|
||||
run: |
|
||||
pip install setuptools
|
||||
python setup.py install
|
||||
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pip install pytest pytest-cov
|
||||
pytest gcld3/tests/gcld3_test.py
|
||||
|
||||
|
||||
sdist:
|
||||
name: Build source distribution
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
name: Install Python
|
||||
with:
|
||||
python-version: "3.8"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools wheel
|
||||
- name: Build sdist
|
||||
run: python setup.py sdist
|
||||
|
||||
- uses: actions/upload-artifact@v2
|
||||
with:
|
||||
path: dist/*.tar.gz
|
||||
|
||||
wheel:
|
||||
name: ${{ matrix.os }},${{ matrix.arch }}-wheel
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
arch: [auto]
|
||||
include:
|
||||
- os: ubuntu-latest
|
||||
arch: aarch64
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Set up QEMU
|
||||
if: ${{ matrix.arch == 'aarch64' }}
|
||||
uses: docker/setup-qemu-action@v1
|
||||
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.8
|
||||
|
||||
- name: Install cibuildwheel
|
||||
run: |
|
||||
python -m pip install cibuildwheel>=1.5.5 auditwheel delocate
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
CIBW_BUILD: "cp36-* cp38-* pp36-*"
|
||||
CIBW_SKIP: "*-win32 *-manylinux_i686 pp27-* cp27-* cp35-* *-musllinux_aarch64"
|
||||
CIBW_ARCHS: ${{matrix.arch}}
|
||||
CIBW_BEFORE_BUILD_LINUX: yum -y install protobuf-devel protobuf-compiler python3-devel
|
||||
CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair --lib-sdir . -w {dest_dir} {wheel}"
|
||||
CIBW_BEFORE_BUILD_MACOS: brew install protobuf
|
||||
CIBW_REPAIR_WHEEL_COMMAND_MACOS: "delocate-listdeps {wheel} && delocate-wheel -w {dest_dir} -v {wheel}"
|
||||
run: |
|
||||
python -m cibuildwheel --output-dir wheelhouse
|
||||
|
||||
- uses: actions/upload-artifact@v2
|
||||
with:
|
||||
path: ./wheelhouse/*.whl
|
||||
|
||||
|
||||
pypi:
|
||||
needs: [wheel, sdist]
|
||||
runs-on: ubuntu-latest
|
||||
# upload to PyPI on every tag starting with 'v'
|
||||
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
|
||||
# alternatively, to publish when a GitHub Release is created, use the following rule:
|
||||
# if: github.event_name == 'release' && github.event.action == 'published'
|
||||
steps:
|
||||
- uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: artifact
|
||||
path: dist
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: '3.8'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install twine
|
||||
|
||||
- name: Upload to test pypi
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
|
||||
run: |
|
||||
twine upload --repository-url https://test.pypi.org/legacy/ dist/*
|
||||
|
||||
- name: Upload to pypi
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
run: |
|
||||
twine upload dist/*
|
||||
1
Telegram/ThirdParty/cld3/.gitignore
vendored
Normal file
1
Telegram/ThirdParty/cld3/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
build
|
||||
69
Telegram/ThirdParty/cld3/CMakeLists.txt
vendored
Normal file
69
Telegram/ThirdParty/cld3/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
# This cmake scripts only builds a static cld3 lib and the unittests.
|
||||
|
||||
project(cld3)
|
||||
|
||||
# Old versions of cmake dont search/find protobuf lite
|
||||
cmake_minimum_required(VERSION 3.9)
|
||||
|
||||
find_package(Protobuf REQUIRED)
|
||||
message(STATUS "Protobuf_FOUND= ${Protobuf_FOUND}")
|
||||
message(STATUS "Protobuf_VERSION= ${Protobuf_VERSION}")
|
||||
message(WARNING "Protobuf 2.5 and CLD3 seems happy together. This script does NOT check if your verison of protobuf is compatible.")
|
||||
message(STATUS "Protobuf_LIBRARIES= ${Protobuf_LIBRARIES}")
|
||||
message(STATUS "Protobuf_LITE_LIBRARIES= ${Protobuf_LITE_LIBRARIES}") # Usually /usr/lib64/libprotobuf-lite.so
|
||||
|
||||
# By default, protobuf_generate_cpp generates pb.* files directy in the cmake build dir.
|
||||
# But CLD3 sources have been coded using hard coded pathes to cld_3/protos/*.pb.h.
|
||||
# So *.pb.h must be output to cld_3/protos.
|
||||
# For that, let's use a custom my_protobuf_generate_cpp:
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/misc/myprotobuf.cmake)
|
||||
my_protobuf_generate_cpp(cld_3/protos PROTO_SRCS PROTO_HDRS src/feature_extractor.proto src/sentence.proto src/task_spec.proto)
|
||||
message(STATUS "PROTO_HDRS= ${PROTO_HDRS}")
|
||||
|
||||
add_definitions(-fPIC) # Position Independant Code
|
||||
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
|
||||
add_definitions(-std=c++11) # Needed for std::to_string(), ...
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR} ${Protobuf_INCLUDE_DIRS}) # needed to include generated pb headers
|
||||
|
||||
add_library(${PROJECT_NAME}
|
||||
${PROTO_SRCS} ${PROTO_HDRS}
|
||||
src/base.cc
|
||||
src/embedding_feature_extractor.cc
|
||||
src/embedding_network.cc
|
||||
src/feature_extractor.cc
|
||||
src/feature_extractor.h
|
||||
src/feature_types.cc
|
||||
src/fml_parser.cc
|
||||
src/language_identifier_features.cc
|
||||
src/lang_id_nn_params.cc
|
||||
src/nnet_language_identifier.cc
|
||||
src/registry.cc
|
||||
src/relevant_script_feature.cc
|
||||
src/sentence_features.cc
|
||||
src/task_context.cc
|
||||
src/task_context_params.cc
|
||||
src/unicodetext.cc
|
||||
src/utils.cc
|
||||
src/workspace.cc
|
||||
|
||||
src/script_span/generated_entities.cc
|
||||
src/script_span/getonescriptspan.cc
|
||||
src/script_span/getonescriptspan.h
|
||||
src/script_span/getonescriptspan_test.cc
|
||||
src/script_span/utf8statetable.cc
|
||||
src/script_span/offsetmap.cc
|
||||
src/script_span/text_processing.cc
|
||||
src/script_span/text_processing.h
|
||||
src/script_span/fixunicodevalue.cc
|
||||
)
|
||||
|
||||
# unit tests exec:
|
||||
add_executable(language_identifier_main src/language_identifier_main.cc)
|
||||
target_link_libraries(language_identifier_main cld3 ${Protobuf_LITE_LIBRARIES})
|
||||
|
||||
add_executable(getonescriptspan_test src/script_span/getonescriptspan_test.cc)
|
||||
target_link_libraries(getonescriptspan_test cld3 ${Protobuf_LITE_LIBRARIES})
|
||||
|
||||
add_executable(language_identifier_features_test src/language_identifier_features_test.cc)
|
||||
target_link_libraries(language_identifier_features_test cld3 ${Protobuf_LITE_LIBRARIES})
|
||||
26
Telegram/ThirdParty/cld3/CONTRIBUTING.md
vendored
Normal file
26
Telegram/ThirdParty/cld3/CONTRIBUTING.md
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
Want to contribute? Great! First, read this page (including the small print at
|
||||
the end).
|
||||
|
||||
### Before you contribute
|
||||
Before we can use your code, you must sign the
|
||||
[Google Individual Contributor License Agreement](https://cla.developers.google.com/about/google-individual)
|
||||
(CLA), which you can do online. The CLA is necessary mainly because you own the
|
||||
copyright to your changes, even after your contribution becomes part of our
|
||||
codebase, so we need your permission to use and distribute your code. We also
|
||||
need to be sure of various other things—for instance that you'll tell us if you
|
||||
know that your code infringes on other people's patents. You don't have to sign
|
||||
the CLA until after you've submitted your code for review and a member has
|
||||
approved it, but you must do it before we can put your code into our codebase.
|
||||
Before you start working on a larger contribution, you should get in touch with
|
||||
us first through the issue tracker with your idea so that we can help out and
|
||||
possibly guide you. Coordinating up front makes it much easier to avoid
|
||||
frustration later on.
|
||||
|
||||
### Code reviews
|
||||
All submissions, including submissions by project members, require review. We
|
||||
use Github pull requests for this purpose.
|
||||
|
||||
### The small print
|
||||
Contributions made by corporations are covered by a different agreement than
|
||||
the one above, the
|
||||
[Software Grant and Corporate Contributor License Agreement](https://cla.developers.google.com/about/google-corporate).
|
||||
203
Telegram/ThirdParty/cld3/LICENSE
vendored
Normal file
203
Telegram/ThirdParty/cld3/LICENSE
vendored
Normal file
@@ -0,0 +1,203 @@
|
||||
Copyright 2016 Google Inc. All rights reserved.
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2016, Google Inc.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
13
Telegram/ThirdParty/cld3/MANIFEST.in
vendored
Normal file
13
Telegram/ThirdParty/cld3/MANIFEST.in
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
include LICENSE
|
||||
include README.md
|
||||
include requirements.txt
|
||||
global-include *h
|
||||
global-include *cc
|
||||
global-include *proto
|
||||
prune .github/
|
||||
prune .eggs/
|
||||
global-exclude *.pyc
|
||||
global-exclude *.cache
|
||||
global-exclude *.so
|
||||
exclude src/cld_3/protos/*h
|
||||
exclude src/cld_3/protos/*cc
|
||||
191
Telegram/ThirdParty/cld3/README.md
vendored
Normal file
191
Telegram/ThirdParty/cld3/README.md
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
# Compact Language Detector v3 (CLD3)
|
||||
|
||||
* [Model](#model)
|
||||
* [Supported Languages](#supported-languages)
|
||||
* [Installation](#installation)
|
||||
* [Bugs and Feature Requests](#bugs-and-feature-requests)
|
||||
* [Credits](#credits)
|
||||
|
||||
### Model
|
||||
|
||||
CLD3 is a neural network model for language identification. This package
|
||||
contains the inference code and a trained model. The inference code
|
||||
extracts character ngrams from the input text and computes the fraction
|
||||
of times each of them appears. For example, as shown in the figure below,
|
||||
if the input text is "banana", then one of the extracted trigrams is "ana"
|
||||
and the corresponding fraction is 2/4. The ngrams are hashed down to an id
|
||||
within a small range, and each id is represented by a dense embedding vector
|
||||
estimated during training.
|
||||
|
||||
The model averages the embeddings corresponding to each ngram type according
|
||||
to the fractions, and the averaged embeddings are concatenated to produce
|
||||
the embedding layer. The remaining components of the network are a hidden
|
||||
(Rectified linear) layer and a softmax layer.
|
||||
|
||||
To get a language prediction for the input text, we simply perform a forward
|
||||
pass through the network.
|
||||
|
||||

|
||||
|
||||
### Supported Languages
|
||||
|
||||
The model outputs BCP-47-style language codes, shown in the table below. For
|
||||
some languages, output is differentiated by script. Language and script names
|
||||
from
|
||||
[Unicode CLDR](https://github.com/unicode-cldr/cldr-localenames-modern/blob/master/main/en).
|
||||
|
||||
Output Code | Language Name | Script Name
|
||||
----------- | --------------- | ------------------------------------------
|
||||
af | Afrikaans | Latin
|
||||
am | Amharic | Ethiopic
|
||||
ar | Arabic | Arabic
|
||||
bg | Bulgarian | Cyrillic
|
||||
bg-Latn | Bulgarian | Latin
|
||||
bn | Bangla | Bangla
|
||||
bs | Bosnian | Latin
|
||||
ca | Catalan | Latin
|
||||
ceb | Cebuano | Latin
|
||||
co | Corsican | Latin
|
||||
cs | Czech | Latin
|
||||
cy | Welsh | Latin
|
||||
da | Danish | Latin
|
||||
de | German | Latin
|
||||
el | Greek | Greek
|
||||
el-Latn | Greek | Latin
|
||||
en | English | Latin
|
||||
eo | Esperanto | Latin
|
||||
es | Spanish | Latin
|
||||
et | Estonian | Latin
|
||||
eu | Basque | Latin
|
||||
fa | Persian | Arabic
|
||||
fi | Finnish | Latin
|
||||
fil | Filipino | Latin
|
||||
fr | French | Latin
|
||||
fy | Western Frisian | Latin
|
||||
ga | Irish | Latin
|
||||
gd | Scottish Gaelic | Latin
|
||||
gl | Galician | Latin
|
||||
gu | Gujarati | Gujarati
|
||||
ha | Hausa | Latin
|
||||
haw | Hawaiian | Latin
|
||||
hi | Hindi | Devanagari
|
||||
hi-Latn | Hindi | Latin
|
||||
hmn | Hmong | Latin
|
||||
hr | Croatian | Latin
|
||||
ht | Haitian Creole | Latin
|
||||
hu | Hungarian | Latin
|
||||
hy | Armenian | Armenian
|
||||
id | Indonesian | Latin
|
||||
ig | Igbo | Latin
|
||||
is | Icelandic | Latin
|
||||
it | Italian | Latin
|
||||
iw | Hebrew | Hebrew
|
||||
ja | Japanese | Japanese
|
||||
ja-Latn | Japanese | Latin
|
||||
jv | Javanese | Latin
|
||||
ka | Georgian | Georgian
|
||||
kk | Kazakh | Cyrillic
|
||||
km | Khmer | Khmer
|
||||
kn | Kannada | Kannada
|
||||
ko | Korean | Korean
|
||||
ku | Kurdish | Latin
|
||||
ky | Kyrgyz | Cyrillic
|
||||
la | Latin | Latin
|
||||
lb | Luxembourgish | Latin
|
||||
lo | Lao | Lao
|
||||
lt | Lithuanian | Latin
|
||||
lv | Latvian | Latin
|
||||
mg | Malagasy | Latin
|
||||
mi | Maori | Latin
|
||||
mk | Macedonian | Cyrillic
|
||||
ml | Malayalam | Malayalam
|
||||
mn | Mongolian | Cyrillic
|
||||
mr | Marathi | Devanagari
|
||||
ms | Malay | Latin
|
||||
mt | Maltese | Latin
|
||||
my | Burmese | Myanmar
|
||||
ne | Nepali | Devanagari
|
||||
nl | Dutch | Latin
|
||||
no | Norwegian | Latin
|
||||
ny | Nyanja | Latin
|
||||
pa | Punjabi | Gurmukhi
|
||||
pl | Polish | Latin
|
||||
ps | Pashto | Arabic
|
||||
pt | Portuguese | Latin
|
||||
ro | Romanian | Latin
|
||||
ru | Russian | Cyrillic
|
||||
ru-Latn | Russian | English
|
||||
sd | Sindhi | Arabic
|
||||
si | Sinhala | Sinhala
|
||||
sk | Slovak | Latin
|
||||
sl | Slovenian | Latin
|
||||
sm | Samoan | Latin
|
||||
sn | Shona | Latin
|
||||
so | Somali | Latin
|
||||
sq | Albanian | Latin
|
||||
sr | Serbian | Cyrillic
|
||||
st | Southern Sotho | Latin
|
||||
su | Sundanese | Latin
|
||||
sv | Swedish | Latin
|
||||
sw | Swahili | Latin
|
||||
ta | Tamil | Tamil
|
||||
te | Telugu | Telugu
|
||||
tg | Tajik | Cyrillic
|
||||
th | Thai | Thai
|
||||
tr | Turkish | Latin
|
||||
uk | Ukrainian | Cyrillic
|
||||
ur | Urdu | Arabic
|
||||
uz | Uzbek | Latin
|
||||
vi | Vietnamese | Latin
|
||||
xh | Xhosa | Latin
|
||||
yi | Yiddish | Hebrew
|
||||
yo | Yoruba | Latin
|
||||
zh | Chinese | Han (including Simplified and Traditional)
|
||||
zh-Latn | Chinese | Latin
|
||||
zu | Zulu | Latin
|
||||
|
||||
### Installation
|
||||
CLD3 is designed to run in the Chrome browser, so it relies on code in
|
||||
[Chromium](http://www.chromium.org/).
|
||||
The steps for building and running the demo of the language detection model are:
|
||||
|
||||
- [check out](http://www.chromium.org/developers/how-tos/get-the-code) the
|
||||
Chromium repository.
|
||||
- copy the code to `//third_party/cld_3`
|
||||
- Uncomment `language_identifier_main` executable in `src/BUILD.gn`.
|
||||
- build and run the model using the commands:
|
||||
|
||||
```shell
|
||||
gn gen out/Default
|
||||
ninja -C out/Default third_party/cld_3/src/src:language_identifier_main
|
||||
out/Default/language_identifier_main
|
||||
```
|
||||
### Bugs and Feature Requests
|
||||
|
||||
Open a [GitHub issue](https://github.com/google/cld3/issues) for this repository to file bugs and feature requests.
|
||||
|
||||
### Announcements and Discussion
|
||||
|
||||
For announcements regarding major updates as well as general discussion list, please subscribe to:
|
||||
[cld3-users@googlegroups.com](https://groups.google.com/forum/#!forum/cld3-users)
|
||||
|
||||
### Credits
|
||||
|
||||
Original authors of the code in this package include (in alphabetical order):
|
||||
|
||||
* Alex Salcianu
|
||||
* Andy Golding
|
||||
* Anton Bakalov
|
||||
* Chris Alberti
|
||||
* Daniel Andor
|
||||
* David Weiss
|
||||
* Emily Pitler
|
||||
* Greg Coppola
|
||||
* Jason Riesa
|
||||
* Kuzman Ganchev
|
||||
* Michael Ringgaard
|
||||
* Nan Hua
|
||||
* Ryan McDonald
|
||||
* Slav Petrov
|
||||
* Stefan Istrate
|
||||
* Terry Koo
|
||||
1
Telegram/ThirdParty/cld3/gcld3/__init__.py
vendored
Normal file
1
Telegram/ThirdParty/cld3/gcld3/__init__.py
vendored
Normal file
@@ -0,0 +1 @@
|
||||
from .pybind_ext import *
|
||||
43
Telegram/ThirdParty/cld3/gcld3/pybind_ext.cc
vendored
Normal file
43
Telegram/ThirdParty/cld3/gcld3/pybind_ext.cc
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/pytypes.h>
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
#include "../src/nnet_language_identifier.h"
|
||||
|
||||
namespace pybind11 {
|
||||
|
||||
using chrome_lang_id::NNetLanguageIdentifier;
|
||||
|
||||
// This is conventional.
|
||||
namespace py = pybind11;
|
||||
|
||||
PYBIND11_MODULE(pybind_ext, py_module) {
|
||||
py::class_<NNetLanguageIdentifier>(py_module, "NNetLanguageIdentifier")
|
||||
.def(py::init<const int, const int>(), py::arg("min_num_bytes"),
|
||||
py::arg("max_num_bytes"))
|
||||
.def("FindLanguage", &NNetLanguageIdentifier::FindLanguage,
|
||||
py::arg("text"))
|
||||
.def("FindTopNMostFreqLangs",
|
||||
&NNetLanguageIdentifier::FindTopNMostFreqLangs, py::arg("text"),
|
||||
py::arg("num_langs"))
|
||||
.def_readonly_static("kUnknown", &NNetLanguageIdentifier::kUnknown)
|
||||
.def_readonly_static("kMinNumBytesToConsider",
|
||||
&NNetLanguageIdentifier::kMinNumBytesToConsider)
|
||||
.def_readonly_static("kMaxNumBytesToConsider",
|
||||
&NNetLanguageIdentifier::kMaxNumBytesToConsider)
|
||||
.def_readonly_static("kMaxNumInputBytesToConsider",
|
||||
&NNetLanguageIdentifier::kMaxNumInputBytesToConsider)
|
||||
.def_readonly_static("kReliabilityThreshold",
|
||||
&NNetLanguageIdentifier::kReliabilityThreshold)
|
||||
.def_readonly_static("kReliabilityHrBsThreshold",
|
||||
&NNetLanguageIdentifier::kReliabilityHrBsThreshold);
|
||||
|
||||
py::class_<NNetLanguageIdentifier::Result>(py_module, "Result")
|
||||
.def_readwrite("language", &NNetLanguageIdentifier::Result::language)
|
||||
.def_readwrite("probability",
|
||||
&NNetLanguageIdentifier::Result::probability)
|
||||
.def_readwrite("is_reliable",
|
||||
&NNetLanguageIdentifier::Result::is_reliable)
|
||||
.def_readwrite("proportion", &NNetLanguageIdentifier::Result::proportion);
|
||||
}
|
||||
} // namespace pybind11
|
||||
43
Telegram/ThirdParty/cld3/gcld3/tests/gcld3_test.py
vendored
Normal file
43
Telegram/ThirdParty/cld3/gcld3/tests/gcld3_test.py
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Tests for gcld3."""
|
||||
|
||||
import gcld3
|
||||
import unittest
|
||||
|
||||
|
||||
class NnetLanguageIdentifierTest(unittest.TestCase):
|
||||
|
||||
def testLangIdentification(self):
|
||||
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
|
||||
sample = "This text is written in English."
|
||||
result = detector.FindLanguage(text=sample)
|
||||
self.assertEqual(result.language, "en")
|
||||
self.assertTrue(result.is_reliable)
|
||||
self.assertGreater(result.proportion, 0.99)
|
||||
self.assertGreater(result.probability, 0.90)
|
||||
|
||||
def testEmptyString(self):
|
||||
detector = gcld3.NNetLanguageIdentifier(
|
||||
min_num_bytes=10, max_num_bytes=1000)
|
||||
sample = ""
|
||||
result = detector.FindLanguage(text=sample)
|
||||
self.assertEqual(result.language, "und")
|
||||
self.assertFalse(result.is_reliable)
|
||||
self.assertEqual(result.proportion, 0.0)
|
||||
self.assertEqual(result.probability, 0.00)
|
||||
|
||||
def testLangsIdentification(self):
|
||||
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
|
||||
sample = ("This piece of text is in English. Този текст е на " "Български.")
|
||||
results = detector.FindTopNMostFreqLangs(text=sample, num_langs=2)
|
||||
self.assertEqual(results[0].language, "bg")
|
||||
self.assertTrue(results[0].is_reliable)
|
||||
self.assertLess(results[0].proportion, 0.75)
|
||||
self.assertGreater(results[0].probability, 0.90)
|
||||
self.assertEqual(results[1].language, "en")
|
||||
self.assertTrue(results[1].is_reliable)
|
||||
self.assertLess(results[1].proportion, 0.75)
|
||||
self.assertGreater(results[1].probability, 0.90)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
58
Telegram/ThirdParty/cld3/misc/myprotobuf.cmake
vendored
Normal file
58
Telegram/ThirdParty/cld3/misc/myprotobuf.cmake
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
# Special PROTOBUF_GENERATE_CPP which allows to set the output folder:
|
||||
# From https://stackoverflow.com/users/1600278/akira-okumura
|
||||
|
||||
function(MY_PROTOBUF_GENERATE_CPP PATH SRCS HDRS)
|
||||
if(NOT ARGN)
|
||||
message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
|
||||
# Create an include path for each file specified
|
||||
foreach(FIL ${ARGN})
|
||||
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
|
||||
get_filename_component(ABS_PATH ${ABS_FIL} PATH)
|
||||
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
|
||||
if(${_contains_already} EQUAL -1)
|
||||
list(APPEND _protobuf_include_path -I ${ABS_PATH})
|
||||
endif()
|
||||
endforeach()
|
||||
else()
|
||||
set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
endif()
|
||||
|
||||
if(DEFINED PROTOBUF_IMPORT_DIRS)
|
||||
foreach(DIR ${PROTOBUF_IMPORT_DIRS})
|
||||
get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
|
||||
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
|
||||
if(${_contains_already} EQUAL -1)
|
||||
list(APPEND _protobuf_include_path -I ${ABS_PATH})
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
set(${SRCS})
|
||||
set(${HDRS})
|
||||
foreach(FIL ${ARGN})
|
||||
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
|
||||
get_filename_component(FIL_WE ${FIL} NAME_WE)
|
||||
|
||||
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc")
|
||||
list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h")
|
||||
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${PATH})
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h"
|
||||
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
|
||||
ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/${PATH} ${_protobuf_include_path} ${ABS_FIL}
|
||||
DEPENDS ${ABS_FIL}
|
||||
COMMENT "Running C++ protocol buffer compiler on ${FIL}"
|
||||
VERBATIM )
|
||||
endforeach()
|
||||
|
||||
set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
|
||||
set(${SRCS} ${${SRCS}} PARENT_SCOPE)
|
||||
set(${HDRS} ${${HDRS}} PARENT_SCOPE)
|
||||
endfunction()
|
||||
BIN
Telegram/ThirdParty/cld3/model.png
vendored
Normal file
BIN
Telegram/ThirdParty/cld3/model.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 38 KiB |
3
Telegram/ThirdParty/cld3/requirements.txt
vendored
Normal file
3
Telegram/ThirdParty/cld3/requirements.txt
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
protobuf >=3.0.0
|
||||
pybind11 >=2.5.0
|
||||
wheel >= 0.34.2
|
||||
120
Telegram/ThirdParty/cld3/setup.py
vendored
Normal file
120
Telegram/ThirdParty/cld3/setup.py
vendored
Normal file
@@ -0,0 +1,120 @@
|
||||
"""Setup utility for gcld3."""
|
||||
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
import subprocess
|
||||
import setuptools
|
||||
from setuptools.command import build_ext
|
||||
|
||||
__version__ = '3.0.13'
|
||||
_NAME = 'gcld3'
|
||||
|
||||
REQUIREMENTS = ['pybind11 >= 2.5.0', 'wheel >= 0.34.2']
|
||||
|
||||
PROTO_FILES = [
|
||||
'src/feature_extractor.proto',
|
||||
'src/sentence.proto',
|
||||
'src/task_spec.proto',
|
||||
]
|
||||
|
||||
SRCS = [
|
||||
'src/base.cc',
|
||||
'src/embedding_feature_extractor.cc',
|
||||
'src/embedding_network.cc',
|
||||
'src/feature_extractor.cc',
|
||||
'src/feature_types.cc',
|
||||
'src/fml_parser.cc',
|
||||
'src/lang_id_nn_params.cc',
|
||||
'src/language_identifier_features.cc',
|
||||
'src/language_identifier_main.cc',
|
||||
'src/nnet_language_identifier.cc',
|
||||
'src/registry.cc',
|
||||
'src/relevant_script_feature.cc',
|
||||
'src/sentence_features.cc',
|
||||
'src/task_context.cc',
|
||||
'src/task_context_params.cc',
|
||||
'src/unicodetext.cc',
|
||||
'src/utils.cc',
|
||||
'src/workspace.cc',
|
||||
'src/script_span/fixunicodevalue.cc',
|
||||
'src/script_span/generated_entities.cc',
|
||||
'src/script_span/generated_ulscript.cc',
|
||||
'src/script_span/getonescriptspan.cc',
|
||||
'src/script_span/offsetmap.cc',
|
||||
'src/script_span/text_processing.cc',
|
||||
'src/script_span/utf8statetable.cc',
|
||||
# These CC files have to be generated by the proto buffer compiler 'protoc'
|
||||
'src/cld_3/protos/feature_extractor.pb.cc',
|
||||
'src/cld_3/protos/sentence.pb.cc',
|
||||
'src/cld_3/protos/task_spec.pb.cc',
|
||||
# pybind11 bindings
|
||||
'gcld3/pybind_ext.cc',
|
||||
]
|
||||
|
||||
|
||||
class CompileProtos(build_ext.build_ext):
|
||||
"""Compile protocol buffers via `protoc` compiler."""
|
||||
|
||||
def run(self):
|
||||
if shutil.which('protoc') is None:
|
||||
raise RuntimeError('Please install the proto buffer compiler.')
|
||||
|
||||
# The C++ code expect the protos to be compiled under the following
|
||||
# directory, therefore, create it if necessary.
|
||||
compiled_protos_dir = 'src/cld_3/protos/'
|
||||
os.makedirs(compiled_protos_dir, exist_ok=True)
|
||||
command = ['protoc', f'--cpp_out={compiled_protos_dir}', '--proto_path=src']
|
||||
command.extend(PROTO_FILES)
|
||||
subprocess.run(command, check=True, cwd='./')
|
||||
build_ext.build_ext.run(self)
|
||||
|
||||
|
||||
class PyBindIncludes(object):
|
||||
"""Returns the include paths for pybind11 when needed.
|
||||
|
||||
To delay the invocation of "pybind11.get_include()" until it is available
|
||||
in the environment. This lazy evaluation allows us to install it first, then
|
||||
import it later to determine the correct include paths.
|
||||
"""
|
||||
|
||||
def __str__(self):
|
||||
import pybind11 # pylint: disable=g-import-not-at-top
|
||||
return pybind11.get_include()
|
||||
|
||||
|
||||
MACOS = platform.system() == 'Darwin'
|
||||
ext_modules = [
|
||||
setuptools.Extension(
|
||||
'gcld3.pybind_ext',
|
||||
sorted(SRCS),
|
||||
include_dirs=[
|
||||
PyBindIncludes(),
|
||||
],
|
||||
libraries=['protobuf'],
|
||||
extra_compile_args=['-std=c++11', '-stdlib=libc++'] if MACOS else [],
|
||||
extra_link_args=['-stdlib=libc++'] if MACOS else [],
|
||||
language='c++'),
|
||||
]
|
||||
|
||||
DESCRIPTION = """CLD3 is a neural network model for language identification.
|
||||
This package contains the inference code and a trained model. See
|
||||
https://github.com/google/cld3 for more details.
|
||||
"""
|
||||
|
||||
setuptools.setup(
|
||||
author='Rami Al-Rfou',
|
||||
author_email='rmyeid@google.com',
|
||||
cmdclass={
|
||||
'build_ext': CompileProtos,
|
||||
},
|
||||
ext_modules=ext_modules,
|
||||
packages=setuptools.find_packages(),
|
||||
description='CLD3 is a neural network model for language identification.',
|
||||
long_description=DESCRIPTION,
|
||||
name=_NAME,
|
||||
setup_requires=REQUIREMENTS,
|
||||
url='https://github.com/google/cld3',
|
||||
version=__version__,
|
||||
zip_safe=False,
|
||||
)
|
||||
133
Telegram/ThirdParty/cld3/src/BUILD.gn
vendored
Normal file
133
Telegram/ThirdParty/cld3/src/BUILD.gn
vendored
Normal file
@@ -0,0 +1,133 @@
|
||||
# Copyright 2016 Google Inc. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#==============================================================================
|
||||
|
||||
import("//third_party/protobuf/proto_library.gni")
|
||||
|
||||
proto_library("protos") {
|
||||
sources = [
|
||||
"feature_extractor.proto",
|
||||
"sentence.proto",
|
||||
"task_spec.proto",
|
||||
]
|
||||
proto_out_dir = "cld_3/protos"
|
||||
}
|
||||
|
||||
static_library("cld_3") {
|
||||
sources = [
|
||||
"base.cc",
|
||||
"base.h",
|
||||
"casts.h",
|
||||
"embedding_feature_extractor.cc",
|
||||
"embedding_feature_extractor.h",
|
||||
"embedding_network.cc",
|
||||
"embedding_network.h",
|
||||
"embedding_network_params.h",
|
||||
"feature_extractor.cc",
|
||||
"feature_extractor.h",
|
||||
"feature_types.cc",
|
||||
"feature_types.h",
|
||||
"float16.h",
|
||||
"fml_parser.cc",
|
||||
"fml_parser.h",
|
||||
"language_identifier_features.cc",
|
||||
"language_identifier_features.h",
|
||||
"lang_id_nn_params.cc",
|
||||
"lang_id_nn_params.h",
|
||||
"nnet_language_identifier.cc",
|
||||
"nnet_language_identifier.h",
|
||||
"registry.cc",
|
||||
"registry.h",
|
||||
"relevant_script_feature.cc",
|
||||
"relevant_script_feature.h",
|
||||
"script_detector.h",
|
||||
"sentence_features.cc",
|
||||
"sentence_features.h",
|
||||
"simple_adder.h",
|
||||
"script_span/fixunicodevalue.cc",
|
||||
"script_span/fixunicodevalue.h",
|
||||
"script_span/generated_entities.cc",
|
||||
"script_span/generated_ulscript.cc",
|
||||
"script_span/generated_ulscript.h",
|
||||
"script_span/getonescriptspan.cc",
|
||||
"script_span/getonescriptspan.h",
|
||||
"script_span/integral_types.h",
|
||||
"script_span/offsetmap.cc",
|
||||
"script_span/offsetmap.h",
|
||||
"script_span/port.h",
|
||||
"script_span/stringpiece.h",
|
||||
"script_span/text_processing.cc",
|
||||
"script_span/text_processing.h",
|
||||
"script_span/utf8acceptinterchange.h",
|
||||
"script_span/utf8prop_lettermarkscriptnum.h",
|
||||
"script_span/utf8repl_lettermarklower.h",
|
||||
"script_span/utf8scannot_lettermarkspecial.h",
|
||||
"script_span/utf8statetable.cc",
|
||||
"script_span/utf8statetable.h",
|
||||
"task_context.cc",
|
||||
"task_context.h",
|
||||
"task_context_params.cc",
|
||||
"task_context_params.h",
|
||||
"unicodetext.cc",
|
||||
"unicodetext.h",
|
||||
"utils.cc",
|
||||
"utils.h",
|
||||
"workspace.cc",
|
||||
"workspace.h",
|
||||
]
|
||||
public_deps = [
|
||||
"//third_party/protobuf:protobuf_lite",
|
||||
":protos",
|
||||
]
|
||||
}
|
||||
|
||||
# The executables below are functional. Uncomment to use.
|
||||
|
||||
#executable("language_identifier_main") {
|
||||
# sources = [
|
||||
# "language_identifier_main.cc",
|
||||
# ]
|
||||
# deps = [
|
||||
# ":cld_3",
|
||||
# ]
|
||||
#}
|
||||
|
||||
#executable("getonescriptspan_test") {
|
||||
# sources = [
|
||||
# "script_span/getonescriptspan_test.cc",
|
||||
# ]
|
||||
# deps = [
|
||||
# ":cld_3",
|
||||
# ]
|
||||
#}
|
||||
|
||||
#executable("language_identifier_features_test") {
|
||||
# sources = [
|
||||
# "language_identifier_features_test.cc",
|
||||
# ]
|
||||
# deps = [
|
||||
# ":cld_3",
|
||||
# ]
|
||||
#}
|
||||
|
||||
#executable("nnet_lang_id_test") {
|
||||
# sources = [
|
||||
# "nnet_lang_id_test.cc",
|
||||
# "nnet_lang_id_test_data.cc",
|
||||
# "nnet_lang_id_test_data.h",
|
||||
# ]
|
||||
# deps = [
|
||||
# ":cld_3",
|
||||
# ]
|
||||
#}
|
||||
4
Telegram/ThirdParty/cld3/src/DEPS
vendored
Normal file
4
Telegram/ThirdParty/cld3/src/DEPS
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
include_rules = [
|
||||
'+cld_3',
|
||||
'+script_span',
|
||||
]
|
||||
36
Telegram/ThirdParty/cld3/src/base.cc
vendored
Normal file
36
Telegram/ThirdParty/cld3/src/base.cc
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "base.h"
|
||||
|
||||
#include <string>
|
||||
#if defined(COMPILER_MSVC) || defined(_WIN32)
|
||||
#include <sstream>
|
||||
#endif // defined(COMPILER_MSVC) || defined(_WIN32)
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// TODO(abakalov): Pick the most efficient approach.
|
||||
#if defined(COMPILER_MSVC) || defined(_WIN32)
|
||||
std::string Int64ToString(int64 input) {
|
||||
std::stringstream stream;
|
||||
stream << input;
|
||||
return stream.str();
|
||||
}
|
||||
#else
|
||||
std::string Int64ToString(int64 input) { return std::to_string(input); }
|
||||
#endif // defined(COMPILER_MSVC) || defined(_WIN32)
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
106
Telegram/ThirdParty/cld3/src/base.h
vendored
Normal file
106
Telegram/ThirdParty/cld3/src/base.h
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef BASE_H_
|
||||
#define BASE_H_
|
||||
|
||||
#include <cassert>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
using std::vector;
|
||||
using std::string;
|
||||
using std::map;
|
||||
using std::pair;
|
||||
typedef unsigned int uint32;
|
||||
|
||||
#if LANG_CXX11
|
||||
#define CLD3_DISALLOW_COPY_AND_ASSIGN(TypeName) \
|
||||
TypeName(const TypeName &) = delete; \
|
||||
TypeName &operator=(const TypeName &) = delete
|
||||
#else // C++98 case follows
|
||||
|
||||
// Note that these C++98 implementations cannot completely disallow copying,
|
||||
// as members and friends can still accidentally make elided copies without
|
||||
// triggering a linker error.
|
||||
#define CLD3_DISALLOW_COPY_AND_ASSIGN(TypeName) \
|
||||
TypeName(const TypeName &); \
|
||||
TypeName &operator=(const TypeName &)
|
||||
#endif // LANG_CXX11
|
||||
|
||||
#ifndef CLD3_IMMEDIATE_CRASH
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
#define CLD3_IMMEDIATE_CRASH() __builtin_trap()
|
||||
#else
|
||||
#define CLD3_IMMEDIATE_CRASH() ((void)(*(volatile char *)0 = 0))
|
||||
#endif
|
||||
#endif // CLD3_IMMEDIATE_CRASH
|
||||
|
||||
#define CLD3_CHECK(f) (!(f) ? CLD3_IMMEDIATE_CRASH() : (void)0)
|
||||
|
||||
#if defined(NDEBUG) && !defined(DCHECK_ALWAYS_ON)
|
||||
#define CLD3_DCHECK(f) ((void)0)
|
||||
#else
|
||||
#define CLD3_DCHECK(f) CLD3_CHECK(f)
|
||||
#endif
|
||||
|
||||
#ifndef SWIG
|
||||
typedef int int32;
|
||||
typedef unsigned char uint8; // NOLINT
|
||||
typedef unsigned short uint16; // NOLINT
|
||||
|
||||
// A type to represent a Unicode code-point value. As of Unicode 4.0,
|
||||
// such values require up to 21 bits.
|
||||
// (For type-checking on pointers, make this explicitly signed,
|
||||
// and it should always be the signed version of whatever int32 is.)
|
||||
typedef signed int char32;
|
||||
#endif // SWIG
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
typedef __int64 int64;
|
||||
#else
|
||||
typedef long long int64; // NOLINT
|
||||
#endif // COMPILER_MSVC
|
||||
|
||||
#if defined(__GNUC__) && \
|
||||
(__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
|
||||
|
||||
// For functions we want to force inline.
|
||||
// Introduced in gcc 3.1.
|
||||
#define CLD3_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
|
||||
|
||||
#elif defined(_MSC_VER)
|
||||
#define CLD3_ATTRIBUTE_ALWAYS_INLINE __forceinline
|
||||
#else
|
||||
|
||||
// Other compilers will have to figure it out for themselves.
|
||||
#define CLD3_ATTRIBUTE_ALWAYS_INLINE
|
||||
#endif
|
||||
|
||||
#ifdef INTERNAL_BUILD
|
||||
typedef basic_string<char> bstring;
|
||||
#else
|
||||
typedef std::basic_string<char> bstring;
|
||||
#endif // INTERNAL_BUILD
|
||||
|
||||
// Converts int64 to string.
|
||||
std::string Int64ToString(int64 input);
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // BASE_H_
|
||||
98
Telegram/ThirdParty/cld3/src/casts.h
vendored
Normal file
98
Telegram/ThirdParty/cld3/src/casts.h
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// This code is compiled directly on many platforms, including client
|
||||
// platforms like Windows, Mac, and embedded systems. Before making
|
||||
// any changes here, make sure that you're not breaking any platforms.
|
||||
//
|
||||
|
||||
#ifndef CASTS_H_
|
||||
#define CASTS_H_
|
||||
|
||||
#include <string.h> // for memcpy
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// lang_id_bit_cast<Dest,Source> is a template function that implements the
|
||||
// equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in
|
||||
// very low-level functions like the protobuf library and fast math
|
||||
// support.
|
||||
//
|
||||
// float f = 3.14159265358979;
|
||||
// int i = lang_id_bit_cast<int32>(f);
|
||||
// // i = 0x40490fdb
|
||||
//
|
||||
// The classical address-casting method is:
|
||||
//
|
||||
// // WRONG
|
||||
// float f = 3.14159265358979; // WRONG
|
||||
// int i = * reinterpret_cast<int*>(&f); // WRONG
|
||||
//
|
||||
// The address-casting method actually produces undefined behavior
|
||||
// according to ISO C++ specification section 3.10 -15 -. Roughly, this
|
||||
// section says: if an object in memory has one type, and a program
|
||||
// accesses it with a different type, then the result is undefined
|
||||
// behavior for most values of "different type".
|
||||
//
|
||||
// This is true for any cast syntax, either *(int*)&f or
|
||||
// *reinterpret_cast<int*>(&f). And it is particularly true for
|
||||
// conversions between integral lvalues and floating-point lvalues.
|
||||
//
|
||||
// The purpose of 3.10 -15- is to allow optimizing compilers to assume
|
||||
// that expressions with different types refer to different memory. gcc
|
||||
// 4.0.1 has an optimizer that takes advantage of this. So a
|
||||
// non-conforming program quietly produces wildly incorrect output.
|
||||
//
|
||||
// The problem is not the use of reinterpret_cast. The problem is type
|
||||
// punning: holding an object in memory of one type and reading its bits
|
||||
// back using a different type.
|
||||
//
|
||||
// The C++ standard is more subtle and complex than this, but that
|
||||
// is the basic idea.
|
||||
//
|
||||
// Anyways ...
|
||||
//
|
||||
// lang_id_bit_cast<> calls memcpy() which is blessed by the standard,
|
||||
// especially by the example in section 3.9 . Also, of course,
|
||||
// lang_id_bit_cast<> wraps up the nasty logic in one place.
|
||||
//
|
||||
// Fortunately memcpy() is very fast. In optimized mode, with a
|
||||
// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
|
||||
// code with the minimal amount of data movement. On a 32-bit system,
|
||||
// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
|
||||
// compiles to two loads and two stores.
|
||||
//
|
||||
// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
|
||||
//
|
||||
// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
|
||||
// is likely to surprise you.
|
||||
//
|
||||
// Props to Bill Gibbons for the compile time assertion technique and
|
||||
// Art Komninos and Igor Tandetnik for the msvc experiments.
|
||||
//
|
||||
// -- mec 2005-10-17
|
||||
|
||||
template <class Dest, class Source>
|
||||
inline Dest lang_id_bit_cast(const Source &source) {
|
||||
static_assert(sizeof(Dest) == sizeof(Source), "Sizes do not match");
|
||||
|
||||
Dest dest;
|
||||
memcpy(&dest, &source, sizeof(dest));
|
||||
return dest;
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // CASTS_H_
|
||||
51
Telegram/ThirdParty/cld3/src/embedding_feature_extractor.cc
vendored
Normal file
51
Telegram/ThirdParty/cld3/src/embedding_feature_extractor.cc
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "embedding_feature_extractor.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <vector>
|
||||
|
||||
#include "feature_extractor.h"
|
||||
#include "feature_types.h"
|
||||
#include "task_context.h"
|
||||
#include "utils.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
GenericEmbeddingFeatureExtractor::GenericEmbeddingFeatureExtractor() {}
|
||||
|
||||
GenericEmbeddingFeatureExtractor::~GenericEmbeddingFeatureExtractor() {}
|
||||
|
||||
void GenericEmbeddingFeatureExtractor::Setup(TaskContext *context) {
|
||||
// Don't use version to determine how to get feature FML.
|
||||
string features_param = ArgPrefix();
|
||||
features_param += "_features";
|
||||
const string features = context->Get(features_param, "");
|
||||
const string embedding_names =
|
||||
context->Get(GetParamName("embedding_names"), "");
|
||||
const string embedding_dims =
|
||||
context->Get(GetParamName("embedding_dims"), "");
|
||||
embedding_fml_ = utils::Split(features, ';');
|
||||
add_strings_ = context->Get(GetParamName("add_varlen_strings"), false);
|
||||
embedding_names_ = utils::Split(embedding_names, ';');
|
||||
for (const string &dim : utils::Split(embedding_dims, ';')) {
|
||||
embedding_dims_.push_back(utils::ParseUsing<int>(dim, utils::ParseInt32));
|
||||
}
|
||||
}
|
||||
|
||||
void GenericEmbeddingFeatureExtractor::Init(TaskContext *context) {}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
182
Telegram/ThirdParty/cld3/src/embedding_feature_extractor.h
vendored
Normal file
182
Telegram/ThirdParty/cld3/src/embedding_feature_extractor.h
vendored
Normal file
@@ -0,0 +1,182 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef EMBEDDING_FEATURE_EXTRACTOR_H_
|
||||
#define EMBEDDING_FEATURE_EXTRACTOR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "feature_extractor.h"
|
||||
#include "task_context.h"
|
||||
#include "workspace.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// An EmbeddingFeatureExtractor manages the extraction of features for
|
||||
// embedding-based models. It wraps a sequence of underlying classes of feature
|
||||
// extractors, along with associated predicate maps. Each class of feature
|
||||
// extractors is associated with a name, e.g., "unigrams", "bigrams".
|
||||
//
|
||||
// The class is split between a generic abstract version,
|
||||
// GenericEmbeddingFeatureExtractor (that can be initialized without knowing the
|
||||
// signature of the ExtractFeatures method) and a typed version.
|
||||
//
|
||||
// The predicate maps must be initialized before use: they can be loaded using
|
||||
// Read() or updated via UpdateMapsForExample.
|
||||
class GenericEmbeddingFeatureExtractor {
|
||||
public:
|
||||
GenericEmbeddingFeatureExtractor();
|
||||
virtual ~GenericEmbeddingFeatureExtractor();
|
||||
|
||||
// Get the prefix string to put in front of all arguments, so they don't
|
||||
// conflict with other embedding models.
|
||||
virtual const string ArgPrefix() const = 0;
|
||||
|
||||
// Sets up predicate maps and embedding space names that are common for all
|
||||
// embedding based feature extractors.
|
||||
virtual void Setup(TaskContext *context);
|
||||
virtual void Init(TaskContext *context);
|
||||
|
||||
// Requests workspace for the underlying feature extractors. This is
|
||||
// implemented in the typed class.
|
||||
virtual void RequestWorkspaces(WorkspaceRegistry *registry) = 0;
|
||||
|
||||
// Number of predicates for the embedding at a given index (vocabulary size.)
|
||||
int EmbeddingSize(int index) const {
|
||||
return generic_feature_extractor(index).GetDomainSize();
|
||||
}
|
||||
|
||||
// Returns number of embedding spaces.
|
||||
int NumEmbeddings() const { return embedding_dims_.size(); }
|
||||
|
||||
// Returns the number of features in the embedding space.
|
||||
int FeatureSize(int idx) const {
|
||||
return generic_feature_extractor(idx).feature_types();
|
||||
}
|
||||
|
||||
// Returns the dimensionality of the embedding space.
|
||||
int EmbeddingDims(int index) const { return embedding_dims_[index]; }
|
||||
|
||||
// Accessor for embedding dims (dimensions of the embedding spaces).
|
||||
const std::vector<int> &embedding_dims() const { return embedding_dims_; }
|
||||
|
||||
const std::vector<string> &embedding_fml() const { return embedding_fml_; }
|
||||
|
||||
// Get parameter name by concatenating the prefix and the original name.
|
||||
string GetParamName(const string ¶m_name) const {
|
||||
string name = ArgPrefix();
|
||||
name += "_";
|
||||
name += param_name;
|
||||
return name;
|
||||
}
|
||||
|
||||
protected:
|
||||
// Provides the generic class with access to the templated extractors. This is
|
||||
// used to get the type information out of the feature extractor without
|
||||
// knowing the specific calling arguments of the extractor itself.
|
||||
virtual const GenericFeatureExtractor &generic_feature_extractor(
|
||||
int idx) const = 0;
|
||||
|
||||
private:
|
||||
// Embedding space names for parameter sharing.
|
||||
std::vector<string> embedding_names_;
|
||||
|
||||
// FML strings for each feature extractor.
|
||||
std::vector<string> embedding_fml_;
|
||||
|
||||
// Size of each of the embedding spaces (maximum predicate id).
|
||||
std::vector<int> embedding_sizes_;
|
||||
|
||||
// Embedding dimensions of the embedding spaces (i.e. 32, 64 etc.)
|
||||
std::vector<int> embedding_dims_;
|
||||
|
||||
// Whether or not to add string descriptions to converted examples.
|
||||
bool add_strings_;
|
||||
};
|
||||
|
||||
// Templated, object-specific implementation of the
|
||||
// EmbeddingFeatureExtractor. EXTRACTOR should be a FeatureExtractor<OBJ,
|
||||
// ARGS...> class that has the appropriate FeatureTraits() to ensure that
|
||||
// locator type features work.
|
||||
//
|
||||
// Note: for backwards compatibility purposes, this always reads the FML spec
|
||||
// from "<prefix>_features".
|
||||
template <class EXTRACTOR, class OBJ, class... ARGS>
|
||||
class EmbeddingFeatureExtractor : public GenericEmbeddingFeatureExtractor {
|
||||
public:
|
||||
// Sets up all predicate maps, feature extractors, and flags.
|
||||
void Setup(TaskContext *context) override {
|
||||
GenericEmbeddingFeatureExtractor::Setup(context);
|
||||
feature_extractors_.resize(embedding_fml().size());
|
||||
for (size_t i = 0; i < embedding_fml().size(); ++i) {
|
||||
feature_extractors_[i].Parse(embedding_fml()[i]);
|
||||
feature_extractors_[i].Setup(context);
|
||||
}
|
||||
}
|
||||
|
||||
// Initializes resources needed by the feature extractors.
|
||||
void Init(TaskContext *context) override {
|
||||
GenericEmbeddingFeatureExtractor::Init(context);
|
||||
for (auto &feature_extractor : feature_extractors_) {
|
||||
feature_extractor.Init(context);
|
||||
}
|
||||
}
|
||||
|
||||
// Requests workspaces from the registry. Must be called after Init(), and
|
||||
// before Preprocess().
|
||||
void RequestWorkspaces(WorkspaceRegistry *registry) override {
|
||||
for (auto &feature_extractor : feature_extractors_) {
|
||||
feature_extractor.RequestWorkspaces(registry);
|
||||
}
|
||||
}
|
||||
|
||||
// Must be called on the object one state for each sentence, before any
|
||||
// feature extraction (e.g., UpdateMapsForExample, ExtractSparseFeatures).
|
||||
void Preprocess(WorkspaceSet *workspaces, OBJ *obj) const {
|
||||
for (auto &feature_extractor : feature_extractors_) {
|
||||
feature_extractor.Preprocess(workspaces, obj);
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts features using the extractors. Note that features must already
|
||||
// be initialized to the correct number of feature extractors. No predicate
|
||||
// mapping is applied.
|
||||
void ExtractFeatures(const WorkspaceSet &workspaces, const OBJ &obj,
|
||||
ARGS... args,
|
||||
std::vector<FeatureVector> *features) const {
|
||||
for (size_t i = 0; i < feature_extractors_.size(); ++i) {
|
||||
features->at(i).clear();
|
||||
feature_extractors_.at(i).ExtractFeatures(workspaces, obj, args...,
|
||||
&features->at(i));
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
// Provides generic access to the feature extractors.
|
||||
const GenericFeatureExtractor &generic_feature_extractor(
|
||||
int idx) const override {
|
||||
return feature_extractors_.at(idx);
|
||||
}
|
||||
|
||||
private:
|
||||
// Templated feature extractor class.
|
||||
std::vector<EXTRACTOR> feature_extractors_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // EMBEDDING_FEATURE_EXTRACTOR_H_
|
||||
197
Telegram/ThirdParty/cld3/src/embedding_network.cc
vendored
Normal file
197
Telegram/ThirdParty/cld3/src/embedding_network.cc
vendored
Normal file
@@ -0,0 +1,197 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "embedding_network.h"
|
||||
|
||||
#include "base.h"
|
||||
#include "embedding_network_params.h"
|
||||
#include "float16.h"
|
||||
#include "simple_adder.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace {
|
||||
|
||||
using VectorWrapper = EmbeddingNetwork::VectorWrapper;
|
||||
|
||||
void CheckNoQuantization(const EmbeddingNetworkParams::Matrix matrix) {
|
||||
// Quantization not allowed here.
|
||||
CLD3_DCHECK(static_cast<int>(QuantizationType::NONE) ==
|
||||
static_cast<int>(matrix.quant_type));
|
||||
}
|
||||
|
||||
// Fills a Matrix object with the parameters in the given MatrixParams. This
|
||||
// function is used to initialize weight matrices that are *not* embedding
|
||||
// matrices.
|
||||
void FillMatrixParams(const EmbeddingNetworkParams::Matrix source_matrix,
|
||||
EmbeddingNetwork::Matrix *mat) {
|
||||
mat->resize(source_matrix.rows);
|
||||
CheckNoQuantization(source_matrix);
|
||||
const float *weights =
|
||||
reinterpret_cast<const float *>(source_matrix.elements);
|
||||
for (int r = 0; r < source_matrix.rows; ++r) {
|
||||
(*mat)[r] = EmbeddingNetwork::VectorWrapper(weights, source_matrix.cols);
|
||||
weights += source_matrix.cols;
|
||||
}
|
||||
}
|
||||
|
||||
// Computes y = weights * Relu(x) + b where Relu is optionally applied.
|
||||
template <typename ScaleAdderClass>
|
||||
void SparseReluProductPlusBias(bool apply_relu,
|
||||
const EmbeddingNetwork::Matrix &weights,
|
||||
const EmbeddingNetwork::VectorWrapper &b,
|
||||
const EmbeddingNetwork::Vector &x,
|
||||
EmbeddingNetwork::Vector *y) {
|
||||
y->assign(b.data(), b.data() + b.size());
|
||||
ScaleAdderClass adder(y->data(), y->size());
|
||||
|
||||
const int x_size = x.size();
|
||||
for (int i = 0; i < x_size; ++i) {
|
||||
const float &scale = x[i];
|
||||
if (apply_relu) {
|
||||
if (scale > 0) {
|
||||
adder.LazyScaleAdd(weights[i].data(), scale);
|
||||
}
|
||||
} else {
|
||||
adder.LazyScaleAdd(weights[i].data(), scale);
|
||||
}
|
||||
}
|
||||
adder.Finalize();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void EmbeddingNetwork::ConcatEmbeddings(
|
||||
const std::vector<FeatureVector> &feature_vectors, Vector *concat) const {
|
||||
concat->resize(model_->concat_layer_size());
|
||||
|
||||
// "es_index" stands for "embedding space index".
|
||||
for (size_t es_index = 0; es_index < feature_vectors.size(); ++es_index) {
|
||||
const int concat_offset = model_->concat_offset(es_index);
|
||||
const int embedding_dim = model_->embedding_dim(es_index);
|
||||
|
||||
const EmbeddingMatrix &embedding_matrix = embedding_matrices_[es_index];
|
||||
CLD3_DCHECK(embedding_matrix.dim() == embedding_dim);
|
||||
|
||||
const bool is_quantized =
|
||||
embedding_matrix.quant_type() != QuantizationType::NONE;
|
||||
|
||||
const FeatureVector &feature_vector = feature_vectors[es_index];
|
||||
const int num_features = feature_vector.size();
|
||||
for (int fi = 0; fi < num_features; ++fi) {
|
||||
const FeatureType *feature_type = feature_vector.type(fi);
|
||||
int feature_offset = concat_offset + feature_type->base() * embedding_dim;
|
||||
CLD3_DCHECK(feature_offset + embedding_dim <=
|
||||
static_cast<int>(concat->size()));
|
||||
|
||||
// Weighted embeddings will be added starting from this address.
|
||||
float *concat_ptr = concat->data() + feature_offset;
|
||||
|
||||
// Pointer to float / uint8 weights for relevant embedding.
|
||||
const void *embedding_data;
|
||||
|
||||
// Multiplier for each embedding weight.
|
||||
float multiplier;
|
||||
const FeatureValue feature_value = feature_vector.value(fi);
|
||||
if (feature_type->is_continuous()) {
|
||||
// Continuous features (encoded as FloatFeatureValue).
|
||||
FloatFeatureValue float_feature_value(feature_value);
|
||||
const int id = float_feature_value.value.id;
|
||||
embedding_matrix.get_embedding(id, &embedding_data, &multiplier);
|
||||
multiplier *= float_feature_value.value.weight;
|
||||
} else {
|
||||
// Discrete features: every present feature has implicit value 1.0.
|
||||
embedding_matrix.get_embedding(feature_value, &embedding_data,
|
||||
&multiplier);
|
||||
}
|
||||
|
||||
if (is_quantized) {
|
||||
const uint8 *quant_weights =
|
||||
reinterpret_cast<const uint8 *>(embedding_data);
|
||||
for (int i = 0; i < embedding_dim; ++i, ++quant_weights, ++concat_ptr) {
|
||||
// 128 is bias for UINT8 quantization, only one we currently support.
|
||||
*concat_ptr += (static_cast<int>(*quant_weights) - 128) * multiplier;
|
||||
}
|
||||
} else {
|
||||
const float *weights = reinterpret_cast<const float *>(embedding_data);
|
||||
for (int i = 0; i < embedding_dim; ++i, ++weights, ++concat_ptr) {
|
||||
*concat_ptr += *weights * multiplier;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ScaleAdderClass>
|
||||
void EmbeddingNetwork::FinishComputeFinalScores(const Vector &concat,
|
||||
Vector *scores) const {
|
||||
Vector h0(hidden_bias_[0].size());
|
||||
SparseReluProductPlusBias<ScaleAdderClass>(false, hidden_weights_[0],
|
||||
hidden_bias_[0], concat, &h0);
|
||||
|
||||
CLD3_DCHECK((hidden_weights_.size() == 1) || (hidden_weights_.size() == 2));
|
||||
if (hidden_weights_.size() == 1) { // 1 hidden layer
|
||||
SparseReluProductPlusBias<ScaleAdderClass>(true, softmax_weights_,
|
||||
softmax_bias_, h0, scores);
|
||||
} else if (hidden_weights_.size() == 2) { // 2 hidden layers
|
||||
Vector h1(hidden_bias_[1].size());
|
||||
SparseReluProductPlusBias<ScaleAdderClass>(true, hidden_weights_[1],
|
||||
hidden_bias_[1], h0, &h1);
|
||||
SparseReluProductPlusBias<ScaleAdderClass>(true, softmax_weights_,
|
||||
softmax_bias_, h1, scores);
|
||||
}
|
||||
}
|
||||
|
||||
void EmbeddingNetwork::ComputeFinalScores(
|
||||
const std::vector<FeatureVector> &features, Vector *scores) const {
|
||||
Vector concat;
|
||||
ConcatEmbeddings(features, &concat);
|
||||
|
||||
scores->resize(softmax_bias_.size());
|
||||
FinishComputeFinalScores<SimpleAdder>(concat, scores);
|
||||
}
|
||||
|
||||
EmbeddingNetwork::EmbeddingNetwork(const EmbeddingNetworkParams *model)
|
||||
: model_(model) {
|
||||
int offset_sum = 0;
|
||||
for (int i = 0; i < model_->embedding_dim_size(); ++i) {
|
||||
CLD3_DCHECK(offset_sum == model_->concat_offset(i));
|
||||
offset_sum += model_->embedding_dim(i) * model_->embedding_num_features(i);
|
||||
(void)offset_sum; // Avoid compiler warning for "unused" variable.
|
||||
embedding_matrices_.emplace_back(model_->GetEmbeddingMatrix(i));
|
||||
}
|
||||
|
||||
CLD3_DCHECK(model_->hidden_size() == model_->hidden_bias_size());
|
||||
hidden_weights_.resize(model_->hidden_size());
|
||||
hidden_bias_.resize(model_->hidden_size());
|
||||
for (int i = 0; i < model_->hidden_size(); ++i) {
|
||||
FillMatrixParams(model_->GetHiddenLayerMatrix(i), &hidden_weights_[i]);
|
||||
EmbeddingNetworkParams::Matrix bias = model_->GetHiddenLayerBias(i);
|
||||
CLD3_DCHECK(1 == bias.cols);
|
||||
CheckNoQuantization(bias);
|
||||
hidden_bias_[i] = VectorWrapper(
|
||||
reinterpret_cast<const float *>(bias.elements), bias.rows);
|
||||
}
|
||||
|
||||
CLD3_DCHECK(model_->HasSoftmax());
|
||||
FillMatrixParams(model_->GetSoftmaxMatrix(), &softmax_weights_);
|
||||
|
||||
EmbeddingNetworkParams::Matrix softmax_bias = model_->GetSoftmaxBias();
|
||||
CLD3_DCHECK(1 == softmax_bias.cols);
|
||||
CheckNoQuantization(softmax_bias);
|
||||
softmax_bias_ =
|
||||
VectorWrapper(reinterpret_cast<const float *>(softmax_bias.elements),
|
||||
softmax_bias.rows);
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
186
Telegram/ThirdParty/cld3/src/embedding_network.h
vendored
Normal file
186
Telegram/ThirdParty/cld3/src/embedding_network.h
vendored
Normal file
@@ -0,0 +1,186 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef EMBEDDING_NETWORK_H_
|
||||
#define EMBEDDING_NETWORK_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "embedding_network_params.h"
|
||||
#include "feature_extractor.h"
|
||||
#include "float16.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Classifier using a hand-coded feed-forward neural network.
|
||||
//
|
||||
// No gradient computation, just inference.
|
||||
//
|
||||
// Based on the more general nlp_saft::EmbeddingNetwork.
|
||||
//
|
||||
// Classification works as follows:
|
||||
//
|
||||
// Discrete features -> Embeddings -> Concatenation -> Hidden+ -> Softmax
|
||||
//
|
||||
// In words: given some discrete features, this class extracts the embeddings
|
||||
// for these features, concatenates them, passes them through one or two hidden
|
||||
// layers (each layer uses Relu) and next through a softmax layer that computes
|
||||
// an unnormalized score for each possible class. Note: there is always a
|
||||
// softmax layer.
|
||||
//
|
||||
// NOTE(salcianu): current code can easily be changed to allow more than two
|
||||
// hidden layers. Feel free to do so if you have a genuine need for that.
|
||||
class EmbeddingNetwork {
|
||||
public:
|
||||
// Class used to represent an embedding matrix. Each row is the embedding on
|
||||
// a vocabulary element. Number of columns = number of embedding dimensions.
|
||||
class EmbeddingMatrix {
|
||||
public:
|
||||
explicit EmbeddingMatrix(const EmbeddingNetworkParams::Matrix source_matrix)
|
||||
: rows_(source_matrix.rows),
|
||||
cols_(source_matrix.cols),
|
||||
quant_type_(source_matrix.quant_type),
|
||||
data_(source_matrix.elements),
|
||||
row_size_in_bytes_(GetRowSizeInBytes(cols_, quant_type_)),
|
||||
quant_scales_(source_matrix.quant_scales) {}
|
||||
|
||||
// Returns vocabulary size; one embedding for each vocabulary element.
|
||||
int size() const { return rows_; }
|
||||
|
||||
// Returns number of weights in embedding of each vocabulary element.
|
||||
int dim() const { return cols_; }
|
||||
|
||||
// Returns quantization type for this embedding matrix.
|
||||
QuantizationType quant_type() const { return quant_type_; }
|
||||
|
||||
// Gets embedding for k-th vocabulary element: on return, sets *data to
|
||||
// point to the embedding weights and *scale to the quantization scale (1.0
|
||||
// if no quantization).
|
||||
void get_embedding(int k, const void **data, float *scale) const {
|
||||
CLD3_CHECK(k >= 0);
|
||||
CLD3_CHECK(k < size());
|
||||
*data = reinterpret_cast<const char *>(data_) + k * row_size_in_bytes_;
|
||||
if (quant_type_ == QuantizationType::NONE) {
|
||||
*scale = 1.0;
|
||||
} else {
|
||||
*scale = Float16To32(quant_scales_[k]);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static int GetRowSizeInBytes(int cols, QuantizationType quant_type) {
|
||||
CLD3_DCHECK((quant_type == QuantizationType::NONE) ||
|
||||
(quant_type == QuantizationType::UINT8));
|
||||
if (quant_type == QuantizationType::NONE) {
|
||||
return cols * sizeof(float);
|
||||
} else { // QuantizationType::UINT8
|
||||
return cols * sizeof(uint8);
|
||||
}
|
||||
}
|
||||
|
||||
// Vocabulary size.
|
||||
int rows_;
|
||||
|
||||
// Number of elements in each embedding.
|
||||
int cols_;
|
||||
|
||||
QuantizationType quant_type_;
|
||||
|
||||
// Pointer to the embedding weights, in row-major order. This is a pointer
|
||||
// to an array of floats / uint8, depending on the quantization type.
|
||||
// Not owned.
|
||||
const void *data_;
|
||||
|
||||
// Number of bytes for one row. Used to jump to next row in data_.
|
||||
int row_size_in_bytes_;
|
||||
|
||||
// Pointer to quantization scales. nullptr if no quantization. Otherwise,
|
||||
// quant_scales_[i] is scale for embedding of i-th vocabulary element.
|
||||
const float16 *quant_scales_;
|
||||
};
|
||||
|
||||
// An immutable vector that doesn't own the memory that stores the underlying
|
||||
// floats. Can be used e.g., as a wrapper around model weights stored in the
|
||||
// static memory.
|
||||
class VectorWrapper {
|
||||
public:
|
||||
VectorWrapper() : VectorWrapper(nullptr, 0) {}
|
||||
|
||||
// Constructs a vector wrapper around the size consecutive floats that start
|
||||
// at address data. Note: the underlying data should be alive for at least
|
||||
// the lifetime of this VectorWrapper object. That's trivially true if data
|
||||
// points to statically allocated data :)
|
||||
VectorWrapper(const float *data, int size) : data_(data), size_(size) {}
|
||||
|
||||
int size() const { return size_; }
|
||||
|
||||
const float *data() const { return data_; }
|
||||
|
||||
private:
|
||||
const float *data_; // Not owned.
|
||||
int size_;
|
||||
|
||||
// Doesn't own anything, so it can be copied and assigned at will :)
|
||||
};
|
||||
|
||||
typedef std::vector<VectorWrapper> Matrix;
|
||||
typedef std::vector<float> Vector;
|
||||
|
||||
// Constructs an embedding network using the parameters from model.
|
||||
//
|
||||
// Note: model should stay alive for at least the lifetime of this
|
||||
// EmbeddingNetwork object. TODO(salcianu): remove this constraint: we should
|
||||
// copy all necessary data (except, of course, the static weights) at
|
||||
// construction time and use that, instead of relying on model.
|
||||
explicit EmbeddingNetwork(const EmbeddingNetworkParams *model);
|
||||
|
||||
virtual ~EmbeddingNetwork() {}
|
||||
|
||||
// Runs forward computation to fill scores with unnormalized output unit
|
||||
// scores. This is useful for making predictions.
|
||||
void ComputeFinalScores(const std::vector<FeatureVector> &features,
|
||||
Vector *scores) const;
|
||||
|
||||
private:
|
||||
// Computes the softmax scores (prior to normalization) from the concatenated
|
||||
// representation.
|
||||
template <typename ScaleAdderClass>
|
||||
void FinishComputeFinalScores(const Vector &concat, Vector *scores) const;
|
||||
|
||||
// Constructs the concatenated input embedding vector in place in output
|
||||
// vector concat.
|
||||
void ConcatEmbeddings(const std::vector<FeatureVector> &features,
|
||||
Vector *concat) const;
|
||||
|
||||
// Pointer to the model object passed to the constructor. Not owned.
|
||||
const EmbeddingNetworkParams *model_;
|
||||
|
||||
// Network parameters.
|
||||
|
||||
// One weight matrix for each embedding.
|
||||
std::vector<EmbeddingMatrix> embedding_matrices_;
|
||||
|
||||
// One weight matrix and one vector of bias weights for each hiden layer.
|
||||
std::vector<Matrix> hidden_weights_;
|
||||
std::vector<VectorWrapper> hidden_bias_;
|
||||
|
||||
// Weight matrix and bias vector for the softmax layer.
|
||||
Matrix softmax_weights_;
|
||||
VectorWrapper softmax_bias_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // EMBEDDING_NETWORK_H_
|
||||
285
Telegram/ThirdParty/cld3/src/embedding_network_params.h
vendored
Normal file
285
Telegram/ThirdParty/cld3/src/embedding_network_params.h
vendored
Normal file
@@ -0,0 +1,285 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef EMBEDDING_NETWORK_PARAMS_H_
|
||||
#define EMBEDDING_NETWORK_PARAMS_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "base.h"
|
||||
#include "float16.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
enum class QuantizationType { NONE = 0, UINT8 };
|
||||
|
||||
// API for accessing parameters from a statically-linked EmbeddingNetworkProto.
|
||||
class EmbeddingNetworkParams {
|
||||
public:
|
||||
virtual ~EmbeddingNetworkParams() {}
|
||||
|
||||
// **** High-level API.
|
||||
|
||||
// Simple representation of a matrix. This small struct that doesn't own any
|
||||
// resource intentionally supports copy / assign, to simplify our APIs.
|
||||
struct Matrix {
|
||||
// Number of rows.
|
||||
int rows;
|
||||
|
||||
// Number of columns.
|
||||
int cols;
|
||||
|
||||
QuantizationType quant_type;
|
||||
|
||||
// Pointer to matrix elements, in row-major order
|
||||
// (https://en.wikipedia.org/wiki/Row-major_order) Not owned.
|
||||
const void *elements;
|
||||
|
||||
// Quantization scales: one scale for each row.
|
||||
const float16 *quant_scales;
|
||||
};
|
||||
|
||||
// Returns i-th embedding matrix. Crashes on out of bounds indices.
|
||||
//
|
||||
// This is the transpose of the corresponding matrix from the original proto.
|
||||
Matrix GetEmbeddingMatrix(int i) const {
|
||||
CheckMatrixRange(i, embeddings_size(), "embedding matrix");
|
||||
Matrix matrix;
|
||||
matrix.rows = embeddings_num_rows(i);
|
||||
matrix.cols = embeddings_num_cols(i);
|
||||
matrix.elements = embeddings_weights(i);
|
||||
matrix.quant_type = embeddings_quant_type(i);
|
||||
matrix.quant_scales = embeddings_quant_scales(i);
|
||||
return matrix;
|
||||
}
|
||||
|
||||
// Returns weight matrix for i-th hidden layer. Crashes on out of bounds
|
||||
// indices.
|
||||
//
|
||||
// This is the transpose of the corresponding matrix from the original proto.
|
||||
Matrix GetHiddenLayerMatrix(int i) const {
|
||||
CheckMatrixRange(i, hidden_size(), "hidden layer");
|
||||
Matrix matrix;
|
||||
matrix.rows = hidden_num_rows(i);
|
||||
matrix.cols = hidden_num_cols(i);
|
||||
|
||||
// Quantization not supported here.
|
||||
matrix.quant_type = QuantizationType::NONE;
|
||||
matrix.elements = hidden_weights(i);
|
||||
return matrix;
|
||||
}
|
||||
|
||||
// Returns bias for i-th hidden layer. Technically a Matrix, but we expect it
|
||||
// to be a row/column vector (i.e., num rows or num cols is 1). However, we
|
||||
// don't CHECK for that: we just provide access to underlying data. Crashes
|
||||
// on out of bounds indices.
|
||||
Matrix GetHiddenLayerBias(int i) const {
|
||||
CheckMatrixRange(i, hidden_bias_size(), "hidden layer bias");
|
||||
Matrix matrix;
|
||||
matrix.rows = hidden_bias_num_rows(i);
|
||||
matrix.cols = hidden_bias_num_cols(i);
|
||||
|
||||
// Quantization not supported here.
|
||||
matrix.quant_type = QuantizationType::NONE;
|
||||
matrix.elements = hidden_bias_weights(i);
|
||||
return matrix;
|
||||
}
|
||||
|
||||
// Returns true if a softmax layer exists.
|
||||
bool HasSoftmax() const { return softmax_size() == 1; }
|
||||
|
||||
// Returns weight matrix for the softmax layer. Note: should be called only
|
||||
// if HasSoftmax() is true.
|
||||
//
|
||||
// This is the transpose of the corresponding matrix from the original proto.
|
||||
Matrix GetSoftmaxMatrix() const {
|
||||
CLD3_DCHECK(HasSoftmax());
|
||||
Matrix matrix;
|
||||
matrix.rows = softmax_num_rows(0);
|
||||
matrix.cols = softmax_num_cols(0);
|
||||
|
||||
// Quantization not supported here.
|
||||
matrix.quant_type = QuantizationType::NONE;
|
||||
matrix.elements = softmax_weights(0);
|
||||
return matrix;
|
||||
}
|
||||
|
||||
// Returns bias for the softmax layer. Technically a Matrix, but we expect it
|
||||
// to be a row/column vector (i.e., num rows or num cols is 1). However, we
|
||||
// don't CHECK for that: we just provide access to underlying data.
|
||||
Matrix GetSoftmaxBias() const {
|
||||
CLD3_DCHECK(HasSoftmax());
|
||||
Matrix matrix;
|
||||
matrix.rows = softmax_bias_num_rows(0);
|
||||
matrix.cols = softmax_bias_num_cols(0);
|
||||
|
||||
// Quantization not supported here.
|
||||
matrix.quant_type = QuantizationType::NONE;
|
||||
matrix.elements = softmax_bias_weights(0);
|
||||
return matrix;
|
||||
}
|
||||
|
||||
// **** Low-level API.
|
||||
//
|
||||
// * Most low-level API methods are documented by giving an equivalent
|
||||
// function call on proto, the original proto (of type
|
||||
// EmbeddingNetworkProto) which was used to generate the C++ code.
|
||||
//
|
||||
// * To simplify our generation code, optional proto fields of message type
|
||||
// are treated as repeated fields with 0 or 1 instances. As such, we have
|
||||
// *_size() methods for such optional fields: they return 0 or 1.
|
||||
//
|
||||
// * "transpose(M)" denotes the transpose of a matrix M.
|
||||
|
||||
// ** Access methods for repeated MatrixParams embeddings.
|
||||
//
|
||||
// Returns proto.embeddings_size().
|
||||
virtual int embeddings_size() const = 0;
|
||||
|
||||
// Returns number of rows of transpose(proto.embeddings(i)).
|
||||
virtual int embeddings_num_rows(int i) const = 0;
|
||||
|
||||
// Returns number of columns of transpose(proto.embeddings(i)).
|
||||
virtual int embeddings_num_cols(int i) const = 0;
|
||||
|
||||
// Returns pointer to elements of transpose(proto.embeddings(i)), in row-major
|
||||
// order.
|
||||
virtual const void *embeddings_weights(int i) const = 0;
|
||||
|
||||
virtual QuantizationType embeddings_quant_type(int i) const {
|
||||
return QuantizationType::NONE;
|
||||
}
|
||||
|
||||
virtual const float16 *embeddings_quant_scales(int i) const {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// ** Access methods for repeated MatrixParams hidden.
|
||||
//
|
||||
// Returns embedding_network_proto.hidden_size().
|
||||
virtual int hidden_size() const = 0;
|
||||
|
||||
// Returns embedding_network_proto.hidden(i).rows().
|
||||
virtual int hidden_num_rows(int i) const = 0;
|
||||
|
||||
// Returns embedding_network_proto.hidden(i).rows().
|
||||
virtual int hidden_num_cols(int i) const = 0;
|
||||
|
||||
// Returns pointer to beginning of array of floats with all values from
|
||||
// embedding_network_proto.hidden(i).
|
||||
virtual const void *hidden_weights(int i) const = 0;
|
||||
|
||||
// ** Access methods for repeated MatrixParams hidden_bias.
|
||||
//
|
||||
// Returns proto.hidden_bias_size().
|
||||
virtual int hidden_bias_size() const = 0;
|
||||
|
||||
// Returns number of rows of proto.hidden_bias(i).
|
||||
virtual int hidden_bias_num_rows(int i) const = 0;
|
||||
|
||||
// Returns number of columns of proto.hidden_bias(i).
|
||||
virtual int hidden_bias_num_cols(int i) const = 0;
|
||||
|
||||
// Returns pointer to elements of proto.hidden_bias(i), in row-major order.
|
||||
virtual const void *hidden_bias_weights(int i) const = 0;
|
||||
|
||||
// ** Access methods for optional MatrixParams softmax.
|
||||
//
|
||||
// Returns 1 if proto has optional field softmax, 0 otherwise.
|
||||
virtual int softmax_size() const = 0;
|
||||
|
||||
// Returns number of rows of transpose(proto.softmax()).
|
||||
virtual int softmax_num_rows(int i) const = 0;
|
||||
|
||||
// Returns number of columns of transpose(proto.softmax()).
|
||||
virtual int softmax_num_cols(int i) const = 0;
|
||||
|
||||
// Returns pointer to elements of transpose(proto.softmax()), in row-major
|
||||
// order.
|
||||
virtual const void *softmax_weights(int i) const = 0;
|
||||
|
||||
// ** Access methods for optional MatrixParams softmax_bias.
|
||||
//
|
||||
// Returns 1 if proto has optional field softmax_bias, 0 otherwise.
|
||||
virtual int softmax_bias_size() const = 0;
|
||||
|
||||
// Returns number of rows of proto.softmax_bias().
|
||||
virtual int softmax_bias_num_rows(int i) const = 0;
|
||||
|
||||
// Returns number of columns of proto.softmax_bias().
|
||||
virtual int softmax_bias_num_cols(int i) const = 0;
|
||||
|
||||
// Returns pointer to elements of proto.softmax_bias(), in row-major order.
|
||||
virtual const void *softmax_bias_weights(int i) const = 0;
|
||||
|
||||
// ** Access methods for repeated int32 embedding_dim.
|
||||
//
|
||||
// Returns proto.embedding_dim_size().
|
||||
virtual int embedding_dim_size() const = 0;
|
||||
|
||||
// Returns proto.embedding_dim(i).
|
||||
virtual int embedding_dim(int i) const = 0;
|
||||
|
||||
// ** Access methods for repeated int32 embedding_num_features.
|
||||
//
|
||||
// Returns proto.embedding_num_features_size().
|
||||
virtual int embedding_num_features_size() const = 0;
|
||||
|
||||
// Returns proto.embedding_num_features(i).
|
||||
virtual int embedding_num_features(int i) const = 0;
|
||||
|
||||
// ** Access methods for repeated int32 embedding_features_domain_size.
|
||||
//
|
||||
// Returns proto.embedding_features_domain_size_size().
|
||||
virtual int embedding_features_domain_size_size() const = 0;
|
||||
|
||||
// Returns proto.embedding_features_domain_size(i).
|
||||
virtual int embedding_features_domain_size(int i) const = 0;
|
||||
|
||||
// ** Access methods for repeated int32 concat_offset.
|
||||
//
|
||||
// Returns proto.concat_offset_size().
|
||||
virtual int concat_offset(int i) const = 0;
|
||||
|
||||
// Returns proto.concat_offset(i).
|
||||
virtual int concat_offset_size() const = 0;
|
||||
|
||||
// ** Access methods for concat_layer_size.
|
||||
//
|
||||
// Returns proto.has_concat_layer_size().
|
||||
virtual bool has_concat_layer_size() const = 0;
|
||||
|
||||
// Returns proto.concat_layer_size().
|
||||
virtual int concat_layer_size() const = 0;
|
||||
|
||||
// ** Access methods for is_precomputed
|
||||
//
|
||||
// Returns proto.has_is_precomputed().
|
||||
virtual bool has_is_precomputed() const = 0;
|
||||
|
||||
// Returns proto.is_precomputed().
|
||||
virtual bool is_precomputed() const = 0;
|
||||
|
||||
private:
|
||||
void CheckMatrixRange(int index, int num_matrices,
|
||||
const string &description) const {
|
||||
CLD3_DCHECK(index >= 0);
|
||||
CLD3_DCHECK(index < num_matrices);
|
||||
}
|
||||
}; // class EmbeddingNetworkParams
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // EMBEDDING_NETWORK_PARAMS_H_
|
||||
137
Telegram/ThirdParty/cld3/src/feature_extractor.cc
vendored
Normal file
137
Telegram/ThirdParty/cld3/src/feature_extractor.cc
vendored
Normal file
@@ -0,0 +1,137 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "feature_extractor.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "feature_types.h"
|
||||
#include "fml_parser.h"
|
||||
#include "utils.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
constexpr FeatureValue GenericFeatureFunction::kNone;
|
||||
|
||||
FeatureVector::FeatureVector() {}
|
||||
|
||||
FeatureVector::~FeatureVector() {}
|
||||
|
||||
GenericFeatureExtractor::GenericFeatureExtractor() {}
|
||||
|
||||
GenericFeatureExtractor::~GenericFeatureExtractor() {}
|
||||
|
||||
GenericFeatureExtractor::GenericFeatureExtractor(
|
||||
const GenericFeatureExtractor &extractor)
|
||||
: descriptor_(extractor.descriptor_),
|
||||
feature_types_(extractor.feature_types_) {}
|
||||
|
||||
void GenericFeatureExtractor::Parse(const string &source) {
|
||||
// Parse feature specification into descriptor.
|
||||
FMLParser parser;
|
||||
parser.Parse(source, mutable_descriptor());
|
||||
|
||||
// Initialize feature extractor from descriptor.
|
||||
InitializeFeatureFunctions();
|
||||
}
|
||||
|
||||
void GenericFeatureExtractor::InitializeFeatureTypes() {
|
||||
// Register all feature types.
|
||||
GetFeatureTypes(&feature_types_);
|
||||
for (size_t i = 0; i < feature_types_.size(); ++i) {
|
||||
FeatureType *ft = feature_types_[i];
|
||||
ft->set_base(i);
|
||||
|
||||
// Check for feature space overflow.
|
||||
CLD3_DCHECK(ft->GetDomainSize() >= 0);
|
||||
}
|
||||
|
||||
std::vector<string> types_names;
|
||||
GetFeatureTypeNames(&types_names);
|
||||
CLD3_DCHECK(feature_types_.size() == types_names.size());
|
||||
}
|
||||
|
||||
void GenericFeatureExtractor::GetFeatureTypeNames(
|
||||
std::vector<string> *type_names) const {
|
||||
for (size_t i = 0; i < feature_types_.size(); ++i) {
|
||||
FeatureType *ft = feature_types_[i];
|
||||
type_names->push_back(ft->name());
|
||||
}
|
||||
}
|
||||
|
||||
FeatureValue GenericFeatureExtractor::GetDomainSize() const {
|
||||
// Domain size of the set of features is equal to:
|
||||
// [largest domain size of any feature types] * [number of feature types]
|
||||
FeatureValue max_feature_type_dsize = 0;
|
||||
for (size_t i = 0; i < feature_types_.size(); ++i) {
|
||||
FeatureType *ft = feature_types_[i];
|
||||
const FeatureValue feature_type_dsize = ft->GetDomainSize();
|
||||
if (feature_type_dsize > max_feature_type_dsize) {
|
||||
max_feature_type_dsize = feature_type_dsize;
|
||||
}
|
||||
}
|
||||
|
||||
return max_feature_type_dsize;
|
||||
}
|
||||
|
||||
string GenericFeatureFunction::GetParameter(const string &name) const {
|
||||
// Find named parameter in feature descriptor.
|
||||
for (int i = 0; i < descriptor_->parameter_size(); ++i) {
|
||||
if (name == descriptor_->parameter(i).name()) {
|
||||
return descriptor_->parameter(i).value();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
GenericFeatureFunction::GenericFeatureFunction() {}
|
||||
|
||||
GenericFeatureFunction::~GenericFeatureFunction() { delete feature_type_; }
|
||||
|
||||
int GenericFeatureFunction::GetIntParameter(const string &name,
|
||||
int default_value) const {
|
||||
string value = GetParameter(name);
|
||||
return value.empty() ? default_value
|
||||
: utils::ParseUsing<int>(value, utils::ParseInt32);
|
||||
}
|
||||
|
||||
bool GenericFeatureFunction::GetBoolParameter(const string &name,
|
||||
bool default_value) const {
|
||||
string value = GetParameter(name);
|
||||
if (value.empty()) return default_value;
|
||||
if (value == "true") return true;
|
||||
if (value == "false") return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
void GenericFeatureFunction::GetFeatureTypes(
|
||||
std::vector<FeatureType *> *types) const {
|
||||
if (feature_type_ != nullptr) types->push_back(feature_type_);
|
||||
}
|
||||
|
||||
FeatureType *GenericFeatureFunction::GetFeatureType() const {
|
||||
// If a single feature type has been registered return it.
|
||||
if (feature_type_ != nullptr) return feature_type_;
|
||||
|
||||
// Get feature types for function.
|
||||
std::vector<FeatureType *> types;
|
||||
GetFeatureTypes(&types);
|
||||
|
||||
// If there is exactly one feature type return this, else return null.
|
||||
if (types.size() == 1) return types[0];
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
633
Telegram/ThirdParty/cld3/src/feature_extractor.h
vendored
Normal file
633
Telegram/ThirdParty/cld3/src/feature_extractor.h
vendored
Normal file
@@ -0,0 +1,633 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Generic feature extractor for extracting features from objects. The feature
|
||||
// extractor can be used for extracting features from any object. The feature
|
||||
// extractor and feature function classes are template classes that have to
|
||||
// be instantiated for extracting feature from a specific object type.
|
||||
//
|
||||
// A feature extractor consists of a hierarchy of feature functions. Each
|
||||
// feature function extracts one or more feature type and value pairs from the
|
||||
// object.
|
||||
//
|
||||
// The feature extractor has a modular design where new feature functions can be
|
||||
// registered as components. The feature extractor is initialized from a
|
||||
// descriptor represented by a protocol buffer. The feature extractor can also
|
||||
// be initialized from a text-based source specification of the feature
|
||||
// extractor. Feature specification parsers can be added as components. By
|
||||
// default the feature extractor can be read from an ASCII protocol buffer or in
|
||||
// a simple feature modeling language (fml).
|
||||
|
||||
// A feature function is invoked with a focus. Nested feature function can be
|
||||
// invoked with another focus determined by the parent feature function.
|
||||
|
||||
#ifndef FEATURE_EXTRACTOR_H_
|
||||
#define FEATURE_EXTRACTOR_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "base.h"
|
||||
#include "cld_3/protos/feature_extractor.pb.h"
|
||||
#include "feature_types.h"
|
||||
#include "registry.h"
|
||||
#include "script_span/stringpiece.h"
|
||||
#include "task_context.h"
|
||||
#include "utils.h"
|
||||
#include "workspace.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// TODO(djweiss) Clean this up as well.
|
||||
// Use the same type for feature values as is used for predicated.
|
||||
typedef int64 Predicate;
|
||||
typedef Predicate FeatureValue;
|
||||
|
||||
// Output feature model in FML format.
|
||||
void ToFMLFunction(const FeatureFunctionDescriptor &function, string *output);
|
||||
void ToFML(const FeatureFunctionDescriptor &function, string *output);
|
||||
|
||||
// A union used to represent discrete and continuous feature values.
|
||||
union FloatFeatureValue {
|
||||
public:
|
||||
explicit FloatFeatureValue(FeatureValue v) : discrete_value(v) {}
|
||||
FloatFeatureValue(uint32 i, float w) {
|
||||
value.id = i;
|
||||
value.weight = w;
|
||||
}
|
||||
FeatureValue discrete_value;
|
||||
struct IdWeight {
|
||||
uint32 id;
|
||||
float weight;
|
||||
} value;
|
||||
};
|
||||
|
||||
// A feature vector contains feature type and value pairs.
|
||||
class FeatureVector {
|
||||
public:
|
||||
FeatureVector();
|
||||
~FeatureVector();
|
||||
|
||||
// Adds feature type and value pair to feature vector.
|
||||
void add(FeatureType *type, FeatureValue value) {
|
||||
features_.emplace_back(type, value);
|
||||
}
|
||||
|
||||
// Removes all elements from the feature vector.
|
||||
void clear() { features_.clear(); }
|
||||
|
||||
// Returns the number of elements in the feature vector.
|
||||
int size() const { return features_.size(); }
|
||||
|
||||
// Reserves space in the underlying feature vector.
|
||||
void reserve(int n) { features_.reserve(n); }
|
||||
|
||||
// Returns feature type for an element in the feature vector.
|
||||
FeatureType *type(int index) const { return features_[index].type; }
|
||||
|
||||
// Returns feature value for an element in the feature vector.
|
||||
FeatureValue value(int index) const { return features_[index].value; }
|
||||
|
||||
private:
|
||||
// Structure for holding feature type and value pairs.
|
||||
struct Element {
|
||||
Element() : type(NULL), value(-1) {}
|
||||
Element(FeatureType *t, FeatureValue v) : type(t), value(v) {}
|
||||
|
||||
FeatureType *type;
|
||||
FeatureValue value;
|
||||
};
|
||||
|
||||
// Array for storing feature vector elements.
|
||||
std::vector<Element> features_;
|
||||
|
||||
CLD3_DISALLOW_COPY_AND_ASSIGN(FeatureVector);
|
||||
};
|
||||
|
||||
// The generic feature extractor is the type-independent part of a feature
|
||||
// extractor. This holds the descriptor for the feature extractor and the
|
||||
// collection of feature types used in the feature extractor. The feature
|
||||
// types are not available until FeatureExtractor<>::Init() has been called.
|
||||
class GenericFeatureExtractor {
|
||||
public:
|
||||
GenericFeatureExtractor();
|
||||
virtual ~GenericFeatureExtractor();
|
||||
GenericFeatureExtractor(const GenericFeatureExtractor &extractor);
|
||||
|
||||
// Initializes the feature extractor from a source representation of the
|
||||
// feature extractor. The first line is used for determining the feature
|
||||
// specification language. If the first line starts with #! followed by a name
|
||||
// then this name is used for instantiating a feature specification parser
|
||||
// with that name. If the language cannot be detected this way it falls back
|
||||
// to using the default language supplied.
|
||||
void Parse(const string &source);
|
||||
|
||||
// Returns the feature extractor descriptor.
|
||||
const FeatureExtractorDescriptor &descriptor() const { return descriptor_; }
|
||||
FeatureExtractorDescriptor *mutable_descriptor() { return &descriptor_; }
|
||||
|
||||
// Returns the number of feature types in the feature extractor. Invalid
|
||||
// before Init() has been called.
|
||||
int feature_types() const { return feature_types_.size(); }
|
||||
|
||||
// Returns all feature types names used by the extractor. The names are
|
||||
// added to the types_names array. Invalid before Init() has been called.
|
||||
void GetFeatureTypeNames(std::vector<string> *type_names) const;
|
||||
|
||||
// Returns a feature type used in the extractor. Invalid before Init() has
|
||||
// been called.
|
||||
const FeatureType *feature_type(int index) const {
|
||||
return feature_types_[index];
|
||||
}
|
||||
|
||||
// Returns the feature domain size of this feature extractor.
|
||||
// NOTE: The way that domain size is calculated is, for some, unintuitive. It
|
||||
// is the largest domain size of any feature type.
|
||||
FeatureValue GetDomainSize() const;
|
||||
|
||||
protected:
|
||||
// Initializes the feature types used by the extractor. Called from
|
||||
// FeatureExtractor<>::Init().
|
||||
void InitializeFeatureTypes();
|
||||
|
||||
private:
|
||||
// Initializes the top-level feature functions.
|
||||
virtual void InitializeFeatureFunctions() = 0;
|
||||
|
||||
// Returns all feature types used by the extractor. The feature types are
|
||||
// added to the result array.
|
||||
virtual void GetFeatureTypes(std::vector<FeatureType *> *types) const = 0;
|
||||
|
||||
// Descriptor for the feature extractor. This is a protocol buffer that
|
||||
// contains all the information about the feature extractor. The feature
|
||||
// functions are initialized from the information in the descriptor.
|
||||
FeatureExtractorDescriptor descriptor_;
|
||||
|
||||
// All feature types used by the feature extractor. The collection of all the
|
||||
// feature types describes the feature space of the feature set produced by
|
||||
// the feature extractor. Not owned.
|
||||
std::vector<FeatureType *> feature_types_;
|
||||
};
|
||||
|
||||
// The generic feature function is the type-independent part of a feature
|
||||
// function. Each feature function is associated with the descriptor that it is
|
||||
// instantiated from. The feature types associated with this feature function
|
||||
// will be established by the time FeatureExtractor<>::Init() completes.
|
||||
class GenericFeatureFunction {
|
||||
public:
|
||||
// A feature value that represents the absence of a value.
|
||||
static constexpr FeatureValue kNone = -1;
|
||||
|
||||
GenericFeatureFunction();
|
||||
virtual ~GenericFeatureFunction();
|
||||
|
||||
// Sets up the feature function. NB: FeatureTypes of nested functions are not
|
||||
// guaranteed to be available until Init().
|
||||
virtual void Setup(TaskContext *context) {}
|
||||
|
||||
// Initializes the feature function. NB: The FeatureType of this function must
|
||||
// be established when this method completes.
|
||||
virtual void Init(TaskContext *context) {}
|
||||
|
||||
// Requests workspaces from a registry to obtain indices into a WorkspaceSet
|
||||
// for any Workspace objects used by this feature function. NB: This will be
|
||||
// called after Init(), so it can depend on resources and arguments.
|
||||
virtual void RequestWorkspaces(WorkspaceRegistry *registry) {}
|
||||
|
||||
// Appends the feature types produced by the feature function to types. The
|
||||
// default implementation appends feature_type(), if non-null. Invalid
|
||||
// before Init() has been called.
|
||||
virtual void GetFeatureTypes(std::vector<FeatureType *> *types) const;
|
||||
|
||||
// Returns the feature type for feature produced by this feature function. If
|
||||
// the feature function produces features of different types this returns
|
||||
// null. Invalid before Init() has been called.
|
||||
virtual FeatureType *GetFeatureType() const;
|
||||
|
||||
// Returns the name of the registry used for creating the feature function.
|
||||
// This can be used for checking if two feature functions are of the same
|
||||
// kind.
|
||||
virtual const char *RegistryName() const = 0;
|
||||
|
||||
// Returns the value of a named parameter in the feature functions descriptor.
|
||||
// If the named parameter is not found the global parameters are searched.
|
||||
string GetParameter(const string &name) const;
|
||||
int GetIntParameter(const string &name, int default_value) const;
|
||||
bool GetBoolParameter(const string &name, bool default_value) const;
|
||||
|
||||
// Returns the FML function description for the feature function, i.e. the
|
||||
// name and parameters without the nested features.
|
||||
string FunctionName() const {
|
||||
string output;
|
||||
ToFMLFunction(*descriptor_, &output);
|
||||
return output;
|
||||
}
|
||||
|
||||
// Returns the prefix for nested feature functions. This is the prefix of this
|
||||
// feature function concatenated with the feature function name.
|
||||
string SubPrefix() const {
|
||||
return prefix_.empty() ? FunctionName() : prefix_ + "." + FunctionName();
|
||||
}
|
||||
|
||||
// Returns/sets the feature extractor this function belongs to.
|
||||
GenericFeatureExtractor *extractor() const { return extractor_; }
|
||||
void set_extractor(GenericFeatureExtractor *extractor) {
|
||||
extractor_ = extractor;
|
||||
}
|
||||
|
||||
// Returns/sets the feature function descriptor.
|
||||
FeatureFunctionDescriptor *descriptor() const { return descriptor_; }
|
||||
void set_descriptor(FeatureFunctionDescriptor *descriptor) {
|
||||
descriptor_ = descriptor;
|
||||
}
|
||||
|
||||
// Returns a descriptive name for the feature function. The name is taken from
|
||||
// the descriptor for the feature function. If the name is empty or the
|
||||
// feature function is a variable the name is the FML representation of the
|
||||
// feature, including the prefix.
|
||||
string name() const {
|
||||
string output;
|
||||
if (descriptor_->name().empty()) {
|
||||
if (!prefix_.empty()) {
|
||||
output.append(prefix_);
|
||||
output.append(".");
|
||||
}
|
||||
ToFML(*descriptor_, &output);
|
||||
} else {
|
||||
output = descriptor_->name();
|
||||
}
|
||||
StringPiece stripped(output);
|
||||
utils::RemoveWhitespaceContext(&stripped);
|
||||
|
||||
string stripped_output(stripped.data(), stripped.size());
|
||||
return stripped_output;
|
||||
}
|
||||
|
||||
// Returns the argument from the feature function descriptor. It defaults to
|
||||
// 0 if the argument has not been specified.
|
||||
int argument() const {
|
||||
return descriptor_->has_argument() ? descriptor_->argument() : 0;
|
||||
}
|
||||
|
||||
// Returns/sets/clears function name prefix.
|
||||
const string &prefix() const { return prefix_; }
|
||||
void set_prefix(const string &prefix) { prefix_ = prefix; }
|
||||
|
||||
protected:
|
||||
// Returns the feature type for single-type feature functions.
|
||||
FeatureType *feature_type() const { return feature_type_; }
|
||||
|
||||
// Sets the feature type for single-type feature functions. This takes
|
||||
// ownership of feature_type. Can only be called once.
|
||||
void set_feature_type(FeatureType *feature_type) {
|
||||
CLD3_DCHECK(feature_type_ == nullptr);
|
||||
feature_type_ = feature_type;
|
||||
}
|
||||
|
||||
private:
|
||||
// Feature extractor this feature function belongs to. Not owned.
|
||||
GenericFeatureExtractor *extractor_ = nullptr;
|
||||
|
||||
// Descriptor for feature function. Not owned.
|
||||
FeatureFunctionDescriptor *descriptor_ = nullptr;
|
||||
|
||||
// Feature type for features produced by this feature function. If the
|
||||
// feature function produces features of multiple feature types this is null
|
||||
// and the feature function must return it's feature types in
|
||||
// GetFeatureTypes(). Owned.
|
||||
FeatureType *feature_type_ = nullptr;
|
||||
|
||||
// Prefix used for sub-feature types of this function.
|
||||
string prefix_;
|
||||
};
|
||||
|
||||
// Feature function that can extract features from an object. Templated on
|
||||
// two type arguments:
|
||||
//
|
||||
// OBJ: The "object" from which features are extracted; e.g., a sentence. This
|
||||
// should be a plain type, rather than a reference or pointer.
|
||||
//
|
||||
// ARGS: A set of 0 or more types that are used to "index" into some part of the
|
||||
// object that should be extracted, e.g. an int token index for a sentence
|
||||
// object. This should not be a reference type.
|
||||
template <class OBJ, class... ARGS>
|
||||
class FeatureFunction
|
||||
: public GenericFeatureFunction,
|
||||
public RegisterableClass<FeatureFunction<OBJ, ARGS...> > {
|
||||
public:
|
||||
using Self = FeatureFunction<OBJ, ARGS...>;
|
||||
|
||||
// Preprocesses the object. This will be called prior to calling Evaluate()
|
||||
// or Compute() on that object.
|
||||
virtual void Preprocess(WorkspaceSet *workspaces, OBJ *object) const {}
|
||||
|
||||
// Appends features computed from the object and focus to the result. The
|
||||
// default implementation delegates to Compute(), adding a single value if
|
||||
// available. Multi-valued feature functions must override this method.
|
||||
virtual void Evaluate(const WorkspaceSet &workspaces, const OBJ &object,
|
||||
ARGS... args, FeatureVector *result) const {
|
||||
FeatureValue value = Compute(workspaces, object, args..., result);
|
||||
if (value != kNone) result->add(feature_type(), value);
|
||||
}
|
||||
|
||||
// Returns a feature value computed from the object and focus, or kNone if no
|
||||
// value is computed. Single-valued feature functions only need to override
|
||||
// this method.
|
||||
virtual FeatureValue Compute(const WorkspaceSet &workspaces,
|
||||
const OBJ &object, ARGS... args,
|
||||
const FeatureVector *fv) const {
|
||||
return kNone;
|
||||
}
|
||||
|
||||
// Instantiates a new feature function in a feature extractor from a feature
|
||||
// descriptor.
|
||||
static Self *Instantiate(GenericFeatureExtractor *extractor,
|
||||
FeatureFunctionDescriptor *fd,
|
||||
const string &prefix) {
|
||||
Self *f = Self::Create(fd->type());
|
||||
f->set_extractor(extractor);
|
||||
f->set_descriptor(fd);
|
||||
f->set_prefix(prefix);
|
||||
return f;
|
||||
}
|
||||
|
||||
// Returns the name of the registry for the feature function.
|
||||
const char *RegistryName() const override { return Self::registry()->name; }
|
||||
|
||||
private:
|
||||
// Special feature function class for resolving variable references. The type
|
||||
// of the feature function is used for resolving the variable reference. When
|
||||
// evaluated it will either get the feature value(s) from the variable portion
|
||||
// of the feature vector, if present, or otherwise it will call the referenced
|
||||
// feature extractor function directly to extract the feature(s).
|
||||
class Reference;
|
||||
};
|
||||
|
||||
// Base class for features with nested feature functions. The nested functions
|
||||
// are of type NES, which may be different from the type of the parent function.
|
||||
// NB: NestedFeatureFunction will ensure that all initialization of nested
|
||||
// functions takes place during Setup() and Init() -- after the nested features
|
||||
// are initialized, the parent feature is initialized via SetupNested() and
|
||||
// InitNested(). Alternatively, a derived classes that overrides Setup() and
|
||||
// Init() directly should call Parent::Setup(), Parent::Init(), etc. first.
|
||||
//
|
||||
// Note: NestedFeatureFunction cannot know how to call Preprocess, Evaluate, or
|
||||
// Compute, since the nested functions may be of a different type.
|
||||
template <class NES, class OBJ, class... ARGS>
|
||||
class NestedFeatureFunction : public FeatureFunction<OBJ, ARGS...> {
|
||||
public:
|
||||
using Parent = NestedFeatureFunction<NES, OBJ, ARGS...>;
|
||||
|
||||
// Clean up nested functions.
|
||||
~NestedFeatureFunction() override { utils::STLDeleteElements(&nested_); }
|
||||
|
||||
// By default, just appends the nested feature types.
|
||||
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
|
||||
// Nested features require nested features to be defined.
|
||||
CLD3_DCHECK(!this->nested().empty());
|
||||
for (auto *function : nested_) function->GetFeatureTypes(types);
|
||||
}
|
||||
|
||||
// Sets up the nested features.
|
||||
void Setup(TaskContext *context) override {
|
||||
CreateNested(this->extractor(), this->descriptor(), &nested_,
|
||||
this->SubPrefix());
|
||||
for (auto *function : nested_) function->Setup(context);
|
||||
SetupNested(context);
|
||||
}
|
||||
|
||||
// Sets up this NestedFeatureFunction specifically.
|
||||
virtual void SetupNested(TaskContext *context) {}
|
||||
|
||||
// Initializes the nested features.
|
||||
void Init(TaskContext *context) override {
|
||||
for (auto *function : nested_) function->Init(context);
|
||||
InitNested(context);
|
||||
}
|
||||
|
||||
// Initializes this NestedFeatureFunction specifically.
|
||||
virtual void InitNested(TaskContext *context) {}
|
||||
|
||||
// Gets all the workspaces needed for the nested functions.
|
||||
void RequestWorkspaces(WorkspaceRegistry *registry) override {
|
||||
for (auto *function : nested_) function->RequestWorkspaces(registry);
|
||||
}
|
||||
|
||||
// Returns the list of nested feature functions.
|
||||
const vector<NES *> &nested() const { return nested_; }
|
||||
|
||||
// Instantiates nested feature functions for a feature function. Creates and
|
||||
// initializes one feature function for each sub-descriptor in the feature
|
||||
// descriptor.
|
||||
static void CreateNested(GenericFeatureExtractor *extractor,
|
||||
FeatureFunctionDescriptor *fd,
|
||||
vector<NES *> *functions, const string &prefix) {
|
||||
for (int i = 0; i < fd->feature_size(); ++i) {
|
||||
FeatureFunctionDescriptor *sub = fd->mutable_feature(i);
|
||||
NES *f = NES::Instantiate(extractor, sub, prefix);
|
||||
functions->push_back(f);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
// The nested feature functions, if any, in order of declaration in the
|
||||
// feature descriptor. Owned.
|
||||
vector<NES *> nested_;
|
||||
};
|
||||
|
||||
// Base class for a nested feature function that takes nested features with the
|
||||
// same signature as these features, i.e. a meta feature. For this class, we can
|
||||
// provide preprocessing of the nested features.
|
||||
template <class OBJ, class... ARGS>
|
||||
class MetaFeatureFunction
|
||||
: public NestedFeatureFunction<FeatureFunction<OBJ, ARGS...>, OBJ,
|
||||
ARGS...> {
|
||||
public:
|
||||
// Preprocesses using the nested features.
|
||||
void Preprocess(WorkspaceSet *workspaces, OBJ *object) const override {
|
||||
for (auto *function : this->nested_) {
|
||||
function->Preprocess(workspaces, object);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Template for a special type of locator: The locator of type
|
||||
// FeatureFunction<OBJ, ARGS...> calls nested functions of type
|
||||
// FeatureFunction<OBJ, IDX, ARGS...>, where the derived class DER is
|
||||
// responsible for translating by providing the following:
|
||||
//
|
||||
// // Gets the new additional focus.
|
||||
// IDX GetFocus(const WorkspaceSet &workspaces, const OBJ &object);
|
||||
//
|
||||
// This is useful to e.g. add a token focus to a parser state based on some
|
||||
// desired property of that state.
|
||||
template <class DER, class OBJ, class IDX, class... ARGS>
|
||||
class FeatureAddFocusLocator
|
||||
: public NestedFeatureFunction<FeatureFunction<OBJ, IDX, ARGS...>, OBJ,
|
||||
ARGS...> {
|
||||
public:
|
||||
void Preprocess(WorkspaceSet *workspaces, OBJ *object) const override {
|
||||
for (auto *function : this->nested_) {
|
||||
function->Preprocess(workspaces, object);
|
||||
}
|
||||
}
|
||||
|
||||
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object, ARGS... args,
|
||||
FeatureVector *result) const override {
|
||||
IDX focus =
|
||||
static_cast<const DER *>(this)->GetFocus(workspaces, object, args...);
|
||||
for (auto *function : this->nested()) {
|
||||
function->Evaluate(workspaces, object, focus, args..., result);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the first nested feature's computed value.
|
||||
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
|
||||
ARGS... args,
|
||||
const FeatureVector *result) const override {
|
||||
IDX focus =
|
||||
static_cast<const DER *>(this)->GetFocus(workspaces, object, args...);
|
||||
return this->nested()[0]->Compute(workspaces, object, focus, args...,
|
||||
result);
|
||||
}
|
||||
};
|
||||
|
||||
// CRTP feature locator class. This is a meta feature that modifies ARGS and
|
||||
// then calls the nested feature functions with the modified ARGS. Note that in
|
||||
// order for this template to work correctly, all of ARGS must be types for
|
||||
// which the reference operator & can be interpreted as a pointer to the
|
||||
// argument. The derived class DER must implement the UpdateFocus method which
|
||||
// takes pointers to the ARGS arguments:
|
||||
//
|
||||
// // Updates the current arguments.
|
||||
// void UpdateArgs(const OBJ &object, ARGS *...args) const;
|
||||
template <class DER, class OBJ, class... ARGS>
|
||||
class FeatureLocator : public MetaFeatureFunction<OBJ, ARGS...> {
|
||||
public:
|
||||
// Feature locators have an additional check that there is no intrinsic type.
|
||||
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
|
||||
// FeatureLocators should not have an intrinsic type.
|
||||
CLD3_DCHECK(this->feature_type() == nullptr);
|
||||
MetaFeatureFunction<OBJ, ARGS...>::GetFeatureTypes(types);
|
||||
}
|
||||
|
||||
// Evaluates the locator.
|
||||
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object, ARGS... args,
|
||||
FeatureVector *result) const override {
|
||||
static_cast<const DER *>(this)->UpdateArgs(workspaces, object, &args...);
|
||||
for (auto *function : this->nested()) {
|
||||
function->Evaluate(workspaces, object, args..., result);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the first nested feature's computed value.
|
||||
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
|
||||
ARGS... args,
|
||||
const FeatureVector *result) const override {
|
||||
static_cast<const DER *>(this)->UpdateArgs(workspaces, object, &args...);
|
||||
return this->nested()[0]->Compute(workspaces, object, args..., result);
|
||||
}
|
||||
};
|
||||
|
||||
// Feature extractor for extracting features from objects of a certain class.
|
||||
// Template type parameters are as defined for FeatureFunction.
|
||||
template <class OBJ, class... ARGS>
|
||||
class FeatureExtractor : public GenericFeatureExtractor {
|
||||
public:
|
||||
// Feature function type for top-level functions in the feature extractor.
|
||||
typedef FeatureFunction<OBJ, ARGS...> Function;
|
||||
typedef FeatureExtractor<OBJ, ARGS...> Self;
|
||||
|
||||
// Feature locator type for the feature extractor.
|
||||
template <class DER>
|
||||
using Locator = FeatureLocator<DER, OBJ, ARGS...>;
|
||||
|
||||
// Initializes feature extractor.
|
||||
FeatureExtractor() {}
|
||||
|
||||
~FeatureExtractor() override { utils::STLDeleteElements(&functions_); }
|
||||
|
||||
// Sets up the feature extractor. Note that only top-level functions exist
|
||||
// until Setup() is called. This does not take ownership over the context,
|
||||
// which must outlive this.
|
||||
void Setup(TaskContext *context) {
|
||||
for (Function *function : functions_) function->Setup(context);
|
||||
}
|
||||
|
||||
// Initializes the feature extractor. Must be called after Setup(). This
|
||||
// does not take ownership over the context, which must outlive this.
|
||||
void Init(TaskContext *context) {
|
||||
for (Function *function : functions_) function->Init(context);
|
||||
this->InitializeFeatureTypes();
|
||||
}
|
||||
|
||||
// Requests workspaces from the registry. Must be called after Init(), and
|
||||
// before Preprocess(). Does not take ownership over registry. This should be
|
||||
// the same registry used to initialize the WorkspaceSet used in Preprocess()
|
||||
// and ExtractFeatures(). NB: This is a different ordering from that used in
|
||||
// SentenceFeatureRepresentation style feature computation.
|
||||
void RequestWorkspaces(WorkspaceRegistry *registry) {
|
||||
for (auto *function : functions_) function->RequestWorkspaces(registry);
|
||||
}
|
||||
|
||||
// Preprocesses the object using feature functions for the phase. Must be
|
||||
// called before any calls to ExtractFeatures() on that object and phase.
|
||||
void Preprocess(WorkspaceSet *workspaces, OBJ *object) const {
|
||||
for (Function *function : functions_) {
|
||||
function->Preprocess(workspaces, object);
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts features from an object with a focus. This invokes all the
|
||||
// top-level feature functions in the feature extractor. Only feature
|
||||
// functions belonging to the specified phase are invoked.
|
||||
void ExtractFeatures(const WorkspaceSet &workspaces, const OBJ &object,
|
||||
ARGS... args, FeatureVector *result) const {
|
||||
result->reserve(this->feature_types());
|
||||
|
||||
// Extract features.
|
||||
for (size_t i = 0; i < functions_.size(); ++i) {
|
||||
functions_[i]->Evaluate(workspaces, object, args..., result);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Creates and initializes all feature functions in the feature extractor.
|
||||
void InitializeFeatureFunctions() override {
|
||||
// Create all top-level feature functions.
|
||||
for (int i = 0; i < descriptor().feature_size(); ++i) {
|
||||
FeatureFunctionDescriptor *fd = mutable_descriptor()->mutable_feature(i);
|
||||
Function *function = Function::Instantiate(this, fd, "");
|
||||
functions_.push_back(function);
|
||||
}
|
||||
}
|
||||
|
||||
// Collect all feature types used in the feature extractor.
|
||||
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
|
||||
for (size_t i = 0; i < functions_.size(); ++i) {
|
||||
functions_[i]->GetFeatureTypes(types);
|
||||
}
|
||||
}
|
||||
|
||||
// Top-level feature functions (and variables) in the feature extractor.
|
||||
// Owned.
|
||||
std::vector<Function *> functions_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // FEATURE_EXTRACTOR_H_
|
||||
50
Telegram/ThirdParty/cld3/src/feature_extractor.proto
vendored
Normal file
50
Telegram/ThirdParty/cld3/src/feature_extractor.proto
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Protocol buffers for feature extractor.
|
||||
|
||||
syntax = "proto2";
|
||||
option optimize_for = LITE_RUNTIME;
|
||||
|
||||
package chrome_lang_id;
|
||||
|
||||
message Parameter {
|
||||
optional string name = 1;
|
||||
optional string value = 2;
|
||||
}
|
||||
|
||||
// Descriptor for feature function.
|
||||
message FeatureFunctionDescriptor {
|
||||
// Feature function type.
|
||||
required string type = 1;
|
||||
|
||||
// Feature function name.
|
||||
optional string name = 2;
|
||||
|
||||
// Default argument for feature function.
|
||||
optional int32 argument = 3 [default = 0];
|
||||
|
||||
// Named parameters for feature descriptor.
|
||||
repeated Parameter parameter = 4;
|
||||
|
||||
// Nested sub-feature function descriptors.
|
||||
repeated FeatureFunctionDescriptor feature = 7;
|
||||
};
|
||||
|
||||
// Descriptor for feature extractor.
|
||||
message FeatureExtractorDescriptor {
|
||||
// Top-level feature function for extractor.
|
||||
repeated FeatureFunctionDescriptor feature = 1;
|
||||
};
|
||||
72
Telegram/ThirdParty/cld3/src/feature_types.cc
vendored
Normal file
72
Telegram/ThirdParty/cld3/src/feature_types.cc
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "feature_types.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "base.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
FeatureType::FeatureType(const string &name)
|
||||
: name_(name),
|
||||
base_(0),
|
||||
is_continuous_(name.find("continuous") != string::npos) {}
|
||||
|
||||
FeatureType::~FeatureType() {}
|
||||
|
||||
template <class Resource>
|
||||
ResourceBasedFeatureType<Resource>::ResourceBasedFeatureType(
|
||||
const string &name, const Resource *resource,
|
||||
const std::map<FeatureValue, string> &values)
|
||||
: FeatureType(name), resource_(resource), values_(values) {
|
||||
max_value_ = resource->NumValues() - 1;
|
||||
for (const auto &pair : values) {
|
||||
CLD3_DCHECK(pair.first >= resource->NumValues());
|
||||
max_value_ = pair.first > max_value_ ? pair.first : max_value_;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Resource>
|
||||
ResourceBasedFeatureType<Resource>::ResourceBasedFeatureType(
|
||||
const string &name, const Resource *resource)
|
||||
: ResourceBasedFeatureType(name, resource, {}) {}
|
||||
|
||||
EnumFeatureType::EnumFeatureType(
|
||||
const string &name, const std::map<FeatureValue, string> &value_names)
|
||||
: FeatureType(name), value_names_(value_names) {
|
||||
for (const auto &pair : value_names) {
|
||||
CLD3_DCHECK(pair.first >= 0);
|
||||
domain_size_ = std::max(domain_size_, pair.first + 1);
|
||||
}
|
||||
}
|
||||
|
||||
EnumFeatureType::~EnumFeatureType() {}
|
||||
|
||||
string EnumFeatureType::GetFeatureValueName(FeatureValue value) const {
|
||||
auto it = value_names_.find(value);
|
||||
if (it == value_names_.end()) {
|
||||
return "<INVALID>";
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
FeatureValue EnumFeatureType::GetDomainSize() const { return domain_size_; }
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
158
Telegram/ThirdParty/cld3/src/feature_types.h
vendored
Normal file
158
Telegram/ThirdParty/cld3/src/feature_types.h
vendored
Normal file
@@ -0,0 +1,158 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Common feature types for parser components.
|
||||
|
||||
#ifndef FEATURE_TYPES_H_
|
||||
#define FEATURE_TYPES_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "base.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// TODO(djweiss) Clean this up as well.
|
||||
// Use the same type for feature values as is used for predicated.
|
||||
typedef int64 Predicate;
|
||||
typedef Predicate FeatureValue;
|
||||
|
||||
// Each feature value in a feature vector has a feature type. The feature type
|
||||
// is used for converting feature type and value pairs to predicate values. The
|
||||
// feature type can also return names for feature values and calculate the size
|
||||
// of the feature value domain. The FeatureType class is abstract and must be
|
||||
// specialized for the concrete feature types.
|
||||
class FeatureType {
|
||||
public:
|
||||
// Initializes a feature type.
|
||||
explicit FeatureType(const string &name);
|
||||
|
||||
virtual ~FeatureType();
|
||||
|
||||
// Converts a feature value to a name.
|
||||
virtual string GetFeatureValueName(FeatureValue value) const = 0;
|
||||
|
||||
// Returns the size of the feature values domain.
|
||||
virtual int64 GetDomainSize() const = 0;
|
||||
|
||||
// Returns the feature type name.
|
||||
const string &name() const { return name_; }
|
||||
|
||||
Predicate base() const { return base_; }
|
||||
void set_base(Predicate base) { base_ = base; }
|
||||
|
||||
// Returns true iff this feature is continuous; see FloatFeatureValue.
|
||||
bool is_continuous() const { return is_continuous_; }
|
||||
|
||||
private:
|
||||
// Feature type name.
|
||||
string name_;
|
||||
|
||||
// "Base" feature value: i.e. a "slot" in a global ordering of features.
|
||||
Predicate base_;
|
||||
|
||||
// See doc for is_continuous().
|
||||
bool is_continuous_;
|
||||
};
|
||||
|
||||
// Templated generic resource based feature type. This feature type delegates
|
||||
// look up of feature value names to an unknown resource class, which is not
|
||||
// owned. Optionally, this type can also store a mapping of extra values which
|
||||
// are not in the resource.
|
||||
//
|
||||
// Note: this class assumes that Resource->GetFeatureValueName() will return
|
||||
// successfully for values ONLY in the range [0, Resource->NumValues()) Any
|
||||
// feature value not in the extra value map and not in the above range of
|
||||
// Resource will result in a ERROR and return of "<INVALID>".
|
||||
template <class Resource>
|
||||
class ResourceBasedFeatureType : public FeatureType {
|
||||
public:
|
||||
// Creates a new type with given name, resource object, and a mapping of
|
||||
// special values. The values must be greater or equal to
|
||||
// resource->NumValues() so as to avoid collisions; this is verified with
|
||||
// CHECK at creation.
|
||||
ResourceBasedFeatureType(const string &name, const Resource *resource,
|
||||
const std::map<FeatureValue, string> &values);
|
||||
|
||||
// Creates a new type with no special values.
|
||||
ResourceBasedFeatureType(const string &name, const Resource *resource);
|
||||
|
||||
// Returns the feature name for a given feature value. First checks the values
|
||||
// map, then checks the resource to look up the name.
|
||||
string GetFeatureValueName(FeatureValue value) const override {
|
||||
if (values_.find(value) != values_.end()) {
|
||||
return values_.find(value)->second;
|
||||
}
|
||||
if (value >= 0 && value < resource_->NumValues()) {
|
||||
return resource_->GetFeatureValueName(value);
|
||||
} else {
|
||||
return "<INVALID>";
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the number of possible values for this feature type. This is the
|
||||
// based on the largest value that was observed in the extra values.
|
||||
FeatureValue GetDomainSize() const override { return max_value_ + 1; }
|
||||
|
||||
protected:
|
||||
// Shared resource. Not owned.
|
||||
const Resource *resource_ = nullptr;
|
||||
|
||||
// Maximum possible value this feature could take.
|
||||
FeatureValue max_value_;
|
||||
|
||||
// Mapping for extra feature values not in the resource.
|
||||
std::map<FeatureValue, string> values_;
|
||||
};
|
||||
|
||||
// Feature type that is defined using an explicit map from FeatureValue to
|
||||
// string values. This can reduce some of the boilerplate when defining
|
||||
// features that generate enum values. Example usage:
|
||||
//
|
||||
// class BeverageSizeFeature : public FeatureFunction<Beverage>
|
||||
// enum FeatureValue { SMALL, MEDIUM, LARGE }; // values for this feature
|
||||
// void Init(TaskContext *context) override {
|
||||
// set_feature_type(new EnumFeatureType("beverage_size",
|
||||
// {{SMALL, "SMALL"}, {MEDIUM, "MEDIUM"}, {LARGE, "LARGE"}});
|
||||
// }
|
||||
// [...]
|
||||
// };
|
||||
class EnumFeatureType : public FeatureType {
|
||||
public:
|
||||
EnumFeatureType(const string &name,
|
||||
const std::map<FeatureValue, string> &value_names);
|
||||
~EnumFeatureType() override;
|
||||
|
||||
// Returns the feature name for a given feature value.
|
||||
string GetFeatureValueName(FeatureValue value) const override;
|
||||
|
||||
// Returns the number of possible values for this feature type. This is one
|
||||
// greater than the largest value in the value_names map.
|
||||
FeatureValue GetDomainSize() const override;
|
||||
|
||||
protected:
|
||||
// Maximum possible value this feature could take.
|
||||
FeatureValue domain_size_ = 0;
|
||||
|
||||
// Names of feature values.
|
||||
std::map<FeatureValue, string> value_names_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // FEATURE_TYPES_H_
|
||||
58
Telegram/ThirdParty/cld3/src/float16.h
vendored
Normal file
58
Telegram/ThirdParty/cld3/src/float16.h
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef FLOAT16_H_
|
||||
#define FLOAT16_H_
|
||||
|
||||
#include <string.h> // for memcpy
|
||||
|
||||
#include "base.h"
|
||||
#include "casts.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Compact 16-bit encoding of floating point numbers. This
|
||||
// representation uses 1 bit for the sign, 8 bits for the exponent and
|
||||
// 7 bits for the mantissa. It is assumed that floats are in IEEE 754
|
||||
// format so a float16 is just bits 16-31 of a single precision float.
|
||||
//
|
||||
// NOTE: The IEEE floating point standard defines a float16 format that
|
||||
// is different than this format (it has fewer bits of exponent and more
|
||||
// bits of mantissa). We don't use that format here because conversion
|
||||
// to/from 32-bit floats is more complex for that format, and the
|
||||
// conversion for this format is very simple.
|
||||
//
|
||||
// <---------float16------------>
|
||||
// s e e e e e e e e f f f f f f f f f f f f f f f f f f f f f f f
|
||||
// <------------------------------float-------------------------->
|
||||
// 3 3 2 2 1 1 0
|
||||
// 1 0 3 2 5 4 0
|
||||
|
||||
typedef uint16 float16;
|
||||
|
||||
static inline float16 Float32To16(float f) {
|
||||
// Note that we just truncate the mantissa bits: we make no effort to
|
||||
// do any smarter rounding.
|
||||
return (lang_id_bit_cast<uint32>(f) >> 16) & 0xffff;
|
||||
}
|
||||
|
||||
static inline float Float16To32(float16 f) {
|
||||
// We fill in the new mantissa bits with 0, and don't do anything smarter.
|
||||
return lang_id_bit_cast<float>(f << 16);
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // FLOAT16_H_
|
||||
308
Telegram/ThirdParty/cld3/src/fml_parser.cc
vendored
Normal file
308
Telegram/ThirdParty/cld3/src/fml_parser.cc
vendored
Normal file
@@ -0,0 +1,308 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "fml_parser.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <string>
|
||||
|
||||
#include "base.h"
|
||||
#include "utils.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
namespace {
|
||||
|
||||
inline bool IsValidCharAtStartOfIdentifier(char c) {
|
||||
return isalpha(c) || (c == '_') || (c == '/');
|
||||
}
|
||||
|
||||
// Returns true iff character c can appear inside an identifier.
|
||||
inline bool IsValidCharInsideIdentifier(char c) {
|
||||
return isalnum(c) || (c == '_') || (c == '-') || (c == '/');
|
||||
}
|
||||
|
||||
// Returns true iff character c can appear at the beginning of a number.
|
||||
inline bool IsValidCharAtStartOfNumber(char c) {
|
||||
return isdigit(c) || (c == '+') || (c == '-');
|
||||
}
|
||||
|
||||
// Returns true iff character c can appear inside a number.
|
||||
inline bool IsValidCharInsideNumber(char c) { return isdigit(c) || (c == '.'); }
|
||||
|
||||
} // namespace
|
||||
|
||||
FMLParser::FMLParser() {}
|
||||
FMLParser::~FMLParser() {}
|
||||
|
||||
void FMLParser::Initialize(const string &source) {
|
||||
// Initialize parser state.
|
||||
source_ = source;
|
||||
current_ = source_.begin();
|
||||
item_start_ = line_start_ = current_;
|
||||
line_number_ = item_line_number_ = 1;
|
||||
|
||||
// Read first input item.
|
||||
NextItem();
|
||||
}
|
||||
|
||||
void FMLParser::Next() {
|
||||
// Move to the next input character. If we are at a line break update line
|
||||
// number and line start position.
|
||||
if (CurrentChar() == '\n') {
|
||||
++line_number_;
|
||||
++current_;
|
||||
line_start_ = current_;
|
||||
} else {
|
||||
++current_;
|
||||
}
|
||||
}
|
||||
|
||||
void FMLParser::NextItem() {
|
||||
// Skip white space and comments.
|
||||
while (!eos()) {
|
||||
if (CurrentChar() == '#') {
|
||||
// Skip comment.
|
||||
while (!eos() && CurrentChar() != '\n') Next();
|
||||
} else if (isspace(CurrentChar())) {
|
||||
// Skip whitespace.
|
||||
while (!eos() && isspace(CurrentChar())) Next();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Record start position for next item.
|
||||
item_start_ = current_;
|
||||
item_line_number_ = line_number_;
|
||||
|
||||
// Check for end of input.
|
||||
if (eos()) {
|
||||
item_type_ = END;
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse number.
|
||||
if (IsValidCharAtStartOfNumber(CurrentChar())) {
|
||||
string::iterator start = current_;
|
||||
Next();
|
||||
while (!eos() && IsValidCharInsideNumber(CurrentChar())) Next();
|
||||
item_text_.assign(start, current_);
|
||||
item_type_ = NUMBER;
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse string.
|
||||
if (CurrentChar() == '"') {
|
||||
Next();
|
||||
string::iterator start = current_;
|
||||
while (CurrentChar() != '"') {
|
||||
CLD3_DCHECK(!eos());
|
||||
Next();
|
||||
}
|
||||
item_text_.assign(start, current_);
|
||||
item_type_ = STRING;
|
||||
Next();
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse identifier name.
|
||||
if (IsValidCharAtStartOfIdentifier(CurrentChar())) {
|
||||
string::iterator start = current_;
|
||||
while (!eos() && IsValidCharInsideIdentifier(CurrentChar())) {
|
||||
Next();
|
||||
}
|
||||
item_text_.assign(start, current_);
|
||||
item_type_ = NAME;
|
||||
return;
|
||||
}
|
||||
|
||||
// Single character item.
|
||||
item_type_ = CurrentChar();
|
||||
Next();
|
||||
}
|
||||
|
||||
void FMLParser::Parse(const string &source,
|
||||
FeatureExtractorDescriptor *result) {
|
||||
// Initialize parser.
|
||||
Initialize(source);
|
||||
|
||||
while (item_type_ != END) {
|
||||
// Parse either a parameter name or a feature.
|
||||
CLD3_DCHECK(item_type_ == NAME);
|
||||
string name = item_text_;
|
||||
NextItem();
|
||||
|
||||
// Feature expected.
|
||||
CLD3_DCHECK(static_cast<char>(item_type_) != '=');
|
||||
|
||||
// Parse feature.
|
||||
FeatureFunctionDescriptor *descriptor = result->add_feature();
|
||||
descriptor->set_type(name);
|
||||
ParseFeature(descriptor);
|
||||
}
|
||||
}
|
||||
|
||||
void FMLParser::ParseFeature(FeatureFunctionDescriptor *result) {
|
||||
// Parse argument and parameters.
|
||||
if (item_type_ == '(') {
|
||||
NextItem();
|
||||
ParseParameter(result);
|
||||
while (item_type_ == ',') {
|
||||
NextItem();
|
||||
ParseParameter(result);
|
||||
}
|
||||
|
||||
CLD3_DCHECK(item_type_ == ')');
|
||||
NextItem();
|
||||
}
|
||||
|
||||
// Parse feature name.
|
||||
if (item_type_ == ':') {
|
||||
NextItem();
|
||||
|
||||
// Feature name expected.
|
||||
CLD3_DCHECK((item_type_ == NAME) || (item_type_ == STRING));
|
||||
string name = item_text_;
|
||||
NextItem();
|
||||
|
||||
// Set feature name.
|
||||
result->set_name(name);
|
||||
}
|
||||
|
||||
// Parse sub-features.
|
||||
if (item_type_ == '.') {
|
||||
// Parse dotted sub-feature.
|
||||
NextItem();
|
||||
CLD3_DCHECK(item_type_ == NAME);
|
||||
string type = item_text_;
|
||||
NextItem();
|
||||
|
||||
// Parse sub-feature.
|
||||
FeatureFunctionDescriptor *subfeature = result->add_feature();
|
||||
subfeature->set_type(type);
|
||||
ParseFeature(subfeature);
|
||||
} else if (item_type_ == '{') {
|
||||
// Parse sub-feature block.
|
||||
NextItem();
|
||||
while (item_type_ != '}') {
|
||||
CLD3_DCHECK(item_type_ == NAME);
|
||||
string type = item_text_;
|
||||
NextItem();
|
||||
|
||||
// Parse sub-feature.
|
||||
FeatureFunctionDescriptor *subfeature = result->add_feature();
|
||||
subfeature->set_type(type);
|
||||
ParseFeature(subfeature);
|
||||
}
|
||||
NextItem();
|
||||
}
|
||||
}
|
||||
|
||||
void FMLParser::ParseParameter(FeatureFunctionDescriptor *result) {
|
||||
CLD3_DCHECK((item_type_ == NUMBER) || (item_type_ == NAME));
|
||||
if (item_type_ == NUMBER) {
|
||||
int argument = utils::ParseUsing<int>(item_text_, utils::ParseInt32);
|
||||
NextItem();
|
||||
|
||||
// Set default argument for feature.
|
||||
result->set_argument(argument);
|
||||
} else { // item_type_ == NAME
|
||||
string name = item_text_;
|
||||
NextItem();
|
||||
CLD3_DCHECK(item_type_ == '=');
|
||||
NextItem();
|
||||
|
||||
// Parameter value expected.
|
||||
CLD3_DCHECK(item_type_ < END);
|
||||
string value = item_text_;
|
||||
NextItem();
|
||||
|
||||
// Add parameter to feature.
|
||||
Parameter *parameter;
|
||||
parameter = result->add_parameter();
|
||||
parameter->set_name(name);
|
||||
parameter->set_value(value);
|
||||
}
|
||||
}
|
||||
|
||||
void ToFMLFunction(const FeatureFunctionDescriptor &function, string *output) {
|
||||
output->append(function.type());
|
||||
if (function.argument() != 0 || function.parameter_size() > 0) {
|
||||
output->append("(");
|
||||
bool first = true;
|
||||
if (function.argument() != 0) {
|
||||
output->append(Int64ToString(function.argument()));
|
||||
first = false;
|
||||
}
|
||||
for (int i = 0; i < function.parameter_size(); ++i) {
|
||||
if (!first) output->append(",");
|
||||
output->append(function.parameter(i).name());
|
||||
output->append("=");
|
||||
output->append("\"");
|
||||
output->append(function.parameter(i).value());
|
||||
output->append("\"");
|
||||
first = false;
|
||||
}
|
||||
output->append(")");
|
||||
}
|
||||
}
|
||||
|
||||
void ToFML(const FeatureFunctionDescriptor &function, string *output) {
|
||||
ToFMLFunction(function, output);
|
||||
if (function.feature_size() == 1) {
|
||||
output->append(".");
|
||||
ToFML(function.feature(0), output);
|
||||
} else if (function.feature_size() > 1) {
|
||||
output->append(" { ");
|
||||
for (int i = 0; i < function.feature_size(); ++i) {
|
||||
if (i > 0) output->append(" ");
|
||||
ToFML(function.feature(i), output);
|
||||
}
|
||||
output->append(" } ");
|
||||
}
|
||||
}
|
||||
|
||||
void ToFML(const FeatureExtractorDescriptor &extractor, string *output) {
|
||||
for (int i = 0; i < extractor.feature_size(); ++i) {
|
||||
ToFML(extractor.feature(i), output);
|
||||
output->append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
string AsFML(const FeatureFunctionDescriptor &function) {
|
||||
string str;
|
||||
ToFML(function, &str);
|
||||
return str;
|
||||
}
|
||||
|
||||
string AsFML(const FeatureExtractorDescriptor &extractor) {
|
||||
string str;
|
||||
ToFML(extractor, &str);
|
||||
return str;
|
||||
}
|
||||
|
||||
void StripFML(string *fml_string) {
|
||||
auto it = fml_string->begin();
|
||||
while (it != fml_string->end()) {
|
||||
if (*it == '"') {
|
||||
it = fml_string->erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
123
Telegram/ThirdParty/cld3/src/fml_parser.h
vendored
Normal file
123
Telegram/ThirdParty/cld3/src/fml_parser.h
vendored
Normal file
@@ -0,0 +1,123 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Feature modeling language (fml) parser.
|
||||
//
|
||||
// BNF grammar for fml:
|
||||
//
|
||||
// <feature model> ::= { <feature extractor> }
|
||||
//
|
||||
// <feature extractor> ::= <extractor spec> |
|
||||
// <extractor spec> '.' <feature extractor> |
|
||||
// <extractor spec> '{' { <feature extractor> } '}'
|
||||
//
|
||||
// <extractor spec> ::= <extractor type>
|
||||
// [ '(' <parameter list> ')' ]
|
||||
// [ ':' <extractor name> ]
|
||||
//
|
||||
// <parameter list> = ( <parameter> | <argument> ) { ',' <parameter> }
|
||||
//
|
||||
// <parameter> ::= <parameter name> '=' <parameter value>
|
||||
//
|
||||
// <extractor type> ::= NAME
|
||||
// <extractor name> ::= NAME | STRING
|
||||
// <argument> ::= NUMBER
|
||||
// <parameter name> ::= NAME
|
||||
// <parameter value> ::= NUMBER | STRING | NAME
|
||||
|
||||
#ifndef FML_PARSER_H_
|
||||
#define FML_PARSER_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "base.h"
|
||||
#include "cld_3/protos/feature_extractor.pb.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
class FMLParser {
|
||||
public:
|
||||
// Parses fml specification into feature extractor descriptor.
|
||||
void Parse(const string &source, FeatureExtractorDescriptor *result);
|
||||
|
||||
FMLParser();
|
||||
~FMLParser();
|
||||
|
||||
private:
|
||||
// Initializes the parser with the source text.
|
||||
void Initialize(const string &source);
|
||||
|
||||
// Moves to the next input character.
|
||||
void Next();
|
||||
|
||||
// Moves to the next input item.
|
||||
void NextItem();
|
||||
|
||||
// Parses a feature descriptor.
|
||||
void ParseFeature(FeatureFunctionDescriptor *result);
|
||||
|
||||
// Parses a parameter specification.
|
||||
void ParseParameter(FeatureFunctionDescriptor *result);
|
||||
|
||||
// Returns true if end of source input has been reached.
|
||||
bool eos() const { return current_ == source_.end(); }
|
||||
|
||||
// Returns current character. Other methods should access the current
|
||||
// character through this method (instead of using *current_ directly): this
|
||||
// method performs extra safety checks.
|
||||
char CurrentChar() const {
|
||||
// CLD3_DCHECK that we are reading from inside the string.
|
||||
CLD3_DCHECK(current_ >= source_.begin());
|
||||
CLD3_DCHECK(current_ < source_.end());
|
||||
return *current_;
|
||||
}
|
||||
|
||||
// Item types.
|
||||
enum ItemTypes {
|
||||
END = 0,
|
||||
NAME = -1,
|
||||
NUMBER = -2,
|
||||
STRING = -3,
|
||||
};
|
||||
|
||||
// Source text.
|
||||
string source_;
|
||||
|
||||
// Current input position.
|
||||
string::iterator current_;
|
||||
|
||||
// Line number for current input position.
|
||||
int line_number_;
|
||||
|
||||
// Start position for current item.
|
||||
string::iterator item_start_;
|
||||
|
||||
// Start position for current line.
|
||||
string::iterator line_start_;
|
||||
|
||||
// Line number for current item.
|
||||
int item_line_number_;
|
||||
|
||||
// Item type for current item. If this is positive it is interpreted as a
|
||||
// character. If it is negative it is interpreted as an item type.
|
||||
int item_type_;
|
||||
|
||||
// Text for current item.
|
||||
string item_text_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // FML_PARSER_H_
|
||||
57449
Telegram/ThirdParty/cld3/src/lang_id_nn_params.cc
vendored
Normal file
57449
Telegram/ThirdParty/cld3/src/lang_id_nn_params.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
178
Telegram/ThirdParty/cld3/src/lang_id_nn_params.h
vendored
Executable file
178
Telegram/ThirdParty/cld3/src/lang_id_nn_params.h
vendored
Executable file
@@ -0,0 +1,178 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef LANG_ID_NN_PARAMS_H_
|
||||
#define LANG_ID_NN_PARAMS_H_
|
||||
|
||||
#include "base.h"
|
||||
#include "embedding_network_params.h"
|
||||
#include "float16.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
class LangIdNNParams : public EmbeddingNetworkParams {
|
||||
public:
|
||||
~LangIdNNParams() override {}
|
||||
|
||||
// Access methods for embeddings:
|
||||
int embeddings_size() const override { return 6; }
|
||||
int embeddings_num_rows(int i) const override {
|
||||
return kEmbeddingsNumRows[i];
|
||||
}
|
||||
int embeddings_num_cols(int i) const override {
|
||||
return kEmbeddingsNumCols[i];
|
||||
}
|
||||
const void *embeddings_weights(int i) const override {
|
||||
return embeddings_weights_[i];
|
||||
}
|
||||
QuantizationType embeddings_quant_type(int i) const override {
|
||||
return QuantizationType::UINT8;
|
||||
}
|
||||
const float16 *embeddings_quant_scales(int i) const override {
|
||||
return embeddings_quant_scales_[i];
|
||||
}
|
||||
|
||||
// Access methods for hidden:
|
||||
int hidden_size() const override { return 1; }
|
||||
int hidden_num_rows(int i) const override { return kHiddenNumRows[i]; }
|
||||
int hidden_num_cols(int i) const override { return kHiddenNumCols[i]; }
|
||||
const void *hidden_weights(int i) const override {
|
||||
return hidden_weights_[i];
|
||||
}
|
||||
|
||||
// Access methods for hidden_bias:
|
||||
int hidden_bias_size() const override { return 1; }
|
||||
int hidden_bias_num_rows(int i) const override {
|
||||
return kHiddenBiasNumRows[i];
|
||||
}
|
||||
int hidden_bias_num_cols(int i) const override {
|
||||
return kHiddenBiasNumCols[i];
|
||||
}
|
||||
const void *hidden_bias_weights(int i) const override {
|
||||
return hidden_bias_weights_[i];
|
||||
}
|
||||
|
||||
// Access methods for softmax:
|
||||
int softmax_size() const override { return 1; }
|
||||
int softmax_num_rows(int i) const override { return kSoftmaxNumRows[i]; }
|
||||
int softmax_num_cols(int i) const override { return kSoftmaxNumCols[i]; }
|
||||
const void *softmax_weights(int i) const override {
|
||||
return softmax_weights_[i];
|
||||
}
|
||||
|
||||
// Access methods for softmax_bias:
|
||||
int softmax_bias_size() const override { return 1; }
|
||||
int softmax_bias_num_rows(int i) const override {
|
||||
return kSoftmaxBiasNumRows[i];
|
||||
}
|
||||
int softmax_bias_num_cols(int i) const override {
|
||||
return kSoftmaxBiasNumCols[i];
|
||||
}
|
||||
const void *softmax_bias_weights(int i) const override {
|
||||
return softmax_bias_weights_[i];
|
||||
}
|
||||
|
||||
// Access methods for embedding_dim:
|
||||
int embedding_dim_size() const override { return 6; }
|
||||
int32 embedding_dim(int i) const override { return kEmbeddingDimValues[i]; }
|
||||
|
||||
// Access methods for embedding_num_features:
|
||||
int embedding_num_features_size() const override { return 6; }
|
||||
int32 embedding_num_features(int i) const override {
|
||||
return kEmbeddingNumFeaturesValues[i];
|
||||
}
|
||||
|
||||
// Access methods for embedding_features_domain_size:
|
||||
int embedding_features_domain_size_size() const override { return 6; }
|
||||
int32 embedding_features_domain_size(int i) const override {
|
||||
return kEmbeddingFeaturesDomainSizeValues[i];
|
||||
}
|
||||
|
||||
// Access methods for concat_offset:
|
||||
int concat_offset_size() const override { return 6; }
|
||||
int32 concat_offset(int i) const override { return kConcatOffsetValues[i]; }
|
||||
|
||||
// Access methods for concat_layer_size:
|
||||
bool has_concat_layer_size() const override { return true; }
|
||||
int32 concat_layer_size() const override { return 80; }
|
||||
|
||||
// Access methods for is_precomputed:
|
||||
bool has_is_precomputed() const override { return false; }
|
||||
bool is_precomputed() const override { return false; }
|
||||
|
||||
private:
|
||||
// Private fields for embeddings:
|
||||
static const int kEmbeddingsNumRows[];
|
||||
static const int kEmbeddingsNumCols[];
|
||||
static const uint8 kEmbeddingsWeights0[];
|
||||
static const uint8 kEmbeddingsWeights1[];
|
||||
static const uint8 kEmbeddingsWeights2[];
|
||||
static const uint8 kEmbeddingsWeights3[];
|
||||
static const uint8 kEmbeddingsWeights4[];
|
||||
static const uint8 kEmbeddingsWeights5[];
|
||||
const void *embeddings_weights_[6] = {
|
||||
kEmbeddingsWeights0, kEmbeddingsWeights1, kEmbeddingsWeights2,
|
||||
kEmbeddingsWeights3, kEmbeddingsWeights4, kEmbeddingsWeights5};
|
||||
static const float16 kEmbeddingsQuantScales0[];
|
||||
static const float16 kEmbeddingsQuantScales1[];
|
||||
static const float16 kEmbeddingsQuantScales2[];
|
||||
static const float16 kEmbeddingsQuantScales3[];
|
||||
static const float16 kEmbeddingsQuantScales4[];
|
||||
static const float16 kEmbeddingsQuantScales5[];
|
||||
const float16 *embeddings_quant_scales_[6] = {
|
||||
kEmbeddingsQuantScales0, kEmbeddingsQuantScales1,
|
||||
kEmbeddingsQuantScales2, kEmbeddingsQuantScales3,
|
||||
kEmbeddingsQuantScales4, kEmbeddingsQuantScales5};
|
||||
|
||||
// Private fields for hidden:
|
||||
static const int kHiddenNumRows[];
|
||||
static const int kHiddenNumCols[];
|
||||
static const float kHiddenWeights0[];
|
||||
const void *hidden_weights_[1] = {kHiddenWeights0};
|
||||
|
||||
// Private fields for hidden_bias:
|
||||
static const int kHiddenBiasNumRows[];
|
||||
static const int kHiddenBiasNumCols[];
|
||||
static const float kHiddenBiasWeights0[];
|
||||
const void *hidden_bias_weights_[1] = {kHiddenBiasWeights0};
|
||||
|
||||
// Private fields for softmax:
|
||||
static const int kSoftmaxNumRows[];
|
||||
static const int kSoftmaxNumCols[];
|
||||
static const float kSoftmaxWeights0[];
|
||||
const void *softmax_weights_[1] = {kSoftmaxWeights0};
|
||||
|
||||
// Private fields for softmax_bias:
|
||||
static const int kSoftmaxBiasNumRows[];
|
||||
static const int kSoftmaxBiasNumCols[];
|
||||
static const float kSoftmaxBiasWeights0[];
|
||||
const void *softmax_bias_weights_[1] = {kSoftmaxBiasWeights0};
|
||||
|
||||
// Private fields for embedding_dim:
|
||||
static const int32 kEmbeddingDimValues[];
|
||||
|
||||
// Private fields for embedding_num_features:
|
||||
static const int32 kEmbeddingNumFeaturesValues[];
|
||||
|
||||
// Private fields for embedding_features_domain_size:
|
||||
static const int32 kEmbeddingFeaturesDomainSizeValues[];
|
||||
|
||||
// Private fields for concat_offset:
|
||||
static const int32 kConcatOffsetValues[];
|
||||
}; // class LangIdNNParams
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // LANG_ID_NN_PARAMS_H_
|
||||
165
Telegram/ThirdParty/cld3/src/language_identifier_features.cc
vendored
Normal file
165
Telegram/ThirdParty/cld3/src/language_identifier_features.cc
vendored
Normal file
@@ -0,0 +1,165 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "language_identifier_features.h"
|
||||
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "base.h"
|
||||
#include "feature_extractor.h"
|
||||
#include "feature_types.h"
|
||||
#include "script_span/generated_ulscript.h"
|
||||
#include "script_span/getonescriptspan.h"
|
||||
#include "sentence_features.h"
|
||||
#include "task_context.h"
|
||||
#include "unicodetext.h"
|
||||
#include "utils.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
NumericFeatureType::NumericFeatureType(const string &name, FeatureValue size)
|
||||
: FeatureType(name), size_(size) {}
|
||||
|
||||
string NumericFeatureType::GetFeatureValueName(FeatureValue value) const {
|
||||
return value < 0 ? "" : Int64ToString(value);
|
||||
}
|
||||
|
||||
FeatureValue NumericFeatureType::GetDomainSize() const { return size_; }
|
||||
|
||||
void ContinuousBagOfNgramsFunction::Setup(TaskContext *context) {
|
||||
// Parameters in the feature function descriptor.
|
||||
include_terminators_ = GetBoolParameter("include_terminators", false);
|
||||
include_spaces_ = GetBoolParameter("include_spaces", false);
|
||||
use_equal_ngram_weight_ = GetBoolParameter("use_equal_weight", false);
|
||||
ngram_id_dimension_ = GetIntParameter("id_dim", 10000);
|
||||
ngram_size_ = GetIntParameter("size", 3);
|
||||
}
|
||||
|
||||
void ContinuousBagOfNgramsFunction::Init(TaskContext *context) {
|
||||
set_feature_type(new NumericFeatureType(name(), ngram_id_dimension_));
|
||||
}
|
||||
|
||||
void ContinuousBagOfNgramsFunction::Evaluate(const WorkspaceSet &workspaces,
|
||||
const Sentence &sentence,
|
||||
FeatureVector *result) const {
|
||||
// Include terminators for each token. Tokens are discovered by splitting the
|
||||
// text on spaces.
|
||||
std::vector<string> chars;
|
||||
utils::GetUTF8Chars(sentence.text(), &chars);
|
||||
if (include_terminators_) {
|
||||
std::vector<string> new_chars{"^"};
|
||||
for (size_t index = 0; index < chars.size(); ++index) {
|
||||
if (chars.at(index) == " ") {
|
||||
new_chars.push_back("$");
|
||||
new_chars.push_back(" ");
|
||||
new_chars.push_back("^");
|
||||
} else {
|
||||
new_chars.push_back(chars.at(index));
|
||||
}
|
||||
}
|
||||
new_chars.push_back("$");
|
||||
chars.swap(new_chars);
|
||||
}
|
||||
|
||||
// Find the char ngram counts.
|
||||
std::unordered_map<string, int> char_ngram_counts;
|
||||
int count_sum = 0;
|
||||
for (int start = 0; start <= static_cast<int>(chars.size()) - ngram_size_;
|
||||
++start) {
|
||||
string char_ngram;
|
||||
int index;
|
||||
for (index = 0; index < ngram_size_; ++index) {
|
||||
const string ¤t_char = chars.at(start + index);
|
||||
if (current_char == " " && !include_spaces_) {
|
||||
break;
|
||||
}
|
||||
char_ngram.append(current_char);
|
||||
}
|
||||
if (index == ngram_size_) {
|
||||
char_ngram_counts[char_ngram]++;
|
||||
++count_sum;
|
||||
}
|
||||
}
|
||||
|
||||
// Populate the feature vector.
|
||||
const float equal_weight = 1.0 / char_ngram_counts.size();
|
||||
const float norm = static_cast<float>(count_sum);
|
||||
for (const auto &ngram_and_count : char_ngram_counts) {
|
||||
const float weight =
|
||||
use_equal_ngram_weight_ ? equal_weight : ngram_and_count.second / norm;
|
||||
FloatFeatureValue value(
|
||||
utils::Hash32WithDefaultSeed(ngram_and_count.first) %
|
||||
ngram_id_dimension_,
|
||||
weight);
|
||||
result->add(feature_type(), value.discrete_value);
|
||||
}
|
||||
}
|
||||
|
||||
FeatureValue ScriptFeature::Compute(const WorkspaceSet &workspaces,
|
||||
const Sentence &sentence,
|
||||
const FeatureVector *result) const {
|
||||
const string &text = sentence.text();
|
||||
CLD2::ScriptScanner ss(text.c_str(), text.size(),
|
||||
/*is_plain_text=*/true);
|
||||
|
||||
// GetOneScriptSpan() is called only once because of the assumption that the
|
||||
// input contains one script. This function also cleans up the input (e.g.,
|
||||
// removes digits, punctuation).
|
||||
// TODO(abakalov): Extract the clean-up and script detection code out of
|
||||
// GetOneScriptSpan() because we don't have to iterate over the whole text,
|
||||
// just look at the first codepoint after clean-up.
|
||||
CLD2::LangSpan script_span;
|
||||
ss.GetOneScriptSpan(&script_span);
|
||||
const CLD2::ULScript ulscript = script_span.ulscript;
|
||||
if (ulscript != CLD2::ULScript_Hani) {
|
||||
return ulscript;
|
||||
} else {
|
||||
// Out of the codepoints captured by ULScript_Hani, separately count those
|
||||
// in Hangul (Korean script) and those in a script other than Hangul.
|
||||
int num_hangul = 0;
|
||||
int num_non_hangul = 0;
|
||||
UnicodeText unicode_text;
|
||||
unicode_text.PointToUTF8(script_span.text, script_span.text_bytes);
|
||||
for (chrome_lang_id::char32 codepoint : unicode_text) {
|
||||
// If the current codepoint is space, continue.
|
||||
if (codepoint == 0x20) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if the current codepoint is within the ranges associated with
|
||||
// Hangul.
|
||||
if ((codepoint >= 0x1100 && codepoint <= 0x11FF) || // Hangul Jamo
|
||||
(codepoint >= 0xA960 && codepoint <= 0xA97F) || // Jamo Extended A
|
||||
(codepoint >= 0xD7B0 && codepoint <= 0xD7FF) || // Jamo Extended B
|
||||
(codepoint >= 0x3130 && codepoint <= 0x318F) || // Compatibility Jamo
|
||||
(codepoint >= 0xFFA0 && codepoint <= 0xFFDC) || // Halfwidth Jamo
|
||||
(codepoint >= 0xAC00 && codepoint <= 0xD7AF)) { // Hangul Syllables
|
||||
num_hangul++;
|
||||
} else {
|
||||
num_non_hangul++;
|
||||
}
|
||||
}
|
||||
|
||||
if (num_hangul > num_non_hangul) {
|
||||
return static_cast<FeatureValue>(CLD2::NUM_ULSCRIPTS);
|
||||
} else {
|
||||
return static_cast<FeatureValue>(CLD2::ULScript_Hani);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
116
Telegram/ThirdParty/cld3/src/language_identifier_features.h
vendored
Normal file
116
Telegram/ThirdParty/cld3/src/language_identifier_features.h
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef LANGUAGE_IDENTIFIER_FEATURES_H_
|
||||
#define LANGUAGE_IDENTIFIER_FEATURES_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "feature_extractor.h"
|
||||
#include "feature_types.h"
|
||||
#include "script_span/generated_ulscript.h"
|
||||
#include "cld_3/protos/sentence.pb.h"
|
||||
#include "sentence_features.h"
|
||||
#include "task_context.h"
|
||||
#include "workspace.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Feature type for numeric features.
|
||||
class NumericFeatureType : public FeatureType {
|
||||
public:
|
||||
// Initializes numeric feature.
|
||||
NumericFeatureType(const string &name, FeatureValue size);
|
||||
|
||||
// Returns numeric feature value.
|
||||
string GetFeatureValueName(FeatureValue value) const override;
|
||||
|
||||
// Returns the number of feature values.
|
||||
FeatureValue GetDomainSize() const override;
|
||||
|
||||
private:
|
||||
FeatureValue size_;
|
||||
};
|
||||
|
||||
// Class for computing continuous char ngram features.
|
||||
// Feature function descriptor parameters:
|
||||
// include_terminators(bool, false):
|
||||
// If 'true', then splits the text based on spaces to get tokens, adds "^"
|
||||
// to the beginning of each token, and adds "$" to the end of each token.
|
||||
// include_spaces(bool, false):
|
||||
// If 'true', then includes char ngrams containing spaces.
|
||||
// use_equal_weight(bool, false):
|
||||
// If 'true', then weighs each unique ngram by 1.0 / (number of unique
|
||||
// ngrams in the input). Otherwise, weighs each unique ngram by (ngram
|
||||
// count) / (total number of ngrams).
|
||||
// id_dim(int, 10000):
|
||||
// The integer id of each char ngram is computed as follows:
|
||||
// Hash32WithDefaultSeed(char ngram) % id_dim.
|
||||
// size(int, 3):
|
||||
// Only ngrams of this size will be extracted.
|
||||
class ContinuousBagOfNgramsFunction : public WholeSentenceFeature {
|
||||
public:
|
||||
void Setup(TaskContext *context) override;
|
||||
void Init(TaskContext *context) override;
|
||||
|
||||
// Appends the features computed from the focus to the feature vector.
|
||||
void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
|
||||
FeatureVector *result) const override;
|
||||
|
||||
private:
|
||||
// If 'true', then splits the text based on spaces to get tokens, adds "^" to
|
||||
// the beginning of each token, and adds "$" to the end of each token.
|
||||
bool include_terminators_;
|
||||
|
||||
// If 'true', then includes char ngrams containing spaces.
|
||||
bool include_spaces_;
|
||||
|
||||
// If 'true', then weighs each unique ngram by 1.0 / (number of unique ngrams
|
||||
// in the input). Otherwise, weighs each unique ngram by (ngram count) /
|
||||
// (total number of ngrams).
|
||||
bool use_equal_ngram_weight_;
|
||||
|
||||
// The integer id of each char ngram is computed as follows:
|
||||
// Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
|
||||
int ngram_id_dimension_;
|
||||
|
||||
// Only ngrams of size ngram_size_ will be extracted.
|
||||
int ngram_size_;
|
||||
};
|
||||
|
||||
// Class for detecting the script of a piece of text. The list of supported
|
||||
// scripts is chrome_lang_id::CLD2::ULScript. This class uses the script
|
||||
// recognition code ported from CLD2. ULScript_Hani is split into non-Korean
|
||||
// script and Korean script (Hangul). In the former case, the function emits
|
||||
// ULScript_Hani. In the latter case, the function emits NUM_ULSCRIPTS. The
|
||||
// class assumes that the input is (1) interchange valid UTF8, and (2) contains
|
||||
// only one chrome_lang_id::CLD2::ULScript.
|
||||
class ScriptFeature : public WholeSentenceFeature {
|
||||
public:
|
||||
void Init(TaskContext *context) override {
|
||||
// The dimension is incremented by 1 because ULScript_Hani is split into two
|
||||
// as mentioned in the class description.
|
||||
set_feature_type(new NumericFeatureType(
|
||||
name(), chrome_lang_id::CLD2::NUM_ULSCRIPTS + 1));
|
||||
}
|
||||
|
||||
// Computes the feature and saves it in the feature vector.
|
||||
FeatureValue Compute(const WorkspaceSet &workspaces, const Sentence &sentence,
|
||||
const FeatureVector *result) const override;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // LANGUAGE_IDENTIFIER_FEATURES_H_
|
||||
261
Telegram/ThirdParty/cld3/src/language_identifier_features_test.cc
vendored
Normal file
261
Telegram/ThirdParty/cld3/src/language_identifier_features_test.cc
vendored
Normal file
@@ -0,0 +1,261 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
#include "base.h"
|
||||
#include "feature_extractor.h"
|
||||
#include "language_identifier_features.h"
|
||||
#include "nnet_language_identifier.h"
|
||||
#include "script_span/generated_ulscript.h"
|
||||
#include "cld_3/protos/sentence.pb.h"
|
||||
#include "task_context.h"
|
||||
#include "utils.h"
|
||||
#include "workspace.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace language_identifier_features_test {
|
||||
|
||||
static WholeSentenceFeature *cbog_factory() {
|
||||
return new ContinuousBagOfNgramsFunction;
|
||||
}
|
||||
|
||||
static WholeSentenceFeature *sf_factory() { return new ScriptFeature; }
|
||||
|
||||
// Class for calculating the feature weights and ids.
|
||||
class FeatureIdWeightCalculator {
|
||||
public:
|
||||
explicit FeatureIdWeightCalculator(TaskContext *context) {
|
||||
if (WholeSentenceFeature::registry() == nullptr) {
|
||||
// Create registry for our WholeSentenceFeature(s).
|
||||
RegisterableClass<WholeSentenceFeature>::CreateRegistry(
|
||||
"sentence feature function", "WholeSentenceFeature", __FILE__,
|
||||
__LINE__);
|
||||
}
|
||||
|
||||
// Register our WholeSentenceFeature(s).
|
||||
// Register ContinuousBagOfNgramsFunction feature function.
|
||||
static WholeSentenceFeature::Registry::Registrar cbog_registrar(
|
||||
WholeSentenceFeature::registry(), "continuous-bag-of-ngrams",
|
||||
"ContinuousBagOfNgramsFunction", __FILE__, __LINE__, cbog_factory);
|
||||
|
||||
// Register Script feature function.
|
||||
static WholeSentenceFeature::Registry::Registrar sf_registrar(
|
||||
WholeSentenceFeature::registry(), "script", "ScriptFeature", __FILE__,
|
||||
__LINE__, sf_factory);
|
||||
|
||||
feature_extractor_.Setup(context);
|
||||
feature_extractor_.Init(context);
|
||||
}
|
||||
|
||||
// Assumes that a single feature is specified and extracts it.
|
||||
void ExtractOnlyFeature(Sentence *sentence,
|
||||
std::vector<FeatureVector> *features) {
|
||||
CLD3_CHECK(features->size() == 1);
|
||||
WorkspaceSet workspace;
|
||||
workspace.Reset(workspace_registry_);
|
||||
feature_extractor_.Preprocess(&workspace, sentence);
|
||||
feature_extractor_.ExtractFeatures(workspace, *sentence, features);
|
||||
CLD3_CHECK(features->size() == 1);
|
||||
}
|
||||
|
||||
// Returns a map from feature value id to feature value weight.
|
||||
std::unordered_map<int, float> GetFloatFeatureValIdsAndWeights(
|
||||
Sentence *sentence) {
|
||||
std::vector<FeatureVector> feature_vectors(1); // one feature space
|
||||
ExtractOnlyFeature(sentence, &feature_vectors);
|
||||
const FeatureVector &feature_vector = feature_vectors.at(0);
|
||||
|
||||
// Save the (feature value id, feature value weight) pairs to a map.
|
||||
std::unordered_map<int, float> feature_id_weight;
|
||||
for (int index = 0; index < feature_vector.size(); ++index) {
|
||||
const FloatFeatureValue feature_value =
|
||||
FloatFeatureValue(feature_vector.value(index));
|
||||
feature_id_weight[feature_value.value.id] = feature_value.value.weight;
|
||||
}
|
||||
return feature_id_weight;
|
||||
}
|
||||
|
||||
// Returns the feature value ids.
|
||||
std::set<int> GetFeatureValueIds(Sentence *sentence) {
|
||||
std::vector<FeatureVector> feature_vectors(1); // one feature space
|
||||
ExtractOnlyFeature(sentence, &feature_vectors);
|
||||
const FeatureVector &feature_vector = feature_vectors.at(0);
|
||||
|
||||
std::set<int> ids;
|
||||
for (int index = 0; index < feature_vector.size(); ++index) {
|
||||
ids.insert(feature_vector.value(index));
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
private:
|
||||
// The registry of shared workspaces in the feature extractor.
|
||||
WorkspaceRegistry workspace_registry_;
|
||||
LanguageIdEmbeddingFeatureExtractor feature_extractor_;
|
||||
};
|
||||
|
||||
// Extracts features and checks that their ids and weights are correct.
|
||||
bool ExtractAndCheckFeatures(const string &features, const int id_dim,
|
||||
const std::vector<string> &expected_char_ngrams,
|
||||
const std::vector<float> &expected_weights,
|
||||
Sentence *sentence) {
|
||||
TaskContext context;
|
||||
context.SetParameter("language_identifier_features", features);
|
||||
FeatureIdWeightCalculator calc(&context);
|
||||
|
||||
// Get the feature ids and the corresponding weights.
|
||||
const std::unordered_map<int, float> feature_id_weight =
|
||||
calc.GetFloatFeatureValIdsAndWeights(sentence);
|
||||
if (feature_id_weight.size() != expected_char_ngrams.size()) {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Number of expected feature ids: "
|
||||
<< expected_char_ngrams.size() << std::endl;
|
||||
std::cout << " Number of extracted feature ids: "
|
||||
<< feature_id_weight.size() << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Specifies how close two float values should be to be considered equal.
|
||||
const float epsilon = 0.0001f;
|
||||
bool test_successful = true;
|
||||
for (size_t i = 0; i < expected_char_ngrams.size(); ++i) {
|
||||
const int expected_id =
|
||||
utils::Hash32WithDefaultSeed(expected_char_ngrams.at(i)) % id_dim;
|
||||
|
||||
// Check the ids and the weights.
|
||||
if (feature_id_weight.count(expected_id) == 0) {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Feature id " << expected_id << " is missing" << std::endl;
|
||||
test_successful = false;
|
||||
} else {
|
||||
if (std::abs(feature_id_weight.at(expected_id) - expected_weights.at(i)) >
|
||||
epsilon) {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Different weight for feature id " << expected_id
|
||||
<< ": expected weight " << expected_weights.at(i)
|
||||
<< ", actual weight " << feature_id_weight.at(expected_id)
|
||||
<< std::endl;
|
||||
test_successful = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (test_successful) {
|
||||
std::cout << " Success!" << std::endl;
|
||||
}
|
||||
return test_successful;
|
||||
}
|
||||
|
||||
// Tests the case when ngram features get equal weight. Returns "true" if the
|
||||
// test is successful and "false" otherwise.
|
||||
bool TestExtractFeaturesWithEqualWeight() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
// The integer id of each char ngram is computed as follows:
|
||||
// utils::Hash32WithDefaultSeed(char ngram) % id_dim.
|
||||
const int id_dim = 100;
|
||||
const string features = "continuous-bag-of-ngrams(id_dim=" +
|
||||
std::to_string(id_dim) +
|
||||
",size=2,include_terminators=true,include_" +
|
||||
"spaces=false,use_equal_weight=true)";
|
||||
Sentence sentence;
|
||||
sentence.set_text("aa aab");
|
||||
const std::vector<string> expected_char_ngrams{"ab", "b$", "^a", "aa", "a$"};
|
||||
const std::vector<float> expected_weights = {0.2f, 0.2f, 0.2f, 0.2f, 0.2f};
|
||||
return ExtractAndCheckFeatures(features, id_dim, expected_char_ngrams,
|
||||
expected_weights, &sentence);
|
||||
}
|
||||
|
||||
// Tests the case when ngram features get weights equal to their normalized
|
||||
// counts. Returns "true" if the test is successful and "false" otherwise.
|
||||
bool TestExtractFeaturesWithNonEqualWeight() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
// The integer id of each char ngram is computed as follows:
|
||||
// utils::Hash32WithDefaultSeed(char ngram) % id_dim.
|
||||
const int id_dim = 100;
|
||||
const string features = "continuous-bag-of-ngrams(id_dim=" +
|
||||
std::to_string(id_dim) +
|
||||
",size=2,include_terminators=true,include_" +
|
||||
"spaces=false,use_equal_weight=false)";
|
||||
Sentence sentence;
|
||||
sentence.set_text("aa aab");
|
||||
const std::vector<string> expected_char_ngrams{"ab", "b$", "^a", "aa", "a$"};
|
||||
const std::vector<float> expected_weights{0.1428f, 0.1428f, 0.2857f, 0.2857f,
|
||||
0.1428f};
|
||||
return ExtractAndCheckFeatures(features, id_dim, expected_char_ngrams,
|
||||
expected_weights, &sentence);
|
||||
}
|
||||
|
||||
// Tests the feature Script.
|
||||
bool TestScriptFeature() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
bool test_successful = true;
|
||||
TaskContext context;
|
||||
context.SetParameter("language_identifier_features", "script");
|
||||
FeatureIdWeightCalculator calc(&context);
|
||||
|
||||
// Check the script of the English sentence.
|
||||
Sentence sentence;
|
||||
sentence.set_text("food");
|
||||
std::set<int> feature_val_ids = calc.GetFeatureValueIds(&sentence);
|
||||
if (feature_val_ids.size() != 1 ||
|
||||
feature_val_ids.count(chrome_lang_id::CLD2::ULScript_Latin) == 0) {
|
||||
test_successful = false;
|
||||
std::cout << " Failure for input: " << sentence.text() << std::endl;
|
||||
}
|
||||
|
||||
// Check the script of a Chinese sentence.
|
||||
sentence.set_text("字");
|
||||
feature_val_ids = calc.GetFeatureValueIds(&sentence);
|
||||
if (feature_val_ids.size() != 1 ||
|
||||
feature_val_ids.count(chrome_lang_id::CLD2::ULScript_Hani) == 0) {
|
||||
test_successful = false;
|
||||
std::cout << " Failure for input: " << sentence.text() << std::endl;
|
||||
}
|
||||
|
||||
// Check the script of a Korean sentence.
|
||||
sentence.set_text("워드");
|
||||
feature_val_ids = calc.GetFeatureValueIds(&sentence);
|
||||
if (feature_val_ids.size() != 1 ||
|
||||
feature_val_ids.count(chrome_lang_id::CLD2::NUM_ULSCRIPTS) == 0) {
|
||||
test_successful = false;
|
||||
std::cout << " Failure for input: " << sentence.text() << std::endl;
|
||||
}
|
||||
|
||||
if (test_successful) {
|
||||
std::cout << " Success!" << std::endl;
|
||||
}
|
||||
return test_successful;
|
||||
}
|
||||
|
||||
} // namespace language_identifier_features_test
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
// Runs the feature extraction tests.
|
||||
int main(int argc, char **argv) {
|
||||
const bool tests_successful =
|
||||
chrome_lang_id::language_identifier_features_test::
|
||||
TestExtractFeaturesWithEqualWeight() &&
|
||||
chrome_lang_id::language_identifier_features_test::
|
||||
TestExtractFeaturesWithNonEqualWeight() &&
|
||||
chrome_lang_id::language_identifier_features_test::TestScriptFeature();
|
||||
return tests_successful ? 0 : 1;
|
||||
}
|
||||
54
Telegram/ThirdParty/cld3/src/language_identifier_main.cc
vendored
Normal file
54
Telegram/ThirdParty/cld3/src/language_identifier_main.cc
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "base.h"
|
||||
#include "nnet_language_identifier.h"
|
||||
|
||||
using chrome_lang_id::NNetLanguageIdentifier;
|
||||
|
||||
// Runs a neural net model for language identification.
|
||||
int main(int argc, char **argv) {
|
||||
NNetLanguageIdentifier lang_id(/*min_num_bytes=*/0,
|
||||
/*max_num_bytes=*/1000);
|
||||
|
||||
const std::vector<std::string> texts{"This text is written in English.",
|
||||
"Text in deutscher Sprache verfasst."};
|
||||
for (const std::string &text : texts) {
|
||||
const NNetLanguageIdentifier::Result result = lang_id.FindLanguage(text);
|
||||
std::cout << "text: " << text << std::endl
|
||||
<< " language: " << result.language << std::endl
|
||||
<< " probability: " << result.probability << std::endl
|
||||
<< " reliable: " << result.is_reliable << std::endl
|
||||
<< " proportion: " << result.proportion << std::endl
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
const std::string &text =
|
||||
"This piece of text is in English. Този текст е на Български.";
|
||||
std::cout << "text: " << text << std::endl;
|
||||
const std::vector<NNetLanguageIdentifier::Result> results =
|
||||
lang_id.FindTopNMostFreqLangs(text, /*num_langs*/ 3);
|
||||
for (const NNetLanguageIdentifier::Result &result : results) {
|
||||
std::cout << " language: " << result.language << std::endl
|
||||
<< " probability: " << result.probability << std::endl
|
||||
<< " reliable: " << result.is_reliable << std::endl
|
||||
<< " proportion: " << result.proportion << std::endl
|
||||
<< std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
254
Telegram/ThirdParty/cld3/src/nnet_lang_id_test.cc
vendored
Normal file
254
Telegram/ThirdParty/cld3/src/nnet_lang_id_test.cc
vendored
Normal file
@@ -0,0 +1,254 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "base.h"
|
||||
#include "nnet_lang_id_test_data.h"
|
||||
#include "nnet_language_identifier.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace nnet_lang_id_test {
|
||||
|
||||
// Tests the model on all supported languages. Returns "true" if the test is
|
||||
// successful and "false" otherwise.
|
||||
// TODO(abakalov): Add a test for random input that should be labeled as
|
||||
// "unknown" due to low confidence.
|
||||
bool TestPredictions() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
// (gold language, sample text) pairs used for testing.
|
||||
const std::vector<std::pair<std::string, std::string>> gold_lang_text = {
|
||||
{"af", NNetLangIdTestData::kTestStrAF},
|
||||
{"ar", NNetLangIdTestData::kTestStrAR},
|
||||
{"az", NNetLangIdTestData::kTestStrAZ},
|
||||
{"be", NNetLangIdTestData::kTestStrBE},
|
||||
{"bg", NNetLangIdTestData::kTestStrBG},
|
||||
{"bn", NNetLangIdTestData::kTestStrBN},
|
||||
{"bs", NNetLangIdTestData::kTestStrBS},
|
||||
{"ca", NNetLangIdTestData::kTestStrCA},
|
||||
{"ceb", NNetLangIdTestData::kTestStrCEB},
|
||||
{"cs", NNetLangIdTestData::kTestStrCS},
|
||||
{"cy", NNetLangIdTestData::kTestStrCY},
|
||||
{"da", NNetLangIdTestData::kTestStrDA},
|
||||
{"de", NNetLangIdTestData::kTestStrDE},
|
||||
{"el", NNetLangIdTestData::kTestStrEL},
|
||||
{"en", NNetLangIdTestData::kTestStrEN},
|
||||
{"eo", NNetLangIdTestData::kTestStrEO},
|
||||
{"es", NNetLangIdTestData::kTestStrES},
|
||||
{"et", NNetLangIdTestData::kTestStrET},
|
||||
{"eu", NNetLangIdTestData::kTestStrEU},
|
||||
{"fa", NNetLangIdTestData::kTestStrFA},
|
||||
{"fi", NNetLangIdTestData::kTestStrFI},
|
||||
{"fil", NNetLangIdTestData::kTestStrFIL},
|
||||
{"fr", NNetLangIdTestData::kTestStrFR},
|
||||
{"ga", NNetLangIdTestData::kTestStrGA},
|
||||
{"gl", NNetLangIdTestData::kTestStrGL},
|
||||
{"gu", NNetLangIdTestData::kTestStrGU},
|
||||
{"ha", NNetLangIdTestData::kTestStrHA},
|
||||
{"hi", NNetLangIdTestData::kTestStrHI},
|
||||
{"hmn", NNetLangIdTestData::kTestStrHMN},
|
||||
{"hr", NNetLangIdTestData::kTestStrHR},
|
||||
{"ht", NNetLangIdTestData::kTestStrHT},
|
||||
{"hu", NNetLangIdTestData::kTestStrHU},
|
||||
{"hy", NNetLangIdTestData::kTestStrHY},
|
||||
{"id", NNetLangIdTestData::kTestStrID},
|
||||
{"ig", NNetLangIdTestData::kTestStrIG},
|
||||
{"is", NNetLangIdTestData::kTestStrIS},
|
||||
{"it", NNetLangIdTestData::kTestStrIT},
|
||||
{"iw", NNetLangIdTestData::kTestStrIW},
|
||||
{"ja", NNetLangIdTestData::kTestStrJA},
|
||||
{"jv", NNetLangIdTestData::kTestStrJV},
|
||||
{"ka", NNetLangIdTestData::kTestStrKA},
|
||||
{"kk", NNetLangIdTestData::kTestStrKK},
|
||||
{"km", NNetLangIdTestData::kTestStrKM},
|
||||
{"kn", NNetLangIdTestData::kTestStrKN},
|
||||
{"ko", NNetLangIdTestData::kTestStrKO},
|
||||
{"la", NNetLangIdTestData::kTestStrLA},
|
||||
{"lo", NNetLangIdTestData::kTestStrLO},
|
||||
{"lt", NNetLangIdTestData::kTestStrLT},
|
||||
{"lv", NNetLangIdTestData::kTestStrLV},
|
||||
{"mg", NNetLangIdTestData::kTestStrMG},
|
||||
{"mi", NNetLangIdTestData::kTestStrMI},
|
||||
{"mk", NNetLangIdTestData::kTestStrMK},
|
||||
{"ml", NNetLangIdTestData::kTestStrML},
|
||||
{"mn", NNetLangIdTestData::kTestStrMN},
|
||||
{"mr", NNetLangIdTestData::kTestStrMR},
|
||||
{"ms", NNetLangIdTestData::kTestStrMS},
|
||||
{"mt", NNetLangIdTestData::kTestStrMT},
|
||||
{"my", NNetLangIdTestData::kTestStrMY},
|
||||
{"ne", NNetLangIdTestData::kTestStrNE},
|
||||
{"nl", NNetLangIdTestData::kTestStrNL},
|
||||
{"no", NNetLangIdTestData::kTestStrNO},
|
||||
{"ny", NNetLangIdTestData::kTestStrNY},
|
||||
{"pa", NNetLangIdTestData::kTestStrPA},
|
||||
{"pl", NNetLangIdTestData::kTestStrPL},
|
||||
{"pt", NNetLangIdTestData::kTestStrPT},
|
||||
{"ro", NNetLangIdTestData::kTestStrRO},
|
||||
{"ru", NNetLangIdTestData::kTestStrRU},
|
||||
{"si", NNetLangIdTestData::kTestStrSI},
|
||||
{"sk", NNetLangIdTestData::kTestStrSK},
|
||||
{"sl", NNetLangIdTestData::kTestStrSL},
|
||||
{"so", NNetLangIdTestData::kTestStrSO},
|
||||
{"sq", NNetLangIdTestData::kTestStrSQ},
|
||||
{"sr", NNetLangIdTestData::kTestStrSR},
|
||||
{"st", NNetLangIdTestData::kTestStrST},
|
||||
{"su", NNetLangIdTestData::kTestStrSU},
|
||||
{"sv", NNetLangIdTestData::kTestStrSV},
|
||||
{"sw", NNetLangIdTestData::kTestStrSW},
|
||||
{"ta", NNetLangIdTestData::kTestStrTA},
|
||||
{"te", NNetLangIdTestData::kTestStrTE},
|
||||
{"tg", NNetLangIdTestData::kTestStrTG},
|
||||
{"th", NNetLangIdTestData::kTestStrTH},
|
||||
{"tr", NNetLangIdTestData::kTestStrTR},
|
||||
{"uk", NNetLangIdTestData::kTestStrUK},
|
||||
{"ur", NNetLangIdTestData::kTestStrUR},
|
||||
{"uz", NNetLangIdTestData::kTestStrUZ},
|
||||
{"vi", NNetLangIdTestData::kTestStrVI},
|
||||
{"yi", NNetLangIdTestData::kTestStrYI},
|
||||
{"yo", NNetLangIdTestData::kTestStrYO},
|
||||
{"zh", NNetLangIdTestData::kTestStrZH},
|
||||
{"zu", NNetLangIdTestData::kTestStrZU}};
|
||||
|
||||
NNetLanguageIdentifier lang_id(/*min_num_bytes=*/0,
|
||||
/*max_num_bytes=*/1000);
|
||||
|
||||
// Iterate over all the test instances, make predictions and check that they
|
||||
// are correct.
|
||||
int num_wrong = 0;
|
||||
for (const auto &test_instance : gold_lang_text) {
|
||||
const std::string &expected_lang = test_instance.first;
|
||||
const std::string &text = test_instance.second;
|
||||
|
||||
const NNetLanguageIdentifier::Result result = lang_id.FindLanguage(text);
|
||||
if (result.language != expected_lang) {
|
||||
++num_wrong;
|
||||
std::cout << " Misclassification: " << std::endl;
|
||||
std::cout << " Text: " << text << std::endl;
|
||||
std::cout << " Expected language: " << expected_lang << std::endl;
|
||||
std::cout << " Predicted language: " << result.language << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if (num_wrong == 0) {
|
||||
std::cout << " Success!" << std::endl;
|
||||
return true;
|
||||
} else {
|
||||
std::cout << " Failure: " << num_wrong << " wrong predictions"
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Tests the model on input containing multiple languages of different scripts.
|
||||
// Returns "true" if the test is successful and "false" otherwise.
|
||||
bool TestMultipleLanguagesInInput() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
// Text containing snippets in English and Bulgarian.
|
||||
const std::string text =
|
||||
"This piece of text is in English. Този текст е на Български.";
|
||||
|
||||
// Expected language spans in the input text, corresponding respectively to
|
||||
// Bulgarian and English.
|
||||
const std::string expected_bg_span = " Този текст е на Български ";
|
||||
const std::string expected_en_span = " This piece of text is in English ";
|
||||
const float expected_byte_sum =
|
||||
static_cast<float>(expected_bg_span.size() + expected_en_span.size());
|
||||
|
||||
// Number of languages to query for and the expected byte proportions.
|
||||
const int num_queried_langs = 3;
|
||||
const std::unordered_map<string, float> expected_lang_proportions{
|
||||
{"bg", expected_bg_span.size() / expected_byte_sum},
|
||||
{"en", expected_en_span.size() / expected_byte_sum},
|
||||
{NNetLanguageIdentifier::kUnknown, 0.0}};
|
||||
|
||||
NNetLanguageIdentifier lang_id(/*min_num_bytes=*/0,
|
||||
/*max_num_bytes=*/1000);
|
||||
const std::vector<NNetLanguageIdentifier::Result> results =
|
||||
lang_id.FindTopNMostFreqLangs(text, num_queried_langs);
|
||||
|
||||
if (results.size() != expected_lang_proportions.size()) {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Wrong number of languages: expected "
|
||||
<< expected_lang_proportions.size() << ", obtained "
|
||||
<< results.size() << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Iterate over the results and check that the correct proportions are
|
||||
// returned for the expected languages.
|
||||
const float epsilon = 0.00001f;
|
||||
for (const NNetLanguageIdentifier::Result &result : results) {
|
||||
if (expected_lang_proportions.count(result.language) == 0) {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Incorrect language: " << result.language << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (std::abs(result.proportion -
|
||||
expected_lang_proportions.at(result.language)) > epsilon) {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Language " << result.language << ": expected proportion "
|
||||
<< expected_lang_proportions.at(result.language) << ", got "
|
||||
<< result.proportion << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip over undefined language.
|
||||
if (result.language == "und")
|
||||
continue;
|
||||
if (result.byte_ranges.size() != 1) {
|
||||
std::cout << " Should only detect one span containing " << result.language
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
// Check that specified byte ranges for language are correct.
|
||||
int start_index = result.byte_ranges[0].start_index;
|
||||
int end_index = result.byte_ranges[0].end_index;
|
||||
std::string byte_ranges_text = text.substr(start_index, end_index - start_index);
|
||||
if (result.language == "bg") {
|
||||
if (byte_ranges_text.compare("Този текст е на Български.") != 0) {
|
||||
std::cout << " Incorrect byte ranges returned for Bulgarian " << std::endl;
|
||||
return false;
|
||||
}
|
||||
} else if (result.language == "en") {
|
||||
if (byte_ranges_text.compare("This piece of text is in English. ") != 0) {
|
||||
std::cout << " Incorrect byte ranges returned for English " << std::endl;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
std::cout << " Got language other than English or Bulgarian "
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
std::cout << " Success!" << std::endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace nnet_lang_id_test
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
// Runs tests for the language identification model.
|
||||
int main(int argc, char **argv) {
|
||||
const bool tests_successful =
|
||||
chrome_lang_id::nnet_lang_id_test::TestPredictions() &&
|
||||
chrome_lang_id::nnet_lang_id_test::TestMultipleLanguagesInInput();
|
||||
return tests_successful ? 0 : 1;
|
||||
}
|
||||
529
Telegram/ThirdParty/cld3/src/nnet_lang_id_test_data.cc
vendored
Normal file
529
Telegram/ThirdParty/cld3/src/nnet_lang_id_test_data.cc
vendored
Normal file
@@ -0,0 +1,529 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "nnet_lang_id_test_data.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrAF =
|
||||
"Dit is 'n kort stukkie van die teks wat gebruik sal word vir die toets "
|
||||
"van die akkuraatheid van die nuwe benadering.";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrAR = "احتيالية بيع أي حساب";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrAZ =
|
||||
" a az qalıb breyn rinq intellektual oyunu üzrə yarışın zona mərhələləri "
|
||||
"keçirilib miq un qalıqlarının dənizdən çıxarılması davam edir məhəmməd "
|
||||
"peyğəmbərin karikaturalarını çap edən qəzetin baş redaktoru iş otağında "
|
||||
"ölüb";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrBE =
|
||||
" а друкаваць іх не было тэхнічна магчыма бліжэй за вільню тым самым часам "
|
||||
"нямецкае кіраўніцтва прапаноўвала апроч ўвядзення лацінкі яе";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrBG =
|
||||
" а дума попада в състояние на изпитание ключовите думи с предсказана "
|
||||
"малко под то изискване на страниците за търсене в";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrBN =
|
||||
"গ্যালারির ৩৮ বছর পূর্তিতে মূল্যছাড় অর্থনীতি বিএনপির ওয়াক আউট তপন"
|
||||
" চৌধুরী হারবাল অ্যাসোসিয়েশনের সভাপতি আন্তর্জাতিক পরামর্শক "
|
||||
"বোর্ড দিয়ে শরিয়াহ্ ইনন্ডেক্স করবে "
|
||||
"সিএসই মালিকপক্ষের কান্না, শ্রমিকের অনিশ্চয়তা মতিঝিলে সমাবেশ নিষিদ্ধ: "
|
||||
"এফবিসিসিআইয়ের ধন্যবাদ বিনোদন বিশেষ প্রতিবেদন বাংলালিংকের গ্র্যান্ডমাস্টার "
|
||||
"সিজন-৩ ব্রাজিলে বিশ্বকাপ ফুটবল আয়োজনবিরোধী বিক্ষোভ দেশের নিরাপত্তার"
|
||||
" চেয়ে অনেক বেশি সচেতন । প্রার্থীদের দক্ষতা ও যোগ্যতার"
|
||||
" পাশাপাশি তারা জাতীয় ইস্যুগুলোতে প্রাধান্য দিয়েছেন । ” পাঁচটি সিটিতে ২০"
|
||||
" লাখ ভোটারদের দিয়ে জাতীয় নির্বাচনে ৮ কোটি ভোটারদের"
|
||||
" সঙ্গে তুলনা করা যাবে কি একজন দর্শকের এমন প্রশ্নে জবাবে আব্দুল্লাহ "
|
||||
"আল নোমান বলেন , “ এই পাঁচটি সিটি কর্পোরেশন নির্বাচন দেশের পাঁচটি বড়"
|
||||
" বিভাগের প্রতিনিধিত্ব করছে । এছাড়া এখানকার ভোটার রা সবাই সচেতন । তারা";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrBS =
|
||||
"Novi predsjednik Mešihata Islamske zajednice u Srbiji (IZuS) i muftija "
|
||||
"dr. Mevlud ef. Dudić izjavio je u intervjuu za Anadolu Agency (AA) kako "
|
||||
"je uvjeren da će doći do vraćanja jedinstva među muslimanima i unutar "
|
||||
"Islamske zajednice na prostoru Sandžaka, te da je njegova ruka pružena za "
|
||||
"povratak svih u okrilje Islamske zajednice u Srbiji nakon skoro sedam "
|
||||
"godina podjela u tom dijelu Srbije. Dudić je za predsjednika Mešihata IZ "
|
||||
"u Srbiji izabran 4. januara, a zvanična inauguracija će biti obavljena u "
|
||||
"prvoj polovini februara. Kako se očekuje, prisustvovat će joj i "
|
||||
"reisu-l-ulema Islamske zajednice u Srbiji Husein ef. Kavazović koji će i "
|
||||
"zvanično promovirati Dudića u novog prvog čovjeka IZ u Srbiji. Dudić će "
|
||||
"danas boraviti u prvoj zvaničnoj posjeti reisu Kavazoviću, što je njegov "
|
||||
"privi simbolični potez nakon imenovanja. ";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrCA =
|
||||
"al final en un únic lloc nhorabona l correu electrònic està concebut com "
|
||||
"a eina de productivitat aleshores per què perdre el temps arxivant "
|
||||
"missatges per després intentar recordar on els veu desar i per què heu d "
|
||||
"eliminar missatges importants per l";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrCEB =
|
||||
"Ang Sugbo usa sa mga labing ugmad nga lalawigan sa nasod. Kini ang sentro "
|
||||
"sa komersyo, edukasyon ug industriya sa sentral ug habagatang dapit sa "
|
||||
"kapupod-an. Ang mipadayag sa Sugbo isip ikapito nga labing nindot nga "
|
||||
"pulo sa , ang nag-inusarang pulo sa Pilipinas nga napasidunggan sa maong "
|
||||
"magasin sukad pa sa tuig";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrCS =
|
||||
" a akci opakujte film uložen vykreslit gmail tokio smazat obsah adresáře "
|
||||
"nelze načíst systémový profil jednotky smoot okud používáte pro určení "
|
||||
"polokoule značky z západ nebo v východ používejte nezáporné hodnoty "
|
||||
"zeměpisné délky nelze";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrCY =
|
||||
" a chofrestru eich cyfrif ymwelwch a unwaith i chi greu eich cyfrif mi "
|
||||
"fydd yn cael ei hysbysu o ch cyfeiriad ebost newydd fel eich bod yn gallu "
|
||||
"cadw mewn cysylltiad drwy gmail os nad ydych chi wedi clywed yn barod am "
|
||||
"gmail mae n gwasanaeth gwebost";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrDA =
|
||||
" a z tallene og punktummer der er tilladte log ud angiv den ønskede "
|
||||
"adgangskode igen november gem personlige oplysninger kontrolspørgsmål det "
|
||||
"sidste tegn i dit brugernavn skal være et bogstav a z eller tal skriv de "
|
||||
"tegn du kan se i billedet nedenfor";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrDE =
|
||||
" abschnitt ordner aktivieren werden die ordnereinstellungen im "
|
||||
"farbabschnitt deaktiviert öchten sie wirklich fortfahren eldtypen angeben "
|
||||
"optional n diesem schritt geben sie für jedesfeld aus dem datenset den "
|
||||
"typ an ieser schritt ist optional eldtypen";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrEL =
|
||||
" ή αρνητική αναζήτηση λέξης κλειδιού καταστήστε τις μεμονωμένες λέξεις "
|
||||
"κλειδιά περισσότερο στοχοθετημένες με τη μετατροπή τους σε";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrEN =
|
||||
" a backup credit card by visiting your billing preferences page or visit "
|
||||
"the adwords help centre for more details https adwords google com support "
|
||||
"bin answer py answer hl en we were unable to process the payment of for "
|
||||
"your outstanding google adwords";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrEO =
|
||||
" a jarcento refoje per enmetado de koncerna pastro tiam de reformita "
|
||||
"konfesio ekde refoje ekzistis luteranaj komunumanoj tamen tiuj fondis "
|
||||
"propran komunumon nur en ambaŭ apartenis ekde al la evangela eklezio en "
|
||||
"prusio resp ties rejnlanda provinceklezio en";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrES =
|
||||
" a continuación haz clic en el botón obtener ruta también puedes "
|
||||
"desplazarte hasta el final de la página para cambiar tus opciones de "
|
||||
"búsqueda gráfico y detalles ésta es una lista de los vídeos que te "
|
||||
"recomendamos nuestras recomendaciones se basan";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrET =
|
||||
" a niipea kui sinu maksimaalne igakuine krediidi limiit on meie poolt "
|
||||
"heaks kiidetud on sinu kohustuseks see krediidilimiit";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrEU =
|
||||
" a den eraso bat honen kontra hortaz eragiketa bakarrik behar dituen "
|
||||
"eraso batek aes apurtuko luke nahiz eta oraingoz eraso bideraezina izan "
|
||||
"gaur egungo teknologiaren mugak direla eta oraingoz kezka hauek alde "
|
||||
"batera utzi daitezke orain arteko indar";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrFA =
|
||||
" آب خوردن عجله می کردند به جای باز ی کتک کاری می کردند و همه چيز مثل قبل "
|
||||
"بود فقط من ماندم و يک دنيا حرف و انتظار تا عاقبت رسيد احضاريه ی ای با";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrFI =
|
||||
" a joilla olet käynyt tämä kerro meille kuka ä olet ei tunnistettavia "
|
||||
"käyttötietoja kuten virheraportteja käytetään google desktopin "
|
||||
"parantamiseen etsi näyttää mukautettuja uutisia google desktop "
|
||||
"keskivaihto leikkaa voit kaksoisnapsauttaa";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrFIL =
|
||||
"Ito ay isang maikling piraso ng teksto na ito ay gagamitin para sa "
|
||||
"pagsubok ang kawastuhan ng mga bagong diskarte.";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrFR =
|
||||
" a accès aux collections et aux frontaux qui lui ont été attribués il "
|
||||
"peut consulter et modifier ses collections et exporter des configurations "
|
||||
"de collection toutefois il ne peut pas créer ni supprimer des collections "
|
||||
"enfin il a accès aux fonctions";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrGA =
|
||||
" a bhfuil na focail go léir i do cheist le fáil orthu ní gá ach focail "
|
||||
"breise a chur leis na cinn a cuardaíodh cheana chun an cuardach a "
|
||||
"bheachtú nó a chúngú má chuirtear focal breise isteach aimseofar fo aicme "
|
||||
"ar leith de na torthaí a fuarthas";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrGL =
|
||||
" debe ser como mínimo taranto tendas de venda polo miúdo cociñas "
|
||||
"servizos bordado canadá viaxes parques de vehículos de recreo hotel "
|
||||
"oriental habitación recibir unha postal no enderezo indicado "
|
||||
"anteriormente";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrGU =
|
||||
" આના પરિણામ પ્રમાણસર ફોન્ટ અવતરણ ચિન્હવાળા પાઠને છુપાવો બધા સમૂહો શોધાયા"
|
||||
" હાલનો જ સંદેશ વિષયની";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrHA =
|
||||
" a cikin a kan sakamako daga sakwannin a kan sakamako daga sakwannin daga "
|
||||
"ranar zuwa a kan sakamako daga guda daga ranar zuwa a kan sakamako daga "
|
||||
"shafukan daga ranar zuwa a kan sakamako daga guda a cikin last hour a kan "
|
||||
"sakamako daga guda daga kafar";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrHI =
|
||||
" ं ऐडवर्ड्स विज्ञापनों के अनुभव पर आधारित हैं और इनकी मदद से आपको अपने"
|
||||
" विज्ञापनों का अधिकतम लाभ";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrHMN =
|
||||
"Qhov no yog ib tug luv luv daim ntawv nyeem uas yuav siv tau rau kev soj "
|
||||
"ntsuam qhov tseeb ntawm tus tshiab mus kom ze.";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrHR =
|
||||
"Posljednja dva vladara su Kijaksar (Κυαξαρης; 625-585 prije Krista), "
|
||||
"fraortov sin koji će proširiti teritorij Medije i Astijag. Kijaksar je "
|
||||
"imao kćer ili unuku koja se zvala Amitis a postala je ženom "
|
||||
"Nabukodonosora II. kojoj je ovaj izgradio Viseće vrtove Babilona. "
|
||||
"Kijaksar je modernizirao svoju vojsku i uništio Ninivu 612. prije Krista. "
|
||||
"Naslijedio ga je njegov sin, posljednji medijski kralj, Astijag, kojega "
|
||||
"je detronizirao (srušio sa vlasti) njegov unuk Kir Veliki. Zemljom su "
|
||||
"zavladali Perzijanci. Hrvatska je zemlja situacija u Europi. Ona ima "
|
||||
"bogatu kulturu i ukusna jela.";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrHT =
|
||||
" ak pitit tout sosyete a chita se pou sa leta dwe pwoteje yo nimewo leta "
|
||||
"fèt pou li pwoteje tout paran ak pitit nan peyi a menm jan kit paran yo "
|
||||
"marye kit yo pa marye tout manman ki fè pitit leta fèt pou ba yo konkoul "
|
||||
"menm jan tou pou timoun piti ak pou";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrHU =
|
||||
" a felhasználóim a google azonosító szöveget ikor látják a felhasználóim "
|
||||
"a google azonosító szöveget felhasználók a google azonosító szöveget "
|
||||
"fogják látni minden tranzakció után ha a vásárlását regisztrációját "
|
||||
"oldalunk";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrHY =
|
||||
" ա յ եվ նա հիացած աչքերով նայում է հինգհարկանի շենքի տարօրինակ փոքրիկ "
|
||||
"քառակուսի պատուհաններին դեռ մենք շատ ենք հետամնաց ասում է նա այսպես է";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrID =
|
||||
"berdiri setelah pengurusnya yang berusia 83 tahun, Fayzrahman Satarov, "
|
||||
"mendeklarasikan diri sebagai nabi dan rumahnya sebagai negara Islam "
|
||||
"Satarov digambarkan sebagai mantan ulama Islam tahun 1970-an. "
|
||||
"Pengikutnya didorong membaca manuskripnya dan kebanyakan dilarang "
|
||||
"meninggalkan tempat persembunyian bawah tanah di dasar gedung delapan "
|
||||
"lantai mereka. Jaksa membuka penyelidikan kasus kriminal pada kelompok "
|
||||
"itu dan menyatakan akan membubarkan kelompok kalau tetap melakukan "
|
||||
"kegiatan ilegal seperti mencegah anggotanya mencari bantuan medis atau "
|
||||
"pendidikan. Sampai sekarang pihak berwajib belum melakukan penangkapan "
|
||||
"meskipun polisi mencurigai adanya tindak kekerasan pada anak. Pengadilan "
|
||||
"selanjutnya akan memutuskan apakah anak-anak diizinkan tetap tinggal "
|
||||
"dengan orang tua mereka. Kazan yang berada sekitar 800 kilometer di timur "
|
||||
"Moskow merupakan wilayah Tatarstan yang";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrIG =
|
||||
"Chineke bụ aha ọzọ ndï omenala Igbo kpọro Chukwu. Mgbe ndị bekee bịara, "
|
||||
"ha mee ya nke ndi Christian. N'echiche ndi ekpere chi Omenala Ndi Igbo, "
|
||||
"Christianity, Judaism, ma Islam, Chineke nwere ọtụtụ utu aha, ma nwee "
|
||||
"nanị otu aha. Ụzọ abụọ e si akpọ aha ahụ bụ Jehovah ma Ọ bụ Yahweh. Na "
|
||||
"ọtụtụ Akwụkwọ Nsọ, e wepụla aha Chineke ma jiri utu aha bụ Onyenwe Anyị "
|
||||
"ma ọ bụ Chineke dochie ya. Ma mgbe e dere akwụkwọ nsọ, aha ahụ bụ Jehova "
|
||||
"pụtara n’ime ya, ihe dị ka ugboro pụkụ asaa(7,000).";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrIS =
|
||||
" a afköst leitarorða þinna leitarorð neikvæð leitarorð auglýsingahópa "
|
||||
"byggja upp aðallista yfir ný leitarorð fyrir auglýsingahópana og skoða "
|
||||
"ítarleg gögn um árangur leitarorða eins og samkeppni auglýsenda og "
|
||||
"leitarmagn er krafist notkun";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrIT =
|
||||
" a causa di un intervento di manutenzione del sistema fino alle ore circa "
|
||||
"ora legale costa del pacifico del novembre le campagne esistenti "
|
||||
"continueranno a essere pubblicate come di consueto anche durante questo "
|
||||
"breve periodo di inattività ci scusiamo per";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrIW =
|
||||
" או לערוך את העדפות ההפצה אנא עקוב אחרי השלבים הבאים כנס לחשבון האישי שלך "
|
||||
"ב";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrJA =
|
||||
" このペ ジでは アカウントに指定された予算の履歴を一覧にしています "
|
||||
"それぞれの項目には 予算額と特定期間のステ タスが表示されます "
|
||||
"現在または今後の予算を設定するには";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrJV =
|
||||
"Iki Piece cendhak teks sing bakal digunakake kanggo Testing akurasi "
|
||||
"pendekatan anyar.";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrKA =
|
||||
" ა ბირთვიდან მიღებული ელემენტი მენდელეევის პერიოდულ სიტემაში "
|
||||
"გადაინაცვლებს ორი უჯრით";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrKK =
|
||||
" а билердің өзіне рұқсат берілмеген егер халық талап етсе ғана хан "
|
||||
"келісім берген өздеріңіз білесіздер қр қыл мыс тық кодексінде жазаның";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrKM =
|
||||
"នេះគឺជាបំណែកខ្លីនៃអត្ថបទដែលនឹងត្រូវបានប្រើសម្រាប់ការធ្វើតេស្តភាពត្រឹមត្រូវ"
|
||||
"នៃវិធីសាស្រ្តថ្មីនេះ។";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrKN =
|
||||
" ಂಠಯ್ಯನವರು ತುಮಕೂರು ಜಿಲ್ಲೆಯ ಚಿಕ್ಕನಾಯಕನಹಳ್ಳಿ ತಾಲ್ಲೂಕಿನ ತೀರ್ಥಪುರ ವೆಂಬ ಸಾಧಾರಣ"
|
||||
" ಹಳ್ಳಿಯ ಶ್ಯಾನುಭೋಗರ";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrKO =
|
||||
" 개별적으로 리포트 액세스 권한을 부여할 수 있습니다 액세스 권한 "
|
||||
"부여사용자에게 프로필 리포트에 액세스할 수 있는 권한을 부여하시려면 가용 "
|
||||
"프로필 상자에서 프로필 이름을 선택한 다음";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrLA =
|
||||
" a deo qui enim nocendi causa mentiri solet si iam consulendi causa "
|
||||
"mentiatur multum profecit sed aliud est quod per se ipsum laudabile "
|
||||
"proponitur aliud quod in deterioris comparatione praeponitur aliter enim "
|
||||
"gratulamur cum sanus est homo aliter cum melius";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrLO =
|
||||
" ກຫາທົ່ວທັງເວັບ ແລະໃນເວັບໄຮ້ສາຍ ທຳອິດໃຫ້ທຳການຊອກຫາກ່ອນ ຈາກນັ້ນ"
|
||||
" ໃຫ້ກົດປຸ່ມເມນູ ໃນໜ້າຜົນໄດ້";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrLT =
|
||||
" a išsijungia mano idėja dėl geriausio laiko po pastarųjų savo santykių "
|
||||
"pasimokiau penki dalykai be kurių negaliu gyventi mano miegamajame tu "
|
||||
"surasi ideali pora išsilavinimas aukštoji mokykla koledžas universitetas "
|
||||
"pagrindinis laipsnis metai";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrLV =
|
||||
" a gadskārtējā izpārdošana slēpošana jāņi atlaide izmaiņas trafikā kas "
|
||||
"saistītas ar sezonas izpārdošanu speciālajām atlaidēm u c ir parastas un "
|
||||
"atslēgvārdi kas ir populāri noteiktos laika posmos šajā laikā saņems "
|
||||
"lielāku klikšķu";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrMG =
|
||||
" amporisihin i ianao mba hijery ny dika teksta ranofotsiny an ity "
|
||||
"lahatsoratra ity tsy ilaina ny opérateur efa karohina daholo ny teny "
|
||||
"rehetra nosoratanao ampiasao anaovana dokambarotra i google telugu datin "
|
||||
"ny takelaka fikarohana sary renitakelak i";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrMI =
|
||||
" haere ki te kainga o o haere ki te kainga o o haere ki te kainga o te "
|
||||
"rapunga ahua o haere ki te kainga o ka tangohia he ki to rapunga kaore au "
|
||||
"mohio te tikanga whakatiki o te ra he whakaharuru te pai rapunga a te "
|
||||
"rapunga ahua a e kainga o nga awhina o te";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrMK =
|
||||
" гласовите коалицијата на вмро дпмне како партија со најмногу освоени "
|
||||
"гласови ќе добие евра а на сметката на коализијата за македонија";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrML =
|
||||
" ം അങ്ങനെ ഞങ്ങള് അവരുടെ മുമ്പില് നിന്നു ഔടും ഉടനെ നിങ്ങള് പതിയിരിപ്പില് "
|
||||
"നിന്നു എഴുന്നേറ്റു";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrMN =
|
||||
" а боловсронгуй болгох орон нутгийн ажил үйлсийг уялдуулж зохицуулах "
|
||||
"дүрэм журам боловсруулах орон нутгийн өмч хөрөнгө санхүүгийн";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrMR =
|
||||
"हैदराबाद उच्चार ऐका (सहाय्य·माहिती)तेलुगू: హైదరాబాదు , उर्दू:"
|
||||
" حیدر آباد हे भारतातील आंध्र प्रदेश राज्याच्या राजधानीचे शहर"
|
||||
" आहे. हैदराबादची लोकसंख्या ७७ लाख ४० हजार ३३४ आहे. मोत्यांचे शहर"
|
||||
" अशी एकेकाळी ओळख असलेल्या या शहराला ऐतिहासिक, सांस्कृतिक आणि "
|
||||
"स्थापत्यशास्त्रीय वारसा लाभला आहे. १९९० नंतर शिक्षण आणि माहिती तंत्रज्ञान"
|
||||
" त्याचप्रमाणे औषधनिर्मिती आणि जैवतंत्रज्ञान क्षेत्रातील उद्योगधंद्यांची"
|
||||
" वाढ शहरात झाली. दक्षिण मध्य भारतातील पर्यटन आणि तेलुगू चित्रपटनिर्मितीचे"
|
||||
" हैदराबाद हे केंद्र आहे";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrMS =
|
||||
"pengampunan beramai-ramai supaya mereka pulang ke rumah masing-masing. "
|
||||
"Orang-orang besarnya enggan mengiktiraf sultan yang dilantik oleh Belanda "
|
||||
"sebagai Yang DiPertuan Selangor. Orang ramai pula tidak mahu menjalankan "
|
||||
"perniagaan bijih timah dengan Belanda, selagi raja yang berhak tidak "
|
||||
"ditabalkan. Perdagang yang lain dibekukan terus kerana untuk membalas "
|
||||
"jasa beliau yang membantu Belanda menentang Riau, Johor dan Selangor. Di "
|
||||
"antara tiga orang Sultan juga dipandang oleh rakyat sebagai seorang "
|
||||
"sultan yang paling gigih. 1 | 2 SULTAN Sebagai ganti Sultan Ibrahim "
|
||||
"ditabalkan Raja Muhammad iaitu Raja Muda. Walaupun baginda bukan anak "
|
||||
"isteri pertama bergelar Sultan Muhammad bersemayam di Kuala Selangor "
|
||||
"juga. Pentadbiran baginda yang lemah itu menyebabkan Kuala Selangor "
|
||||
"menjadi sarang ioleh Cina di Lukut tidak diambil tindakan, sedangkan "
|
||||
"baginda sendiri banyak berhutang kepada 1";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrMT =
|
||||
" ata ikteb messaġġ lil indirizzi differenti billi tagħżilhom u tagħfas il "
|
||||
"buttuna ikteb żid numri tfittxijja tal kotba mur print home kotba minn "
|
||||
"pagni ghal pagna minn ghall ktieb ta aċċessa stieden habib iehor grazzi "
|
||||
"it tim tal gruppi google";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrMY =
|
||||
" တက္ကသုိလ္ မ္ဟ ပ္ရန္ လာ္ရပီးေနာက္ န္ဟစ္ အရ္ဝယ္ ဦးသန္ ့သည္ ပန္"
|
||||
" းတနော္ အမ္ယုိးသား ေက္ယာင္ း";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrNE =
|
||||
"अरू ठाऊँबाटपनि खुलेको छ यो खाता अर अरू ठाऊँबाटपनि खुलेको छ यो खाता अर ू";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrNL =
|
||||
" a als volgt te werk om een configuratiebestand te maken sitemap gen py "
|
||||
"ebruik filters om de s op te geven die moeten worden toegevoegd of "
|
||||
"uitgesloten op basis van de opmaaktaal elke sitemap mag alleen de s "
|
||||
"bevatten voor een bepaalde opmaaktaal dit";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrNO =
|
||||
" a er obligatorisk tidsforskyvning plassering av katalogsøk "
|
||||
"planinformasjon loggfilbane gruppenavn kontoinformasjon passord domene "
|
||||
"gruppeinformasjon alle kampanjesporing alternativ bruker grupper "
|
||||
"oppgaveplanlegger oppgavehistorikk kontosammendrag antall";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrNY =
|
||||
"Boma ndi gawo la dziko lomwe linapangidwa ndi cholinga chothandiza "
|
||||
"ntchito yolamulira. Kuŵalako kulikuunikabe mandita, Edipo nyima "
|
||||
"unalephera kugonjetsa kuŵalako.";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrPA =
|
||||
" ਂ ਦਿਨਾਂ ਵਿਚ ਭਾਈ ਸਾਹਿਬ ਦੀ ਬੁੱਚੜ ਗੋਬਿੰਦ ਰਾਮ ਨਾਲ ਅੜਫਸ ਚੱਲ ਰਹੀ ਸੀ ਗੋਬਿੰਦ"
|
||||
" ਰਾਮ ਨੇ ਭਾਈ ਸਾਹਿਬ ਦੀਆਂ ਭੈਣਾ";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrPL =
|
||||
" a australii będzie widział inne reklamy niż użytkownik z kanady "
|
||||
"kierowanie geograficzne sprawia że reklamy są lepiej dopasowane do "
|
||||
"użytkownika twojej strony oznacza to także że możesz nie zobaczyć "
|
||||
"wszystkich reklam które są wyświetlane na";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrPT =
|
||||
" a abit prevê que a entrada desses produtos estrangeiros no mercado "
|
||||
"têxtil e vestuário do brasil possa reduzir os preços em cerca de a partir "
|
||||
"de má notícia para os empresários que terão que lutar para garantir suas "
|
||||
"margens de lucro mas boa notícia";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrRO =
|
||||
" a anunţurilor reţineţi nu plătiţi pentru clicuri sau impresii ci numai "
|
||||
"atunci când pe site ul dvs survine o acţiune dorită site urile negative "
|
||||
"nu pot avea uri de destinaţie daţi instrucţiuni societăţii dvs bancare "
|
||||
"sau constructoare să";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrRU =
|
||||
" а неправильный формат идентификатора дн назад";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrSI =
|
||||
" අනුරාධ මිහිඳුකුල නමින් සකුරා ට ලිපියක් තැපෑලෙන් එවා තිබුණා කි "
|
||||
"් රස්ටි ෂෙල්ටන් ප ් රනාන්දු ද";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrSK =
|
||||
" a aktivovať reklamnú kampaň ak chcete kampaň pred spustením ešte "
|
||||
"prispôsobiť uložte ju ako šablónu a pokračujte v úprave vyberte si jednu "
|
||||
"z možností nižšie a kliknite na tlačidlo uložiť kampaň nastavenia kampane "
|
||||
"môžete ľubovoľne";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrSL =
|
||||
" adsense stanje prijave za google adsense google adsense račun je bil "
|
||||
"začasno zamrznjen pozdravljeni hvala za vaše zanimanje v google adsense "
|
||||
"po pregledu vaše prijavnice so naši strokovnjaki ugotovili da spletna "
|
||||
"stran ki je trenutno povezana z vašim";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrSO =
|
||||
" a oo maanta bogga koobaad ugu qoran yahey beesha caalamka laakiin si "
|
||||
"kata oo beesha caalamku ula guntato soomaaliya waxa aan shaki ku jirin in "
|
||||
"aakhirataanka dadka soomaalida oo kaliya ay yihiin ku soomaaliya ka saari "
|
||||
"kara dhibka ay ku jirto";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrSQ =
|
||||
" a do të kërkoni nga beogradi që të njohë pavarësinë e kosovës zoti thaçi "
|
||||
"prishtina është gati ta njoh pavarësinë e serbisë ndërsa natyrisht se do "
|
||||
"të kërkohet një gjë e tillë që edhe beogradi ta njoh shtetin e pavarur "
|
||||
"dhe sovran të";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrSR =
|
||||
"балчак балчак на мапи србије уреди демографија у насељу балчак живи "
|
||||
"пунолетна становника а просечна старост становништва износи година";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrST =
|
||||
" bang ba nang le thahasello matshwao a sehlooho thuto e thehilweng hodima "
|
||||
"diphetho ke tsela ya ho ruta le ho ithuta e totobatsang hantle seo "
|
||||
"baithuti ba lokelang ho se fihlella ntlhatheo eo e sebetsang ka yona ke "
|
||||
"ya hore titjhere o hlakisa pele seo";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrSU =
|
||||
"Nu ngatur kahirupan warga, keur kapentingan pamarentahan diatur ku RT, RW "
|
||||
"jeung Kepala Dusun, sedengkeun urusan adat dipupuhuan ku Kuncen jeung "
|
||||
"kepala adat. Sanajan Kampung Kuta teu pati anggang jeung lembur sejenna "
|
||||
"nu aya di wewengkon Desa Pasir Angin, tapi boh wangunan imah atawa "
|
||||
"tradisi kahirupan masarakatna nenggang ti nu lian.";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrSV =
|
||||
" a bort objekt från google desktop post äldst meny öretag dress etaljer "
|
||||
"alternativ för vad är inne yaste google skrivbord plugin program för "
|
||||
"nyheter google visa nyheter som är anpassade efter de artiklar som du "
|
||||
"läser om du till exempel läser";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrSW =
|
||||
" a ujumbe mpya jumla unda tafuta na angalia vikundi vya kujadiliana na "
|
||||
"kushiriki mawazo iliyopangwa kwa tarehe watumiaji wapya futa orodha hizi "
|
||||
"lugha hoja vishikanisho vilivyo dhaminiwa ujumbe sanaa na tamasha toka "
|
||||
"udhibitisho wa neno kwa haraka fikia";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrTA =
|
||||
" அங்கு ராஜேந்திர சோழனால் கட்டப்பட்ட பிரம்மாண்டமான சிவன் கோவில் ஒன்றும்"
|
||||
" உள்ளது தொகு";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrTE =
|
||||
" ఁ దనర జయించిన తత్వ మరసి చూడఁ దాన యగును రాజయోగి యిట్లు తేజరిల్లుచు నుండు "
|
||||
"విశ్వదాభిరామ వినర వేమ";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrTG =
|
||||
" адолат ва инсондӯстиро бар фашизм нажодпарастӣ ва адоват тарҷеҳ додааст "
|
||||
"чоп кунед ба дигарон фиристед чоп кунед ба дигарон фиристед";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrTH =
|
||||
" กฏในการค้นหา หรือหน้าเนื้อหา หากท่านเลือกลงโฆษณา "
|
||||
"ท่านอาจจะปรับต้องเพิ่มงบประมาณรายวันตา";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrTR =
|
||||
" a ayarlarınızı görmeniz ve yönetmeniz içindir eğer kampanyanız için "
|
||||
"günlük bütçenizi gözden geçirebileceğiniz yeri arıyorsanız kampanya "
|
||||
"yönetimi ne gidin kampanyanızı seçin ve kampanya ayarlarını düzenle yi "
|
||||
"tıklayın sunumu";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrUK =
|
||||
" а більший бюджет щоб забезпечити собі максимум прибутків від переходів "
|
||||
"відстежуйте свої об яви за датою географічним розташуванням";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrUR =
|
||||
" آپ کو کم سے کم ممکنہ رقم چارج کرتا ہے اس کی مثال کے طور پر فرض کریں اگر "
|
||||
"آپ کی زیادہ سے زیادہ قیمت فی کلِک امریکی ڈالر اور کلِک کرنے کی شرح ہو تو";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrUZ =
|
||||
" abadiylashtirildi aqsh ayol prezidentga tayyormi markaziy osiyo afg "
|
||||
"onistonga qanday yordam berishi mumkin ukrainada o zbekistonlik "
|
||||
"muhojirlar tazyiqdan shikoyat qilmoqda gruziya va ukraina hozircha natoga "
|
||||
"qabul qilinmaydi afg oniston o zbekistonni g";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrVI =
|
||||
" adsense cho nội dung nhà cung cấp dịch vụ di động xác minh tín"
|
||||
" dụng thay đổi nhãn kg các ô xem chi phí cho từ chối các đơn đặt"
|
||||
" hàng dạng cấp dữ liệu ác minh trang web của bạn để xem";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrYI =
|
||||
"אן פאנטאזיע ער איז באקאנט צים מערסטן פאר זיינע באַלאַדעס ער האָט געוווינט "
|
||||
"אין ווארשע יעס פאריס ליווערפול און לאנדאן סוף כל סוף איז ער";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrYO =
|
||||
" abinibi han ikawe alantakun le ni opolopo ede abinibi ti a to lesese bi "
|
||||
"eniyan to fe lo se fe lati se atunse jowo mo pe awon oju iwe itakunagbaye "
|
||||
"miran ti ako ni oniruru ede abinibi le faragba nipa atunse ninu se iwadi "
|
||||
"blogs ni ori itakun agbaye ti e ba";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrZH =
|
||||
"产品的简报和公告 提交该申请后无法进行更改 请确认您的选择是正确的 "
|
||||
"对于要提交的图书 我确认 我是版权所有者或已得到版权所有者的授权 "
|
||||
"要更改您的国家 地区 请在此表的最上端更改您的";
|
||||
|
||||
const char *const NNetLangIdTestData::kTestStrZU =
|
||||
" ana engu uma inkinga iqhubeka siza ubike kwi isexwayiso ngenxa yephutha "
|
||||
"lomlekeleli sikwazi ukubuyisela emuva kuphela imiphumela engaqediwe "
|
||||
"ukuthola imiphumela eqediwe zama ukulayisha kabusha leli khasi emizuzwini "
|
||||
"engu uma inkinga iqhubeka siza uthumele";
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
117
Telegram/ThirdParty/cld3/src/nnet_lang_id_test_data.h
vendored
Normal file
117
Telegram/ThirdParty/cld3/src/nnet_lang_id_test_data.h
vendored
Normal file
@@ -0,0 +1,117 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef NNET_LANG_ID_TEST_DATA_H_
|
||||
#define NNET_LANG_ID_TEST_DATA_H_
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
class NNetLangIdTestData {
|
||||
public:
|
||||
// Pieces of text in different languages.
|
||||
static const char *const kTestStrAF;
|
||||
static const char *const kTestStrAR;
|
||||
static const char *const kTestStrAZ;
|
||||
static const char *const kTestStrBE;
|
||||
static const char *const kTestStrBG;
|
||||
static const char *const kTestStrBN;
|
||||
static const char *const kTestStrBS;
|
||||
static const char *const kTestStrCA;
|
||||
static const char *const kTestStrCEB;
|
||||
static const char *const kTestStrCS;
|
||||
static const char *const kTestStrCY;
|
||||
static const char *const kTestStrDA;
|
||||
static const char *const kTestStrDE;
|
||||
static const char *const kTestStrEL;
|
||||
static const char *const kTestStrEN;
|
||||
static const char *const kTestStrEO;
|
||||
static const char *const kTestStrES;
|
||||
static const char *const kTestStrET;
|
||||
static const char *const kTestStrEU;
|
||||
static const char *const kTestStrFA;
|
||||
static const char *const kTestStrFI;
|
||||
static const char *const kTestStrFIL;
|
||||
static const char *const kTestStrFR;
|
||||
static const char *const kTestStrGA;
|
||||
static const char *const kTestStrGL;
|
||||
static const char *const kTestStrGU;
|
||||
static const char *const kTestStrHA;
|
||||
static const char *const kTestStrHI;
|
||||
static const char *const kTestStrHMN;
|
||||
static const char *const kTestStrHR;
|
||||
static const char *const kTestStrHT;
|
||||
static const char *const kTestStrHU;
|
||||
static const char *const kTestStrHY;
|
||||
static const char *const kTestStrID;
|
||||
static const char *const kTestStrIG;
|
||||
static const char *const kTestStrIS;
|
||||
static const char *const kTestStrIT;
|
||||
static const char *const kTestStrIW;
|
||||
static const char *const kTestStrJA;
|
||||
static const char *const kTestStrJV;
|
||||
static const char *const kTestStrKA;
|
||||
static const char *const kTestStrKK;
|
||||
static const char *const kTestStrKM;
|
||||
static const char *const kTestStrKN;
|
||||
static const char *const kTestStrKO;
|
||||
static const char *const kTestStrLA;
|
||||
static const char *const kTestStrLO;
|
||||
static const char *const kTestStrLT;
|
||||
static const char *const kTestStrLV;
|
||||
static const char *const kTestStrMG;
|
||||
static const char *const kTestStrMI;
|
||||
static const char *const kTestStrMK;
|
||||
static const char *const kTestStrML;
|
||||
static const char *const kTestStrMN;
|
||||
static const char *const kTestStrMR;
|
||||
static const char *const kTestStrMS;
|
||||
static const char *const kTestStrMT;
|
||||
static const char *const kTestStrMY;
|
||||
static const char *const kTestStrNE;
|
||||
static const char *const kTestStrNL;
|
||||
static const char *const kTestStrNO;
|
||||
static const char *const kTestStrNY;
|
||||
static const char *const kTestStrPA;
|
||||
static const char *const kTestStrPL;
|
||||
static const char *const kTestStrPT;
|
||||
static const char *const kTestStrRO;
|
||||
static const char *const kTestStrRU;
|
||||
static const char *const kTestStrSI;
|
||||
static const char *const kTestStrSK;
|
||||
static const char *const kTestStrSL;
|
||||
static const char *const kTestStrSO;
|
||||
static const char *const kTestStrSQ;
|
||||
static const char *const kTestStrSR;
|
||||
static const char *const kTestStrST;
|
||||
static const char *const kTestStrSU;
|
||||
static const char *const kTestStrSV;
|
||||
static const char *const kTestStrSW;
|
||||
static const char *const kTestStrTA;
|
||||
static const char *const kTestStrTE;
|
||||
static const char *const kTestStrTG;
|
||||
static const char *const kTestStrTH;
|
||||
static const char *const kTestStrTR;
|
||||
static const char *const kTestStrUK;
|
||||
static const char *const kTestStrUR;
|
||||
static const char *const kTestStrUZ;
|
||||
static const char *const kTestStrVI;
|
||||
static const char *const kTestStrYI;
|
||||
static const char *const kTestStrYO;
|
||||
static const char *const kTestStrZH;
|
||||
static const char *const kTestStrZU;
|
||||
};
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // NNET_LANG_ID_TEST_DATA_H_
|
||||
386
Telegram/ThirdParty/cld3/src/nnet_language_identifier.cc
vendored
Normal file
386
Telegram/ThirdParty/cld3/src/nnet_language_identifier.cc
vendored
Normal file
@@ -0,0 +1,386 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "nnet_language_identifier.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
|
||||
#include "base.h"
|
||||
#include "embedding_network.h"
|
||||
#include "registry.h"
|
||||
#include "relevant_script_feature.h"
|
||||
#include "script_span/generated_ulscript.h"
|
||||
#include "script_span/getonescriptspan.h"
|
||||
#include "script_span/text_processing.h"
|
||||
#include "cld_3/protos/sentence.pb.h"
|
||||
#include "sentence_features.h"
|
||||
#include "task_context.h"
|
||||
#include "workspace.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace {
|
||||
|
||||
// Struct for accumulating stats for a language as text subsequences of the same
|
||||
// script are processed.
|
||||
struct LangChunksStats {
|
||||
// Sum of probabilities across subsequences.
|
||||
float prob_sum = 0.0;
|
||||
|
||||
// Total number of bytes corresponding to the language.
|
||||
int byte_sum = 0;
|
||||
|
||||
// Number chunks corresponding to the language.
|
||||
int num_chunks = 0;
|
||||
|
||||
// Specifies the byte ranges that language applies to.
|
||||
std::vector<NNetLanguageIdentifier::SpanInfo> byte_ranges;
|
||||
};
|
||||
|
||||
// Compares two pairs based on their values.
|
||||
bool OrderBySecondDescending(const std::pair<string, float> &x,
|
||||
const std::pair<string, float> &y) {
|
||||
if (x.second == y.second) {
|
||||
return x.first < y.first;
|
||||
} else {
|
||||
return x.second > y.second;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns "true" if the languge prediction is reliable based on the
|
||||
// probability, and "false" otherwise.
|
||||
bool ResultIsReliable(const string &language, float probability) {
|
||||
if (language == "hr" || language == "bs") {
|
||||
return (probability >= NNetLanguageIdentifier::kReliabilityHrBsThreshold);
|
||||
} else {
|
||||
return (probability >= NNetLanguageIdentifier::kReliabilityThreshold);
|
||||
}
|
||||
}
|
||||
|
||||
// Finds the number of interchange-valid bytes to process.
|
||||
int FindNumValidBytesToProcess(const string &text) {
|
||||
// Check if the size of the input text can fit into an int. If not, focus on
|
||||
// the first std::numeric_limits<int>::max() bytes.
|
||||
const int doc_text_size =
|
||||
(text.size() < static_cast<size_t>(std::numeric_limits<int>::max()))
|
||||
? static_cast<int>(text.size())
|
||||
: std::numeric_limits<int>::max();
|
||||
|
||||
// Truncate the input text if it is too long and find the span containing
|
||||
// interchange-valid UTF8.
|
||||
const int num_valid_bytes = CLD2::SpanInterchangeValid(
|
||||
text.c_str(),
|
||||
std::min(NNetLanguageIdentifier::kMaxNumInputBytesToConsider,
|
||||
doc_text_size));
|
||||
|
||||
return num_valid_bytes;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
const int NNetLanguageIdentifier::kMinNumBytesToConsider = 140;
|
||||
const int NNetLanguageIdentifier::kMaxNumBytesToConsider = 700;
|
||||
const int NNetLanguageIdentifier::kMaxNumInputBytesToConsider = 10000;
|
||||
const int NNetLanguageIdentifier::kNumSnippets = 5;
|
||||
const char NNetLanguageIdentifier::kUnknown[] = "und";
|
||||
const float NNetLanguageIdentifier::kReliabilityThreshold = 0.7f;
|
||||
const float NNetLanguageIdentifier::kReliabilityHrBsThreshold = 0.5f;
|
||||
|
||||
const string LanguageIdEmbeddingFeatureExtractor::ArgPrefix() const {
|
||||
return "language_identifier";
|
||||
}
|
||||
|
||||
NNetLanguageIdentifier::NNetLanguageIdentifier()
|
||||
: NNetLanguageIdentifier(kMinNumBytesToConsider, kMaxNumBytesToConsider) {}
|
||||
|
||||
static WholeSentenceFeature *cbog_factory() {
|
||||
return new ContinuousBagOfNgramsFunction;
|
||||
}
|
||||
|
||||
static WholeSentenceFeature *rsf_factory() { return new RelevantScriptFeature; }
|
||||
|
||||
static WholeSentenceFeature *sf_factory() { return new ScriptFeature; }
|
||||
|
||||
NNetLanguageIdentifier::NNetLanguageIdentifier(int min_num_bytes,
|
||||
int max_num_bytes)
|
||||
: num_languages_(TaskContextParams::GetNumLanguages()),
|
||||
network_(&nn_params_),
|
||||
min_num_bytes_(min_num_bytes),
|
||||
max_num_bytes_(max_num_bytes) {
|
||||
CLD3_CHECK(max_num_bytes_ > 0);
|
||||
CLD3_CHECK(min_num_bytes_ >= 0);
|
||||
CLD3_CHECK(min_num_bytes_ < max_num_bytes_);
|
||||
|
||||
num_snippets_ = (max_num_bytes_ <= kNumSnippets) ? 1 : kNumSnippets;
|
||||
snippet_size_ = max_num_bytes_ / num_snippets_;
|
||||
|
||||
if (WholeSentenceFeature::registry() == nullptr) {
|
||||
// Create registry for our WholeSentenceFeature(s).
|
||||
RegisterableClass<WholeSentenceFeature>::CreateRegistry(
|
||||
"sentence feature function", "WholeSentenceFeature", __FILE__,
|
||||
__LINE__);
|
||||
}
|
||||
|
||||
// Register our WholeSentenceFeature(s).
|
||||
// Register ContinuousBagOfNgramsFunction feature function.
|
||||
static WholeSentenceFeature::Registry::Registrar cbog_registrar(
|
||||
WholeSentenceFeature::registry(), "continuous-bag-of-ngrams",
|
||||
"ContinuousBagOfNgramsFunction", __FILE__, __LINE__, cbog_factory);
|
||||
|
||||
// Register RelevantScriptFeature feature function.
|
||||
static WholeSentenceFeature::Registry::Registrar rsf_registrar(
|
||||
WholeSentenceFeature::registry(), "continuous-bag-of-relevant-scripts",
|
||||
"RelevantScriptFeature", __FILE__, __LINE__, rsf_factory);
|
||||
|
||||
// Register ScriptFeature feature function.
|
||||
static WholeSentenceFeature::Registry::Registrar sf_registrar(
|
||||
WholeSentenceFeature::registry(), "script", "ScriptFeature", __FILE__,
|
||||
__LINE__, sf_factory);
|
||||
|
||||
// Get the model parameters, set up and initialize the model.
|
||||
TaskContext context;
|
||||
TaskContextParams::ToTaskContext(&context);
|
||||
Setup(&context);
|
||||
Init(&context);
|
||||
}
|
||||
|
||||
NNetLanguageIdentifier::~NNetLanguageIdentifier() {}
|
||||
|
||||
void NNetLanguageIdentifier::Setup(TaskContext *context) {
|
||||
feature_extractor_.Setup(context);
|
||||
}
|
||||
|
||||
void NNetLanguageIdentifier::Init(TaskContext *context) {
|
||||
feature_extractor_.Init(context);
|
||||
feature_extractor_.RequestWorkspaces(&workspace_registry_);
|
||||
}
|
||||
|
||||
void NNetLanguageIdentifier::GetFeatures(
|
||||
Sentence *sentence, std::vector<FeatureVector> *features) const {
|
||||
// Feature workspace set.
|
||||
WorkspaceSet workspace;
|
||||
workspace.Reset(workspace_registry_);
|
||||
feature_extractor_.Preprocess(&workspace, sentence);
|
||||
feature_extractor_.ExtractFeatures(workspace, *sentence, features);
|
||||
}
|
||||
|
||||
// Returns the language name corresponding to the given id.
|
||||
string NNetLanguageIdentifier::GetLanguageName(int language_id) const {
|
||||
CLD3_CHECK(language_id >= 0);
|
||||
CLD3_CHECK(language_id < num_languages_);
|
||||
return TaskContextParams::language_names(language_id);
|
||||
}
|
||||
|
||||
NNetLanguageIdentifier::Result NNetLanguageIdentifier::FindLanguage(
|
||||
const string &text) {
|
||||
const int num_valid_bytes = FindNumValidBytesToProcess(text);
|
||||
|
||||
// Iterate over the input with ScriptScanner to clean up the text (e.g.,
|
||||
// removing digits, punctuation, brackets).
|
||||
// TODO(abakalov): Extract the code that does the clean-up out of
|
||||
// ScriptScanner.
|
||||
CLD2::ScriptScanner ss(text.c_str(), num_valid_bytes, /*is_plain_text=*/true);
|
||||
CLD2::LangSpan script_span;
|
||||
string cleaned;
|
||||
while (ss.GetOneScriptSpanLower(&script_span)) {
|
||||
// script_span has spaces at the beginning and the end, so there is no need
|
||||
// for a delimiter.
|
||||
cleaned.append(script_span.text, script_span.text_bytes);
|
||||
}
|
||||
|
||||
if (static_cast<int>(cleaned.size()) < min_num_bytes_) {
|
||||
return Result();
|
||||
}
|
||||
|
||||
// Copy to a vector because a non-const char* will be needed.
|
||||
std::vector<char> text_to_process;
|
||||
for (size_t i = 0; i < cleaned.size(); ++i) {
|
||||
text_to_process.push_back(cleaned[i]);
|
||||
}
|
||||
text_to_process.push_back('\0');
|
||||
|
||||
// Remove repetitive chunks or ones containing mostly spaces.
|
||||
const int chunk_size = 0; // Use the default.
|
||||
char *text_begin = &text_to_process[0];
|
||||
const int new_length = CLD2::CheapSqueezeInplace(
|
||||
text_begin, text_to_process.size() - 1, chunk_size);
|
||||
if (new_length < min_num_bytes_) {
|
||||
return Result();
|
||||
}
|
||||
|
||||
const string squeezed_text_to_process =
|
||||
SelectTextGivenBeginAndSize(text_begin, new_length);
|
||||
return FindLanguageOfValidUTF8(squeezed_text_to_process);
|
||||
}
|
||||
|
||||
NNetLanguageIdentifier::Result NNetLanguageIdentifier::FindLanguageOfValidUTF8(
|
||||
const string &text) {
|
||||
// Create a Sentence storing the input text.
|
||||
Sentence sentence;
|
||||
sentence.set_text(text);
|
||||
|
||||
// Predict language.
|
||||
// TODO(salcianu): reuse vector<FeatureVector>.
|
||||
std::vector<FeatureVector> features(feature_extractor_.NumEmbeddings());
|
||||
GetFeatures(&sentence, &features);
|
||||
|
||||
EmbeddingNetwork::Vector scores;
|
||||
network_.ComputeFinalScores(features, &scores);
|
||||
int prediction_id = -1;
|
||||
float max_val = -std::numeric_limits<float>::infinity();
|
||||
for (size_t i = 0; i < scores.size(); ++i) {
|
||||
if (scores[i] > max_val) {
|
||||
prediction_id = i;
|
||||
max_val = scores[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Compute probability.
|
||||
Result result;
|
||||
float diff_sum = 0.0;
|
||||
for (size_t i = 0; i < scores.size(); ++i) {
|
||||
diff_sum += exp(scores[i] - max_val);
|
||||
}
|
||||
const float log_sum_exp = max_val + log(diff_sum);
|
||||
result.probability = exp(max_val - log_sum_exp);
|
||||
|
||||
result.language = GetLanguageName(prediction_id);
|
||||
result.is_reliable = ResultIsReliable(result.language, result.probability);
|
||||
result.proportion = 1.0;
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<NNetLanguageIdentifier::Result>
|
||||
NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
|
||||
int num_langs) {
|
||||
std::vector<Result> results;
|
||||
|
||||
// Truncate the input text if it is too long and find the span containing
|
||||
// interchange-valid UTF8.
|
||||
const int num_valid_bytes = FindNumValidBytesToProcess(text);
|
||||
if (num_valid_bytes == 0) {
|
||||
while (num_langs-- > 0) {
|
||||
results.emplace_back();
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
// Process each subsequence of the same script.
|
||||
CLD2::ScriptScanner ss(text.c_str(), num_valid_bytes, /*is_plain_text=*/true);
|
||||
CLD2::LangSpan script_span;
|
||||
std::unordered_map<string, LangChunksStats> lang_stats;
|
||||
int total_num_bytes = 0;
|
||||
int chunk_size = 0; // Use the default.
|
||||
while (ss.GetOneScriptSpanLower(&script_span)) {
|
||||
const int num_original_span_bytes = script_span.text_bytes;
|
||||
|
||||
// Remove repetitive chunks or ones containing mostly spaces.
|
||||
const int new_length = CLD2::CheapSqueezeInplace(
|
||||
script_span.text, script_span.text_bytes, chunk_size);
|
||||
script_span.text_bytes = new_length;
|
||||
|
||||
if (script_span.text_bytes < min_num_bytes_) {
|
||||
continue;
|
||||
}
|
||||
total_num_bytes += num_original_span_bytes;
|
||||
|
||||
const string selected_text = SelectTextGivenScriptSpan(script_span);
|
||||
|
||||
Result result = FindLanguageOfValidUTF8(selected_text);
|
||||
string language = result.language;
|
||||
lang_stats[language].byte_sum += num_original_span_bytes;
|
||||
lang_stats[language].prob_sum +=
|
||||
result.probability * num_original_span_bytes;
|
||||
lang_stats[language].num_chunks++;
|
||||
// Add SpanInfo. Start and end indices are relative to original input.
|
||||
lang_stats[language].byte_ranges.push_back(SpanInfo(
|
||||
ss.MapBack(0), ss.MapBack(script_span.text_bytes), result.probability));
|
||||
}
|
||||
|
||||
// Sort the languages based on the number of bytes associated with them.
|
||||
// TODO(abakalov): Consider alternative possibly more efficient portable
|
||||
// approaches for finding the top N languages. Given that on average, there
|
||||
// aren't that many languages in the input, it's likely that the benefits will
|
||||
// be negligible (if any).
|
||||
std::vector<std::pair<string, float>> langs_and_byte_counts;
|
||||
for (const auto &entry : lang_stats) {
|
||||
langs_and_byte_counts.emplace_back(entry.first, entry.second.byte_sum);
|
||||
}
|
||||
std::sort(langs_and_byte_counts.begin(), langs_and_byte_counts.end(),
|
||||
OrderBySecondDescending);
|
||||
|
||||
const float byte_sum = static_cast<float>(total_num_bytes);
|
||||
const int num_langs_to_save =
|
||||
std::min(num_langs, static_cast<int>(langs_and_byte_counts.size()));
|
||||
for (int indx = 0; indx < num_langs_to_save; ++indx) {
|
||||
Result result;
|
||||
const string &language = langs_and_byte_counts.at(indx).first;
|
||||
const LangChunksStats &stats = lang_stats.at(language);
|
||||
result.language = language;
|
||||
result.probability = stats.prob_sum / stats.byte_sum;
|
||||
result.proportion = stats.byte_sum / byte_sum;
|
||||
result.is_reliable = ResultIsReliable(language, result.probability);
|
||||
result.byte_ranges = stats.byte_ranges;
|
||||
results.push_back(result);
|
||||
}
|
||||
|
||||
int padding_size = num_langs - langs_and_byte_counts.size();
|
||||
while (padding_size-- > 0) {
|
||||
results.emplace_back();
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
string NNetLanguageIdentifier::SelectTextGivenScriptSpan(
|
||||
const CLD2::LangSpan &script_span) {
|
||||
return SelectTextGivenBeginAndSize(script_span.text, script_span.text_bytes);
|
||||
}
|
||||
|
||||
string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
|
||||
const char *text_begin, int text_size) {
|
||||
string output_text;
|
||||
|
||||
// If the size of the input is greater than the maximum number of bytes needed
|
||||
// for a prediction, then concatenate snippets that are equally spread out
|
||||
// throughout the input.
|
||||
if (text_size > max_num_bytes_) {
|
||||
const char *snippet_begin = nullptr;
|
||||
const char *snippet_end = text_begin;
|
||||
|
||||
// Number of bytes between the snippets.
|
||||
const int num_skip_bytes =
|
||||
(text_size - max_num_bytes_) / (num_snippets_ + 1);
|
||||
|
||||
for (int i = 0; i < num_snippets_; ++i) {
|
||||
// Using SpanInterchangeValid to find the offsets to ensure that we are
|
||||
// not splitting a character in two.
|
||||
const int actual_num_skip_bytes =
|
||||
CLD2::SpanInterchangeValid(snippet_end, num_skip_bytes);
|
||||
snippet_begin = snippet_end + actual_num_skip_bytes;
|
||||
const int actual_snippet_size =
|
||||
CLD2::SpanInterchangeValid(snippet_begin, snippet_size_);
|
||||
snippet_end = snippet_begin + actual_snippet_size;
|
||||
output_text.append(snippet_begin, actual_snippet_size);
|
||||
output_text.append(" ");
|
||||
}
|
||||
} else {
|
||||
output_text.append(text_begin, text_size);
|
||||
}
|
||||
return output_text;
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
191
Telegram/ThirdParty/cld3/src/nnet_language_identifier.h
vendored
Normal file
191
Telegram/ThirdParty/cld3/src/nnet_language_identifier.h
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef NNET_LANGUAGE_IDENTIFIER_H_
|
||||
#define NNET_LANGUAGE_IDENTIFIER_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "base.h"
|
||||
#include "embedding_feature_extractor.h"
|
||||
#include "embedding_network.h"
|
||||
#include "lang_id_nn_params.h"
|
||||
#include "language_identifier_features.h"
|
||||
#include "script_span/getonescriptspan.h"
|
||||
#include "cld_3/protos/sentence.pb.h"
|
||||
#include "sentence_features.h"
|
||||
#include "task_context.h"
|
||||
#include "task_context_params.h"
|
||||
#include "cld_3/protos/task_spec.pb.h"
|
||||
#include "workspace.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Specialization of the EmbeddingFeatureExtractor for extracting from
|
||||
// (Sentence, int).
|
||||
class LanguageIdEmbeddingFeatureExtractor
|
||||
: public EmbeddingFeatureExtractor<WholeSentenceExtractor, Sentence> {
|
||||
public:
|
||||
const string ArgPrefix() const override;
|
||||
};
|
||||
|
||||
// Class for detecting the language of a document.
|
||||
class NNetLanguageIdentifier {
|
||||
public:
|
||||
// Holds probability that Span, specified by start/end indices, is a given
|
||||
// language. The langauge is not stored here; it can be found in Result, which
|
||||
// holds a vector of SpanInfo.
|
||||
struct SpanInfo {
|
||||
SpanInfo(int start_index_val, int end_index_val, float probability_val)
|
||||
: start_index(start_index_val),
|
||||
end_index(end_index_val),
|
||||
probability(probability_val) {}
|
||||
int start_index = -1;
|
||||
int end_index = -1;
|
||||
float probability = 0.0;
|
||||
};
|
||||
|
||||
// Information about a predicted language.
|
||||
struct Result {
|
||||
string language = kUnknown;
|
||||
float probability = 0.0; // Language probability.
|
||||
bool is_reliable = false; // Whether the prediction is reliable.
|
||||
|
||||
// Proportion of bytes associated with the language. If FindLanguage is
|
||||
// called, this variable is set to 1.
|
||||
float proportion = 0.0;
|
||||
|
||||
// Specifies the byte ranges that |language| applies to.
|
||||
std::vector<SpanInfo> byte_ranges;
|
||||
};
|
||||
|
||||
NNetLanguageIdentifier();
|
||||
NNetLanguageIdentifier(int min_num_bytes, int max_num_bytes);
|
||||
~NNetLanguageIdentifier();
|
||||
|
||||
// Finds the most likely language for the given text, along with additional
|
||||
// information (e.g., probability). The prediction is based on the first N
|
||||
// bytes where N is the minumum between the number of interchange valid UTF8
|
||||
// bytes and max_num_bytes_. If N is less than min_num_bytes_ long, then this
|
||||
// function returns kUnknown.
|
||||
Result FindLanguage(const string &text);
|
||||
|
||||
// Splits the input text (up to the first byte, if any, that is not
|
||||
// interchange valid UTF8) into spans based on the script, predicts a language
|
||||
// for each span, and returns a vector storing the top num_langs most frequent
|
||||
// languages along with additional information (e.g., proportions). The number
|
||||
// of bytes considered for each span is the minimum between the size of the
|
||||
// span and max_num_bytes_. If more languages are requested than what is
|
||||
// available in the input, then for those cases kUnknown is returned. Also, if
|
||||
// the size of the span is less than min_num_bytes_ long, then the span is
|
||||
// skipped. If the input text is too long, only the first
|
||||
// kMaxNumInputBytesToConsider bytes are processed.
|
||||
std::vector<Result> FindTopNMostFreqLangs(const string &text, int num_langs);
|
||||
|
||||
// String returned when a language is unknown or prediction cannot be made.
|
||||
static const char kUnknown[];
|
||||
|
||||
// Min number of bytes needed to make a prediction if the default constructor
|
||||
// is called.
|
||||
static const int kMinNumBytesToConsider;
|
||||
|
||||
// Max number of bytes to consider to make a prediction if the default
|
||||
// constructor is called.
|
||||
static const int kMaxNumBytesToConsider;
|
||||
|
||||
// Max number of input bytes to process.
|
||||
static const int kMaxNumInputBytesToConsider;
|
||||
|
||||
// Predictions with probability greater than or equal to this threshold are
|
||||
// marked as reliable. This threshold was optimized on a set of text segments
|
||||
// extracted from wikipedia, and results in an overall precision, recall,
|
||||
// and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
|
||||
static const float kReliabilityThreshold;
|
||||
|
||||
// Reliability threshold for the languages hr and bs.
|
||||
static const float kReliabilityHrBsThreshold;
|
||||
|
||||
private:
|
||||
// Sets up and initializes the model.
|
||||
void Setup(TaskContext *context);
|
||||
void Init(TaskContext *context);
|
||||
|
||||
// Extract features from sentence. On return, FeatureVector features[i]
|
||||
// contains the features for the embedding space #i.
|
||||
void GetFeatures(Sentence *sentence,
|
||||
std::vector<FeatureVector> *features) const;
|
||||
|
||||
// Finds the most likely language for the given text. Assumes that the text is
|
||||
// interchange valid UTF8.
|
||||
Result FindLanguageOfValidUTF8(const string &text);
|
||||
|
||||
// Returns the language name corresponding to the given id.
|
||||
string GetLanguageName(int language_id) const;
|
||||
|
||||
// Concatenates snippets of text equally spread out throughout the input if
|
||||
// the size of the input is greater than the maximum number of bytes needed to
|
||||
// make a prediction. The resulting string is used for language
|
||||
// identification.
|
||||
string SelectTextGivenScriptSpan(const CLD2::LangSpan &script_span);
|
||||
string SelectTextGivenBeginAndSize(const char *text_begin, int text_size);
|
||||
|
||||
// Number of languages.
|
||||
const int num_languages_;
|
||||
|
||||
// Typed feature extractor for embeddings.
|
||||
LanguageIdEmbeddingFeatureExtractor feature_extractor_;
|
||||
|
||||
// The registry of shared workspaces in the feature extractor.
|
||||
WorkspaceRegistry workspace_registry_;
|
||||
|
||||
// Parameters for the neural networks.
|
||||
LangIdNNParams nn_params_;
|
||||
|
||||
// Neural network to use for scoring.
|
||||
EmbeddingNetwork network_;
|
||||
|
||||
// This feature function is not relevant to this class. Adding this variable
|
||||
// ensures that the features are linked.
|
||||
ContinuousBagOfNgramsFunction ngram_function_;
|
||||
|
||||
// Minimum number of bytes needed to make a prediction. If the default
|
||||
// constructor is called, this variable is equal to kMinNumBytesToConsider.
|
||||
int min_num_bytes_;
|
||||
|
||||
// Maximum number of bytes to use to make a prediction. If the default
|
||||
// constructor is called, this variable is equal to kMaxNumBytesToConsider.
|
||||
int max_num_bytes_;
|
||||
|
||||
// Number of snippets to concatenate to produce the string used for language
|
||||
// identification. If max_num_bytes_ <= kNumSnippets (i.e., the maximum number
|
||||
// of bytes needed to make a prediction is smaller or equal to the number of
|
||||
// default snippets), then this variable is equal to 1. Otherwise, it is set
|
||||
// to kNumSnippets.
|
||||
int num_snippets_;
|
||||
|
||||
// The string used to make a prediction is created by concatenating
|
||||
// num_snippets_ snippets of size snippet_size_ = (max_num_bytes_ /
|
||||
// num_snippets_) that are equaly spread out throughout the input.
|
||||
int snippet_size_;
|
||||
|
||||
// Default number of snippets to concatenate to produce the string used for
|
||||
// language identification. For the actual number of snippets, see
|
||||
// num_snippets_.
|
||||
static const int kNumSnippets;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // NNET_LANGUAGE_IDENTIFIER_H_
|
||||
28
Telegram/ThirdParty/cld3/src/registry.cc
vendored
Normal file
28
Telegram/ThirdParty/cld3/src/registry.cc
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "registry.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Global list of all component registries.
|
||||
RegistryMetadata *global_registry_list = NULL;
|
||||
|
||||
void RegistryMetadata::Register(RegistryMetadata *registry) {
|
||||
registry->set_link(global_registry_list);
|
||||
global_registry_list = registry;
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
242
Telegram/ThirdParty/cld3/src/registry.h
vendored
Normal file
242
Telegram/ThirdParty/cld3/src/registry.h
vendored
Normal file
@@ -0,0 +1,242 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Registry for component registration. These classes can be used for creating
|
||||
// registries of components conforming to the same interface. This is useful for
|
||||
// making a component-based architecture where the specific implementation
|
||||
// classes can be selected at runtime. There is support for both class-based and
|
||||
// instance based registries.
|
||||
//
|
||||
// Example:
|
||||
// function.h:
|
||||
//
|
||||
// class Function : public RegisterableInstance<Function> {
|
||||
// public:
|
||||
// virtual double Evaluate(double x) = 0;
|
||||
// };
|
||||
//
|
||||
// #define REGISTER_FUNCTION(type, component)
|
||||
// REGISTER_INSTANCE_COMPONENT(Function, type, component);
|
||||
//
|
||||
// function.cc:
|
||||
//
|
||||
// REGISTER_INSTANCE_REGISTRY("function", Function);
|
||||
//
|
||||
// class Cos : public Function {
|
||||
// public:
|
||||
// double Evaluate(double x) { return cos(x); }
|
||||
// };
|
||||
//
|
||||
// class Exp : public Function {
|
||||
// public:
|
||||
// double Evaluate(double x) { return exp(x); }
|
||||
// };
|
||||
//
|
||||
// REGISTER_FUNCTION("cos", Cos);
|
||||
// REGISTER_FUNCTION("exp", Exp);
|
||||
//
|
||||
// Function *f = Function::Lookup("cos");
|
||||
// double result = f->Evaluate(arg);
|
||||
|
||||
#ifndef REGISTRY_H_
|
||||
#define REGISTRY_H_
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "base.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Component metadata with information about name, class, and code location.
|
||||
class ComponentMetadata {
|
||||
public:
|
||||
ComponentMetadata(const char *name, const char *class_name, const char *file,
|
||||
int line)
|
||||
: name_(name),
|
||||
class_name_(class_name),
|
||||
file_(file),
|
||||
line_(line),
|
||||
link_(NULL) {}
|
||||
|
||||
// Getters.
|
||||
const char *name() const { return name_; }
|
||||
const char *class_name() const { return class_name_; }
|
||||
const char *file() const { return file_; }
|
||||
int line() const { return line_; }
|
||||
|
||||
// Metadata objects can be linked in a list.
|
||||
ComponentMetadata *link() const { return link_; }
|
||||
void set_link(ComponentMetadata *link) { link_ = link; }
|
||||
|
||||
private:
|
||||
// Component name.
|
||||
const char *name_;
|
||||
|
||||
// Name of class for component.
|
||||
const char *class_name_;
|
||||
|
||||
// Code file and location where the component was registered.
|
||||
const char *file_;
|
||||
int line_;
|
||||
|
||||
// Link to next metadata object in list.
|
||||
ComponentMetadata *link_;
|
||||
};
|
||||
|
||||
// The master registry contains all registered component registries. A registry
|
||||
// is not registered in the master registry until the first component of that
|
||||
// type is registered.
|
||||
class RegistryMetadata : public ComponentMetadata {
|
||||
public:
|
||||
RegistryMetadata(const char *name, const char *class_name, const char *file,
|
||||
int line)
|
||||
: ComponentMetadata(name, class_name, file, line) {}
|
||||
|
||||
// Registers a component registry in the master registry.
|
||||
static void Register(RegistryMetadata *registry);
|
||||
};
|
||||
|
||||
// Registry for components. An object can be registered with a type name in the
|
||||
// registry. The named instances in the registry can be returned using the
|
||||
// Lookup() method. The components in the registry are put into a linked list
|
||||
// of components. It is important that the component registry can be statically
|
||||
// initialized in order not to depend on initialization order.
|
||||
template <class T>
|
||||
struct ComponentRegistry {
|
||||
typedef ComponentRegistry<T> Self;
|
||||
|
||||
// Component registration class.
|
||||
class Registrar : public ComponentMetadata {
|
||||
public:
|
||||
// Registers new component by linking itself into the component list of
|
||||
// the registry.
|
||||
Registrar(Self *registry, const char *type, const char *class_name,
|
||||
const char *file, int line, T *object)
|
||||
: ComponentMetadata(type, class_name, file, line), object_(object) {
|
||||
// Register registry in master registry if this is the first registered
|
||||
// component of this type.
|
||||
if (registry->components == NULL) {
|
||||
RegistryMetadata::Register(
|
||||
new RegistryMetadata(registry->name, registry->class_name,
|
||||
registry->file, registry->line));
|
||||
}
|
||||
|
||||
// Register component in registry.
|
||||
set_link(registry->components);
|
||||
registry->components = this;
|
||||
}
|
||||
|
||||
// Returns component type.
|
||||
const char *type() const { return name(); }
|
||||
|
||||
// Returns component object.
|
||||
T *object() const { return object_; }
|
||||
|
||||
// Returns the next component in the component list.
|
||||
Registrar *next() const { return static_cast<Registrar *>(link()); }
|
||||
|
||||
private:
|
||||
// Component object.
|
||||
T *object_;
|
||||
};
|
||||
|
||||
// Finds registrar for named component in registry.
|
||||
const Registrar *GetComponent(const char *type) const {
|
||||
Registrar *r = components;
|
||||
while (r != NULL && strcmp(type, r->type()) != 0) r = r->next();
|
||||
CLD3_DCHECK(r != nullptr);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// Finds a named component in the registry.
|
||||
T *Lookup(const char *type) const { return GetComponent(type)->object(); }
|
||||
T *Lookup(const string &type) const { return Lookup(type.c_str()); }
|
||||
|
||||
// Textual description of the kind of components in the registry.
|
||||
const char *name;
|
||||
|
||||
// Base class name of component type.
|
||||
const char *class_name;
|
||||
|
||||
// File and line where the registry is defined.
|
||||
const char *file;
|
||||
int line;
|
||||
|
||||
// Linked list of registered components.
|
||||
Registrar *components;
|
||||
};
|
||||
|
||||
// Base class for registerable class-based components.
|
||||
template <class T>
|
||||
class RegisterableClass {
|
||||
public:
|
||||
// Factory function type.
|
||||
typedef T *(Factory)();
|
||||
|
||||
// Registry type.
|
||||
typedef ComponentRegistry<Factory> Registry;
|
||||
|
||||
// Should be called before any call to Create() or registry(), i.e., before
|
||||
// using the registration mechanism to register and or instantiate subclasses
|
||||
// of T.
|
||||
static void CreateRegistry(
|
||||
const char *name,
|
||||
const char *class_name,
|
||||
const char *file,
|
||||
int line) {
|
||||
registry_ = new Registry();
|
||||
registry_->name = name;
|
||||
registry_->class_name = class_name;
|
||||
registry_->file = file;
|
||||
registry_->line = line;
|
||||
registry_->components = nullptr;
|
||||
}
|
||||
|
||||
// Should be called when one is done using the registration mechanism for
|
||||
// class T.
|
||||
static void DeleteRegistry() {
|
||||
delete registry_;
|
||||
registry_ = nullptr;
|
||||
}
|
||||
|
||||
// Creates a new component instance.
|
||||
static T *Create(const string &type) { return registry()->Lookup(type)(); }
|
||||
|
||||
// Returns registry for class.
|
||||
static Registry *registry() { return registry_; }
|
||||
|
||||
private:
|
||||
// Registry for class.
|
||||
static Registry *registry_;
|
||||
};
|
||||
|
||||
// Base class for registerable instance-based components.
|
||||
template <class T>
|
||||
class RegisterableInstance {
|
||||
public:
|
||||
// Registry type.
|
||||
typedef ComponentRegistry<T> Registry;
|
||||
|
||||
private:
|
||||
// Registry for class.
|
||||
static Registry registry_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // REGISTRY_H_
|
||||
89
Telegram/ThirdParty/cld3/src/relevant_script_feature.cc
vendored
Normal file
89
Telegram/ThirdParty/cld3/src/relevant_script_feature.cc
vendored
Normal file
@@ -0,0 +1,89 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "relevant_script_feature.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "feature_extractor.h"
|
||||
#include "feature_types.h"
|
||||
#include "language_identifier_features.h"
|
||||
#include "script_detector.h"
|
||||
#include "cld_3/protos/sentence.pb.h"
|
||||
#include "sentence_features.h"
|
||||
#include "task_context.h"
|
||||
#include "utils.h"
|
||||
#include "workspace.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
void RelevantScriptFeature::Setup(TaskContext *context) {
|
||||
// Nothing.
|
||||
}
|
||||
|
||||
void RelevantScriptFeature::Init(TaskContext *context) {
|
||||
set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
|
||||
}
|
||||
|
||||
void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
|
||||
const Sentence &sentence,
|
||||
FeatureVector *result) const {
|
||||
const string &text = sentence.text();
|
||||
|
||||
// We expect kNumRelevantScripts to be small, so we stack-allocate the array
|
||||
// of counts. Still, if that changes, we want to find out.
|
||||
static_assert(
|
||||
kNumRelevantScripts < 25,
|
||||
"switch counts to vector<int>: too big for stack-allocated int[]");
|
||||
|
||||
// counts[s] is the number of characters with script s.
|
||||
// Note: {} "value-initializes" the array to zero.
|
||||
int counts[kNumRelevantScripts]{};
|
||||
int total_count = 0;
|
||||
const char *const text_end = text.data() + text.size();
|
||||
for (const char *curr = text.data(); curr < text_end;
|
||||
curr += utils::OneCharLen(curr)) {
|
||||
const int num_bytes = utils::OneCharLen(curr);
|
||||
|
||||
// If a partial UTF-8 character is encountered, break out of the loop.
|
||||
if (curr + num_bytes > text_end) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Skip spaces, numbers, punctuation, and all other non-alpha ASCII
|
||||
// characters: these characters are used in so many languages, they do not
|
||||
// communicate language-related information.
|
||||
if ((num_bytes == 1) && !isalpha(*curr)) {
|
||||
continue;
|
||||
}
|
||||
Script script = GetScript(curr, num_bytes);
|
||||
CLD3_DCHECK(script >= 0);
|
||||
CLD3_DCHECK(script < kNumRelevantScripts);
|
||||
counts[static_cast<int>(script)]++;
|
||||
total_count++;
|
||||
}
|
||||
|
||||
for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
|
||||
int count = counts[script_id];
|
||||
if (count > 0) {
|
||||
const float weight = static_cast<float>(count) / total_count;
|
||||
FloatFeatureValue value(script_id, weight);
|
||||
result->add(feature_type(), value.discrete_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
49
Telegram/ThirdParty/cld3/src/relevant_script_feature.h
vendored
Normal file
49
Telegram/ThirdParty/cld3/src/relevant_script_feature.h
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef RELEVANT_SCRIPT_FEATURE_H_
|
||||
#define RELEVANT_SCRIPT_FEATURE_H_
|
||||
|
||||
#include "feature_extractor.h"
|
||||
#include "cld_3/protos/sentence.pb.h"
|
||||
#include "sentence_features.h"
|
||||
#include "task_context.h"
|
||||
#include "workspace.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode
|
||||
// script (see below): each such feature indicates the script and the ratio of
|
||||
// UTF8 characters in that script, in the given sentence.
|
||||
//
|
||||
// What is a relevant script? Recognizing all 100+ Unicode scripts would
|
||||
// require too much code size and runtime. Instead, we focus only on a few
|
||||
// scripts that communicate a lot of language information: e.g., the use of
|
||||
// Hiragana characters almost always indicates Japanese, so Hiragana is a
|
||||
// "relevant" script for us. The Latin script is used by dozens of language, so
|
||||
// Latin is not relevant in this context.
|
||||
class RelevantScriptFeature : public WholeSentenceFeature {
|
||||
public:
|
||||
void Setup(TaskContext *context) override;
|
||||
void Init(TaskContext *context) override;
|
||||
|
||||
// Appends the features computed from the sentence to the feature vector.
|
||||
void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
|
||||
FeatureVector *result) const override;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // RELEVANT_SCRIPT_FEATURE_H_
|
||||
259
Telegram/ThirdParty/cld3/src/relevant_script_feature_test.cc
vendored
Normal file
259
Telegram/ThirdParty/cld3/src/relevant_script_feature_test.cc
vendored
Normal file
@@ -0,0 +1,259 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include "feature_extractor.h"
|
||||
#include "feature_types.h"
|
||||
#include "relevant_script_feature.h"
|
||||
#include "script_detector.h"
|
||||
#include "cld_3/protos/sentence.pb.h"
|
||||
#include "sentence_features.h"
|
||||
#include "task_context.h"
|
||||
#include "utils.h"
|
||||
#include "workspace.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace relevant_script_feature_test {
|
||||
namespace {
|
||||
// Checks whether the expected and actual float feature values are within 0.0001
|
||||
// of each other.
|
||||
bool FeatureValuesNear(float expected_value, float actual_value) {
|
||||
return std::abs(expected_value - actual_value) < 0.0001;
|
||||
}
|
||||
|
||||
// Checks whether two sets of feature values are within an acceptable amount of
|
||||
// each other.
|
||||
bool FeaturesNear(const string &test_input,
|
||||
const std::map<int, float> &expected_features,
|
||||
const std::map<int, float> &actual_features) {
|
||||
if (expected_features.size() != actual_features.size()) {
|
||||
std::cout << " Failure for input: " << test_input << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto &id_and_value : expected_features) {
|
||||
const int id = id_and_value.first;
|
||||
if (actual_features.count(id) == 0 ||
|
||||
!FeatureValuesNear(expected_features.at(id), actual_features.at(id))) {
|
||||
std::cout << " Failure for input: " << test_input << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
std::cout << " Success for input: " << test_input << std::endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Checks whether the set of features is empty.
|
||||
bool CheckFeaturesEmpty(const string &input,
|
||||
const std::map<int, float> &actual_features) {
|
||||
if (!actual_features.empty()) {
|
||||
std::cout << " Failure for input: " << input << std::endl;
|
||||
return false;
|
||||
} else {
|
||||
std::cout << " Success for input: " << input << std::endl;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
static WholeSentenceFeature *rsf_factory() { return new RelevantScriptFeature; }
|
||||
|
||||
class RelevantScriptFeatureExtractor {
|
||||
public:
|
||||
RelevantScriptFeatureExtractor() {
|
||||
if (WholeSentenceFeature::registry() == nullptr) {
|
||||
// Create registry for our WholeSentenceFeature(s).
|
||||
RegisterableClass<WholeSentenceFeature>::CreateRegistry(
|
||||
"sentence feature function", "WholeSentenceFeature", __FILE__,
|
||||
__LINE__);
|
||||
}
|
||||
|
||||
// Register our WholeSentenceFeature(s).
|
||||
// Register RelevantScriptFeature feature function.
|
||||
static WholeSentenceFeature::Registry::Registrar rsf_registrar(
|
||||
WholeSentenceFeature::registry(), "continuous-bag-of-relevant-scripts",
|
||||
"RelevantScriptFeature", __FILE__, __LINE__, rsf_factory);
|
||||
|
||||
feature_extractor_.Parse("continuous-bag-of-relevant-scripts");
|
||||
TaskContext context;
|
||||
feature_extractor_.Setup(&context);
|
||||
feature_extractor_.Init(&context);
|
||||
feature_extractor_.RequestWorkspaces(&workspace_registry_);
|
||||
}
|
||||
|
||||
// Returns "true" if feature extraction is successful, and "false" otherwise.
|
||||
bool Extract(const string &text, std::map<int, float> *float_features) {
|
||||
float_features->clear();
|
||||
if (text.empty()) {
|
||||
return true;
|
||||
}
|
||||
Sentence sentence;
|
||||
sentence.set_text(text);
|
||||
workspace_.Reset(workspace_registry_);
|
||||
feature_extractor_.Preprocess(&workspace_, &sentence);
|
||||
FeatureVector feature_vector;
|
||||
feature_extractor_.ExtractFeatures(workspace_, sentence, &feature_vector);
|
||||
|
||||
for (int index = 0; index < feature_vector.size(); ++index) {
|
||||
const FloatFeatureValue value =
|
||||
FloatFeatureValue(feature_vector.value(index));
|
||||
if (float_features->count(value.value.id) != 0) {
|
||||
std::cout << " Failure: duplicate feature" << std::endl;
|
||||
return false;
|
||||
}
|
||||
float_features->emplace(value.value.id, value.value.weight);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
WorkspaceSet workspace_;
|
||||
WholeSentenceExtractor feature_extractor_;
|
||||
|
||||
// The registry of shared workspaces in the feature extractor.
|
||||
WorkspaceRegistry workspace_registry_;
|
||||
};
|
||||
|
||||
bool TestCommonCases() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
RelevantScriptFeatureExtractor extractor;
|
||||
std::map<int, float> float_features;
|
||||
bool test_successful = true;
|
||||
|
||||
string input = "just some plain text";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 1.00}},
|
||||
float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
input = "ヸヂ゠ヂ";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!FeaturesNear(input, {{chrome_lang_id::kScriptKatakana, 1.00}},
|
||||
float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// 4 Latin letters mixed with 4 Katakana letters.
|
||||
input = "ヸtヂe゠xtヂ";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 0.5},
|
||||
{chrome_lang_id::kScriptKatakana, 0.5}},
|
||||
float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
input = "just some 121212%^^( ヸヂ゠ヂ text";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!FeaturesNear(input, {{chrome_lang_id::kScriptOtherUtf8OneByte, 0.75},
|
||||
{chrome_lang_id::kScriptKatakana, 0.25}},
|
||||
float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
return test_successful;
|
||||
}
|
||||
|
||||
bool TestCornerCases() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
RelevantScriptFeatureExtractor extractor;
|
||||
std::map<int, float> float_features;
|
||||
bool test_successful = true;
|
||||
|
||||
// Empty string.
|
||||
string input = "";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!CheckFeaturesEmpty(input, float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// Only whitespaces.
|
||||
input = " ";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!CheckFeaturesEmpty(input, float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// Only numbers and punctuation.
|
||||
input = "12----)(";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!CheckFeaturesEmpty(input, float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// Only numbers, punctuation, and spaces.
|
||||
input = "12--- - ) ( ";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!CheckFeaturesEmpty(input, float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// One UTF8 character by itself.
|
||||
input = "ゟ";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
|
||||
float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
input = "ה";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!FeaturesNear(input, {{chrome_lang_id::kScriptHebrew, 1.00}},
|
||||
float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// One UTF8 character with some numbers / punctuation / spaces: character at
|
||||
// one extremity or in the middle.
|
||||
input = "1234ゟ";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
|
||||
float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
input = "ゟ12-(";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
|
||||
float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
input = "8*1ゟ12----";
|
||||
if (!extractor.Extract(input, &float_features) ||
|
||||
!FeaturesNear(input, {{chrome_lang_id::kScriptHiragana, 1.00}},
|
||||
float_features)) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
return test_successful;
|
||||
}
|
||||
|
||||
} // namespace relevant_script_feature_test
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
// Runs the feature extraction tests.
|
||||
int main(int argc, char **argv) {
|
||||
const bool tests_successful =
|
||||
chrome_lang_id::relevant_script_feature_test::TestCommonCases() &&
|
||||
chrome_lang_id::relevant_script_feature_test::TestCornerCases();
|
||||
return tests_successful ? 0 : 1;
|
||||
}
|
||||
156
Telegram/ThirdParty/cld3/src/script_detector.h
vendored
Normal file
156
Telegram/ThirdParty/cld3/src/script_detector.h
vendored
Normal file
@@ -0,0 +1,156 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef SCRIPT_DETECTOR_H_
|
||||
#define SCRIPT_DETECTOR_H_
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Unicode scripts we care about. To get compact and fast code, we detect only
|
||||
// a few Unicode scripts that offer a strong indication about the language of
|
||||
// the text (e.g., Hiragana -> Japanese).
|
||||
enum Script {
|
||||
// Special value to indicate internal errors in the script detection code.
|
||||
kScriptError,
|
||||
|
||||
// Special values for all Unicode scripts that we do not detect. One special
|
||||
// value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
|
||||
// already have that information, we use it). kScriptOtherUtf8OneByte means
|
||||
// ~Latin and kScriptOtherUtf8FourBytes means ~Han.
|
||||
kScriptOtherUtf8OneByte,
|
||||
kScriptOtherUtf8TwoBytes,
|
||||
kScriptOtherUtf8ThreeBytes,
|
||||
kScriptOtherUtf8FourBytes,
|
||||
|
||||
kScriptGreek,
|
||||
kScriptCyrillic,
|
||||
kScriptHebrew,
|
||||
kScriptArabic,
|
||||
kScriptHangulJamo, // Used primarily for Korean.
|
||||
kScriptHiragana, // Used primarily for Japanese.
|
||||
kScriptKatakana, // Used primarily for Japanese.
|
||||
|
||||
// Add new scripts here.
|
||||
|
||||
// Do not add any script after kNumRelevantScripts. This value indicates the
|
||||
// number of elements in this enum Script (except this value) such that we can
|
||||
// easily iterate over the scripts.
|
||||
kNumRelevantScripts,
|
||||
};
|
||||
|
||||
template <typename IntType>
|
||||
inline bool InRange(IntType value, IntType low, IntType hi) {
|
||||
return (value >= low) && (value <= hi);
|
||||
}
|
||||
|
||||
// Returns Script for the UTF8 character that starts at address p.
|
||||
// Precondition: p points to a valid UTF8 character of num_bytes bytes.
|
||||
inline Script GetScript(const unsigned char *p, int num_bytes) {
|
||||
switch (num_bytes) {
|
||||
case 1:
|
||||
return kScriptOtherUtf8OneByte;
|
||||
|
||||
case 2: {
|
||||
// 2-byte UTF8 characters have 11 bits of information. unsigned int has
|
||||
// at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
|
||||
// it's enough. It's also usually the fastest int type on the current
|
||||
// CPU, so it's better to use than int32.
|
||||
static const unsigned int kGreekStart = 0x370;
|
||||
|
||||
// Commented out (unsued in the code): kGreekEnd = 0x3FF;
|
||||
static const unsigned int kCyrillicStart = 0x400;
|
||||
static const unsigned int kCyrillicEnd = 0x4FF;
|
||||
static const unsigned int kHebrewStart = 0x590;
|
||||
|
||||
// Commented out (unsued in the code): kHebrewEnd = 0x5FF;
|
||||
static const unsigned int kArabicStart = 0x600;
|
||||
static const unsigned int kArabicEnd = 0x6FF;
|
||||
const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
|
||||
if (codepoint > kCyrillicEnd) {
|
||||
if (codepoint >= kArabicStart) {
|
||||
if (codepoint <= kArabicEnd) {
|
||||
return kScriptArabic;
|
||||
}
|
||||
} else {
|
||||
// At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
|
||||
// codepoint <= kHebrewEnd.
|
||||
if (codepoint >= kHebrewStart) {
|
||||
return kScriptHebrew;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (codepoint >= kCyrillicStart) {
|
||||
return kScriptCyrillic;
|
||||
} else {
|
||||
// At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
|
||||
// codepoint <= kGreekEnd.
|
||||
if (codepoint >= kGreekStart) {
|
||||
return kScriptGreek;
|
||||
}
|
||||
}
|
||||
}
|
||||
return kScriptOtherUtf8TwoBytes;
|
||||
}
|
||||
|
||||
case 3: {
|
||||
// 3-byte UTF8 characters have 16 bits of information. unsigned int has
|
||||
// at least 16 bits.
|
||||
static const unsigned int kHangulJamoStart = 0x1100;
|
||||
static const unsigned int kHangulJamoEnd = 0x11FF;
|
||||
static const unsigned int kHiraganaStart = 0x3041;
|
||||
static const unsigned int kHiraganaEnd = 0x309F;
|
||||
|
||||
// Commented out (unsued in the code): kKatakanaStart = 0x30A0;
|
||||
static const unsigned int kKatakanaEnd = 0x30FF;
|
||||
const unsigned int codepoint =
|
||||
((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
|
||||
if (codepoint > kHiraganaEnd) {
|
||||
// On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
|
||||
// codepoint >= kKatakanaStart.
|
||||
if (codepoint <= kKatakanaEnd) {
|
||||
return kScriptKatakana;
|
||||
}
|
||||
} else {
|
||||
if (codepoint >= kHiraganaStart) {
|
||||
return kScriptHiragana;
|
||||
} else {
|
||||
if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
|
||||
return kScriptHangulJamo;
|
||||
}
|
||||
}
|
||||
}
|
||||
return kScriptOtherUtf8ThreeBytes;
|
||||
}
|
||||
|
||||
case 4:
|
||||
return kScriptOtherUtf8FourBytes;
|
||||
|
||||
default:
|
||||
return kScriptError;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns Script for the UTF8 character that starts at address p. Similar to
|
||||
// the previous version of GetScript, except for "char" vs "unsigned char".
|
||||
// Most code works with "char *" pointers, ignoring the fact that char is
|
||||
// unsigned (by default) on most platforms, but signed on iOS. This code takes
|
||||
// care of making sure we always treat chars as unsigned.
|
||||
inline Script GetScript(const char *p, int num_bytes) {
|
||||
return GetScript(reinterpret_cast<const unsigned char *>(p), num_bytes);
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_DETECTOR_H_
|
||||
161
Telegram/ThirdParty/cld3/src/script_detector_test.cc
vendored
Normal file
161
Telegram/ThirdParty/cld3/src/script_detector_test.cc
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "script_detector.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace script_detector_test {
|
||||
|
||||
Script GetScript(const char *p) {
|
||||
const int num_bytes = utils::OneCharLen(p);
|
||||
return chrome_lang_id::GetScript(p, num_bytes);
|
||||
}
|
||||
|
||||
bool PrintAndReturnStatus(bool status) {
|
||||
if (status) {
|
||||
std::cout << " Success" << std::endl;
|
||||
return true;
|
||||
} else {
|
||||
std::cout << " Failure" << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool TestGreekScript() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
// The first two conditions check first / last character from the Greek and
|
||||
// Coptic script. The last two ones are negative tests.
|
||||
return PrintAndReturnStatus(
|
||||
kScriptGreek == GetScript("Ͱ") && kScriptGreek == GetScript("Ͽ") &&
|
||||
kScriptGreek == GetScript("δ") && kScriptGreek == GetScript("Θ") &&
|
||||
kScriptGreek == GetScript("Δ") && kScriptGreek != GetScript("a") &&
|
||||
kScriptGreek != GetScript("0"));
|
||||
}
|
||||
|
||||
bool TestCyrillicScript() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
return PrintAndReturnStatus(
|
||||
kScriptCyrillic == GetScript("Ѐ") && kScriptCyrillic == GetScript("ӿ") &&
|
||||
kScriptCyrillic == GetScript("ш") && kScriptCyrillic == GetScript("Б") &&
|
||||
kScriptCyrillic == GetScript("Ӱ"));
|
||||
}
|
||||
|
||||
bool TestHebrewScript() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
return PrintAndReturnStatus(
|
||||
kScriptHebrew == GetScript("֑") && kScriptHebrew == GetScript("״") &&
|
||||
kScriptHebrew == GetScript("ד") && kScriptHebrew == GetScript("ה") &&
|
||||
kScriptHebrew == GetScript("צ"));
|
||||
}
|
||||
|
||||
bool TestArabicScript() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
return PrintAndReturnStatus(kScriptArabic == GetScript("م") &&
|
||||
kScriptArabic == GetScript("خ"));
|
||||
}
|
||||
|
||||
bool TestHangulJamoScript() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
return PrintAndReturnStatus(kScriptHangulJamo == GetScript("ᄀ") &&
|
||||
kScriptHangulJamo == GetScript("ᇿ") &&
|
||||
kScriptHangulJamo == GetScript("ᄡ") &&
|
||||
kScriptHangulJamo == GetScript("ᆅ") &&
|
||||
kScriptHangulJamo == GetScript("ᅘ"));
|
||||
}
|
||||
|
||||
bool TestHiraganaScript() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
return PrintAndReturnStatus(kScriptHiragana == GetScript("ぁ") &&
|
||||
kScriptHiragana == GetScript("ゟ") &&
|
||||
kScriptHiragana == GetScript("こ") &&
|
||||
kScriptHiragana == GetScript("や") &&
|
||||
kScriptHiragana == GetScript("ぜ"));
|
||||
}
|
||||
|
||||
bool TestKatakanaScript() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
return PrintAndReturnStatus(kScriptKatakana == GetScript("゠") &&
|
||||
kScriptKatakana == GetScript("ヿ") &&
|
||||
kScriptKatakana == GetScript("ヂ") &&
|
||||
kScriptKatakana == GetScript("ザ") &&
|
||||
kScriptKatakana == GetScript("ヸ"));
|
||||
}
|
||||
|
||||
bool TestOtherScripts() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
bool test_successful = true;
|
||||
|
||||
if (kScriptOtherUtf8OneByte != GetScript("^") ||
|
||||
kScriptOtherUtf8OneByte != GetScript("$")) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// Unrecognized 2-byte scripts. For info on the scripts mentioned below, see
|
||||
// http://www.unicode.org/charts/#scripts Note: the scripts below are uniquely
|
||||
// associated with a language. Still, the number of queries in those
|
||||
// languages is small and we didn't want to increase the code size and
|
||||
// latency, so (at least for now) we do not treat them specially.
|
||||
// The following three tests are, respectively, for Armenian, Syriac and
|
||||
// Thaana.
|
||||
if (kScriptOtherUtf8TwoBytes != GetScript("Ձ") ||
|
||||
kScriptOtherUtf8TwoBytes != GetScript("ܔ") ||
|
||||
kScriptOtherUtf8TwoBytes != GetScript("ށ")) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// Unrecognized 3-byte script: CJK Unified Ideographs: not uniquely associated
|
||||
// with a language.
|
||||
if (kScriptOtherUtf8ThreeBytes != GetScript("万") ||
|
||||
kScriptOtherUtf8ThreeBytes != GetScript("両")) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// Unrecognized 4-byte script: CJK Unified Ideographs Extension C. Note:
|
||||
// there is a nice UTF-8 encoder / decoder at https://mothereff.in/utf-8
|
||||
if (kScriptOtherUtf8FourBytes != GetScript("\xF0\xAA\x9C\x94")) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
// Unrecognized 4-byte script: CJK Unified Ideographs Extension E
|
||||
if (kScriptOtherUtf8FourBytes != GetScript("\xF0\xAB\xA0\xB5") ||
|
||||
kScriptOtherUtf8FourBytes != GetScript("\xF0\xAC\xBA\xA1")) {
|
||||
test_successful = false;
|
||||
}
|
||||
|
||||
return PrintAndReturnStatus(test_successful);
|
||||
}
|
||||
|
||||
} // namespace script_detector_test
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
// Runs the feature extraction tests.
|
||||
int main(int argc, char **argv) {
|
||||
const bool tests_successful =
|
||||
chrome_lang_id::script_detector_test::TestGreekScript() &&
|
||||
chrome_lang_id::script_detector_test::TestCyrillicScript() &&
|
||||
chrome_lang_id::script_detector_test::TestHebrewScript() &&
|
||||
chrome_lang_id::script_detector_test::TestArabicScript() &&
|
||||
chrome_lang_id::script_detector_test::TestHangulJamoScript() &&
|
||||
chrome_lang_id::script_detector_test::TestHiraganaScript() &&
|
||||
chrome_lang_id::script_detector_test::TestKatakanaScript() &&
|
||||
chrome_lang_id::script_detector_test::TestOtherScripts();
|
||||
|
||||
return tests_successful ? 0 : 1;
|
||||
}
|
||||
11
Telegram/ThirdParty/cld3/src/script_span/README.md
vendored
Normal file
11
Telegram/ThirdParty/cld3/src/script_span/README.md
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
The code in this directory identifies the scripts present in a given piece of
|
||||
text along with the corresponding spans. The code was copied from
|
||||
[CLD2](https://github.com/CLD2Owners/cld2) and was slightly refactored. It can
|
||||
be further simplified and cleaned up.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
55
Telegram/ThirdParty/cld3/src/script_span/fixunicodevalue.cc
vendored
Normal file
55
Telegram/ThirdParty/cld3/src/script_span/fixunicodevalue.cc
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Routine that maps a Unicode code point to an interchange-valid one
|
||||
//
|
||||
|
||||
#include "fixunicodevalue.h"
|
||||
#include "integral_types.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
// Guarantees that the resulting output value is interchange valid
|
||||
// 00-FF; map to spaces or MS CP1252
|
||||
// D800-DFFF; surrogates
|
||||
// FDD0-FDEF; non-characters
|
||||
// xxFFFE-xxFFFF; non-characters
|
||||
char32 FixUnicodeValue(char32 uv) {
|
||||
uint32 uuv = static_cast<uint32>(uv);
|
||||
if (uuv < 0x0100) {
|
||||
return kMapFullMicrosoft1252OrSpace[uuv];
|
||||
}
|
||||
if (uuv < 0xD800) {
|
||||
return uv;
|
||||
}
|
||||
if ((uuv & ~0x0F) == 0xFDD0) { // non-characters
|
||||
return 0xFFFD;
|
||||
}
|
||||
if ((uuv & ~0x0F) == 0xFDE0) { // non-characters
|
||||
return 0xFFFD;
|
||||
}
|
||||
if ((uuv & 0x00FFFE) == 0xFFFE) { // non-characters
|
||||
return 0xFFFD;
|
||||
}
|
||||
if ((0xE000 <= uuv) && (uuv <= 0x10FFFF)) {
|
||||
return uv;
|
||||
}
|
||||
// surrogates and negative and > 0x10FFFF all land here
|
||||
return 0xFFFD;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
} // End namespace chrome_lang_id
|
||||
69
Telegram/ThirdParty/cld3/src/script_span/fixunicodevalue.h
vendored
Normal file
69
Telegram/ThirdParty/cld3/src/script_span/fixunicodevalue.h
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Routine that maps a Unicode code point to an interchange-valid one
|
||||
//
|
||||
// Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
|
||||
// code points. C0 and C1 control codes that are not interchange-valid
|
||||
// are mapped to spaces.
|
||||
|
||||
|
||||
#ifndef SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
||||
#define SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
||||
|
||||
#include "integral_types.h" // for char32
|
||||
#include "port.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
// Map byte value 0000-00FF to char32
|
||||
// Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
|
||||
// Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
|
||||
static const char32 kMapFullMicrosoft1252OrSpace[256] = {
|
||||
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
|
||||
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
|
||||
0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
|
||||
0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
|
||||
|
||||
0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
|
||||
0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
|
||||
0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
|
||||
0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
|
||||
|
||||
0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
|
||||
0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
|
||||
0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
|
||||
0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
|
||||
0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
|
||||
0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
|
||||
|
||||
0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
|
||||
0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
|
||||
0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
|
||||
0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
|
||||
};
|
||||
|
||||
// Guarantees that the resulting output value is interchange valid
|
||||
// 00-FF; map to spaces or MS CP1252
|
||||
// D800-DFFF; surrogates
|
||||
// FDD0-FDEF; non-characters
|
||||
// xxFFFE-xxFFFF; non-characters
|
||||
char32 FixUnicodeValue(char32 uv);
|
||||
|
||||
} // End namespace CLD2
|
||||
} // End namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_FIXUNICODEVALUE_H_
|
||||
296
Telegram/ThirdParty/cld3/src/script_span/generated_entities.cc
vendored
Normal file
296
Telegram/ThirdParty/cld3/src/script_span/generated_entities.cc
vendored
Normal file
@@ -0,0 +1,296 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// generated_entities.cc
|
||||
// Machine generated. Do Not Edit.
|
||||
//
|
||||
// Declarations for HTML entities recognized by CLD2
|
||||
//
|
||||
#include "generated_ulscript.h" // for CharIntPair
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kNameToEntitySize = 265;
|
||||
extern const CharIntPair kNameToEntity[kNameToEntitySize] = {
|
||||
{"AElig", 198},
|
||||
{"AMP", 38},
|
||||
{"Aacute", 193},
|
||||
{"Acirc", 194},
|
||||
{"Agrave", 192},
|
||||
{"Alpha", 913},
|
||||
{"Aring", 197},
|
||||
{"Atilde", 195},
|
||||
{"Auml", 196},
|
||||
{"Beta", 914},
|
||||
{"Ccaron", 268},
|
||||
{"Ccedil", 199},
|
||||
{"Chi", 935},
|
||||
{"Dagger", 8225},
|
||||
{"Delta", 916},
|
||||
{"ETH", 208},
|
||||
{"Eacute", 201},
|
||||
{"Ecaron", 282},
|
||||
{"Ecirc", 202},
|
||||
{"Egrave", 200},
|
||||
{"Epsilon", 917},
|
||||
{"Eta", 919},
|
||||
{"Euml", 203},
|
||||
{"GT", 62},
|
||||
{"Gamma", 915},
|
||||
{"Iacute", 205},
|
||||
{"Icirc", 206},
|
||||
{"Igrave", 204},
|
||||
{"Iota", 921},
|
||||
{"Iuml", 207},
|
||||
{"Kappa", 922},
|
||||
{"LT", 60},
|
||||
{"Lambda", 923},
|
||||
{"Mu", 924},
|
||||
{"Ntilde", 209},
|
||||
{"Nu", 925},
|
||||
{"OElig", 338},
|
||||
{"Oacute", 211},
|
||||
{"Ocirc", 212},
|
||||
{"Ograve", 210},
|
||||
{"Omega", 937},
|
||||
{"Omicron", 927},
|
||||
{"Oslash", 216},
|
||||
{"Otilde", 213},
|
||||
{"Ouml", 214},
|
||||
{"Phi", 934},
|
||||
{"Pi", 928},
|
||||
{"Prime", 8243},
|
||||
{"Psi", 936},
|
||||
{"QUOT", 34},
|
||||
{"Rcaron", 344},
|
||||
{"Rho", 929},
|
||||
{"Scaron", 352},
|
||||
{"Sigma", 931},
|
||||
{"THORN", 222},
|
||||
{"Tau", 932},
|
||||
{"Theta", 920},
|
||||
{"Uacute", 218},
|
||||
{"Ucirc", 219},
|
||||
{"Ugrave", 217},
|
||||
{"Upsilon", 933},
|
||||
{"Uuml", 220},
|
||||
{"Xi", 926},
|
||||
{"Yacute", 221},
|
||||
{"Yuml", 376},
|
||||
{"Zeta", 918},
|
||||
{"aacute", 225},
|
||||
{"acirc", 226},
|
||||
{"acute", 180},
|
||||
{"aelig", 230},
|
||||
{"agrave", 224},
|
||||
{"alefsym", 8501},
|
||||
{"alpha", 945},
|
||||
{"amp", 38},
|
||||
{"and", 8743},
|
||||
{"ang", 8736},
|
||||
{"apos", 39},
|
||||
{"aring", 229},
|
||||
{"asymp", 8776},
|
||||
{"atilde", 227},
|
||||
{"auml", 228},
|
||||
{"bdquo", 8222},
|
||||
{"beta", 946},
|
||||
{"brvbar", 166},
|
||||
{"bull", 8226},
|
||||
{"cap", 8745},
|
||||
{"ccaron", 269},
|
||||
{"ccedil", 231},
|
||||
{"cedil", 184},
|
||||
{"cent", 162},
|
||||
{"chi", 967},
|
||||
{"circ", 710},
|
||||
{"clubs", 9827},
|
||||
{"cong", 8773},
|
||||
{"copy", 169},
|
||||
{"crarr", 8629},
|
||||
{"cup", 8746},
|
||||
{"curren", 164},
|
||||
{"dArr", 8659},
|
||||
{"dagger", 8224},
|
||||
{"darr", 8595},
|
||||
{"deg", 176},
|
||||
{"delta", 948},
|
||||
{"diams", 9830},
|
||||
{"divide", 247},
|
||||
{"eacute", 233},
|
||||
{"ecaron", 283},
|
||||
{"ecirc", 234},
|
||||
{"egrave", 232},
|
||||
{"emdash", 8212},
|
||||
{"empty", 8709},
|
||||
{"emsp", 8195},
|
||||
{"endash", 8211},
|
||||
{"ensp", 8194},
|
||||
{"epsilon", 949},
|
||||
{"equiv", 8801},
|
||||
{"eta", 951},
|
||||
{"eth", 240},
|
||||
{"euml", 235},
|
||||
{"euro", 8364},
|
||||
{"exist", 8707},
|
||||
{"fnof", 402},
|
||||
{"forall", 8704},
|
||||
{"frac12", 189},
|
||||
{"frac14", 188},
|
||||
{"frac34", 190},
|
||||
{"frasl", 8260},
|
||||
{"gamma", 947},
|
||||
{"ge", 8805},
|
||||
{"gt", 62},
|
||||
{"hArr", 8660},
|
||||
{"harr", 8596},
|
||||
{"hearts", 9829},
|
||||
{"hellip", 8230},
|
||||
{"iacute", 237},
|
||||
{"icirc", 238},
|
||||
{"iexcl", 161},
|
||||
{"igrave", 236},
|
||||
{"image", 8465},
|
||||
{"infin", 8734},
|
||||
{"int", 8747},
|
||||
{"iota", 953},
|
||||
{"iquest", 191},
|
||||
{"isin", 8712},
|
||||
{"iuml", 239},
|
||||
{"kappa", 954},
|
||||
{"lArr", 8656},
|
||||
{"lambda", 955},
|
||||
{"lang", 9001},
|
||||
{"laquo", 171},
|
||||
{"larr", 8592},
|
||||
{"lceil", 8968},
|
||||
{"ldquo", 8220},
|
||||
{"le", 8804},
|
||||
{"lfloor", 8970},
|
||||
{"lowast", 8727},
|
||||
{"loz", 9674},
|
||||
{"lrm", 8206},
|
||||
{"lsaquo", 8249},
|
||||
{"lsquo", 8216},
|
||||
{"lt", 60},
|
||||
{"macr", 175},
|
||||
{"mdash", 8212},
|
||||
{"micro", 181},
|
||||
{"middot", 183},
|
||||
{"minus", 8722},
|
||||
{"mu", 956},
|
||||
{"nabla", 8711},
|
||||
{"nbsp", 160},
|
||||
{"ndash", 8211},
|
||||
{"ne", 8800},
|
||||
{"ni", 8715},
|
||||
{"not", 172},
|
||||
{"notin", 8713},
|
||||
{"nsub", 8836},
|
||||
{"ntilde", 241},
|
||||
{"nu", 957},
|
||||
{"oacute", 243},
|
||||
{"ocirc", 244},
|
||||
{"oelig", 339},
|
||||
{"ograve", 242},
|
||||
{"oline", 8254},
|
||||
{"omega", 969},
|
||||
{"omicron", 959},
|
||||
{"oplus", 8853},
|
||||
{"or", 8744},
|
||||
{"ordf", 170},
|
||||
{"ordm", 186},
|
||||
{"oslash", 248},
|
||||
{"otilde", 245},
|
||||
{"otimes", 8855},
|
||||
{"ouml", 246},
|
||||
{"para", 182},
|
||||
{"part", 8706},
|
||||
{"permil", 8240},
|
||||
{"perp", 8869},
|
||||
{"phi", 966},
|
||||
{"pi", 960},
|
||||
{"piv", 982},
|
||||
{"plusmn", 177},
|
||||
{"pound", 163},
|
||||
{"prime", 8242},
|
||||
{"prod", 8719},
|
||||
{"prop", 8733},
|
||||
{"psi", 968},
|
||||
{"quot", 34},
|
||||
{"rArr", 8658},
|
||||
{"radic", 8730},
|
||||
{"rang", 9002},
|
||||
{"raquo", 187},
|
||||
{"rarr", 8594},
|
||||
{"rcaron", 345},
|
||||
{"rceil", 8969},
|
||||
{"rdquo", 8221},
|
||||
{"real", 8476},
|
||||
{"reg", 174},
|
||||
{"rfloor", 8971},
|
||||
{"rho", 961},
|
||||
{"rlm", 8207},
|
||||
{"rsaquo", 8250},
|
||||
{"rsquo", 8217},
|
||||
{"sbquo", 8218},
|
||||
{"scaron", 353},
|
||||
{"sdot", 8901},
|
||||
{"sect", 167},
|
||||
{"shy", 173},
|
||||
{"sigma", 963},
|
||||
{"sigmaf", 962},
|
||||
{"sim", 8764},
|
||||
{"spades", 9824},
|
||||
{"sub", 8834},
|
||||
{"sube", 8838},
|
||||
{"sum", 8721},
|
||||
{"sup", 8835},
|
||||
{"sup1", 185},
|
||||
{"sup2", 178},
|
||||
{"sup3", 179},
|
||||
{"supe", 8839},
|
||||
{"szlig", 223},
|
||||
{"tau", 964},
|
||||
{"there4", 8756},
|
||||
{"theta", 952},
|
||||
{"thetasym", 977},
|
||||
{"thinsp", 8201},
|
||||
{"thorn", 254},
|
||||
{"tilde", 732},
|
||||
{"times", 215},
|
||||
{"trade", 8482},
|
||||
{"uArr", 8657},
|
||||
{"uacute", 250},
|
||||
{"uarr", 8593},
|
||||
{"ucirc", 251},
|
||||
{"ugrave", 249},
|
||||
{"uml", 168},
|
||||
{"upsih", 978},
|
||||
{"upsilon", 965},
|
||||
{"uuml", 252},
|
||||
{"weierp", 8472},
|
||||
{"xi", 958},
|
||||
{"yacute", 253},
|
||||
{"yen", 165},
|
||||
{"yuml", 255},
|
||||
{"zeta", 950},
|
||||
{"zwj", 8205},
|
||||
{"zwnj", 8204},
|
||||
};
|
||||
|
||||
} // namespace CLD2
|
||||
} // namespace chrome_lang_id
|
||||
678
Telegram/ThirdParty/cld3/src/script_span/generated_ulscript.cc
vendored
Normal file
678
Telegram/ThirdParty/cld3/src/script_span/generated_ulscript.cc
vendored
Normal file
@@ -0,0 +1,678 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// generated_ulscript.cc
|
||||
// Machine generated. Do Not Edit.
|
||||
//
|
||||
// Declarations for scripts recognized by CLD2
|
||||
//
|
||||
|
||||
#include "generated_ulscript.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToNameSize = 102;
|
||||
extern const char* const kULScriptToName[kULScriptToNameSize] = {
|
||||
"Common", // 0 Zyyy
|
||||
"Latin", // 1 Latn
|
||||
"Greek", // 2 Grek
|
||||
"Cyrillic", // 3 Cyrl
|
||||
"Armenian", // 4 Armn
|
||||
"Hebrew", // 5 Hebr
|
||||
"Arabic", // 6 Arab
|
||||
"Syriac", // 7 Syrc
|
||||
"Thaana", // 8 Thaa
|
||||
"Devanagari", // 9 Deva
|
||||
"Bengali", // 10 Beng
|
||||
"Gurmukhi", // 11 Guru
|
||||
"Gujarati", // 12 Gujr
|
||||
"Oriya", // 13 Orya
|
||||
"Tamil", // 14 Taml
|
||||
"Telugu", // 15 Telu
|
||||
"Kannada", // 16 Knda
|
||||
"Malayalam", // 17 Mlym
|
||||
"Sinhala", // 18 Sinh
|
||||
"Thai", // 19 Thai
|
||||
"Lao", // 20 Laoo
|
||||
"Tibetan", // 21 Tibt
|
||||
"Myanmar", // 22 Mymr
|
||||
"Georgian", // 23 Geor
|
||||
"Hani", // 24 Hani
|
||||
"Ethiopic", // 25 Ethi
|
||||
"Cherokee", // 26 Cher
|
||||
"Canadian_Aboriginal", // 27 Cans
|
||||
"Ogham", // 28 Ogam
|
||||
"Runic", // 29 Runr
|
||||
"Khmer", // 30 Khmr
|
||||
"Mongolian", // 31 Mong
|
||||
"", // 32
|
||||
"", // 33
|
||||
"Bopomofo", // 34 Bopo
|
||||
"", // 35
|
||||
"Yi", // 36 Yiii
|
||||
"Old_Italic", // 37 Ital
|
||||
"Gothic", // 38 Goth
|
||||
"Deseret", // 39 Dsrt
|
||||
"Inherited", // 40 Zinh
|
||||
"Tagalog", // 41 Tglg
|
||||
"Hanunoo", // 42 Hano
|
||||
"Buhid", // 43 Buhd
|
||||
"Tagbanwa", // 44 Tagb
|
||||
"Limbu", // 45 Limb
|
||||
"Tai_Le", // 46 Tale
|
||||
"Linear_B", // 47 Linb
|
||||
"Ugaritic", // 48 Ugar
|
||||
"Shavian", // 49 Shaw
|
||||
"Osmanya", // 50 Osma
|
||||
"Cypriot", // 51 Cprt
|
||||
"Braille", // 52 Brai
|
||||
"Buginese", // 53 Bugi
|
||||
"Coptic", // 54 Copt
|
||||
"New_Tai_Lue", // 55 Talu
|
||||
"Glagolitic", // 56 Glag
|
||||
"Tifinagh", // 57 Tfng
|
||||
"Syloti_Nagri", // 58 Sylo
|
||||
"Old_Persian", // 59 Xpeo
|
||||
"Kharoshthi", // 60 Khar
|
||||
"Balinese", // 61 Bali
|
||||
"Cuneiform", // 62 Xsux
|
||||
"Phoenician", // 63 Phnx
|
||||
"Phags_Pa", // 64 Phag
|
||||
"Nko", // 65 Nkoo
|
||||
"Sundanese", // 66 Sund
|
||||
"Lepcha", // 67 Lepc
|
||||
"Ol_Chiki", // 68 Olck
|
||||
"Vai", // 69 Vaii
|
||||
"Saurashtra", // 70 Saur
|
||||
"Kayah_Li", // 71 Kali
|
||||
"Rejang", // 72 Rjng
|
||||
"Lycian", // 73 Lyci
|
||||
"Carian", // 74 Cari
|
||||
"Lydian", // 75 Lydi
|
||||
"Cham", // 76 Cham
|
||||
"Tai_Tham", // 77 Lana
|
||||
"Tai_Viet", // 78 Tavt
|
||||
"Avestan", // 79 Avst
|
||||
"Egyptian_Hieroglyphs", // 80 Egyp
|
||||
"Samaritan", // 81 Samr
|
||||
"Lisu", // 82 Lisu
|
||||
"Bamum", // 83 Bamu
|
||||
"Javanese", // 84 Java
|
||||
"Meetei_Mayek", // 85 Mtei
|
||||
"Imperial_Aramaic", // 86 Armi
|
||||
"Old_South_Arabian", // 87 Sarb
|
||||
"Inscriptional_Parthian", // 88 Prti
|
||||
"Inscriptional_Pahlavi", // 89 Phli
|
||||
"Old_Turkic", // 90 Orkh
|
||||
"Kaithi", // 91 Kthi
|
||||
"Batak", // 92 Batk
|
||||
"Brahmi", // 93 Brah
|
||||
"Mandaic", // 94 Mand
|
||||
"Chakma", // 95 Cakm
|
||||
"Meroitic_Cursive", // 96 Merc
|
||||
"Meroitic_Hieroglyphs", // 97 Mero
|
||||
"Miao", // 98 Plrd
|
||||
"Sharada", // 99 Shrd
|
||||
"Sora_Sompeng", // 100 Sora
|
||||
"Takri", // 101 Takr
|
||||
};
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToCodeSize = 102;
|
||||
extern const char* const kULScriptToCode[kULScriptToCodeSize] = {
|
||||
"Zyyy", // 0 Common
|
||||
"Latn", // 1 Latin
|
||||
"Grek", // 2 Greek
|
||||
"Cyrl", // 3 Cyrillic
|
||||
"Armn", // 4 Armenian
|
||||
"Hebr", // 5 Hebrew
|
||||
"Arab", // 6 Arabic
|
||||
"Syrc", // 7 Syriac
|
||||
"Thaa", // 8 Thaana
|
||||
"Deva", // 9 Devanagari
|
||||
"Beng", // 10 Bengali
|
||||
"Guru", // 11 Gurmukhi
|
||||
"Gujr", // 12 Gujarati
|
||||
"Orya", // 13 Oriya
|
||||
"Taml", // 14 Tamil
|
||||
"Telu", // 15 Telugu
|
||||
"Knda", // 16 Kannada
|
||||
"Mlym", // 17 Malayalam
|
||||
"Sinh", // 18 Sinhala
|
||||
"Thai", // 19 Thai
|
||||
"Laoo", // 20 Lao
|
||||
"Tibt", // 21 Tibetan
|
||||
"Mymr", // 22 Myanmar
|
||||
"Geor", // 23 Georgian
|
||||
"Hani", // 24 Hani
|
||||
"Ethi", // 25 Ethiopic
|
||||
"Cher", // 26 Cherokee
|
||||
"Cans", // 27 Canadian_Aboriginal
|
||||
"Ogam", // 28 Ogham
|
||||
"Runr", // 29 Runic
|
||||
"Khmr", // 30 Khmer
|
||||
"Mong", // 31 Mongolian
|
||||
"", // 32
|
||||
"", // 33
|
||||
"Bopo", // 34 Bopomofo
|
||||
"", // 35
|
||||
"Yiii", // 36 Yi
|
||||
"Ital", // 37 Old_Italic
|
||||
"Goth", // 38 Gothic
|
||||
"Dsrt", // 39 Deseret
|
||||
"Zinh", // 40 Inherited
|
||||
"Tglg", // 41 Tagalog
|
||||
"Hano", // 42 Hanunoo
|
||||
"Buhd", // 43 Buhid
|
||||
"Tagb", // 44 Tagbanwa
|
||||
"Limb", // 45 Limbu
|
||||
"Tale", // 46 Tai_Le
|
||||
"Linb", // 47 Linear_B
|
||||
"Ugar", // 48 Ugaritic
|
||||
"Shaw", // 49 Shavian
|
||||
"Osma", // 50 Osmanya
|
||||
"Cprt", // 51 Cypriot
|
||||
"Brai", // 52 Braille
|
||||
"Bugi", // 53 Buginese
|
||||
"Copt", // 54 Coptic
|
||||
"Talu", // 55 New_Tai_Lue
|
||||
"Glag", // 56 Glagolitic
|
||||
"Tfng", // 57 Tifinagh
|
||||
"Sylo", // 58 Syloti_Nagri
|
||||
"Xpeo", // 59 Old_Persian
|
||||
"Khar", // 60 Kharoshthi
|
||||
"Bali", // 61 Balinese
|
||||
"Xsux", // 62 Cuneiform
|
||||
"Phnx", // 63 Phoenician
|
||||
"Phag", // 64 Phags_Pa
|
||||
"Nkoo", // 65 Nko
|
||||
"Sund", // 66 Sundanese
|
||||
"Lepc", // 67 Lepcha
|
||||
"Olck", // 68 Ol_Chiki
|
||||
"Vaii", // 69 Vai
|
||||
"Saur", // 70 Saurashtra
|
||||
"Kali", // 71 Kayah_Li
|
||||
"Rjng", // 72 Rejang
|
||||
"Lyci", // 73 Lycian
|
||||
"Cari", // 74 Carian
|
||||
"Lydi", // 75 Lydian
|
||||
"Cham", // 76 Cham
|
||||
"Lana", // 77 Tai_Tham
|
||||
"Tavt", // 78 Tai_Viet
|
||||
"Avst", // 79 Avestan
|
||||
"Egyp", // 80 Egyptian_Hieroglyphs
|
||||
"Samr", // 81 Samaritan
|
||||
"Lisu", // 82 Lisu
|
||||
"Bamu", // 83 Bamum
|
||||
"Java", // 84 Javanese
|
||||
"Mtei", // 85 Meetei_Mayek
|
||||
"Armi", // 86 Imperial_Aramaic
|
||||
"Sarb", // 87 Old_South_Arabian
|
||||
"Prti", // 88 Inscriptional_Parthian
|
||||
"Phli", // 89 Inscriptional_Pahlavi
|
||||
"Orkh", // 90 Old_Turkic
|
||||
"Kthi", // 91 Kaithi
|
||||
"Batk", // 92 Batak
|
||||
"Brah", // 93 Brahmi
|
||||
"Mand", // 94 Mandaic
|
||||
"Cakm", // 95 Chakma
|
||||
"Merc", // 96 Meroitic_Cursive
|
||||
"Mero", // 97 Meroitic_Hieroglyphs
|
||||
"Plrd", // 98 Miao
|
||||
"Shrd", // 99 Sharada
|
||||
"Sora", // 100 Sora_Sompeng
|
||||
"Takr", // 101 Takri
|
||||
};
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToCNameSize = 102;
|
||||
extern const char* const kULScriptToCName[kULScriptToCNameSize] = {
|
||||
"ULScript_Common", // 0 Zyyy
|
||||
"ULScript_Latin", // 1 Latn
|
||||
"ULScript_Greek", // 2 Grek
|
||||
"ULScript_Cyrillic", // 3 Cyrl
|
||||
"ULScript_Armenian", // 4 Armn
|
||||
"ULScript_Hebrew", // 5 Hebr
|
||||
"ULScript_Arabic", // 6 Arab
|
||||
"ULScript_Syriac", // 7 Syrc
|
||||
"ULScript_Thaana", // 8 Thaa
|
||||
"ULScript_Devanagari", // 9 Deva
|
||||
"ULScript_Bengali", // 10 Beng
|
||||
"ULScript_Gurmukhi", // 11 Guru
|
||||
"ULScript_Gujarati", // 12 Gujr
|
||||
"ULScript_Oriya", // 13 Orya
|
||||
"ULScript_Tamil", // 14 Taml
|
||||
"ULScript_Telugu", // 15 Telu
|
||||
"ULScript_Kannada", // 16 Knda
|
||||
"ULScript_Malayalam", // 17 Mlym
|
||||
"ULScript_Sinhala", // 18 Sinh
|
||||
"ULScript_Thai", // 19 Thai
|
||||
"ULScript_Lao", // 20 Laoo
|
||||
"ULScript_Tibetan", // 21 Tibt
|
||||
"ULScript_Myanmar", // 22 Mymr
|
||||
"ULScript_Georgian", // 23 Geor
|
||||
"ULScript_Hani", // 24 Hani
|
||||
"ULScript_Ethiopic", // 25 Ethi
|
||||
"ULScript_Cherokee", // 26 Cher
|
||||
"ULScript_Canadian_Aboriginal", // 27 Cans
|
||||
"ULScript_Ogham", // 28 Ogam
|
||||
"ULScript_Runic", // 29 Runr
|
||||
"ULScript_Khmer", // 30 Khmr
|
||||
"ULScript_Mongolian", // 31 Mong
|
||||
"ULScript_32", // 32
|
||||
"ULScript_33", // 33
|
||||
"ULScript_Bopomofo", // 34 Bopo
|
||||
"ULScript_35", // 35
|
||||
"ULScript_Yi", // 36 Yiii
|
||||
"ULScript_Old_Italic", // 37 Ital
|
||||
"ULScript_Gothic", // 38 Goth
|
||||
"ULScript_Deseret", // 39 Dsrt
|
||||
"ULScript_Inherited", // 40 Zinh
|
||||
"ULScript_Tagalog", // 41 Tglg
|
||||
"ULScript_Hanunoo", // 42 Hano
|
||||
"ULScript_Buhid", // 43 Buhd
|
||||
"ULScript_Tagbanwa", // 44 Tagb
|
||||
"ULScript_Limbu", // 45 Limb
|
||||
"ULScript_Tai_Le", // 46 Tale
|
||||
"ULScript_Linear_B", // 47 Linb
|
||||
"ULScript_Ugaritic", // 48 Ugar
|
||||
"ULScript_Shavian", // 49 Shaw
|
||||
"ULScript_Osmanya", // 50 Osma
|
||||
"ULScript_Cypriot", // 51 Cprt
|
||||
"ULScript_Braille", // 52 Brai
|
||||
"ULScript_Buginese", // 53 Bugi
|
||||
"ULScript_Coptic", // 54 Copt
|
||||
"ULScript_New_Tai_Lue", // 55 Talu
|
||||
"ULScript_Glagolitic", // 56 Glag
|
||||
"ULScript_Tifinagh", // 57 Tfng
|
||||
"ULScript_Syloti_Nagri", // 58 Sylo
|
||||
"ULScript_Old_Persian", // 59 Xpeo
|
||||
"ULScript_Kharoshthi", // 60 Khar
|
||||
"ULScript_Balinese", // 61 Bali
|
||||
"ULScript_Cuneiform", // 62 Xsux
|
||||
"ULScript_Phoenician", // 63 Phnx
|
||||
"ULScript_Phags_Pa", // 64 Phag
|
||||
"ULScript_Nko", // 65 Nkoo
|
||||
"ULScript_Sundanese", // 66 Sund
|
||||
"ULScript_Lepcha", // 67 Lepc
|
||||
"ULScript_Ol_Chiki", // 68 Olck
|
||||
"ULScript_Vai", // 69 Vaii
|
||||
"ULScript_Saurashtra", // 70 Saur
|
||||
"ULScript_Kayah_Li", // 71 Kali
|
||||
"ULScript_Rejang", // 72 Rjng
|
||||
"ULScript_Lycian", // 73 Lyci
|
||||
"ULScript_Carian", // 74 Cari
|
||||
"ULScript_Lydian", // 75 Lydi
|
||||
"ULScript_Cham", // 76 Cham
|
||||
"ULScript_Tai_Tham", // 77 Lana
|
||||
"ULScript_Tai_Viet", // 78 Tavt
|
||||
"ULScript_Avestan", // 79 Avst
|
||||
"ULScript_Egyptian_Hieroglyphs", // 80 Egyp
|
||||
"ULScript_Samaritan", // 81 Samr
|
||||
"ULScript_Lisu", // 82 Lisu
|
||||
"ULScript_Bamum", // 83 Bamu
|
||||
"ULScript_Javanese", // 84 Java
|
||||
"ULScript_Meetei_Mayek", // 85 Mtei
|
||||
"ULScript_Imperial_Aramaic", // 86 Armi
|
||||
"ULScript_Old_South_Arabian", // 87 Sarb
|
||||
"ULScript_Inscriptional_Parthian", // 88 Prti
|
||||
"ULScript_Inscriptional_Pahlavi", // 89 Phli
|
||||
"ULScript_Old_Turkic", // 90 Orkh
|
||||
"ULScript_Kaithi", // 91 Kthi
|
||||
"ULScript_Batak", // 92 Batk
|
||||
"ULScript_Brahmi", // 93 Brah
|
||||
"ULScript_Mandaic", // 94 Mand
|
||||
"ULScript_Chakma", // 95 Cakm
|
||||
"ULScript_Meroitic_Cursive", // 96 Merc
|
||||
"ULScript_Meroitic_Hieroglyphs", // 97 Mero
|
||||
"ULScript_Miao", // 98 Plrd
|
||||
"ULScript_Sharada", // 99 Shrd
|
||||
"ULScript_Sora_Sompeng", // 100 Sora
|
||||
"ULScript_Takri", // 101 Takr
|
||||
};
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToRtypeSize = 102;
|
||||
extern const ULScriptRType kULScriptToRtype[kULScriptToRtypeSize] = {
|
||||
RTypeNone, // 0 Zyyy
|
||||
RTypeMany, // 1 Latn
|
||||
RTypeOne, // 2 Grek
|
||||
RTypeMany, // 3 Cyrl
|
||||
RTypeOne, // 4 Armn
|
||||
RTypeMany, // 5 Hebr
|
||||
RTypeMany, // 6 Arab
|
||||
RTypeOne, // 7 Syrc
|
||||
RTypeOne, // 8 Thaa
|
||||
RTypeMany, // 9 Deva
|
||||
RTypeMany, // 10 Beng
|
||||
RTypeOne, // 11 Guru
|
||||
RTypeOne, // 12 Gujr
|
||||
RTypeOne, // 13 Orya
|
||||
RTypeOne, // 14 Taml
|
||||
RTypeOne, // 15 Telu
|
||||
RTypeOne, // 16 Knda
|
||||
RTypeOne, // 17 Mlym
|
||||
RTypeOne, // 18 Sinh
|
||||
RTypeOne, // 19 Thai
|
||||
RTypeOne, // 20 Laoo
|
||||
RTypeMany, // 21 Tibt
|
||||
RTypeOne, // 22 Mymr
|
||||
RTypeOne, // 23 Geor
|
||||
RTypeCJK, // 24 Hani
|
||||
RTypeMany, // 25 Ethi
|
||||
RTypeOne, // 26 Cher
|
||||
RTypeOne, // 27 Cans
|
||||
RTypeNone, // 28 Ogam
|
||||
RTypeNone, // 29 Runr
|
||||
RTypeOne, // 30 Khmr
|
||||
RTypeOne, // 31 Mong
|
||||
RTypeNone, // 32
|
||||
RTypeNone, // 33
|
||||
RTypeNone, // 34 Bopo
|
||||
RTypeNone, // 35
|
||||
RTypeNone, // 36 Yiii
|
||||
RTypeNone, // 37 Ital
|
||||
RTypeNone, // 38 Goth
|
||||
RTypeNone, // 39 Dsrt
|
||||
RTypeNone, // 40 Zinh
|
||||
RTypeOne, // 41 Tglg
|
||||
RTypeNone, // 42 Hano
|
||||
RTypeNone, // 43 Buhd
|
||||
RTypeNone, // 44 Tagb
|
||||
RTypeOne, // 45 Limb
|
||||
RTypeNone, // 46 Tale
|
||||
RTypeNone, // 47 Linb
|
||||
RTypeNone, // 48 Ugar
|
||||
RTypeNone, // 49 Shaw
|
||||
RTypeNone, // 50 Osma
|
||||
RTypeNone, // 51 Cprt
|
||||
RTypeNone, // 52 Brai
|
||||
RTypeNone, // 53 Bugi
|
||||
RTypeNone, // 54 Copt
|
||||
RTypeNone, // 55 Talu
|
||||
RTypeNone, // 56 Glag
|
||||
RTypeNone, // 57 Tfng
|
||||
RTypeNone, // 58 Sylo
|
||||
RTypeNone, // 59 Xpeo
|
||||
RTypeNone, // 60 Khar
|
||||
RTypeNone, // 61 Bali
|
||||
RTypeNone, // 62 Xsux
|
||||
RTypeNone, // 63 Phnx
|
||||
RTypeNone, // 64 Phag
|
||||
RTypeNone, // 65 Nkoo
|
||||
RTypeNone, // 66 Sund
|
||||
RTypeNone, // 67 Lepc
|
||||
RTypeNone, // 68 Olck
|
||||
RTypeNone, // 69 Vaii
|
||||
RTypeNone, // 70 Saur
|
||||
RTypeNone, // 71 Kali
|
||||
RTypeNone, // 72 Rjng
|
||||
RTypeNone, // 73 Lyci
|
||||
RTypeNone, // 74 Cari
|
||||
RTypeNone, // 75 Lydi
|
||||
RTypeNone, // 76 Cham
|
||||
RTypeNone, // 77 Lana
|
||||
RTypeNone, // 78 Tavt
|
||||
RTypeNone, // 79 Avst
|
||||
RTypeNone, // 80 Egyp
|
||||
RTypeNone, // 81 Samr
|
||||
RTypeNone, // 82 Lisu
|
||||
RTypeNone, // 83 Bamu
|
||||
RTypeNone, // 84 Java
|
||||
RTypeNone, // 85 Mtei
|
||||
RTypeNone, // 86 Armi
|
||||
RTypeNone, // 87 Sarb
|
||||
RTypeNone, // 88 Prti
|
||||
RTypeNone, // 89 Phli
|
||||
RTypeNone, // 90 Orkh
|
||||
RTypeNone, // 91 Kthi
|
||||
RTypeNone, // 92 Batk
|
||||
RTypeNone, // 93 Brah
|
||||
RTypeNone, // 94 Mand
|
||||
RTypeNone, // 95 Cakm
|
||||
RTypeNone, // 96 Merc
|
||||
RTypeNone, // 97 Mero
|
||||
RTypeNone, // 98 Plrd
|
||||
RTypeNone, // 99 Shrd
|
||||
RTypeNone, // 100 Sora
|
||||
RTypeNone, // 101 Takr
|
||||
};
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToDefaultLangSize = 102;
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kNameToULScriptSize = 105;
|
||||
extern const CharIntPair kNameToULScript[kNameToULScriptSize] = {
|
||||
{"Arabic", 6}, // Arab
|
||||
{"Armenian", 4}, // Armn
|
||||
{"Avestan", 79}, // Avst
|
||||
{"Balinese", 61}, // Bali
|
||||
{"Bamum", 83}, // Bamu
|
||||
{"Batak", 92}, // Batk
|
||||
{"Bengali", 10}, // Beng
|
||||
{"Bopomofo", 34}, // Bopo
|
||||
{"Brahmi", 93}, // Brah
|
||||
{"Braille", 52}, // Brai
|
||||
{"Buginese", 53}, // Bugi
|
||||
{"Buhid", 43}, // Buhd
|
||||
{"Canadian_Aboriginal", 27}, // Cans
|
||||
{"Carian", 74}, // Cari
|
||||
{"Chakma", 95}, // Cakm
|
||||
{"Cham", 76}, // Cham
|
||||
{"Cherokee", 26}, // Cher
|
||||
{"Common", 0}, // Zyyy
|
||||
{"Coptic", 54}, // Copt
|
||||
{"Cuneiform", 62}, // Xsux
|
||||
{"Cypriot", 51}, // Cprt
|
||||
{"Cyrillic", 3}, // Cyrl
|
||||
{"Deseret", 39}, // Dsrt
|
||||
{"Devanagari", 9}, // Deva
|
||||
{"Egyptian_Hieroglyphs", 80}, // Egyp
|
||||
{"Ethiopic", 25}, // Ethi
|
||||
{"Georgian", 23}, // Geor
|
||||
{"Glagolitic", 56}, // Glag
|
||||
{"Gothic", 38}, // Goth
|
||||
{"Greek", 2}, // Grek
|
||||
{"Gujarati", 12}, // Gujr
|
||||
{"Gurmukhi", 11}, // Guru
|
||||
{"Han", 24}, // Hant
|
||||
{"Han", 24}, // Hans
|
||||
{"Han", 24}, // Hani
|
||||
{"Hangul", 24}, // Hang
|
||||
{"Hani", 24}, // Hani
|
||||
{"Hanunoo", 42}, // Hano
|
||||
{"Hebrew", 5}, // Hebr
|
||||
{"Hiragana", 24}, // Hira
|
||||
{"Imperial_Aramaic", 86}, // Armi
|
||||
{"Inherited", 40}, // Zinh
|
||||
{"Inscriptional_Pahlavi", 89}, // Phli
|
||||
{"Inscriptional_Parthian", 88}, // Prti
|
||||
{"Javanese", 84}, // Java
|
||||
{"Kaithi", 91}, // Kthi
|
||||
{"Kannada", 16}, // Knda
|
||||
{"Katakana", 24}, // Kana
|
||||
{"Kayah_Li", 71}, // Kali
|
||||
{"Kharoshthi", 60}, // Khar
|
||||
{"Khmer", 30}, // Khmr
|
||||
{"Lao", 20}, // Laoo
|
||||
{"Latin", 1}, // Latn
|
||||
{"Lepcha", 67}, // Lepc
|
||||
{"Limbu", 45}, // Limb
|
||||
{"Linear_B", 47}, // Linb
|
||||
{"Lisu", 82}, // Lisu
|
||||
{"Lycian", 73}, // Lyci
|
||||
{"Lydian", 75}, // Lydi
|
||||
{"Malayalam", 17}, // Mlym
|
||||
{"Mandaic", 94}, // Mand
|
||||
{"Meetei_Mayek", 85}, // Mtei
|
||||
{"Meroitic_Cursive", 96}, // Merc
|
||||
{"Meroitic_Hieroglyphs", 97}, // Mero
|
||||
{"Miao", 98}, // Plrd
|
||||
{"Mongolian", 31}, // Mong
|
||||
{"Myanmar", 22}, // Mymr
|
||||
{"New_Tai_Lue", 55}, // Talu
|
||||
{"Nko", 65}, // Nkoo
|
||||
{"Ogham", 28}, // Ogam
|
||||
{"Ol_Chiki", 68}, // Olck
|
||||
{"Old_Italic", 37}, // Ital
|
||||
{"Old_Persian", 59}, // Xpeo
|
||||
{"Old_South_Arabian", 87}, // Sarb
|
||||
{"Old_Turkic", 90}, // Orkh
|
||||
{"Oriya", 13}, // Orya
|
||||
{"Osmanya", 50}, // Osma
|
||||
{"Phags_Pa", 64}, // Phag
|
||||
{"Phoenician", 63}, // Phnx
|
||||
{"Rejang", 72}, // Rjng
|
||||
{"Runic", 29}, // Runr
|
||||
{"Samaritan", 81}, // Samr
|
||||
{"Saurashtra", 70}, // Saur
|
||||
{"Sharada", 99}, // Shrd
|
||||
{"Shavian", 49}, // Shaw
|
||||
{"Sinhala", 18}, // Sinh
|
||||
{"Sora_Sompeng", 100}, // Sora
|
||||
{"Sundanese", 66}, // Sund
|
||||
{"Syloti_Nagri", 58}, // Sylo
|
||||
{"Syriac", 7}, // Syrc
|
||||
{"Tagalog", 41}, // Tglg
|
||||
{"Tagbanwa", 44}, // Tagb
|
||||
{"Tai_Le", 46}, // Tale
|
||||
{"Tai_Tham", 77}, // Lana
|
||||
{"Tai_Viet", 78}, // Tavt
|
||||
{"Takri", 101}, // Takr
|
||||
{"Tamil", 14}, // Taml
|
||||
{"Telugu", 15}, // Telu
|
||||
{"Thaana", 8}, // Thaa
|
||||
{"Thai", 19}, // Thai
|
||||
{"Tibetan", 21}, // Tibt
|
||||
{"Tifinagh", 57}, // Tfng
|
||||
{"Ugaritic", 48}, // Ugar
|
||||
{"Vai", 69}, // Vaii
|
||||
{"Yi", 36}, // Yiii
|
||||
};
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kCodeToULScriptSize = 105;
|
||||
extern const CharIntPair kCodeToULScript[kNameToULScriptSize] = {
|
||||
{"Arab", 6}, // Arab
|
||||
{"Armi", 86}, // Armi
|
||||
{"Armn", 4}, // Armn
|
||||
{"Avst", 79}, // Avst
|
||||
{"Bali", 61}, // Bali
|
||||
{"Bamu", 83}, // Bamu
|
||||
{"Batk", 92}, // Batk
|
||||
{"Beng", 10}, // Beng
|
||||
{"Bopo", 34}, // Bopo
|
||||
{"Brah", 93}, // Brah
|
||||
{"Brai", 52}, // Brai
|
||||
{"Bugi", 53}, // Bugi
|
||||
{"Buhd", 43}, // Buhd
|
||||
{"Cakm", 95}, // Cakm
|
||||
{"Cans", 27}, // Cans
|
||||
{"Cari", 74}, // Cari
|
||||
{"Cham", 76}, // Cham
|
||||
{"Cher", 26}, // Cher
|
||||
{"Copt", 54}, // Copt
|
||||
{"Cprt", 51}, // Cprt
|
||||
{"Cyrl", 3}, // Cyrl
|
||||
{"Deva", 9}, // Deva
|
||||
{"Dsrt", 39}, // Dsrt
|
||||
{"Egyp", 80}, // Egyp
|
||||
{"Ethi", 25}, // Ethi
|
||||
{"Geor", 23}, // Geor
|
||||
{"Glag", 56}, // Glag
|
||||
{"Goth", 38}, // Goth
|
||||
{"Grek", 2}, // Grek
|
||||
{"Gujr", 12}, // Gujr
|
||||
{"Guru", 11}, // Guru
|
||||
{"Hang", 24}, // Hang
|
||||
{"Hani", 24}, // Hani
|
||||
{"Hani", 24}, // Hani
|
||||
{"Hano", 42}, // Hano
|
||||
{"Hans", 24}, // Hans
|
||||
{"Hant", 24}, // Hant
|
||||
{"Hebr", 5}, // Hebr
|
||||
{"Hira", 24}, // Hira
|
||||
{"Ital", 37}, // Ital
|
||||
{"Java", 84}, // Java
|
||||
{"Kali", 71}, // Kali
|
||||
{"Kana", 24}, // Kana
|
||||
{"Khar", 60}, // Khar
|
||||
{"Khmr", 30}, // Khmr
|
||||
{"Knda", 16}, // Knda
|
||||
{"Kthi", 91}, // Kthi
|
||||
{"Lana", 77}, // Lana
|
||||
{"Laoo", 20}, // Laoo
|
||||
{"Latn", 1}, // Latn
|
||||
{"Lepc", 67}, // Lepc
|
||||
{"Limb", 45}, // Limb
|
||||
{"Linb", 47}, // Linb
|
||||
{"Lisu", 82}, // Lisu
|
||||
{"Lyci", 73}, // Lyci
|
||||
{"Lydi", 75}, // Lydi
|
||||
{"Mand", 94}, // Mand
|
||||
{"Merc", 96}, // Merc
|
||||
{"Mero", 97}, // Mero
|
||||
{"Mlym", 17}, // Mlym
|
||||
{"Mong", 31}, // Mong
|
||||
{"Mtei", 85}, // Mtei
|
||||
{"Mymr", 22}, // Mymr
|
||||
{"Nkoo", 65}, // Nkoo
|
||||
{"Ogam", 28}, // Ogam
|
||||
{"Olck", 68}, // Olck
|
||||
{"Orkh", 90}, // Orkh
|
||||
{"Orya", 13}, // Orya
|
||||
{"Osma", 50}, // Osma
|
||||
{"Phag", 64}, // Phag
|
||||
{"Phli", 89}, // Phli
|
||||
{"Phnx", 63}, // Phnx
|
||||
{"Plrd", 98}, // Plrd
|
||||
{"Prti", 88}, // Prti
|
||||
{"Rjng", 72}, // Rjng
|
||||
{"Runr", 29}, // Runr
|
||||
{"Samr", 81}, // Samr
|
||||
{"Sarb", 87}, // Sarb
|
||||
{"Saur", 70}, // Saur
|
||||
{"Shaw", 49}, // Shaw
|
||||
{"Shrd", 99}, // Shrd
|
||||
{"Sinh", 18}, // Sinh
|
||||
{"Sora", 100}, // Sora
|
||||
{"Sund", 66}, // Sund
|
||||
{"Sylo", 58}, // Sylo
|
||||
{"Syrc", 7}, // Syrc
|
||||
{"Tagb", 44}, // Tagb
|
||||
{"Takr", 101}, // Takr
|
||||
{"Tale", 46}, // Tale
|
||||
{"Talu", 55}, // Talu
|
||||
{"Taml", 14}, // Taml
|
||||
{"Tavt", 78}, // Tavt
|
||||
{"Telu", 15}, // Telu
|
||||
{"Tfng", 57}, // Tfng
|
||||
{"Tglg", 41}, // Tglg
|
||||
{"Thaa", 8}, // Thaa
|
||||
{"Thai", 19}, // Thai
|
||||
{"Tibt", 21}, // Tibt
|
||||
{"Ugar", 48}, // Ugar
|
||||
{"Vaii", 69}, // Vaii
|
||||
{"Xpeo", 59}, // Xpeo
|
||||
{"Xsux", 62}, // Xsux
|
||||
{"Yiii", 36}, // Yiii
|
||||
{"Zinh", 40}, // Zinh
|
||||
{"Zyyy", 0}, // Zyyy
|
||||
};
|
||||
|
||||
} // namespace CLD2
|
||||
} // namespace chrome_lang_id
|
||||
142
Telegram/ThirdParty/cld3/src/script_span/generated_ulscript.h
vendored
Normal file
142
Telegram/ThirdParty/cld3/src/script_span/generated_ulscript.h
vendored
Normal file
@@ -0,0 +1,142 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// generated_ulscript.h
|
||||
// Machine generated. Do Not Edit.
|
||||
//
|
||||
// Declarations for scripts recognized by CLD2
|
||||
//
|
||||
|
||||
#ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
||||
#define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
|
||||
|
||||
typedef struct {const char* s; int i;} CharIntPair;
|
||||
|
||||
typedef enum {
|
||||
ULScript_Common = 0, // Zyyy
|
||||
ULScript_Latin = 1, // Latn
|
||||
ULScript_Greek = 2, // Grek
|
||||
ULScript_Cyrillic = 3, // Cyrl
|
||||
ULScript_Armenian = 4, // Armn
|
||||
ULScript_Hebrew = 5, // Hebr
|
||||
ULScript_Arabic = 6, // Arab
|
||||
ULScript_Syriac = 7, // Syrc
|
||||
ULScript_Thaana = 8, // Thaa
|
||||
ULScript_Devanagari = 9, // Deva
|
||||
ULScript_Bengali = 10, // Beng
|
||||
ULScript_Gurmukhi = 11, // Guru
|
||||
ULScript_Gujarati = 12, // Gujr
|
||||
ULScript_Oriya = 13, // Orya
|
||||
ULScript_Tamil = 14, // Taml
|
||||
ULScript_Telugu = 15, // Telu
|
||||
ULScript_Kannada = 16, // Knda
|
||||
ULScript_Malayalam = 17, // Mlym
|
||||
ULScript_Sinhala = 18, // Sinh
|
||||
ULScript_Thai = 19, // Thai
|
||||
ULScript_Lao = 20, // Laoo
|
||||
ULScript_Tibetan = 21, // Tibt
|
||||
ULScript_Myanmar = 22, // Mymr
|
||||
ULScript_Georgian = 23, // Geor
|
||||
ULScript_Hani = 24, // Hani
|
||||
ULScript_Ethiopic = 25, // Ethi
|
||||
ULScript_Cherokee = 26, // Cher
|
||||
ULScript_Canadian_Aboriginal = 27, // Cans
|
||||
ULScript_Ogham = 28, // Ogam
|
||||
ULScript_Runic = 29, // Runr
|
||||
ULScript_Khmer = 30, // Khmr
|
||||
ULScript_Mongolian = 31, // Mong
|
||||
ULScript_32 = 32, //
|
||||
ULScript_33 = 33, //
|
||||
ULScript_Bopomofo = 34, // Bopo
|
||||
ULScript_35 = 35, //
|
||||
ULScript_Yi = 36, // Yiii
|
||||
ULScript_Old_Italic = 37, // Ital
|
||||
ULScript_Gothic = 38, // Goth
|
||||
ULScript_Deseret = 39, // Dsrt
|
||||
ULScript_Inherited = 40, // Zinh
|
||||
ULScript_Tagalog = 41, // Tglg
|
||||
ULScript_Hanunoo = 42, // Hano
|
||||
ULScript_Buhid = 43, // Buhd
|
||||
ULScript_Tagbanwa = 44, // Tagb
|
||||
ULScript_Limbu = 45, // Limb
|
||||
ULScript_Tai_Le = 46, // Tale
|
||||
ULScript_Linear_B = 47, // Linb
|
||||
ULScript_Ugaritic = 48, // Ugar
|
||||
ULScript_Shavian = 49, // Shaw
|
||||
ULScript_Osmanya = 50, // Osma
|
||||
ULScript_Cypriot = 51, // Cprt
|
||||
ULScript_Braille = 52, // Brai
|
||||
ULScript_Buginese = 53, // Bugi
|
||||
ULScript_Coptic = 54, // Copt
|
||||
ULScript_New_Tai_Lue = 55, // Talu
|
||||
ULScript_Glagolitic = 56, // Glag
|
||||
ULScript_Tifinagh = 57, // Tfng
|
||||
ULScript_Syloti_Nagri = 58, // Sylo
|
||||
ULScript_Old_Persian = 59, // Xpeo
|
||||
ULScript_Kharoshthi = 60, // Khar
|
||||
ULScript_Balinese = 61, // Bali
|
||||
ULScript_Cuneiform = 62, // Xsux
|
||||
ULScript_Phoenician = 63, // Phnx
|
||||
ULScript_Phags_Pa = 64, // Phag
|
||||
ULScript_Nko = 65, // Nkoo
|
||||
ULScript_Sundanese = 66, // Sund
|
||||
ULScript_Lepcha = 67, // Lepc
|
||||
ULScript_Ol_Chiki = 68, // Olck
|
||||
ULScript_Vai = 69, // Vaii
|
||||
ULScript_Saurashtra = 70, // Saur
|
||||
ULScript_Kayah_Li = 71, // Kali
|
||||
ULScript_Rejang = 72, // Rjng
|
||||
ULScript_Lycian = 73, // Lyci
|
||||
ULScript_Carian = 74, // Cari
|
||||
ULScript_Lydian = 75, // Lydi
|
||||
ULScript_Cham = 76, // Cham
|
||||
ULScript_Tai_Tham = 77, // Lana
|
||||
ULScript_Tai_Viet = 78, // Tavt
|
||||
ULScript_Avestan = 79, // Avst
|
||||
ULScript_Egyptian_Hieroglyphs = 80, // Egyp
|
||||
ULScript_Samaritan = 81, // Samr
|
||||
ULScript_Lisu = 82, // Lisu
|
||||
ULScript_Bamum = 83, // Bamu
|
||||
ULScript_Javanese = 84, // Java
|
||||
ULScript_Meetei_Mayek = 85, // Mtei
|
||||
ULScript_Imperial_Aramaic = 86, // Armi
|
||||
ULScript_Old_South_Arabian = 87, // Sarb
|
||||
ULScript_Inscriptional_Parthian = 88, // Prti
|
||||
ULScript_Inscriptional_Pahlavi = 89, // Phli
|
||||
ULScript_Old_Turkic = 90, // Orkh
|
||||
ULScript_Kaithi = 91, // Kthi
|
||||
ULScript_Batak = 92, // Batk
|
||||
ULScript_Brahmi = 93, // Brah
|
||||
ULScript_Mandaic = 94, // Mand
|
||||
ULScript_Chakma = 95, // Cakm
|
||||
ULScript_Meroitic_Cursive = 96, // Merc
|
||||
ULScript_Meroitic_Hieroglyphs = 97, // Mero
|
||||
ULScript_Miao = 98, // Plrd
|
||||
ULScript_Sharada = 99, // Shrd
|
||||
ULScript_Sora_Sompeng = 100, // Sora
|
||||
ULScript_Takri = 101, // Takr
|
||||
NUM_ULSCRIPTS
|
||||
} ULScript;
|
||||
|
||||
#define UNKNOWN_ULSCRIPT ULScript_Common
|
||||
|
||||
} // namespace CLD2
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
||||
1107
Telegram/ThirdParty/cld3/src/script_span/getonescriptspan.cc
vendored
Normal file
1107
Telegram/ThirdParty/cld3/src/script_span/getonescriptspan.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
124
Telegram/ThirdParty/cld3/src/script_span/getonescriptspan.h
vendored
Normal file
124
Telegram/ThirdParty/cld3/src/script_span/getonescriptspan.h
vendored
Normal file
@@ -0,0 +1,124 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
|
||||
#ifndef SCRIPT_SPAN_GETONESCRIPTSPAN_H_
|
||||
#define SCRIPT_SPAN_GETONESCRIPTSPAN_H_
|
||||
|
||||
#include "generated_ulscript.h"
|
||||
#include "integral_types.h"
|
||||
#include "offsetmap.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
static const int kMaxScriptBuffer = 40960;
|
||||
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
||||
static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
|
||||
static const int kWithinScriptTail = 32; // Stop at word space in last
|
||||
// N bytes of script buffer
|
||||
|
||||
struct LangSpan {
|
||||
char* text = nullptr; // Pointer to the span, somewhere
|
||||
int text_bytes = 0; // Number of bytes of text in the span
|
||||
int offset = 0; // Offset of start of span in original input buffer
|
||||
ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
|
||||
bool truncated = false; // true if buffer filled up before a
|
||||
// different script or EOF was found
|
||||
};
|
||||
|
||||
static inline bool IsContinuationByte(char c) {
|
||||
return static_cast<signed char>(c) < -64;
|
||||
}
|
||||
|
||||
// Gets lscript number for letters; always returns
|
||||
// 0 (common script) for non-letters
|
||||
int GetUTF8LetterScriptNum(const char* src);
|
||||
|
||||
// Update src pointer to point to next quadgram, +2..+5
|
||||
// Looks at src[0..4]
|
||||
const char* AdvanceQuad(const char* src);
|
||||
|
||||
// Utility routine to search alphabetical tables
|
||||
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
|
||||
|
||||
// Returns the length in bytes of the prefix of src that is all
|
||||
// interchange valid UTF-8
|
||||
int SpanInterchangeValid(const char* src, int byte_length);
|
||||
|
||||
class ScriptScanner {
|
||||
public:
|
||||
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
||||
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
|
||||
bool any_text, bool any_script);
|
||||
~ScriptScanner();
|
||||
|
||||
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
||||
bool GetOneScriptSpan(LangSpan* span);
|
||||
|
||||
// Force Latin and Cyrillic scripts to be lowercase
|
||||
void LowerScriptSpan(LangSpan* span);
|
||||
|
||||
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
||||
// Force Latin and Cyrillic scripts to be lowercase
|
||||
bool GetOneScriptSpanLower(LangSpan* span);
|
||||
|
||||
// Copy next run of non-tag characters to buffer [NUL terminated]
|
||||
// This just removes tags and removes entities
|
||||
// Buffer has leading space
|
||||
bool GetOneTextSpan(LangSpan* span);
|
||||
|
||||
// Maps byte offset in most recent GetOneScriptSpan/Lower
|
||||
// span->text [0..text_bytes] into an additional byte offset from
|
||||
// span->offset, to get back to corresponding text in the original
|
||||
// input buffer.
|
||||
// text_offset must be the first byte
|
||||
// of a UTF-8 character, or just beyond the last character. Normally this
|
||||
// routine is called with the first byte of an interesting range and
|
||||
// again with the first byte of the following range.
|
||||
int MapBack(int text_offset);
|
||||
|
||||
const char* GetBufferStart() {return start_byte_;}
|
||||
|
||||
private:
|
||||
// Skip over tags and non-letters
|
||||
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
||||
|
||||
const char* start_byte_; // Starting byte of buffer to scan
|
||||
const char* next_byte_; // First unscanned byte
|
||||
int byte_length_; // Bytes left
|
||||
|
||||
bool is_plain_text_; // true fo text, false for HTML
|
||||
char* script_buffer_; // Holds text with expanded entities
|
||||
char* script_buffer_lower_; // Holds lowercased text
|
||||
bool letters_marks_only_; // To distinguish scriptspan of one
|
||||
// letters/marks vs. any mixture of text
|
||||
bool one_script_only_; // To distinguish scriptspan of one
|
||||
// script vs. any mixture of scripts
|
||||
int exit_state_; // For tag parser kTagParseTbl_0, based
|
||||
// on letters_marks_only_
|
||||
public :
|
||||
// Expose for debugging
|
||||
OffsetMap map2original_; // map from script_buffer_ to buffer
|
||||
OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
|
||||
};
|
||||
|
||||
} // namespace CLD2
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_GETONESCRIPTSPAN_H_
|
||||
135
Telegram/ThirdParty/cld3/src/script_span/getonescriptspan_test.cc
vendored
Normal file
135
Telegram/ThirdParty/cld3/src/script_span/getonescriptspan_test.cc
vendored
Normal file
@@ -0,0 +1,135 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "getonescriptspan.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
namespace getonescriptspan_test {
|
||||
|
||||
// Tests invalid and interchange-invalid input. Returns "true" if the test is
|
||||
// successful and "false" otherwise.
|
||||
bool TestInvalidUTF8Input() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
const std::vector<std::string> invalid_strings{"\xC0\xA9",
|
||||
"\377\377\377\377"};
|
||||
const std::string gold_valid_prefix = "Some valid bytes followed by ";
|
||||
|
||||
// Iterates over the invalid strings, inserts each of them in the middle of a
|
||||
// piece of text, and checks whether these strings are correctly identified.
|
||||
bool test_successful = true;
|
||||
for (size_t i = 0; i < invalid_strings.size(); ++i) {
|
||||
const std::string text = "Some valid bytes followed by " +
|
||||
invalid_strings.at(i) +
|
||||
" and then valid ones again.";
|
||||
|
||||
const int num_valid_bytes = SpanInterchangeValid(text.c_str(), text.size());
|
||||
const std::string detected_valid_prefix(text.c_str(), num_valid_bytes);
|
||||
std::cout << " Testing input string at position " << i << std::endl;
|
||||
if (detected_valid_prefix == gold_valid_prefix) {
|
||||
std::cout << " Success!" << std::endl;
|
||||
} else {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Gold: " << gold_valid_prefix << std::endl;
|
||||
std::cout << " Detected: " << detected_valid_prefix << std::endl;
|
||||
test_successful = false;
|
||||
}
|
||||
}
|
||||
return test_successful;
|
||||
}
|
||||
|
||||
// Tests whether different scripts are correctly detected. Returns "true" if the
|
||||
// test is successful and "false" otherwise.
|
||||
bool TestScriptDetection() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
// Text containing a snippet in English, a snippet in Bulgarian, and a snippet
|
||||
// in English again.
|
||||
const std::string text =
|
||||
"Text in English. Текст на Български. Also text in English.";
|
||||
const std::vector<std::string> gold_script_spans{
|
||||
" Text in English ", " Текст на Български ", " Also text in English "};
|
||||
|
||||
std::vector<std::string> detected_script_spans;
|
||||
ScriptScanner ss(text.c_str(), text.size(), /*is_plain_text=*/true);
|
||||
LangSpan script_span;
|
||||
while (ss.GetOneScriptSpan(&script_span)) {
|
||||
detected_script_spans.emplace_back(script_span.text,
|
||||
script_span.text_bytes);
|
||||
}
|
||||
|
||||
if (detected_script_spans.size() != gold_script_spans.size()) {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Number of gold spans " << gold_script_spans.size()
|
||||
<< std::endl;
|
||||
std::cout << " Number of detected spans " << detected_script_spans.size()
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < detected_script_spans.size(); ++i) {
|
||||
if (detected_script_spans.at(i) != gold_script_spans.at(i)) {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Gold span: " << gold_script_spans.at(i) << std::endl;
|
||||
std::cout << " Detected span: " << detected_script_spans.at(i)
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
std::cout << " Success!" << std::endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Tests the case when the input string is truncated in such a way that a
|
||||
// character is split in two pieces. Returns "true" if the test is successful
|
||||
// and "false" otherwise.
|
||||
bool TestStringCut() {
|
||||
std::cout << "Running " << __FUNCTION__ << std::endl;
|
||||
|
||||
// Text in Bulgarian (Cyrillic script).
|
||||
const std::string text = "Текст на Български";
|
||||
|
||||
// The size of the first two words ("Текст на ") is 16, and size of the first
|
||||
// two words plus the first char of the third word ("Текст на Б") is 18, so a
|
||||
// threshold of 17 results in slicing the first char of the third word.
|
||||
const int first_two_words_size = 16;
|
||||
const int span_size = 17;
|
||||
const int num_valid_bytes = SpanInterchangeValid(text.c_str(), span_size);
|
||||
if (num_valid_bytes == first_two_words_size) {
|
||||
std::cout << " Success!" << std::endl;
|
||||
return true;
|
||||
} else {
|
||||
std::cout << " Failure" << std::endl;
|
||||
std::cout << " Size of gold interchange-valid span: "
|
||||
<< first_two_words_size << std::endl;
|
||||
std::cout << " Size of detected span: " << num_valid_bytes << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace getonescriptspan_test
|
||||
} // namespace CLD2
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
// Runs the functions above.
|
||||
int main(int argc, char **argv) {
|
||||
const bool tests_successful =
|
||||
chrome_lang_id::CLD2::getonescriptspan_test::TestInvalidUTF8Input() &&
|
||||
chrome_lang_id::CLD2::getonescriptspan_test::TestScriptDetection() &&
|
||||
chrome_lang_id::CLD2::getonescriptspan_test::TestStringCut();
|
||||
return tests_successful ? 0 : 1;
|
||||
}
|
||||
37
Telegram/ThirdParty/cld3/src/script_span/integral_types.h
vendored
Normal file
37
Telegram/ThirdParty/cld3/src/script_span/integral_types.h
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef SCRIPT_SPAN_INTEGRAL_TYPES_H_
|
||||
#define SCRIPT_SPAN_INTEGRAL_TYPES_H_
|
||||
|
||||
// Cheap version
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
typedef unsigned char uint8;
|
||||
typedef unsigned short uint16;
|
||||
typedef unsigned int uint32;
|
||||
typedef unsigned long long int uint64;
|
||||
|
||||
typedef signed char int8;
|
||||
typedef signed short int16;
|
||||
typedef signed int int32;
|
||||
typedef signed long long int int64;
|
||||
|
||||
typedef int32 char32;
|
||||
|
||||
} // End namespace CLD2
|
||||
} // End namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_INTEGRAL_TYPES_H_
|
||||
478
Telegram/ThirdParty/cld3/src/script_span/offsetmap.cc
vendored
Normal file
478
Telegram/ThirdParty/cld3/src/script_span/offsetmap.cc
vendored
Normal file
@@ -0,0 +1,478 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
//
|
||||
|
||||
#include "offsetmap.h"
|
||||
|
||||
#include <string.h> // for strcmp
|
||||
#include <algorithm> // for min
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
// Constructor, destructor
|
||||
OffsetMap::OffsetMap() {
|
||||
Clear();
|
||||
}
|
||||
|
||||
OffsetMap::~OffsetMap() {
|
||||
}
|
||||
|
||||
// Clear the map
|
||||
// After:
|
||||
// next_diff_sub_ is 0
|
||||
// Windows are the a and a' ranges covered by diffs_[next_diff_sub_-1]
|
||||
// which is a fake range of width 0 mapping 0=>0
|
||||
void OffsetMap::Clear() {
|
||||
diffs_.clear();
|
||||
pending_op_ = COPY_OP;
|
||||
pending_length_ = 0;
|
||||
next_diff_sub_ = 0;
|
||||
current_lo_aoffset_ = 0;
|
||||
current_hi_aoffset_ = 0;
|
||||
current_lo_aprimeoffset_ = 0;
|
||||
current_hi_aprimeoffset_ = 0;
|
||||
current_diff_ = 0;
|
||||
max_aoffset_ = 0; // Largest seen so far
|
||||
max_aprimeoffset_ = 0; // Largest seen so far
|
||||
}
|
||||
|
||||
static inline char OpPart(const char c) {
|
||||
return (c >> 6) & 3;
|
||||
}
|
||||
static inline char LenPart(const char c) {
|
||||
return c & 0x3f;
|
||||
}
|
||||
|
||||
// Reset to offset 0
|
||||
void OffsetMap::Reset() {
|
||||
MaybeFlushAll();
|
||||
|
||||
next_diff_sub_ = 0;
|
||||
current_lo_aoffset_ = 0;
|
||||
current_hi_aoffset_ = 0;
|
||||
current_lo_aprimeoffset_ = 0;
|
||||
current_hi_aprimeoffset_ = 0;
|
||||
current_diff_ = 0;
|
||||
}
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// identical in A and A'
|
||||
void OffsetMap::Copy(int bytes) {
|
||||
if (bytes == 0) {return;}
|
||||
max_aoffset_ += bytes; // Largest seen so far
|
||||
max_aprimeoffset_ += bytes; // Largest seen so far
|
||||
if (pending_op_ == COPY_OP) {
|
||||
pending_length_ += bytes;
|
||||
} else {
|
||||
Flush();
|
||||
pending_op_ = COPY_OP;
|
||||
pending_length_ = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// inserted in A' while not advancing in A at all
|
||||
void OffsetMap::Insert(int bytes){
|
||||
if (bytes == 0) {return;}
|
||||
max_aprimeoffset_ += bytes; // Largest seen so far
|
||||
if (pending_op_ == INSERT_OP) {
|
||||
pending_length_ += bytes;
|
||||
} else if ((bytes == 1) &&
|
||||
(pending_op_ == DELETE_OP) && (pending_length_ == 1)) {
|
||||
// Special-case exactly delete(1) insert(1) +> copy(1);
|
||||
// all others backmap inserts to after deletes
|
||||
pending_op_ = COPY_OP;
|
||||
} else {
|
||||
Flush();
|
||||
pending_op_ = INSERT_OP;
|
||||
pending_length_ = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// deleted from A while not advancing in A' at all
|
||||
void OffsetMap::Delete(int bytes){
|
||||
if (bytes == 0) {return;}
|
||||
max_aoffset_ += bytes; // Largest seen so far
|
||||
if (pending_op_ == DELETE_OP) {
|
||||
pending_length_ += bytes;
|
||||
} else if ((bytes == 1) &&
|
||||
(pending_op_ == INSERT_OP) && (pending_length_ == 1)) {
|
||||
// Special-case exactly insert(1) delete(1) => copy(1);
|
||||
// all others backmap deletes to after insertss
|
||||
pending_op_ = COPY_OP;
|
||||
} else {
|
||||
Flush();
|
||||
pending_op_ = DELETE_OP;
|
||||
pending_length_ = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
void OffsetMap::Flush() {
|
||||
if (pending_length_ == 0) {
|
||||
return;
|
||||
}
|
||||
// We may be emitting a copy op just after a copy op because +1 -1 cancelled
|
||||
// inbetween. If the lengths don't need a prefix byte, combine them
|
||||
if ((pending_op_ == COPY_OP) && !diffs_.empty()) {
|
||||
char c = diffs_[diffs_.size() - 1];
|
||||
MapOp prior_op = static_cast<MapOp>(OpPart(c));
|
||||
int prior_len = LenPart(c);
|
||||
if ((prior_op == COPY_OP) && ((prior_len + pending_length_) <= 0x3f)) {
|
||||
diffs_[diffs_.size() - 1] += pending_length_;
|
||||
pending_length_ = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (pending_length_ > 0x3f) {
|
||||
bool non_zero_emitted = false;
|
||||
for (int shift = 30; shift > 0; shift -= 6) {
|
||||
int prefix = (pending_length_ >> shift) & 0x3f;
|
||||
if ((prefix > 0) || non_zero_emitted) {
|
||||
Emit(PREFIX_OP, prefix);
|
||||
non_zero_emitted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
Emit(pending_op_, pending_length_ & 0x3f);
|
||||
pending_length_ = 0;
|
||||
}
|
||||
|
||||
|
||||
// Add one more entry to copy one byte off the end, then flush
|
||||
void OffsetMap::FlushAll() {
|
||||
Copy(1);
|
||||
Flush();
|
||||
}
|
||||
|
||||
// Flush all if necessary
|
||||
void OffsetMap::MaybeFlushAll() {
|
||||
if ((0 < pending_length_) || diffs_.empty()) {
|
||||
FlushAll();
|
||||
}
|
||||
}
|
||||
|
||||
// Len may be 0, for example as the low piece of length=64
|
||||
void OffsetMap::Emit(MapOp op, int len) {
|
||||
char c = (static_cast<char>(op) << 6) | (len & 0x3f);
|
||||
diffs_.push_back(c);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// The guts of the 2013 design //
|
||||
// If there are three ranges a b c in diffs_, we can be in one of five //
|
||||
// states: LEFT of a, in ranges a b c, or RIGHT of c //
|
||||
// In each state, there are windows A[Alo..Ahi), A'[A'lo..A'hi) and diffs_ //
|
||||
// position next_diff_sub_ //
|
||||
// There also are mapping constants max_aoffset_ and max_aprimeoffset_ //
|
||||
// If LEFT, Alo=Ahi=0, A'lo=A'hi=0 and next_diff_sub_=0 //
|
||||
// If RIGHT, Alo=Ahi=max_aoffset_, A'lo=A'hi=max_aprimeoffset_ and //
|
||||
// next_diff_sub_=diffs_.size() //
|
||||
// Otherwise, at least one of A[) and A'[) is non-empty and the first bytes //
|
||||
// correspond to each other. If range i is active, next_diff_sub_ is at //
|
||||
// the first byte of range i+1. Because of the length-prefix operator, //
|
||||
// an individual range item in diffs_ may be multiple bytes //
|
||||
// In all cases aprimeoffset = aoffset + current_diff_ //
|
||||
// i.e. current_diff_ = aprimeoffset - aoffset //
|
||||
// //
|
||||
// In the degenerate case of diffs_.empty(), there are only two states //
|
||||
// LEFT and RIGHT and the mapping is the identity mapping. //
|
||||
// The initial state is LEFT. //
|
||||
// It is an error to move left into LEFT or right into RIGHT, but the code //
|
||||
// below is robust in these cases. //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
void OffsetMap::SetLeft() {
|
||||
current_lo_aoffset_ = 0;
|
||||
current_hi_aoffset_ = 0;
|
||||
current_lo_aprimeoffset_ = 0;
|
||||
current_hi_aprimeoffset_ = 0;
|
||||
current_diff_ = 0;
|
||||
next_diff_sub_ = 0;
|
||||
}
|
||||
|
||||
void OffsetMap::SetRight() {
|
||||
current_lo_aoffset_ = max_aoffset_;
|
||||
current_hi_aoffset_ = max_aoffset_;
|
||||
current_lo_aprimeoffset_ = max_aprimeoffset_;
|
||||
current_hi_aprimeoffset_ = max_aprimeoffset_;
|
||||
current_diff_ = max_aprimeoffset_ - max_aoffset_;
|
||||
next_diff_sub_ = 0;
|
||||
}
|
||||
|
||||
// Back up over previous range, 1..5 bytes
|
||||
// Return subscript at the beginning of that. Pins at 0
|
||||
int OffsetMap::Backup(int sub) {
|
||||
if (sub <= 0) {return 0;}
|
||||
--sub;
|
||||
while ((0 < sub) &&
|
||||
(static_cast<MapOp>(OpPart(diffs_[sub - 1]) == PREFIX_OP))) {
|
||||
--sub;
|
||||
}
|
||||
return sub;
|
||||
}
|
||||
|
||||
// Parse next range, 1..5 bytes
|
||||
// Return subscript just off the end of that
|
||||
int OffsetMap::ParseNext(int sub, MapOp* op, int* length) {
|
||||
*op = PREFIX_OP;
|
||||
*length = 0;
|
||||
char c;
|
||||
while ((sub < static_cast<int>(diffs_.size())) && (*op == PREFIX_OP)) {
|
||||
c = diffs_[sub++];
|
||||
*op = static_cast<MapOp>(OpPart(c));
|
||||
int len = LenPart(c);
|
||||
*length = (*length << 6) + len;
|
||||
}
|
||||
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
|
||||
// Mal-formed can include a trailing prefix byte with no following op
|
||||
return sub;
|
||||
}
|
||||
|
||||
// Parse previous range, 1..5 bytes
|
||||
// Return current subscript
|
||||
int OffsetMap::ParsePrevious(int sub, MapOp* op, int* length) {
|
||||
sub = Backup(sub);
|
||||
return ParseNext(sub, op, length);
|
||||
}
|
||||
|
||||
// Move active window one range to the right
|
||||
// Return true if move was OK
|
||||
bool OffsetMap::MoveRight() {
|
||||
// If at last range or RIGHT, set to RIGHT, return error
|
||||
if (next_diff_sub_ >= static_cast<int>(diffs_.size())) {
|
||||
SetRight();
|
||||
return false;
|
||||
}
|
||||
// Actually OK to move right
|
||||
MapOp op;
|
||||
int length;
|
||||
bool retval = true;
|
||||
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
|
||||
next_diff_sub_ = ParseNext(next_diff_sub_, &op, &length);
|
||||
|
||||
current_lo_aoffset_ = current_hi_aoffset_;
|
||||
current_lo_aprimeoffset_ = current_hi_aprimeoffset_;
|
||||
if (op == COPY_OP) {
|
||||
current_hi_aoffset_ = current_lo_aoffset_ + length;
|
||||
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
|
||||
} else if (op == INSERT_OP) {
|
||||
current_hi_aoffset_ = current_lo_aoffset_ + 0;
|
||||
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
|
||||
} else if (op == DELETE_OP) {
|
||||
current_hi_aoffset_ = current_lo_aoffset_ + length;
|
||||
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + 0;
|
||||
} else {
|
||||
SetRight();
|
||||
retval = false;
|
||||
}
|
||||
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Move active window one range to the left
|
||||
// Return true if move was OK
|
||||
bool OffsetMap::MoveLeft() {
|
||||
// If at first range or LEFT, set to LEFT, return error
|
||||
if (next_diff_sub_ <= 0) {
|
||||
SetLeft();
|
||||
return false;
|
||||
}
|
||||
// Back up over current active window
|
||||
next_diff_sub_ = Backup(next_diff_sub_);
|
||||
if (next_diff_sub_ <= 0) {
|
||||
SetLeft();
|
||||
return false;
|
||||
}
|
||||
// Actually OK to move left
|
||||
MapOp op;
|
||||
int length;
|
||||
|
||||
// TODO(abakalov): 'retval' below is set but not used, which is suspicious.
|
||||
// Did the authors mean to return this variable, analogously to MoveRight()?
|
||||
// bool retval = true;
|
||||
// If mal-formed or in LEFT, this will return with op = PREFIX_OP
|
||||
next_diff_sub_ = ParsePrevious(next_diff_sub_, &op, &length);
|
||||
|
||||
current_hi_aoffset_ = current_lo_aoffset_;
|
||||
current_hi_aprimeoffset_ = current_lo_aprimeoffset_;
|
||||
if (op == COPY_OP) {
|
||||
current_lo_aoffset_ = current_hi_aoffset_ - length;
|
||||
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
|
||||
} else if (op == INSERT_OP) {
|
||||
current_lo_aoffset_ = current_hi_aoffset_ - 0;
|
||||
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
|
||||
} else if (op == DELETE_OP) {
|
||||
current_lo_aoffset_ = current_hi_aoffset_ - length;
|
||||
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - 0;
|
||||
} else {
|
||||
SetLeft();
|
||||
// retval = false;
|
||||
}
|
||||
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Map an offset in A' to the corresponding offset in A
|
||||
int OffsetMap::MapBack(int aprimeoffset){
|
||||
MaybeFlushAll();
|
||||
if (aprimeoffset < 0) {return 0;}
|
||||
if (max_aprimeoffset_ <= aprimeoffset) {
|
||||
return (aprimeoffset - max_aprimeoffset_) + max_aoffset_;
|
||||
}
|
||||
|
||||
// If current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_,
|
||||
// use current mapping, else move window left/right
|
||||
bool ok = true;
|
||||
while (ok && (aprimeoffset < current_lo_aprimeoffset_)) {
|
||||
ok = MoveLeft();
|
||||
}
|
||||
while (ok && (current_hi_aprimeoffset_ <= aprimeoffset)) {
|
||||
ok = MoveRight();
|
||||
}
|
||||
// So now current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_
|
||||
|
||||
int aoffset = aprimeoffset - current_diff_;
|
||||
if (aoffset >= current_hi_aoffset_) {
|
||||
// A' is in an insert region, all bytes of which backmap to A=hi_aoffset_
|
||||
aoffset = current_hi_aoffset_;
|
||||
}
|
||||
return aoffset;
|
||||
}
|
||||
|
||||
// Map an offset in A to the corresponding offset in A'
|
||||
int OffsetMap::MapForward(int aoffset){
|
||||
MaybeFlushAll();
|
||||
if (aoffset < 0) {return 0;}
|
||||
if (max_aoffset_ <= aoffset) {
|
||||
return (aoffset - max_aoffset_) + max_aprimeoffset_;
|
||||
}
|
||||
|
||||
// If current_lo_aoffset_ <= aoffset < current_hi_aoffset_,
|
||||
// use current mapping, else move window left/right
|
||||
bool ok = true;
|
||||
while (ok && (aoffset < current_lo_aoffset_)) {
|
||||
ok = MoveLeft();
|
||||
}
|
||||
while (ok && (current_hi_aoffset_ <= aoffset)) {
|
||||
ok = MoveRight();
|
||||
}
|
||||
|
||||
int aprimeoffset = aoffset + current_diff_;
|
||||
if (aprimeoffset >= current_hi_aprimeoffset_) {
|
||||
// A is in a delete region, all bytes of which map to A'=hi_aprimeoffset_
|
||||
aprimeoffset = current_hi_aprimeoffset_;
|
||||
}
|
||||
return aprimeoffset;
|
||||
}
|
||||
|
||||
|
||||
// static
|
||||
bool OffsetMap::CopyInserts(OffsetMap* source, OffsetMap* dest) {
|
||||
bool ok = true;
|
||||
while (ok && (source->next_diff_sub_ !=
|
||||
static_cast<int>(source->diffs_.size()))) {
|
||||
ok = source->MoveRight();
|
||||
if (source->current_lo_aoffset_ != source->current_hi_aoffset_) {
|
||||
return false;
|
||||
}
|
||||
dest->Insert(
|
||||
source->current_hi_aprimeoffset_ - source->current_lo_aprimeoffset_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// static
|
||||
bool OffsetMap::CopyDeletes(OffsetMap* source, OffsetMap* dest) {
|
||||
bool ok = true;
|
||||
while (ok && (source->next_diff_sub_ !=
|
||||
static_cast<int>(source->diffs_.size()))) {
|
||||
ok = source->MoveRight();
|
||||
if (source->current_lo_aprimeoffset_ != source->current_hi_aprimeoffset_) {
|
||||
return false;
|
||||
}
|
||||
dest->Delete(source->current_hi_aoffset_ - source->current_lo_aoffset_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// static
|
||||
void OffsetMap::ComposeOffsetMap(
|
||||
OffsetMap* g, OffsetMap* f, OffsetMap* h) {
|
||||
h->Clear();
|
||||
f->Reset();
|
||||
g->Reset();
|
||||
|
||||
int lo = 0;
|
||||
for (;;) {
|
||||
// Consume delete operations in f. This moves A without moving
|
||||
// A' and A''.
|
||||
if (lo >= g->current_hi_aoffset_ && CopyInserts(g, h)) {
|
||||
if (lo >= f->current_hi_aprimeoffset_ && CopyDeletes(f, h)) {
|
||||
// fprintf(stderr,
|
||||
// "ComposeOffsetMap ERROR, f is longer than g.<br>\n");
|
||||
}
|
||||
|
||||
// FlushAll(), called by Reset(), MapForward() or MapBack(), has
|
||||
// added an extra COPY_OP to f and g, so this function has
|
||||
// composed an extra COPY_OP in h from those. To avoid
|
||||
// FlushAll() adds one more extra COPY_OP to h later, dispatch
|
||||
// Flush() right now.
|
||||
h->Flush();
|
||||
return;
|
||||
}
|
||||
|
||||
// Consume insert operations in g. This moves A'' without moving A
|
||||
// and A'.
|
||||
if (lo >= f->current_hi_aprimeoffset_) {
|
||||
if (!CopyDeletes(f, h)) {
|
||||
// fprintf(stderr,
|
||||
// "ComposeOffsetMap ERROR, g is longer than f.<br>\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Compose one operation which moves A' from lo to hi.
|
||||
int hi = min(f->current_hi_aprimeoffset_, g->current_hi_aoffset_);
|
||||
if (f->current_lo_aoffset_ != f->current_hi_aoffset_ &&
|
||||
g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
|
||||
h->Copy(hi - lo);
|
||||
} else if (f->current_lo_aoffset_ != f->current_hi_aoffset_) {
|
||||
h->Delete(hi - lo);
|
||||
} else if (g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
|
||||
h->Insert(hi - lo);
|
||||
}
|
||||
|
||||
lo = hi;
|
||||
}
|
||||
}
|
||||
|
||||
// For testing only -- force a mapping
|
||||
void OffsetMap::StuffIt(const std::string& diffs,
|
||||
int max_aoffset, int max_aprimeoffset) {
|
||||
Clear();
|
||||
diffs_ = diffs;
|
||||
max_aoffset_ = max_aoffset;
|
||||
max_aprimeoffset_ = max_aprimeoffset;
|
||||
}
|
||||
|
||||
|
||||
} // namespace CLD2
|
||||
} // namespace chrome_lang_id
|
||||
168
Telegram/ThirdParty/cld3/src/script_span/offsetmap.h
vendored
Normal file
168
Telegram/ThirdParty/cld3/src/script_span/offsetmap.h
vendored
Normal file
@@ -0,0 +1,168 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#ifndef SCRIPT_SPAN_OFFSETMAP_H_
|
||||
#define SCRIPT_SPAN_OFFSETMAP_H_
|
||||
|
||||
#include <string> // for string
|
||||
|
||||
#include "integral_types.h" // for uint32
|
||||
|
||||
// ***************************** OffsetMap **************************
|
||||
//
|
||||
// An OffsetMap object is a container for a mapping from offsets in one text
|
||||
// buffer A' to offsets in another text buffer A. It is most useful when A' is
|
||||
// built from A via substitutions that occasionally do not preserve byte length.
|
||||
//
|
||||
// A series of operators are used to build the correspondence map, then
|
||||
// calls can be made to map an offset in A' to an offset in A, or vice versa.
|
||||
// The map starts with offset 0 in A corresponding to offset 0 in A'.
|
||||
// The mapping is then built sequentially, adding on byte ranges that are
|
||||
// identical in A and A', byte ranges that are inserted in A', and byte ranges
|
||||
// that are deleted from A. All bytes beyond those specified when building the
|
||||
// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
|
||||
// end of the map.
|
||||
//
|
||||
// The internal data structure records positions at which bytes are added or
|
||||
// deleted. Using the map is O(1) when increasing the A' or A offset
|
||||
// monotonically, and O(n) when accessing random offsets, where n is the
|
||||
// number of differences.
|
||||
//
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
class OffsetMap {
|
||||
public:
|
||||
// Constructor, destructor
|
||||
OffsetMap();
|
||||
~OffsetMap();
|
||||
|
||||
// Clear the map
|
||||
void Clear();
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes correspond
|
||||
// in A and A'
|
||||
void Copy(int bytes);
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// inserted in A' while not advancing in A at all
|
||||
void Insert(int bytes);
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// deleted from A while not advancing in A' at all
|
||||
void Delete(int bytes);
|
||||
|
||||
// [Finish building map,] Re-position to offset 0
|
||||
// This call is optional; MapForward and MapBack finish building the map
|
||||
// if necessary
|
||||
void Reset();
|
||||
|
||||
// Map an offset in A' to the corresponding offset in A
|
||||
int MapBack(int aprimeoffset);
|
||||
|
||||
// Map an offset in A to the corresponding offset in A'
|
||||
int MapForward(int aoffset);
|
||||
|
||||
// h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
|
||||
// from A' to A'' and h is from A to A''.
|
||||
//
|
||||
// Note that g->MoveForward(f->MoveForward(aoffset)) always equals
|
||||
// to h->MoveForward(aoffset), while
|
||||
// f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
|
||||
// to h->MoveBack(aprimeprimeoffset). This happens when deletion in
|
||||
// f and insertion in g are at the same place. For example,
|
||||
//
|
||||
// A 1 2 3 4
|
||||
// ^ | ^ ^
|
||||
// | | / | f
|
||||
// v vv v
|
||||
// A' 1' 2' 3'
|
||||
// ^ ^^ ^
|
||||
// | | \ | g
|
||||
// v | v v
|
||||
// A'' 1'' 2'' 3'' 4''
|
||||
//
|
||||
// results in:
|
||||
//
|
||||
// A 1 2 3 4
|
||||
// ^ ^\ ^ ^
|
||||
// | | \ | | h
|
||||
// v | vv v
|
||||
// A'' 1'' 2'' 3'' 4''
|
||||
//
|
||||
// 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
|
||||
// the latter figure.
|
||||
static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
|
||||
|
||||
// For testing only -- force a mapping
|
||||
void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
|
||||
|
||||
private:
|
||||
enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
|
||||
|
||||
void Flush();
|
||||
void FlushAll();
|
||||
void MaybeFlushAll();
|
||||
void Emit(MapOp op, int len);
|
||||
|
||||
void SetLeft();
|
||||
void SetRight();
|
||||
|
||||
// Back up over previous range, 1..5 bytes
|
||||
// Return subscript at the beginning of that. Pins at 0
|
||||
int Backup(int sub);
|
||||
|
||||
// Parse next range, 1..5 bytes
|
||||
// Return subscript just off the end of that
|
||||
int ParseNext(int sub, MapOp* op, int* length);
|
||||
|
||||
// Parse previous range, 1..5 bytes
|
||||
// Return current subscript
|
||||
int ParsePrevious(int sub, MapOp* op, int* length);
|
||||
|
||||
bool MoveRight(); // Returns true if OK
|
||||
bool MoveLeft(); // Returns true if OK
|
||||
|
||||
// Copies insert operations from source to dest. Returns true if no
|
||||
// other operations are found.
|
||||
static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
|
||||
|
||||
// Copies delete operations from source to dest. Returns true if no other
|
||||
// operations are found.
|
||||
static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
|
||||
|
||||
std::string diffs_;
|
||||
MapOp pending_op_;
|
||||
uint32 pending_length_;
|
||||
|
||||
// Offsets in the ranges below correspond to each other, with A' = A + diff
|
||||
int next_diff_sub_;
|
||||
int current_lo_aoffset_;
|
||||
int current_hi_aoffset_;
|
||||
int current_lo_aprimeoffset_;
|
||||
int current_hi_aprimeoffset_;
|
||||
int current_diff_;
|
||||
int max_aoffset_;
|
||||
int max_aprimeoffset_;
|
||||
};
|
||||
|
||||
} // namespace CLD2
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_OFFSETMAP_H_
|
||||
143
Telegram/ThirdParty/cld3/src/script_span/port.h
vendored
Normal file
143
Telegram/ThirdParty/cld3/src/script_span/port.h
vendored
Normal file
@@ -0,0 +1,143 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// These are weird things we need to do to get this compiling on
|
||||
// random systems [subset].
|
||||
|
||||
#ifndef SCRIPT_SPAN_PORT_H_
|
||||
#define SCRIPT_SPAN_PORT_H_
|
||||
|
||||
#include <string.h> // for memcpy()
|
||||
|
||||
#include "integral_types.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
// Portable handling of unaligned loads, stores, and copies.
|
||||
// On some platforms, like ARM, the copy functions can be more efficient
|
||||
// then a load and a store.
|
||||
|
||||
#if defined(ARCH_PIII) || defined(ARCH_ATHLON) || defined(ARCH_K8) || defined(_ARCH_PPC)
|
||||
|
||||
// x86 and x86-64 can perform unaligned loads/stores directly;
|
||||
// modern PowerPC hardware can also do unaligned integer loads and stores;
|
||||
// but note: the FPU still sends unaligned loads and stores to a trap handler!
|
||||
|
||||
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
||||
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
||||
#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
|
||||
|
||||
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
|
||||
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
|
||||
#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
|
||||
|
||||
#elif defined(__arm__) && \
|
||||
!defined(__ARM_ARCH_5__) && \
|
||||
!defined(__ARM_ARCH_5T__) && \
|
||||
!defined(__ARM_ARCH_5TE__) && \
|
||||
!defined(__ARM_ARCH_5TEJ__) && \
|
||||
!defined(__ARM_ARCH_6__) && \
|
||||
!defined(__ARM_ARCH_6J__) && \
|
||||
!defined(__ARM_ARCH_6K__) && \
|
||||
!defined(__ARM_ARCH_6Z__) && \
|
||||
!defined(__ARM_ARCH_6ZK__) && \
|
||||
!defined(__ARM_ARCH_6T2__) && \
|
||||
!defined(__ARM_ARCH_7__) && \
|
||||
!defined(__ARM_ARCH_7A__) && \
|
||||
!defined(__ARM_ARCH_7M__) && \
|
||||
!defined(__ARM_ARCH_7R__) && \
|
||||
!defined(__ARM_ARCH_8__) && \
|
||||
!defined(__ARM_ARCH_8A__)
|
||||
|
||||
// ARMv7 and newer support native unaligned accesses, but only of 16-bit
|
||||
// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
|
||||
// do an unaligned read and rotate the words around a bit, or do the reads very
|
||||
// slowly (trip through kernel mode). There's no simple #define that says just
|
||||
// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
|
||||
// sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
|
||||
// so in time, maybe we can move on to that.
|
||||
//
|
||||
// Note that even if a chipset supports unaligned access, it might not be
|
||||
// enabled in any given system, e.g.:
|
||||
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHCGCFD.html
|
||||
// Therefore, it's generally just not safe to allow unaligned access on any ARM
|
||||
// variant.
|
||||
//
|
||||
// This is a mess, but there's not much we can do about it.
|
||||
|
||||
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
||||
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
||||
|
||||
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
|
||||
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
|
||||
|
||||
// TODO(sesse): NEON supports unaligned 64-bit loads and stores.
|
||||
// See if that would be more efficient on platforms supporting it,
|
||||
// at least for copies.
|
||||
|
||||
inline uint64 UNALIGNED_LOAD64(const void *p) {
|
||||
uint64 t;
|
||||
memcpy(&t, p, sizeof t);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
||||
memcpy(p, &v, sizeof v);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define NEED_ALIGNED_LOADS
|
||||
|
||||
// These functions are provided for architectures that don't support
|
||||
// unaligned loads and stores.
|
||||
|
||||
inline uint16 UNALIGNED_LOAD16(const void *p) {
|
||||
uint16 t;
|
||||
memcpy(&t, p, sizeof t);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline uint32 UNALIGNED_LOAD32(const void *p) {
|
||||
uint32 t;
|
||||
memcpy(&t, p, sizeof t);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline uint64 UNALIGNED_LOAD64(const void *p) {
|
||||
uint64 t;
|
||||
memcpy(&t, p, sizeof t);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE16(void *p, uint16 v) {
|
||||
memcpy(p, &v, sizeof v);
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE32(void *p, uint32 v) {
|
||||
memcpy(p, &v, sizeof v);
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
||||
memcpy(p, &v, sizeof v);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // End namespace CLD2
|
||||
} // End namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_PORT_H_
|
||||
81
Telegram/ThirdParty/cld3/src/script_span/stringpiece.h
vendored
Normal file
81
Telegram/ThirdParty/cld3/src/script_span/stringpiece.h
vendored
Normal file
@@ -0,0 +1,81 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// A StringPiece points to part or all of a string, double-quoted string
|
||||
// literal, or other string-like object. A StringPiece does *not* own the
|
||||
// string to which it points. A StringPiece is not null-terminated. [subset]
|
||||
//
|
||||
|
||||
#ifndef SCRIPT_SPAN_STRINGPIECE_H_
|
||||
#define SCRIPT_SPAN_STRINGPIECE_H_
|
||||
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
typedef int stringpiece_ssize_type;
|
||||
|
||||
class StringPiece {
|
||||
private:
|
||||
const char* ptr_;
|
||||
stringpiece_ssize_type length_;
|
||||
|
||||
public:
|
||||
// We provide non-explicit singleton constructors so users can pass
|
||||
// in a "const char*" or a "string" wherever a "StringPiece" is
|
||||
// expected.
|
||||
StringPiece() : ptr_(NULL), length_(0) {}
|
||||
|
||||
StringPiece(const char* str) // NOLINT(runtime/explicit)
|
||||
: ptr_(str), length_(0) {
|
||||
if (str != NULL) {
|
||||
length_ = static_cast<stringpiece_ssize_type>(strlen(str));
|
||||
}
|
||||
}
|
||||
|
||||
StringPiece(const std::string& str) // NOLINT(runtime/explicit)
|
||||
: ptr_(str.data()), length_(0) {
|
||||
length_ = static_cast<stringpiece_ssize_type>(str.size());
|
||||
}
|
||||
|
||||
StringPiece(const char* offset, stringpiece_ssize_type len)
|
||||
: ptr_(offset), length_(len) {
|
||||
}
|
||||
|
||||
void remove_prefix(stringpiece_ssize_type n) {
|
||||
ptr_ += n;
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
void remove_suffix(stringpiece_ssize_type n) {
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
// data() may return a pointer to a buffer with embedded NULs, and the
|
||||
// returned buffer may or may not be null terminated. Therefore it is
|
||||
// typically a mistake to pass data() to a routine that expects a NUL
|
||||
// terminated string.
|
||||
const char* data() const { return ptr_; }
|
||||
stringpiece_ssize_type size() const { return length_; }
|
||||
stringpiece_ssize_type length() const { return length_; }
|
||||
bool empty() const { return length_ == 0; }
|
||||
};
|
||||
|
||||
class StringPiece;
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_STRINGPIECE_H__
|
||||
245
Telegram/ThirdParty/cld3/src/script_span/text_processing.cc
vendored
Normal file
245
Telegram/ThirdParty/cld3/src/script_span/text_processing.cc
vendored
Normal file
@@ -0,0 +1,245 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "text_processing.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
namespace {
|
||||
|
||||
static const int kMaxSpaceScan = 32; // Bytes
|
||||
|
||||
int minint(int a, int b) { return (a < b) ? a : b; }
|
||||
|
||||
// Counts number of spaces; a little faster than one-at-a-time
|
||||
// Doesn't count odd bytes at end
|
||||
int CountSpaces4(const char *src, int src_len) {
|
||||
int s_count = 0;
|
||||
for (int i = 0; i < (src_len & ~3); i += 4) {
|
||||
s_count += (src[i] == ' ');
|
||||
s_count += (src[i + 1] == ' ');
|
||||
s_count += (src[i + 2] == ' ');
|
||||
s_count += (src[i + 3] == ' ');
|
||||
}
|
||||
return s_count;
|
||||
}
|
||||
|
||||
// This uses a cheap predictor to get a measure of compression, and
|
||||
// hence a measure of repetitiveness. It works on complete UTF-8 characters
|
||||
// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
|
||||
// all the time when done with a byte-based count. Sigh.
|
||||
//
|
||||
// To allow running prediction across multiple chunks, caller passes in current
|
||||
// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
|
||||
//
|
||||
// Returns the number of *bytes* correctly predicted, increments by 1..4 for
|
||||
// each correctly-predicted character.
|
||||
//
|
||||
// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
|
||||
//
|
||||
|
||||
// TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
|
||||
|
||||
int CountPredictedBytes(const char *isrc, int src_len, int *hash, int *tbl) {
|
||||
typedef unsigned char uint8;
|
||||
|
||||
int p_count = 0;
|
||||
const uint8 *src = reinterpret_cast<const uint8 *>(isrc);
|
||||
const uint8 *srclimit = src + src_len;
|
||||
int local_hash = *hash;
|
||||
|
||||
while (src < srclimit) {
|
||||
int c = src[0];
|
||||
int incr = 1;
|
||||
|
||||
// Pick up one char and length
|
||||
if (c < 0xc0) {
|
||||
// One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
|
||||
// Do nothing more
|
||||
} else if ((c & 0xe0) == 0xc0) {
|
||||
// Two-byte
|
||||
c = (c << 8) | src[1];
|
||||
incr = 2;
|
||||
} else if ((c & 0xf0) == 0xe0) {
|
||||
// Three-byte
|
||||
c = (c << 16) | (src[1] << 8) | src[2];
|
||||
incr = 3;
|
||||
} else {
|
||||
// Four-byte
|
||||
c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
|
||||
incr = 4;
|
||||
}
|
||||
src += incr;
|
||||
|
||||
int p = tbl[local_hash]; // Prediction
|
||||
tbl[local_hash] = c; // Update prediction
|
||||
if (c == p) {
|
||||
p_count += incr; // Count bytes of good predictions
|
||||
}
|
||||
|
||||
local_hash = ((local_hash << 4) ^ c) & 0xfff;
|
||||
}
|
||||
*hash = local_hash;
|
||||
return p_count;
|
||||
}
|
||||
|
||||
// Backscan to word boundary, returning how many bytes n to go back
|
||||
// so that src - n is non-space ans src - n - 1 is space.
|
||||
// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
|
||||
int BackscanToSpace(const char *src, int limit) {
|
||||
int n = 0;
|
||||
limit = minint(limit, kMaxSpaceScan);
|
||||
while (n < limit) {
|
||||
if (src[-n - 1] == ' ') {
|
||||
return n;
|
||||
} // We are at _X
|
||||
++n;
|
||||
}
|
||||
n = 0;
|
||||
while (n < limit) {
|
||||
if ((src[-n] & 0xc0) != 0x80) {
|
||||
return n;
|
||||
} // We are at char begin
|
||||
++n;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Forwardscan to word boundary, returning how many bytes n to go forward
|
||||
// so that src + n is non-space ans src + n - 1 is space.
|
||||
// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
|
||||
int ForwardscanToSpace(const char *src, int limit) {
|
||||
int n = 0;
|
||||
limit = minint(limit, kMaxSpaceScan);
|
||||
while (n < limit) {
|
||||
if (src[n] == ' ') {
|
||||
return n + 1;
|
||||
} // We are at _X
|
||||
++n;
|
||||
}
|
||||
n = 0;
|
||||
while (n < limit) {
|
||||
if ((src[n] & 0xc0) != 0x80) {
|
||||
return n;
|
||||
} // We are at char begin
|
||||
++n;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// Must be exactly 4096 for cheap compressor.
|
||||
static const int kPredictionTableSize = 4096;
|
||||
static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
|
||||
static const int kSpacesThreshPercent = 30; // Squeeze if >=30% spaces
|
||||
static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
|
||||
|
||||
// Remove portions of text that have a high density of spaces, or that are
|
||||
// overly repetitive, squeezing the remaining text in-place to the front of the
|
||||
// input buffer.
|
||||
//
|
||||
// Squeezing looks at density of space/prediced chars in fixed-size chunks,
|
||||
// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
|
||||
//
|
||||
// Return the new, possibly-shorter length
|
||||
//
|
||||
// Result Buffer ALWAYS has leading space and trailing space space space NUL,
|
||||
// if input does
|
||||
//
|
||||
int CheapSqueezeInplace(char *isrc, int src_len, int ichunksize) {
|
||||
char *src = isrc;
|
||||
char *dst = src;
|
||||
char *srclimit = src + src_len;
|
||||
bool skipping = false;
|
||||
|
||||
int hash = 0;
|
||||
|
||||
// Allocate local prediction table.
|
||||
int *predict_tbl = new int[kPredictionTableSize];
|
||||
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
|
||||
|
||||
int chunksize = ichunksize;
|
||||
if (chunksize == 0) {
|
||||
chunksize = kChunksizeDefault;
|
||||
}
|
||||
int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
|
||||
int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
|
||||
|
||||
while (src < srclimit) {
|
||||
int remaining_bytes = srclimit - src;
|
||||
int len = minint(chunksize, remaining_bytes);
|
||||
|
||||
// Make len land us on a UTF-8 character boundary.
|
||||
// Ah. Also fixes mispredict because we could get out of phase
|
||||
// Loop always terminates at trailing space in buffer
|
||||
while ((src[len] & 0xc0) == 0x80) {
|
||||
++len;
|
||||
} // Move past continuation bytes
|
||||
|
||||
int space_n = CountSpaces4(src, len);
|
||||
int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
|
||||
if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
|
||||
// Skip the text
|
||||
if (!skipping) {
|
||||
// Keeping-to-skipping transition; do it at a space
|
||||
int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
|
||||
dst -= n;
|
||||
if (dst == isrc) {
|
||||
// Force a leading space if the first chunk is deleted
|
||||
*dst++ = ' ';
|
||||
}
|
||||
skipping = true;
|
||||
}
|
||||
} else {
|
||||
// Keep the text
|
||||
if (skipping) {
|
||||
// Skipping-to-keeping transition; do it at a space
|
||||
int n = ForwardscanToSpace(src, len);
|
||||
src += n;
|
||||
remaining_bytes -= n; // Shrink remaining length
|
||||
len -= n;
|
||||
skipping = false;
|
||||
}
|
||||
|
||||
// "len" can be negative in some cases
|
||||
if (len > 0) {
|
||||
memmove(dst, src, len);
|
||||
dst += len;
|
||||
}
|
||||
}
|
||||
src += len;
|
||||
}
|
||||
|
||||
if ((dst - isrc) < (src_len - 3)) {
|
||||
// Pad and make last char clean UTF-8 by putting following spaces
|
||||
dst[0] = ' ';
|
||||
dst[1] = ' ';
|
||||
dst[2] = ' ';
|
||||
dst[3] = '\0';
|
||||
} else if ((dst - isrc) < src_len) {
|
||||
// Make last char clean UTF-8 by putting following space off the end
|
||||
dst[0] = ' ';
|
||||
}
|
||||
|
||||
// Deallocate local prediction table
|
||||
delete[] predict_tbl;
|
||||
return static_cast<int>(dst - isrc);
|
||||
}
|
||||
|
||||
} // namespace CLD2
|
||||
} // namespace chrome_lang_id
|
||||
30
Telegram/ThirdParty/cld3/src/script_span/text_processing.h
vendored
Normal file
30
Telegram/ThirdParty/cld3/src/script_span/text_processing.h
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef SCRIPT_SPAN_TEXT_PROCESSING_H_
|
||||
#define SCRIPT_SPAN_TEXT_PROCESSING_H_
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
// Remove portions of text that have a high density of spaces, or that are
|
||||
// overly repetitive, squeezing the remaining text in-place to the front
|
||||
// of the input buffer.
|
||||
// Return the new, possibly-shorter length
|
||||
int CheapSqueezeInplace(char *isrc, int srclen, int ichunksize);
|
||||
|
||||
} // namespace CLD2
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_TEXT_PROCESSING_H_
|
||||
486
Telegram/ThirdParty/cld3/src/script_span/utf8acceptinterchange.h
vendored
Normal file
486
Telegram/ThirdParty/cld3/src/script_span/utf8acceptinterchange.h
vendored
Normal file
@@ -0,0 +1,486 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Created by utf8tablebuilder version 2.9
|
||||
//
|
||||
// Rejects all codes that are not interchange-valid
|
||||
// Accepts all other UTF-8 codes 0000..10FFFF
|
||||
// Exit optimized -- exits after four times in state 0
|
||||
// All bytes are checked for structurally valid UTF-8
|
||||
// Table entries are absolute statetable subscripts
|
||||
|
||||
#ifndef SCRIPT_SPAN_UTF8ACCEPTINTERCHANGE_H_
|
||||
#define SCRIPT_SPAN_UTF8ACCEPTINTERCHANGE_H_
|
||||
|
||||
#include "integral_types.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
#define X__ (kExitIllegalStructure)
|
||||
#define RJ_ (kExitReject)
|
||||
#define S1_ (kExitReplace1)
|
||||
#define S2_ (kExitReplace2)
|
||||
#define S3_ (kExitReplace3)
|
||||
#define S21 (kExitReplace21)
|
||||
#define S31 (kExitReplace31)
|
||||
#define S32 (kExitReplace32)
|
||||
#define T1_ (kExitReplaceOffset1)
|
||||
#define T2_ (kExitReplaceOffset2)
|
||||
#define S11 (kExitReplace1S0)
|
||||
#define SP_ (kExitSpecial)
|
||||
#define D__ (kExitDoAgain)
|
||||
#define RJA (kExitRejectAlt)
|
||||
|
||||
// Entire table has 17 state blocks of 256 entries each
|
||||
|
||||
static const unsigned int utf8acceptinterchange_STATE0 = 0; // state[0]
|
||||
static const unsigned int utf8acceptinterchange_STATE0_SIZE = 1024; // =[4]
|
||||
static const unsigned int utf8acceptinterchange_TOTAL_SIZE = 4352;
|
||||
static const unsigned int utf8acceptinterchange_MAX_EXPAND_X4 = 0;
|
||||
static const unsigned int utf8acceptinterchange_SHIFT = 8;
|
||||
static const unsigned int utf8acceptinterchange_BYTES = 1;
|
||||
static const unsigned int utf8acceptinterchange_LOSUB = 0x20202020;
|
||||
static const unsigned int utf8acceptinterchange_HIADD = 0x01010101;
|
||||
|
||||
static const uint8 utf8acceptinterchange[] = {
|
||||
// state[0] 0x000000 Byte 1
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_, 1, 1,RJ_, 1, 1,RJ_,RJ_,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,RJ_,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 10,
|
||||
13, 15, 15, 15, 16,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[1] 0x000000 Byte 1
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_, 2, 2,RJ_, 2, 2,RJ_,RJ_,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,RJ_,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 10,
|
||||
13, 15, 15, 15, 16,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[2] 0x000000 Byte 1
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_, 3, 3,RJ_, 3, 3,RJ_,RJ_,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,RJ_,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 10,
|
||||
13, 15, 15, 15, 16,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[3] 0x000000 Byte 1
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,D__,D__,RJ_,D__,D__,RJ_,RJ_,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
|
||||
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
|
||||
|
||||
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
|
||||
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
|
||||
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,D__,
|
||||
D__,D__,D__,D__,D__,D__,D__,D__, D__,D__,D__,D__,D__,D__,D__,RJ_,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 10,
|
||||
13, 15, 15, 15, 16,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[4] 0x0000c0 Byte 2 of 2
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[5] 0x000000 Byte 2 of 3
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[6] 0x001000 Byte 2 of 3
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[7] 0x000080 Byte 2 of 2
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[8] 0x00d000 Byte 2 of 3
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[9] 0x00d800 Byte 3 of 3
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[10] 0x00f000 Byte 2 of 3
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 11, 4, 4, 4, 4, 4, 4, 4, 12,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[11] 0x00fdc0 Byte 3 of 3
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_, RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,RJ_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[12] 0x00ffc0 Byte 3 of 3
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,RJ_,RJ_,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[13] 0x000000 Byte 2 of 4
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[14] 0x01f000 Byte 3 of 4
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[15] 0x040000 Byte 2 of 4
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[16] 0x100000 Byte 2 of 4
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 14,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
};
|
||||
|
||||
// Remap base[0] = (del, add, string_offset)
|
||||
static const RemapEntry utf8acceptinterchange_remap_base[] = {
|
||||
{0,0,0} };
|
||||
|
||||
// Remap string[0]
|
||||
static const unsigned char utf8acceptinterchange_remap_string[] = {
|
||||
0 };
|
||||
|
||||
static const unsigned char utf8acceptinterchange_fast[256] = {
|
||||
1,1,1,1,1,1,1,1, 1,0,0,1,0,0,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,1,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
};
|
||||
|
||||
static const UTF8ScanObj utf8acceptinterchange_obj = {
|
||||
utf8acceptinterchange_STATE0,
|
||||
utf8acceptinterchange_STATE0_SIZE,
|
||||
utf8acceptinterchange_TOTAL_SIZE,
|
||||
utf8acceptinterchange_MAX_EXPAND_X4,
|
||||
utf8acceptinterchange_SHIFT,
|
||||
utf8acceptinterchange_BYTES,
|
||||
utf8acceptinterchange_LOSUB,
|
||||
utf8acceptinterchange_HIADD,
|
||||
utf8acceptinterchange,
|
||||
utf8acceptinterchange_remap_base,
|
||||
utf8acceptinterchange_remap_string,
|
||||
utf8acceptinterchange_fast
|
||||
};
|
||||
|
||||
|
||||
#undef X__
|
||||
#undef RJ_
|
||||
#undef S1_
|
||||
#undef S2_
|
||||
#undef S3_
|
||||
#undef S21
|
||||
#undef S31
|
||||
#undef S32
|
||||
#undef T1_
|
||||
#undef T2_
|
||||
#undef S11
|
||||
#undef SP_
|
||||
#undef D__
|
||||
#undef RJA
|
||||
|
||||
// Table has 4608 bytes, Hash = 505C-3D29
|
||||
|
||||
} // End namespace CLD2
|
||||
} // End namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_UTF8ACCEPTINTERCHANGE_H_
|
||||
1631
Telegram/ThirdParty/cld3/src/script_span/utf8prop_lettermarkscriptnum.h
vendored
Normal file
1631
Telegram/ThirdParty/cld3/src/script_span/utf8prop_lettermarkscriptnum.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
758
Telegram/ThirdParty/cld3/src/script_span/utf8repl_lettermarklower.h
vendored
Normal file
758
Telegram/ThirdParty/cld3/src/script_span/utf8repl_lettermarklower.h
vendored
Normal file
@@ -0,0 +1,758 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Created by utf8tablebuilder version 2.9
|
||||
//
|
||||
// Replaces all codes from file:
|
||||
// lettermarklower_6.2.0.txt
|
||||
// Accepts all other UTF-8 codes 0000..10FFFF
|
||||
// Space optimized
|
||||
//
|
||||
// ** ASSUMES INPUT IS STRUCTURALLY VALID UTF-8 **
|
||||
//
|
||||
// Table entries are absolute statetable subscripts
|
||||
|
||||
#ifndef SCRIPT_SPAN_UTF8REPL_LETTERMARKLOWER_H_
|
||||
#define SCRIPT_SPAN_UTF8REPL_LETTERMARKLOWER_H_
|
||||
|
||||
#include "integral_types.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
#define X__ (kExitIllegalStructure)
|
||||
#define RJ_ (kExitReject)
|
||||
#define S1_ (kExitReplace1)
|
||||
#define S2_ (kExitReplace2)
|
||||
#define S3_ (kExitReplace3)
|
||||
#define S21 (kExitReplace21)
|
||||
#define S31 (kExitReplace31)
|
||||
#define S32 (kExitReplace32)
|
||||
#define T1_ (kExitReplaceOffset1)
|
||||
#define T2_ (kExitReplaceOffset2)
|
||||
#define S11 (kExitReplace1S0)
|
||||
#define SP_ (kExitSpecial)
|
||||
#define D__ (kExitDoAgain)
|
||||
#define RJA (kExitRejectAlt)
|
||||
|
||||
// Entire table has 111 state blocks of 64 entries each
|
||||
|
||||
static const unsigned int utf8repl_lettermarklower_STATE0 = 0; // state[0]
|
||||
static const unsigned int utf8repl_lettermarklower_STATE0_SIZE = 320; // =[5]
|
||||
static const unsigned int utf8repl_lettermarklower_TOTAL_SIZE = 7104;
|
||||
static const unsigned int utf8repl_lettermarklower_MAX_EXPAND_X4 = 12;
|
||||
static const unsigned int utf8repl_lettermarklower_SHIFT = 6;
|
||||
static const unsigned int utf8repl_lettermarklower_BYTES = 1;
|
||||
static const unsigned int utf8repl_lettermarklower_LOSUB = 0x5b5b5b5b;
|
||||
static const unsigned int utf8repl_lettermarklower_HIADD = 0x00000000;
|
||||
|
||||
static const uint8 utf8repl_lettermarklower[] = {
|
||||
// state[0] 0x000000 Byte 1
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0,S11,S11,S11,S11,S11,S11,S11, S11,S11,S11,S11,S11,S11,S11,S11,
|
||||
S11,S11,S11,S11,S11,S11,S11,S11, S11,S11,S11, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__, 6, 11, 13, 16, 19, 22, 25, 28, 6, 6, 6, 31, 33, 36,
|
||||
39, 42, 44, 46, 48, 51, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
7, 54, 74, 8, 8, 8, 8, 8, 8, 8, 88, 8, 8, 8, 8,100,
|
||||
104, 9, 9, 9, 10,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[6 + 2] 0x000080 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
// state[7 + 2] 0x000000 Byte 2 of 3
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[8 + 2] 0x003000 Byte 2 of 3
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[9 + 2] 0x040000 Byte 2 of 4
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
|
||||
// state[10 + 2] 0x100000 Byte 2 of 4
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[11 + 2] 0x0000c0 Byte 2 of 2
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
|
||||
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0x00, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[13 + 2] 0x000100 Byte 2 of 2
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S21, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,S1_, 0,S1_, 0,S2_,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x69,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0x00,0xba,0x00,0xbc,0x00,0xbe,0x00,0x80,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xc5,
|
||||
|
||||
// state[16 + 2] 0x000140 Byte 2 of 2
|
||||
0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S2_,S1_, 0,S1_, 0,S1_, 0, 0,
|
||||
|
||||
0x00,0x82,0x00,0x84,0x00,0x86,0x00,0x88, 0x00,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xbf,0xba,0x00,0xbc,0x00,0xbe,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xc3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[19 + 2] 0x000180 Byte 2 of 2
|
||||
0,S2_,S1_, 0,S1_, 0,S2_,S1_, 0,S2_,S2_,S1_, 0, 0,S2_,S2_,
|
||||
S2_,S1_, 0,S2_,S2_, 0,S2_,S2_, S1_, 0, 0, 0,S2_,S2_, 0,S2_,
|
||||
S1_, 0,S1_, 0,S1_, 0,S2_,S1_, 0,S2_, 0, 0,S1_, 0,S2_,S1_,
|
||||
0,S2_,S2_,S1_, 0,S1_, 0,S2_, S1_, 0, 0, 0,S1_, 0, 0, 0,
|
||||
|
||||
0x00,0x93,0x83,0x00,0x85,0x00,0x94,0x88, 0x00,0x96,0x97,0x8c,0x00,0x00,0x9d,0x99,
|
||||
0x9b,0x92,0x00,0xa0,0xa3,0x00,0xa9,0xa8, 0x99,0x00,0x00,0x00,0xaf,0xb2,0x00,0xb5,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0x80,0xa8, 0x00,0x83,0x00,0x00,0xad,0x00,0x88,0xb0,
|
||||
0x00,0x8a,0x8b,0xb4,0x00,0xb6,0x00,0x92, 0xb9,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
|
||||
|
||||
0x00,0xc9,0x00,0x00,0x00,0x00,0xc9,0x00, 0x00,0xc9,0xc9,0x00,0x00,0x00,0xc7,0xc9,
|
||||
0xc9,0x00,0x00,0xc9,0xc9,0x00,0xc9,0xc9, 0x00,0x00,0x00,0x00,0xc9,0xc9,0x00,0xc9,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xca,0x00, 0x00,0xca,0x00,0x00,0x00,0x00,0xca,0x00,
|
||||
0x00,0xca,0xca,0x00,0x00,0x00,0x00,0xca, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[22 + 2] 0x0001c0 Byte 2 of 2
|
||||
0, 0, 0, 0,S1_,S1_, 0,S1_, S1_, 0,S1_,S1_, 0,S1_, 0,S1_,
|
||||
0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0,S1_,S1_, 0,S1_, 0,S2_,S2_, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x86,0x86,0x00,0x89, 0x89,0x00,0x8c,0x8c,0x00,0x8e,0x00,0x90,
|
||||
0x00,0x92,0x00,0x94,0x00,0x96,0x00,0x98, 0x00,0x9a,0x00,0x9c,0x00,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x00,0xb3,0xb3,0x00,0xb5,0x00,0x95,0xbf, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xc6,0xc6, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[25 + 2] 0x000200 Byte 2 of 2
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S2_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0,T1_,S1_, 0,S2_,T1_, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0x9e,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xbc,0x00,0x9a,0x01,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xc6,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc6,0x00,0x00,
|
||||
|
||||
// state[28 + 2] 0x000240 Byte 2 of 2
|
||||
0,S1_, 0,S2_,S2_,S2_,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x82,0x00,0x80,0x89,0x8c,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0xc6,0xca,0xca,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[31 + 2] 0x000340 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
S1_, 0,S1_, 0, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0x00,0x00,0xb7,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[33 + 2] 0x000380 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0,S1_, 0, S1_,S1_,S1_, 0,S2_, 0,S2_,S2_,
|
||||
0,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S2_,S2_, 0,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xac,0x00, 0xad,0xae,0xaf,0x00,0x8c,0x00,0x8d,0x8e,
|
||||
0x00,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
|
||||
0x80,0x81,0x00,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0xcf,0x00,0xcf,0xcf,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xcf,0xcf,0x00,0xcf,0xcf,0xcf,0xcf,0xcf, 0xcf,0xcf,0xcf,0xcf,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[36 + 2] 0x0003c0 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0, 0, 0, 0,S2_, 0, 0,S1_, 0,S1_,S1_, 0, 0,S2_,S2_,S2_,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x97,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x00,0x00,0x00,0x00,0xb8,0x00,0x00,0xb8, 0x00,0xb2,0xbb,0x00,0x00,0xbb,0xbc,0xbd,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0xce,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xcd,0xcd,0xcd,
|
||||
|
||||
// state[39 + 2] 0x000400 Byte 2 of 2
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
|
||||
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
|
||||
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1, 0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1, 0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[42 + 2] 0x000440 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[44 + 2] 0x000480 Byte 2 of 2
|
||||
S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x81,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[46 + 2] 0x0004c0 Byte 2 of 2
|
||||
S1_,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x8f,0x82,0x00,0x84,0x00,0x86,0x00,0x88, 0x00,0x8a,0x00,0x8c,0x00,0x8e,0x00,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[48 + 2] 0x000500 Byte 2 of 2
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5, 0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,
|
||||
|
||||
// state[51 + 2] 0x000540 Byte 2 of 2
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
|
||||
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xd6,0xd6,0xd6,0xd6,0xd6,0xd6,0xd6,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[54 + 2] 0x001000 Byte 2 of 3
|
||||
6, 6, 55, 57, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 59, 59, 61, 59, 64, 66, 68, 71,
|
||||
|
||||
// state[55 + 2] 0x001080 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_, T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_,
|
||||
T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_, T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09, 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,
|
||||
0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19, 0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,
|
||||
|
||||
// state[57 + 2] 0x0010c0 Byte 3 of 3
|
||||
T1_,T1_,T1_,T1_,T1_,T1_, 0,T1_, 0, 0, 0, 0, 0,T1_, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x22,0x23,0x24,0x25,0x26,0x27,0x00,0x28, 0x00,0x00,0x00,0x00,0x00,0x29,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[59 + 2] 0x001e00 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[61 + 2] 0x001e80 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,S32, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xc3,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[64 + 2] 0x001f00 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0x92,0x93,0x94,0x95,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
|
||||
|
||||
// state[66 + 2] 0x001f40 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,S1_, 0,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x91,0x00,0x93,0x00,0x95,0x00,0x97,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[68 + 2] 0x001f80 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_,S1_, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb0,0xb1,0xb0,0xb1,0xb3,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[71 + 2] 0x001fc0 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S2_,S2_,S2_,S2_,S1_, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_,S1_, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S2_,S2_,S2_,S2_,S1_, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb2,0xb3,0xb4,0xb5,0x83,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0xb6,0xb7,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xba,0xbb,0xa5,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb8,0xb9,0xbc,0xbd,0xb3,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xbd,0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xbd,0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[74 + 2] 0x002000 Byte 2 of 3
|
||||
6, 6, 6, 6, 75, 6, 78, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
80, 83, 59, 86, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[75 + 2] 0x002100 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,S32, 0, 0, 0,S31,S32, 0, 0, 0, 0,
|
||||
0, 0,S2_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x89,0x00, 0x00,0x00,0x6b,0xa5,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x8e,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x00, 0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x85,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[78 + 2] 0x002180 Byte 3 of 3
|
||||
0, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[80 + 2] 0x002c00 Byte 3 of 3
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
|
||||
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
|
||||
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1, 0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,
|
||||
0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1, 0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[83 + 2] 0x002c40 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
S1_, 0,S32,T1_,S32, 0, 0,S1_, 0,S1_, 0,S1_, 0,S32,S32,S32,
|
||||
S32, 0,S1_, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0,S32,S32,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xa1,0x00,0xab,0x2a,0xbd,0x00,0x00,0xa8, 0x00,0xaa,0x00,0xac,0x00,0x91,0xb1,0x90,
|
||||
0x92,0x00,0xb3,0x00,0x00,0xb6,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xbf,0x80,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0xc9,0x00,0xc9,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc9,0xc9,0xc9,
|
||||
0xc9,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xc8,0xc9,
|
||||
|
||||
// state[86 + 2] 0x002cc0 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0, 0,
|
||||
0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xac,0x00,0xae,0x00,0x00,
|
||||
0x00,0x00,0xb3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[88 + 2] 0x00a000 Byte 2 of 3
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 89, 91, 6, 93, 95, 97, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[89 + 2] 0x00a640 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[91 + 2] 0x00a680 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[93 + 2] 0x00a700 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x00,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[95 + 2] 0x00a740 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,T1_,S1_, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0xba,0x00,0xbc,0x00,0x2b,0xbf,0x00,
|
||||
|
||||
// state[97 + 2] 0x00a780 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0,S1_, 0,S32, 0, 0,
|
||||
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S32, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x00,0x00,0x00,0x8c,0x00,0xa5,0x00,0x00,
|
||||
0x91,0x00,0x93,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xa6,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc9,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xc9,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[100 + 2] 0x00f000 Byte 2 of 3
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,101, 6, 6, 6,
|
||||
|
||||
// state[101 + 2] 0x00ff00 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
|
||||
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd, 0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,
|
||||
0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd, 0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[104 + 2] 0x000000 Byte 2 of 4
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
105, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
|
||||
// state[105 + 2] 0x010000 Byte 3 of 4
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
106, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[106 + 2] 0x010400 Byte 4 of 4
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
|
||||
0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
|
||||
0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x91,0x91,0x91,0x91,0x91,0x91,0x91,0x91,
|
||||
0x91,0x91,0x91,0x91,0x91,0x91,0x91,0x91, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
};
|
||||
|
||||
// Remap base[44] = (del, add, string_offset)
|
||||
static const RemapEntry utf8repl_lettermarklower_remap_base[] = {
|
||||
{2,3, 0}, {2,3, 3}, {3,3, 6}, {3,3, 9},
|
||||
{3,3, 12}, {3,3, 15}, {3,3, 18}, {3,3, 21},
|
||||
{3,3, 24}, {3,3, 27}, {3,3, 30}, {3,3, 33},
|
||||
{3,3, 36}, {3,3, 39}, {3,3, 42}, {3,3, 45},
|
||||
|
||||
{3,3, 48}, {3,3, 51}, {3,3, 54}, {3,3, 57},
|
||||
{3,3, 60}, {3,3, 63}, {3,3, 66}, {3,3, 69},
|
||||
{3,3, 72}, {3,3, 75}, {3,3, 78}, {3,3, 81},
|
||||
{3,3, 84}, {3,3, 87}, {3,3, 90}, {3,3, 93},
|
||||
|
||||
{3,3, 96}, {3,3, 99}, {3,3, 102}, {3,3, 105},
|
||||
{3,3, 108}, {3,3, 111}, {3,3, 114}, {3,3, 117},
|
||||
{3,3, 120}, {3,3, 123}, {3,3, 126}, {3,3, 129},
|
||||
{0,0,0} };
|
||||
|
||||
// Remap string[132]
|
||||
static const unsigned char utf8repl_lettermarklower_remap_string[] = {
|
||||
0xe2,0xb1,0xa5,0xe2,0xb1,0xa6,0xe2,0xb4, 0x80,0xe2,0xb4,0x81,0xe2,0xb4,0x82,0xe2,
|
||||
0xb4,0x83,0xe2,0xb4,0x84,0xe2,0xb4,0x85, 0xe2,0xb4,0x86,0xe2,0xb4,0x87,0xe2,0xb4,
|
||||
0x88,0xe2,0xb4,0x89,0xe2,0xb4,0x8a,0xe2, 0xb4,0x8b,0xe2,0xb4,0x8c,0xe2,0xb4,0x8d,
|
||||
0xe2,0xb4,0x8e,0xe2,0xb4,0x8f,0xe2,0xb4, 0x90,0xe2,0xb4,0x91,0xe2,0xb4,0x92,0xe2,
|
||||
|
||||
0xb4,0x93,0xe2,0xb4,0x94,0xe2,0xb4,0x95, 0xe2,0xb4,0x96,0xe2,0xb4,0x97,0xe2,0xb4,
|
||||
0x98,0xe2,0xb4,0x99,0xe2,0xb4,0x9a,0xe2, 0xb4,0x9b,0xe2,0xb4,0x9c,0xe2,0xb4,0x9d,
|
||||
0xe2,0xb4,0x9e,0xe2,0xb4,0x9f,0xe2,0xb4, 0xa0,0xe2,0xb4,0xa1,0xe2,0xb4,0xa2,0xe2,
|
||||
0xb4,0xa3,0xe2,0xb4,0xa4,0xe2,0xb4,0xa5, 0xe2,0xb4,0xa7,0xe2,0xb4,0xad,0xe1,0xb5,
|
||||
|
||||
0xbd,0xe1,0xb5,0xb9,0 };
|
||||
|
||||
static const unsigned char utf8repl_lettermarklower_fast[256] = {
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
|
||||
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
};
|
||||
|
||||
static const UTF8ReplaceObj utf8repl_lettermarklower_obj = {
|
||||
utf8repl_lettermarklower_STATE0,
|
||||
utf8repl_lettermarklower_STATE0_SIZE,
|
||||
utf8repl_lettermarklower_TOTAL_SIZE,
|
||||
utf8repl_lettermarklower_MAX_EXPAND_X4,
|
||||
utf8repl_lettermarklower_SHIFT,
|
||||
utf8repl_lettermarklower_BYTES,
|
||||
utf8repl_lettermarklower_LOSUB,
|
||||
utf8repl_lettermarklower_HIADD,
|
||||
utf8repl_lettermarklower,
|
||||
utf8repl_lettermarklower_remap_base,
|
||||
utf8repl_lettermarklower_remap_string,
|
||||
utf8repl_lettermarklower_fast
|
||||
};
|
||||
|
||||
|
||||
#undef X__
|
||||
#undef RJ_
|
||||
#undef S1_
|
||||
#undef S2_
|
||||
#undef S3_
|
||||
#undef S21
|
||||
#undef S31
|
||||
#undef S32
|
||||
#undef T1_
|
||||
#undef T2_
|
||||
#undef S11
|
||||
#undef SP_
|
||||
#undef D__
|
||||
#undef RJA
|
||||
|
||||
// Table has 7668 bytes, Hash = 07A2-C4E3
|
||||
|
||||
} // End namespace CLD2
|
||||
} // End namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_UTF8REPL_LETTERMARKLOWER_H_
|
||||
1455
Telegram/ThirdParty/cld3/src/script_span/utf8scannot_lettermarkspecial.h
vendored
Normal file
1455
Telegram/ThirdParty/cld3/src/script_span/utf8scannot_lettermarkspecial.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1358
Telegram/ThirdParty/cld3/src/script_span/utf8statetable.cc
vendored
Normal file
1358
Telegram/ThirdParty/cld3/src/script_span/utf8statetable.cc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
285
Telegram/ThirdParty/cld3/src/script_span/utf8statetable.h
vendored
Normal file
285
Telegram/ThirdParty/cld3/src/script_span/utf8statetable.h
vendored
Normal file
@@ -0,0 +1,285 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// State Table follower for scanning UTF-8 strings without converting to
|
||||
// 32- or 16-bit Unicode values.
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#ifndef SCRIPT_SPAN_UTF8STATETABLE_H_
|
||||
#define SCRIPT_SPAN_UTF8STATETABLE_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "integral_types.h" // for uint8, uint32, uint16
|
||||
#include "stringpiece.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace CLD2 {
|
||||
|
||||
class OffsetMap;
|
||||
|
||||
|
||||
// These four-byte entries compactly encode how many bytes 0..255 to delete
|
||||
// in making a string replacement, how many bytes to add 0..255, and the offset
|
||||
// 0..64k-1 of the replacement string in remap_string.
|
||||
struct RemapEntry {
|
||||
uint8 delete_bytes;
|
||||
uint8 add_bytes;
|
||||
uint16 bytes_offset;
|
||||
};
|
||||
|
||||
// Exit type codes for state tables. All but the first get stuffed into
|
||||
// signed one-byte entries. The first is only generated by executable code.
|
||||
// To distinguish from next-state entries, these must be contiguous and
|
||||
// all <= kExitNone
|
||||
typedef enum {
|
||||
kExitDstSpaceFull = 239,
|
||||
kExitIllegalStructure, // 240
|
||||
kExitOK, // 241
|
||||
kExitReject, // ...
|
||||
kExitReplace1,
|
||||
kExitReplace2,
|
||||
kExitReplace3,
|
||||
kExitReplace21,
|
||||
kExitReplace31,
|
||||
kExitReplace32,
|
||||
kExitReplaceOffset1,
|
||||
kExitReplaceOffset2,
|
||||
kExitReplace1S0,
|
||||
kExitSpecial,
|
||||
kExitDoAgain,
|
||||
kExitRejectAlt,
|
||||
kExitNone // 255
|
||||
} ExitReason;
|
||||
|
||||
typedef enum {
|
||||
kExitDstSpaceFull_2 = 32767, // 0x7fff
|
||||
kExitIllegalStructure_2, // 32768 0x8000
|
||||
kExitOK_2, // 32769 0x8001
|
||||
kExitReject_2, // ...
|
||||
kExitReplace1_2,
|
||||
kExitReplace2_2,
|
||||
kExitReplace3_2,
|
||||
kExitReplace21_2,
|
||||
kExitReplace31_2,
|
||||
kExitReplace32_2,
|
||||
kExitReplaceOffset1_2,
|
||||
kExitReplaceOffset2_2,
|
||||
kExitReplace1S0_2,
|
||||
kExitSpecial_2,
|
||||
kExitDoAgain_2,
|
||||
kExitRejectAlt_2,
|
||||
kExitNone_2 // 32783 0x800f
|
||||
} ExitReason_2;
|
||||
|
||||
|
||||
// This struct represents one entire state table. The three initialized byte
|
||||
// areas are state_table, remap_base, and remap_string. state0 and state0_size
|
||||
// give the byte offset and length within state_table of the initial state --
|
||||
// table lookups are expected to start and end in this state, but for
|
||||
// truncated UTF-8 strings, may end in a different state. These allow a quick
|
||||
// test for that condition. entry_shift is 8 for tables subscripted by a full
|
||||
// byte value and 6 for space-optimized tables subscripted by only six
|
||||
// significant bits in UTF-8 continuation bytes.
|
||||
typedef struct {
|
||||
const uint32 state0;
|
||||
const uint32 state0_size;
|
||||
const uint32 total_size;
|
||||
const int max_expand;
|
||||
const int entry_shift;
|
||||
const int bytes_per_entry;
|
||||
const uint32 losub;
|
||||
const uint32 hiadd;
|
||||
const uint8* state_table;
|
||||
const RemapEntry* remap_base;
|
||||
const uint8* remap_string;
|
||||
const uint8* fast_state;
|
||||
} UTF8StateMachineObj;
|
||||
|
||||
// Near-duplicate declaration for tables with two-byte entries
|
||||
typedef struct {
|
||||
const uint32 state0;
|
||||
const uint32 state0_size;
|
||||
const uint32 total_size;
|
||||
const int max_expand;
|
||||
const int entry_shift;
|
||||
const int bytes_per_entry;
|
||||
const uint32 losub;
|
||||
const uint32 hiadd;
|
||||
const unsigned short* state_table;
|
||||
const RemapEntry* remap_base;
|
||||
const uint8* remap_string;
|
||||
const uint8* fast_state;
|
||||
} UTF8StateMachineObj_2;
|
||||
|
||||
|
||||
typedef UTF8StateMachineObj UTF8PropObj;
|
||||
typedef UTF8StateMachineObj UTF8ScanObj;
|
||||
typedef UTF8StateMachineObj UTF8ReplaceObj;
|
||||
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
|
||||
typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
|
||||
// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
|
||||
|
||||
|
||||
// Look up property of one UTF-8 character and advance over it
|
||||
// Return 0 if input length is zero
|
||||
// Return 0 and advance one byte if input is ill-formed
|
||||
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
||||
const uint8** src,
|
||||
int* srclen);
|
||||
|
||||
// Look up property of one UTF-8 character (assumed to be valid).
|
||||
// (This is a faster version of UTF8GenericProperty.)
|
||||
bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
|
||||
|
||||
|
||||
// BigOneByte versions are needed for tables > 240 states, but most
|
||||
// won't need the TwoByte versions.
|
||||
|
||||
// Look up property of one UTF-8 character and advance over it
|
||||
// Return 0 if input length is zero
|
||||
// Return 0 and advance one byte if input is ill-formed
|
||||
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
||||
const uint8** src,
|
||||
int* srclen);
|
||||
|
||||
|
||||
// TwoByte versions are needed for tables > 240 states that don't fit onto
|
||||
// BigOneByte -- rare ultimate fallback
|
||||
|
||||
// Look up property of one UTF-8 character (assumed to be valid).
|
||||
// (This is a faster version of UTF8GenericProperty.)
|
||||
bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
|
||||
|
||||
// Look up property of one UTF-8 character and advance over it
|
||||
// Return 0 if input length is zero
|
||||
// Return 0 and advance one byte if input is ill-formed
|
||||
uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
|
||||
const uint8** src,
|
||||
int* srclen);
|
||||
|
||||
// Look up property of one UTF-8 character (assumed to be valid).
|
||||
// (This is a faster version of UTF8GenericProperty.)
|
||||
bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
|
||||
|
||||
// Scan a UTF-8 stringpiece based on a state table.
|
||||
// Always scan complete UTF-8 characters
|
||||
// Set number of bytes scanned. Return reason for exiting
|
||||
int UTF8GenericScan(const UTF8ScanObj* st,
|
||||
const StringPiece& str,
|
||||
int* bytes_consumed);
|
||||
|
||||
|
||||
|
||||
// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
|
||||
// and doing text replacements.
|
||||
// Always scan complete UTF-8 characters
|
||||
// Set number of bytes consumed from input, number filled to output.
|
||||
// Return reason for exiting
|
||||
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
||||
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
bool is_plain_text,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed,
|
||||
OffsetMap* offsetmap);
|
||||
|
||||
// Older version without offsetmap
|
||||
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
bool is_plain_text,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed);
|
||||
|
||||
// Older version without is_plain_text or offsetmap
|
||||
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed);
|
||||
|
||||
|
||||
// TwoByte version is needed for tables > about 256 states, such
|
||||
// as the table for full Unicode 4.1 canonical + compatibility mapping
|
||||
|
||||
// Scan a UTF-8 stringpiece based on state table with two-byte entries,
|
||||
// copying to output stringpiece
|
||||
// and doing text replacements.
|
||||
// Always scan complete UTF-8 characters
|
||||
// Set number of bytes consumed from input, number filled to output.
|
||||
// Return reason for exiting
|
||||
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
||||
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
bool is_plain_text,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed,
|
||||
OffsetMap* offsetmap);
|
||||
|
||||
// Older version without offsetmap
|
||||
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
bool is_plain_text,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed);
|
||||
|
||||
// Older version without is_plain_text or offsetmap
|
||||
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed);
|
||||
|
||||
|
||||
static const unsigned char kUTF8LenTbl[256] = {
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
|
||||
};
|
||||
|
||||
inline int UTF8OneCharLen(const char* in) {
|
||||
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
|
||||
}
|
||||
|
||||
// Adjust a stringpiece to encompass complete UTF-8 characters.
|
||||
// The data pointer will be increased by 0..3 bytes to get to a character
|
||||
// boundary, and the length will then be decreased by 0..3 bytes
|
||||
// to encompass the last complete character.
|
||||
// This is useful especially when a UTF-8 string must be put into a fixed-
|
||||
// maximum-size buffer cleanly, such as a MySQL buffer.
|
||||
void UTF8TrimToChars(StringPiece* istr);
|
||||
|
||||
} // End namespace CLD2
|
||||
} // End namespace chrome_lang_id
|
||||
|
||||
#endif // SCRIPT_SPAN_UTF8STATETABLE_H_
|
||||
77
Telegram/ThirdParty/cld3/src/sentence.proto
vendored
Normal file
77
Telegram/ThirdParty/cld3/src/sentence.proto
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Protocol buffer specification for sentence analysis.
|
||||
|
||||
syntax = "proto2";
|
||||
option optimize_for = LITE_RUNTIME;
|
||||
|
||||
package chrome_lang_id;
|
||||
|
||||
// A Sentence contains the raw text contents of a sentence, as well as an
|
||||
// analysis.
|
||||
message Sentence {
|
||||
// Identifier for sentence.
|
||||
optional string id = 1;
|
||||
|
||||
// Raw text contents of the sentence.
|
||||
optional string text = 2;
|
||||
|
||||
// Tokenization of the sentence.
|
||||
repeated Token token = 3;
|
||||
|
||||
extensions 1000 to max;
|
||||
}
|
||||
|
||||
// A sentence token marks a span of bytes in the sentence text as a token
|
||||
// or word.
|
||||
message Token {
|
||||
// Token word form.
|
||||
required string word = 1;
|
||||
|
||||
// Start position of token in text.
|
||||
required int32 start = 2;
|
||||
|
||||
// End position of token in text. Gives index of last byte, not one past
|
||||
// the last byte. If token came from lexer, excludes any trailing HTML tags.
|
||||
required int32 end = 3;
|
||||
|
||||
// Head of this token in the dependency tree: the id of the token which has an
|
||||
// arc going to this one. If it is the root token of a sentence, then it is
|
||||
// set to -1.
|
||||
optional int32 head = 4 [default = -1];
|
||||
|
||||
// Part-of-speech tag for token.
|
||||
optional string tag = 5;
|
||||
|
||||
// Coarse-grained word category for token.
|
||||
optional string category = 6;
|
||||
|
||||
// Label for dependency relation between this token and its head.
|
||||
optional string label = 7;
|
||||
|
||||
// Break level for tokens that indicates how it was separated from the
|
||||
// previous token in the text.
|
||||
enum BreakLevel {
|
||||
NO_BREAK = 0; // No separation between tokens.
|
||||
SPACE_BREAK = 1; // Tokens separated by space.
|
||||
LINE_BREAK = 2; // Tokens separated by line break.
|
||||
SENTENCE_BREAK = 3; // Tokens separated by sentence break.
|
||||
}
|
||||
|
||||
optional BreakLevel break_level = 8 [default = SPACE_BREAK];
|
||||
|
||||
extensions 1000 to max;
|
||||
}
|
||||
29
Telegram/ThirdParty/cld3/src/sentence_features.cc
vendored
Normal file
29
Telegram/ThirdParty/cld3/src/sentence_features.cc
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "sentence_features.h"
|
||||
|
||||
#include "registry.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Define registry for the whole Sentence feature functions. NOTE: this is not
|
||||
// yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
|
||||
// constructor, *before* we use any feature.
|
||||
template <>
|
||||
WholeSentenceFeature::Registry*
|
||||
RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
45
Telegram/ThirdParty/cld3/src/sentence_features.h
vendored
Normal file
45
Telegram/ThirdParty/cld3/src/sentence_features.h
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Features that operate on Sentence objects. Most features are defined
|
||||
// in this header so they may be re-used via composition into other more
|
||||
// advanced feature classes.
|
||||
|
||||
#ifndef SENTENCE_FEATURES_H_
|
||||
#define SENTENCE_FEATURES_H_
|
||||
|
||||
#include "feature_extractor.h"
|
||||
#include "cld_3/protos/sentence.pb.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Feature function that extracts features for the full Sentence.
|
||||
using WholeSentenceFeature = FeatureFunction<Sentence>;
|
||||
|
||||
using WholeSentenceExtractor = FeatureExtractor<Sentence>;
|
||||
|
||||
// Declare registry for the whole Sentence feature functions. This is required
|
||||
// for clang's -Wundefined-var-template. However, MSVC has a bug which treats
|
||||
// this declaration as a definition, leading to multiple definition errors, so
|
||||
// omit this on MSVC.
|
||||
#if !defined(COMPILER_MSVC)
|
||||
template <>
|
||||
WholeSentenceFeature::Registry
|
||||
*RegisterableClass<WholeSentenceFeature>::registry_;
|
||||
#endif
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // SENTENCE_FEATURES_H_
|
||||
72
Telegram/ThirdParty/cld3/src/simple_adder.h
vendored
Normal file
72
Telegram/ThirdParty/cld3/src/simple_adder.h
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef SIMPLE_ADDER_H_
|
||||
#define SIMPLE_ADDER_H_
|
||||
|
||||
#include "base.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Class for adding (possibly) scaled arrays.
|
||||
class SimpleAdder {
|
||||
public:
|
||||
static constexpr const int kNumFloatsPerBatch = 1;
|
||||
|
||||
CLD3_ATTRIBUTE_ALWAYS_INLINE SimpleAdder(float *dest, int num_floats)
|
||||
: dest_(dest), num_floats_(num_floats) {}
|
||||
|
||||
CLD3_ATTRIBUTE_ALWAYS_INLINE ~SimpleAdder() {
|
||||
// Should call Finalize function before destruction.
|
||||
CLD3_DCHECK(dest_ == nullptr);
|
||||
}
|
||||
|
||||
// Caller must call this function before calling deconstruct this object.
|
||||
CLD3_ATTRIBUTE_ALWAYS_INLINE void Finalize() { dest_ = nullptr; }
|
||||
|
||||
CLD3_ATTRIBUTE_ALWAYS_INLINE void LazyAdd(const float *source) const {
|
||||
AddImpl(source, num_floats_, dest_);
|
||||
}
|
||||
|
||||
CLD3_ATTRIBUTE_ALWAYS_INLINE void LazyScaleAdd(const float *source,
|
||||
const float scale) const {
|
||||
ScaleAddImpl(source, num_floats_, scale, dest_);
|
||||
}
|
||||
|
||||
// Simple fast while loop to implement dest += source.
|
||||
CLD3_ATTRIBUTE_ALWAYS_INLINE static void AddImpl(
|
||||
const float *__restrict source, uint32 size, float *__restrict dest) {
|
||||
for (uint32 i = 0; i < size; ++i) {
|
||||
dest[i] += source[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Simple fast while loop to implement dest += scale * source.
|
||||
CLD3_ATTRIBUTE_ALWAYS_INLINE static void ScaleAddImpl(
|
||||
const float *__restrict source, uint32 size, const float scale,
|
||||
float *__restrict dest) {
|
||||
for (uint32 i = 0; i < size; ++i) {
|
||||
dest[i] += source[i] * scale;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
float *dest_;
|
||||
int num_floats_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // SIMPLE_ADDER_H_
|
||||
161
Telegram/ThirdParty/cld3/src/task_context.cc
vendored
Normal file
161
Telegram/ThirdParty/cld3/src/task_context.cc
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "task_context.h"
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
TaskContext::TaskContext() {}
|
||||
|
||||
TaskContext::~TaskContext() {}
|
||||
|
||||
TaskInput *TaskContext::GetInput(const string &name) {
|
||||
// Return existing input if it exists.
|
||||
for (int i = 0; i < spec_.input_size(); ++i) {
|
||||
if (spec_.input(i).name() == name) return spec_.mutable_input(i);
|
||||
}
|
||||
|
||||
// Create new input.
|
||||
TaskInput *input = spec_.add_input();
|
||||
input->set_name(name);
|
||||
return input;
|
||||
}
|
||||
|
||||
TaskInput *TaskContext::GetInput(const string &name, const string &file_format,
|
||||
const string &record_format) {
|
||||
TaskInput *input = GetInput(name);
|
||||
if (!file_format.empty()) {
|
||||
bool found = false;
|
||||
for (int i = 0; i < input->file_format_size(); ++i) {
|
||||
if (input->file_format(i) == file_format) found = true;
|
||||
}
|
||||
if (!found) input->add_file_format(file_format);
|
||||
}
|
||||
if (!record_format.empty()) {
|
||||
bool found = false;
|
||||
for (int i = 0; i < input->record_format_size(); ++i) {
|
||||
if (input->record_format(i) == record_format) found = true;
|
||||
}
|
||||
if (!found) input->add_record_format(record_format);
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
void TaskContext::SetParameter(const string &name, const string &value) {
|
||||
// If the parameter already exists update the value.
|
||||
for (int i = 0; i < spec_.parameter_size(); ++i) {
|
||||
if (spec_.parameter(i).name() == name) {
|
||||
spec_.mutable_parameter(i)->set_value(value);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Add new parameter.
|
||||
TaskSpec::Parameter *param = spec_.add_parameter();
|
||||
param->set_name(name);
|
||||
param->set_value(value);
|
||||
}
|
||||
|
||||
string TaskContext::GetParameter(const string &name) const {
|
||||
// First try to find parameter in task specification.
|
||||
for (int i = 0; i < spec_.parameter_size(); ++i) {
|
||||
if (spec_.parameter(i).name() == name) return spec_.parameter(i).value();
|
||||
}
|
||||
|
||||
// Parameter not found, return empty string.
|
||||
return "";
|
||||
}
|
||||
|
||||
int TaskContext::GetIntParameter(const string &name) const {
|
||||
string value = GetParameter(name);
|
||||
return utils::ParseUsing<int>(value, 0, utils::ParseInt32);
|
||||
}
|
||||
|
||||
bool TaskContext::GetBoolParameter(const string &name) const {
|
||||
string value = GetParameter(name);
|
||||
return value == "true";
|
||||
}
|
||||
|
||||
double TaskContext::GetFloatParameter(const string &name) const {
|
||||
string value = GetParameter(name);
|
||||
return utils::ParseUsing<double>(value, .0, utils::ParseDouble);
|
||||
}
|
||||
|
||||
string TaskContext::Get(const string &name, const char *defval) const {
|
||||
// First try to find parameter in task specification.
|
||||
for (int i = 0; i < spec_.parameter_size(); ++i) {
|
||||
if (spec_.parameter(i).name() == name) return spec_.parameter(i).value();
|
||||
}
|
||||
|
||||
// Parameter not found, return default value.
|
||||
return defval;
|
||||
}
|
||||
|
||||
string TaskContext::Get(const string &name, const string &defval) const {
|
||||
return Get(name, defval.c_str());
|
||||
}
|
||||
|
||||
int TaskContext::Get(const string &name, int defval) const {
|
||||
string value = Get(name, "");
|
||||
return utils::ParseUsing<int>(value, defval, utils::ParseInt32);
|
||||
}
|
||||
|
||||
double TaskContext::Get(const string &name, double defval) const {
|
||||
string value = Get(name, "");
|
||||
return utils::ParseUsing<double>(value, defval, utils::ParseDouble);
|
||||
}
|
||||
|
||||
bool TaskContext::Get(const string &name, bool defval) const {
|
||||
string value = Get(name, "");
|
||||
return value.empty() ? defval : value == "true";
|
||||
}
|
||||
|
||||
string TaskContext::InputFile(const TaskInput &input) {
|
||||
CLD3_CHECK(input.part_size() == 1);
|
||||
return input.part(0).file_pattern();
|
||||
}
|
||||
|
||||
bool TaskContext::Supports(const TaskInput &input, const string &file_format,
|
||||
const string &record_format) {
|
||||
// Check file format.
|
||||
if (input.file_format_size() > 0) {
|
||||
bool found = false;
|
||||
for (int i = 0; i < input.file_format_size(); ++i) {
|
||||
if (input.file_format(i) == file_format) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) return false;
|
||||
}
|
||||
|
||||
// Check record format.
|
||||
if (input.record_format_size() > 0) {
|
||||
bool found = false;
|
||||
for (int i = 0; i < input.record_format_size(); ++i) {
|
||||
if (input.record_format(i) == record_format) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
81
Telegram/ThirdParty/cld3/src/task_context.h
vendored
Normal file
81
Telegram/ThirdParty/cld3/src/task_context.h
vendored
Normal file
@@ -0,0 +1,81 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TASK_CONTEXT_H_
|
||||
#define TASK_CONTEXT_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "base.h"
|
||||
#include "cld_3/protos/task_spec.pb.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// A task context holds configuration information for a task. It is basically a
|
||||
// wrapper around a TaskSpec protocol buffer.
|
||||
class TaskContext {
|
||||
public:
|
||||
TaskContext();
|
||||
~TaskContext();
|
||||
|
||||
// Returns the underlying task specification protocol buffer for the context.
|
||||
const TaskSpec &spec() const { return spec_; }
|
||||
TaskSpec *mutable_spec() { return &spec_; }
|
||||
|
||||
// Returns a named input descriptor for the task. A new input is created if
|
||||
// the task context does not already have an input with that name.
|
||||
TaskInput *GetInput(const string &name);
|
||||
TaskInput *GetInput(const string &name, const string &file_format,
|
||||
const string &record_format);
|
||||
|
||||
// Sets task parameter.
|
||||
void SetParameter(const string &name, const string &value);
|
||||
|
||||
// Returns task parameter. If the parameter is not in the task configuration
|
||||
// the (default) value of the corresponding command line flag is returned.
|
||||
string GetParameter(const string &name) const;
|
||||
int GetIntParameter(const string &name) const;
|
||||
bool GetBoolParameter(const string &name) const;
|
||||
double GetFloatParameter(const string &name) const;
|
||||
|
||||
// Returns task parameter. If the parameter is not in the task configuration
|
||||
// the default value is returned. Parameters retrieved using these methods
|
||||
// don't need to be defined with a DEFINE_*() macro.
|
||||
string Get(const string &name, const string &defval) const;
|
||||
string Get(const string &name, const char *defval) const;
|
||||
int Get(const string &name, int defval) const;
|
||||
double Get(const string &name, double defval) const;
|
||||
bool Get(const string &name, bool defval) const;
|
||||
|
||||
// Returns input file name for a single-file task input.
|
||||
static string InputFile(const TaskInput &input);
|
||||
|
||||
// Returns true if task input supports the file and record format.
|
||||
static bool Supports(const TaskInput &input, const string &file_format,
|
||||
const string &record_format);
|
||||
|
||||
private:
|
||||
// Underlying task specification protocol buffer.
|
||||
TaskSpec spec_;
|
||||
|
||||
// Vector of parameters required by this task. These must be specified in the
|
||||
// task rather than relying on default values.
|
||||
std::vector<string> required_parameters_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // TASK_CONTEXT_H_
|
||||
74
Telegram/ThirdParty/cld3/src/task_context_params.cc
vendored
Normal file
74
Telegram/ThirdParty/cld3/src/task_context_params.cc
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// This file contains the hard-coded parameters from the training workflow. If
|
||||
// you update the binary model, you may need to update the variables below as
|
||||
// well.
|
||||
|
||||
#include "task_context_params.h"
|
||||
|
||||
#include "task_context.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
void TaskContextParams::ToTaskContext(TaskContext *context) {
|
||||
context->SetParameter("language_identifier_features",
|
||||
kLanguageIdentifierFeatures);
|
||||
context->SetParameter("language_identifier_embedding_names",
|
||||
kLanguageIdentifierEmbeddingNames);
|
||||
context->SetParameter("language_identifier_embedding_dims",
|
||||
kLanguageIdentifierEmbeddingDims);
|
||||
}
|
||||
|
||||
int TaskContextParams::GetNumLanguages() {
|
||||
int i = 0;
|
||||
while (kLanguageNames[i] != nullptr) {
|
||||
i++;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
const char *const TaskContextParams::kLanguageNames[] = {
|
||||
"eo", "co", "eu", "ta", "de", "mt", "ps", "te", "su", "uz", "zh-Latn", "ne",
|
||||
"nl", "sw", "sq", "hmn", "ja", "no", "mn", "so", "ko", "kk", "sl", "ig",
|
||||
"mr", "th", "zu", "ml", "hr", "bs", "lo", "sd", "cy", "hy", "uk", "pt",
|
||||
"lv", "iw", "cs", "vi", "jv", "be", "km", "mk", "tr", "fy", "am", "zh",
|
||||
"da", "sv", "fi", "ht", "af", "la", "id", "fil", "sm", "ca", "el", "ka",
|
||||
"sr", "it", "sk", "ru", "ru-Latn", "bg", "ny", "fa", "haw", "gl", "et",
|
||||
"ms", "gd", "bg-Latn", "ha", "is", "ur", "mi", "hi", "bn", "hi-Latn", "fr",
|
||||
"yi", "hu", "xh", "my", "tg", "ro", "ar", "lb", "el-Latn", "st", "ceb",
|
||||
"kn", "az", "si", "ky", "mg", "en", "gu", "es", "pl", "ja-Latn", "ga", "lt",
|
||||
"sn", "yo", "pa", "ku",
|
||||
|
||||
// last element must be nullptr
|
||||
nullptr,
|
||||
};
|
||||
|
||||
const char TaskContextParams::kLanguageIdentifierFeatures[] =
|
||||
"continuous-bag-of-ngrams(include_terminators=true,include_spaces=false,"
|
||||
"use_equal_weight=false,id_dim=1000,size=2);continuous-bag-of-ngrams("
|
||||
"include_terminators=true,include_spaces=false,use_equal_weight=false,id_"
|
||||
"dim=5000,size=4);continuous-bag-of-relevant-scripts;script;continuous-bag-"
|
||||
"of-ngrams(include_terminators=true,include_spaces=false,use_equal_weight="
|
||||
"false,id_dim=5000,size=3);continuous-bag-of-ngrams(include_terminators="
|
||||
"true,include_spaces=false,use_equal_weight=false,id_dim=100,size=1)";
|
||||
|
||||
const char TaskContextParams::kLanguageIdentifierEmbeddingNames[] =
|
||||
"bigrams;quadgrams;relevant-scripts;text-script;trigrams;unigrams";
|
||||
|
||||
const char TaskContextParams::kLanguageIdentifierEmbeddingDims[] =
|
||||
"16;16;8;8;16;16";
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
54
Telegram/ThirdParty/cld3/src/task_context_params.h
vendored
Normal file
54
Telegram/ThirdParty/cld3/src/task_context_params.h
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TASK_CONTEXT_PARAMS_H_
|
||||
#define TASK_CONTEXT_PARAMS_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "base.h"
|
||||
#include "task_context.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// Encapsulates the TaskContext specifying only the parameters for the model.
|
||||
// The model weights are loaded statically.
|
||||
class TaskContextParams {
|
||||
public:
|
||||
// Gets the name of the i'th language.
|
||||
static const char *language_names(int i) { return kLanguageNames[i]; }
|
||||
|
||||
// Saves the parameters to the given TaskContext.
|
||||
static void ToTaskContext(TaskContext *context);
|
||||
|
||||
// Gets the number of languages.
|
||||
static int GetNumLanguages();
|
||||
|
||||
private:
|
||||
// Names of all the languages.
|
||||
static const char *const kLanguageNames[];
|
||||
|
||||
// Features in FML format.
|
||||
static const char kLanguageIdentifierFeatures[];
|
||||
|
||||
// Names of the embedding spaces.
|
||||
static const char kLanguageIdentifierEmbeddingNames[];
|
||||
|
||||
// Dimensions of the embedding spaces.
|
||||
static const char kLanguageIdentifierEmbeddingDims[];
|
||||
};
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // TASK_CONTEXT_PARAMS_H_
|
||||
98
Telegram/ThirdParty/cld3/src/task_spec.proto
vendored
Normal file
98
Telegram/ThirdParty/cld3/src/task_spec.proto
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// LINT: ALLOW_GROUPS
|
||||
// Protocol buffer specifications for task configuration.
|
||||
|
||||
syntax = "proto2";
|
||||
option optimize_for = LITE_RUNTIME;
|
||||
|
||||
package chrome_lang_id;
|
||||
|
||||
// Task input descriptor.
|
||||
message TaskInput {
|
||||
// Name of input resource.
|
||||
required string name = 1;
|
||||
|
||||
// Name of stage responsible of creating this resource.
|
||||
optional string creator = 2;
|
||||
|
||||
// File format for resource.
|
||||
repeated string file_format = 3;
|
||||
|
||||
// Record format for resource.
|
||||
repeated string record_format = 4;
|
||||
|
||||
// Is this resource multi-file?
|
||||
optional bool multi_file = 5 [default = false];
|
||||
|
||||
// An input can consist of multiple file sets.
|
||||
repeated group Part = 6 {
|
||||
// File pattern for file set.
|
||||
optional string file_pattern = 7;
|
||||
|
||||
// File format for file set.
|
||||
optional string file_format = 8;
|
||||
|
||||
// Record format for file set.
|
||||
optional string record_format = 9;
|
||||
}
|
||||
}
|
||||
|
||||
// Task output descriptor.
|
||||
message TaskOutput {
|
||||
// Name of output resource.
|
||||
required string name = 1;
|
||||
|
||||
// File format for output resource.
|
||||
optional string file_format = 2;
|
||||
|
||||
// Record format for output resource.
|
||||
optional string record_format = 3;
|
||||
|
||||
// Number of shards in output. If it is different from zero this output is
|
||||
// sharded. If the number of shards is set to -1 this means that the output is
|
||||
// sharded, but the number of shard is unknown. The files are then named
|
||||
// 'base-*-of-*'.
|
||||
optional int32 shards = 4 [default = 0];
|
||||
|
||||
// Base file name for output resource. If this is not set by the task
|
||||
// component it is set to a default value by the workflow engine.
|
||||
optional string file_base = 5;
|
||||
|
||||
// Optional extension added to the file name.
|
||||
optional string file_extension = 6;
|
||||
}
|
||||
|
||||
// A task specification is used for describing executing parameters.
|
||||
message TaskSpec {
|
||||
// Name of task.
|
||||
optional string task_name = 1;
|
||||
|
||||
// Workflow task type.
|
||||
optional string task_type = 2;
|
||||
|
||||
// Task parameters.
|
||||
repeated group Parameter = 3 {
|
||||
required string name = 4;
|
||||
optional string value = 5;
|
||||
}
|
||||
|
||||
// Task inputs.
|
||||
repeated TaskInput input = 6;
|
||||
|
||||
// Task outputs.
|
||||
repeated TaskOutput output = 7;
|
||||
}
|
||||
96
Telegram/ThirdParty/cld3/src/unicodetext.cc
vendored
Normal file
96
Telegram/ThirdParty/cld3/src/unicodetext.cc
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
// Copyright (C) 2006 Google Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Author: Jim Meehan
|
||||
|
||||
#include "unicodetext.h"
|
||||
|
||||
#include "base.h"
|
||||
#include "utils.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// *************** Data representation **********
|
||||
// Note: the copy constructor is undefined.
|
||||
|
||||
void UnicodeText::Repr::PointTo(const char *data, int size) {
|
||||
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
|
||||
data_ = const_cast<char *>(data);
|
||||
size_ = size;
|
||||
capacity_ = size;
|
||||
ours_ = false;
|
||||
}
|
||||
|
||||
// *************** UnicodeText ******************
|
||||
|
||||
UnicodeText::UnicodeText() {}
|
||||
|
||||
UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
|
||||
repr_.PointTo(buffer, byte_length);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText::~UnicodeText() {}
|
||||
|
||||
// ******************* UnicodeText::const_iterator *********************
|
||||
|
||||
// The implementation of const_iterator would be nicer if it
|
||||
// inherited from boost::iterator_facade
|
||||
// (http://boost.org/libs/iterator/doc/iterator_facade.html).
|
||||
|
||||
UnicodeText::const_iterator::const_iterator() : it_(0) {}
|
||||
|
||||
UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(
|
||||
const const_iterator &other) {
|
||||
if (&other != this) it_ = other.it_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator UnicodeText::begin() const {
|
||||
return const_iterator(repr_.data_);
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator UnicodeText::end() const {
|
||||
return const_iterator(repr_.data_ + repr_.size_);
|
||||
}
|
||||
|
||||
char32 UnicodeText::const_iterator::operator*() const {
|
||||
// (We could call chartorune here, but that does some
|
||||
// error-checking, and we're guaranteed that our data is valid
|
||||
// UTF-8. Also, we expect this routine to be called very often. So
|
||||
// for speed, we do the calculation ourselves.)
|
||||
|
||||
// Convert from UTF-8
|
||||
unsigned char byte1 = static_cast<unsigned char>(it_[0]);
|
||||
if (byte1 < 0x80) return byte1;
|
||||
|
||||
unsigned char byte2 = static_cast<unsigned char>(it_[1]);
|
||||
if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
|
||||
|
||||
unsigned char byte3 = static_cast<unsigned char>(it_[2]);
|
||||
if (byte1 < 0xF0) {
|
||||
return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
|
||||
}
|
||||
|
||||
unsigned char byte4 = static_cast<unsigned char>(it_[3]);
|
||||
return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
|
||||
((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {
|
||||
it_ += chrome_lang_id::utils::OneCharLen(it_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
144
Telegram/ThirdParty/cld3/src/unicodetext.h
vendored
Normal file
144
Telegram/ThirdParty/cld3/src/unicodetext.h
vendored
Normal file
@@ -0,0 +1,144 @@
|
||||
// Copyright (C) 2006 Google Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Author: Jim Meehan
|
||||
|
||||
#ifndef UNICODETEXT_H_
|
||||
#define UNICODETEXT_H_
|
||||
|
||||
#include <iterator>
|
||||
#include <utility>
|
||||
|
||||
#include "base.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// ***************************** UnicodeText **************************
|
||||
//
|
||||
// A UnicodeText object is a wrapper around a sequence of Unicode
|
||||
// codepoint values that allows iteration over these values.
|
||||
//
|
||||
// The internal representation of the text is UTF-8. Since UTF-8 is a
|
||||
// variable-width format, UnicodeText does not provide random access
|
||||
// to the text, and changes to the text are permitted only at the end.
|
||||
//
|
||||
// The UnicodeText class defines a const_iterator. The dereferencing
|
||||
// operator (*) returns a codepoint (int32). The iterator is a
|
||||
// read-only iterator. It becomes invalid if the text is changed.
|
||||
//
|
||||
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
|
||||
// 0x10FFFF], but UnicodeText has the additional restriction that it
|
||||
// can contain only those characters that are valid for interchange on
|
||||
// the Web. This excludes all of the control codes except for carriage
|
||||
// return, line feed, and horizontal tab. It also excludes
|
||||
// non-characters, but codepoints that are in the Private Use regions
|
||||
// are allowed, as are codepoints that are unassigned. (See the
|
||||
// Unicode reference for details.)
|
||||
//
|
||||
// MEMORY MANAGEMENT:
|
||||
//
|
||||
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
|
||||
//
|
||||
// The purpose of an alias is to avoid making an unnecessary copy of a
|
||||
// UTF-8 buffer while still providing access to the Unicode values
|
||||
// within that text through iterators. The lifetime of an alias must not
|
||||
// exceed the lifetime of the buffer from which it was constructed.
|
||||
//
|
||||
// Aliases should be used with care. If the source from which an alias
|
||||
// was created is freed, or if the contents are changed, while the
|
||||
// alias is still in use, fatal errors could result. But it can be
|
||||
// quite useful to have a UnicodeText "window" through which to see a
|
||||
// UTF-8 buffer without having to pay the price of making a copy.
|
||||
|
||||
// TODO(abakalov): Consider merging this class with the script detection
|
||||
// code in the directory script_span.
|
||||
class UnicodeText {
|
||||
public:
|
||||
class const_iterator;
|
||||
|
||||
UnicodeText(); // Create an empty text.
|
||||
~UnicodeText();
|
||||
|
||||
class const_iterator {
|
||||
typedef const_iterator CI;
|
||||
|
||||
public:
|
||||
// Iterators are default-constructible.
|
||||
const_iterator();
|
||||
|
||||
// It's safe to make multiple passes over a UnicodeText.
|
||||
const_iterator(const const_iterator &other);
|
||||
const_iterator &operator=(const const_iterator &other);
|
||||
|
||||
char32 operator*() const; // Dereference
|
||||
|
||||
const_iterator &operator++(); // Advance (++iter)
|
||||
|
||||
friend bool operator==(const CI &lhs, const CI &rhs) {
|
||||
return lhs.it_ == rhs.it_;
|
||||
}
|
||||
friend bool operator!=(const CI &lhs, const CI &rhs) {
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
private:
|
||||
friend class UnicodeText;
|
||||
explicit const_iterator(const char *it) : it_(it) {}
|
||||
|
||||
const char *it_;
|
||||
};
|
||||
|
||||
const_iterator begin() const;
|
||||
const_iterator end() const;
|
||||
|
||||
// x.PointToUTF8(buf,len) changes x so that it points to buf
|
||||
// ("becomes an alias"). It does not take ownership or copy buf.
|
||||
// This function assumes that the input is interchange valid UTF8.
|
||||
UnicodeText &PointToUTF8(const char *utf8_buffer, int byte_length);
|
||||
|
||||
private:
|
||||
friend class const_iterator;
|
||||
|
||||
class Repr { // A byte-string.
|
||||
public:
|
||||
char *data_;
|
||||
int size_;
|
||||
int capacity_;
|
||||
bool ours_; // Do we own data_?
|
||||
|
||||
Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
|
||||
~Repr() {
|
||||
if (ours_) delete[] data_;
|
||||
}
|
||||
|
||||
void clear();
|
||||
void reserve(int capacity);
|
||||
void resize(int size);
|
||||
|
||||
void append(const char *bytes, int byte_length);
|
||||
void Copy(const char *data, int size);
|
||||
void TakeOwnershipOf(char *data, int size, int capacity);
|
||||
void PointTo(const char *data, int size);
|
||||
|
||||
private:
|
||||
Repr &operator=(const Repr &);
|
||||
Repr(const Repr &other);
|
||||
};
|
||||
|
||||
Repr repr_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // UNICODETEXT_H_
|
||||
241
Telegram/ThirdParty/cld3/src/utils.cc
vendored
Normal file
241
Telegram/ThirdParty/cld3/src/utils.cc
vendored
Normal file
@@ -0,0 +1,241 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "script_span/stringpiece.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace utils {
|
||||
|
||||
bool ParseInt32(const char *c_str, int *value) {
|
||||
char *temp;
|
||||
*value = strtol(c_str, &temp, 0); // NOLINT
|
||||
return (*temp == '\0');
|
||||
}
|
||||
|
||||
bool ParseDouble(const char *c_str, double *value) {
|
||||
char *temp;
|
||||
*value = strtod(c_str, &temp);
|
||||
return (*temp == '\0');
|
||||
}
|
||||
|
||||
static char hex_char[] = "0123456789abcdef";
|
||||
|
||||
string CEscape(const string &src) {
|
||||
string dest;
|
||||
|
||||
for (unsigned char c : src) {
|
||||
switch (c) {
|
||||
case '\n':
|
||||
dest.append("\\n");
|
||||
break;
|
||||
case '\r':
|
||||
dest.append("\\r");
|
||||
break;
|
||||
case '\t':
|
||||
dest.append("\\t");
|
||||
break;
|
||||
case '\"':
|
||||
dest.append("\\\"");
|
||||
break;
|
||||
case '\'':
|
||||
dest.append("\\'");
|
||||
break;
|
||||
case '\\':
|
||||
dest.append("\\\\");
|
||||
break;
|
||||
default:
|
||||
// Note that if we emit \xNN and the src character after that is a hex
|
||||
// digit then that digit must be escaped too to prevent it being
|
||||
// interpreted as part of the character code by C.
|
||||
if ((c >= 0x80) || !isprint(c)) {
|
||||
dest.append("\\");
|
||||
dest.push_back(hex_char[c / 64]);
|
||||
dest.push_back(hex_char[(c % 64) / 8]);
|
||||
dest.push_back(hex_char[c % 8]);
|
||||
} else {
|
||||
dest.push_back(c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
std::vector<string> Split(const string &text, char delim) {
|
||||
std::vector<string> result;
|
||||
size_t token_start = 0;
|
||||
if (!text.empty()) {
|
||||
for (size_t i = 0; i < text.size() + 1; i++) {
|
||||
if ((i == text.size()) || (text[i] == delim)) {
|
||||
result.push_back(string(text.data() + token_start, i - token_start));
|
||||
token_start = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int RemoveLeadingWhitespace(StringPiece *text) {
|
||||
int count = 0;
|
||||
const char *ptr = text->data();
|
||||
while (count < text->size() && isspace(*ptr)) {
|
||||
count++;
|
||||
ptr++;
|
||||
}
|
||||
text->remove_prefix(count);
|
||||
return count;
|
||||
}
|
||||
|
||||
int RemoveTrailingWhitespace(StringPiece *text) {
|
||||
int count = 0;
|
||||
const char *ptr = text->data() + text->size() - 1;
|
||||
while (count < text->size() && isspace(*ptr)) {
|
||||
++count;
|
||||
--ptr;
|
||||
}
|
||||
text->remove_suffix(count);
|
||||
return count;
|
||||
}
|
||||
|
||||
int RemoveWhitespaceContext(StringPiece *text) {
|
||||
// use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
|
||||
return RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text);
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Lower-level versions of Get... that read directly from a character buffer
|
||||
// without any bounds checking.
|
||||
inline uint32 DecodeFixed32(const char *ptr) {
|
||||
return ((static_cast<uint32>(static_cast<unsigned char>(ptr[0]))) |
|
||||
(static_cast<uint32>(static_cast<unsigned char>(ptr[1])) << 8) |
|
||||
(static_cast<uint32>(static_cast<unsigned char>(ptr[2])) << 16) |
|
||||
(static_cast<uint32>(static_cast<unsigned char>(ptr[3])) << 24));
|
||||
}
|
||||
|
||||
// 0xff is in case char is signed.
|
||||
static inline uint32 ByteAs32(char c) { return static_cast<uint32>(c) & 0xff; }
|
||||
} // namespace
|
||||
|
||||
uint32 Hash32(const char *data, size_t n, uint32 seed) {
|
||||
// 'm' and 'r' are mixing constants generated offline.
|
||||
// They're not really 'magic', they just happen to work well.
|
||||
const uint32 m = 0x5bd1e995;
|
||||
const int r = 24;
|
||||
|
||||
// Initialize the hash to a 'random' value
|
||||
uint32 h = static_cast<uint32>(seed ^ n);
|
||||
|
||||
// Mix 4 bytes at a time into the hash
|
||||
while (n >= 4) {
|
||||
uint32 k = DecodeFixed32(data);
|
||||
k *= m;
|
||||
k ^= k >> r;
|
||||
k *= m;
|
||||
h *= m;
|
||||
h ^= k;
|
||||
data += 4;
|
||||
n -= 4;
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
if (n == 3) {
|
||||
h ^= ByteAs32(data[2]) << 16;
|
||||
h ^= ByteAs32(data[1]) << 8;
|
||||
h ^= ByteAs32(data[0]);
|
||||
h *= m;
|
||||
} else if (n == 2) {
|
||||
h ^= ByteAs32(data[1]) << 8;
|
||||
h ^= ByteAs32(data[0]);
|
||||
h *= m;
|
||||
} else if (n == 1) {
|
||||
h ^= ByteAs32(data[0]);
|
||||
h *= m;
|
||||
}
|
||||
|
||||
// Do a few final mixes of the hash to ensure the last few
|
||||
// bytes are well-incorporated.
|
||||
h ^= h >> 13;
|
||||
h *= m;
|
||||
h ^= h >> 15;
|
||||
return h;
|
||||
}
|
||||
|
||||
uint32 Hash32WithDefaultSeed(const string &input) {
|
||||
return Hash32(input.data(), input.size(), 0xBEEF);
|
||||
}
|
||||
|
||||
PunctuationUtil::CharacterRange PunctuationUtil::kPunctuation[] = {
|
||||
{33, 35}, {37, 42}, {44, 47}, {58, 59},
|
||||
{63, 64}, {91, 93}, {95, 95}, {123, 123},
|
||||
{125, 125}, {161, 161}, {171, 171}, {183, 183},
|
||||
{187, 187}, {191, 191}, {894, 894}, {903, 903},
|
||||
{1370, 1375}, {1417, 1418}, {1470, 1470}, {1472, 1472},
|
||||
{1475, 1475}, {1478, 1478}, {1523, 1524}, {1548, 1549},
|
||||
{1563, 1563}, {1566, 1567}, {1642, 1645}, {1748, 1748},
|
||||
{1792, 1805}, {2404, 2405}, {2416, 2416}, {3572, 3572},
|
||||
{3663, 3663}, {3674, 3675}, {3844, 3858}, {3898, 3901},
|
||||
{3973, 3973}, {4048, 4049}, {4170, 4175}, {4347, 4347},
|
||||
{4961, 4968}, {5741, 5742}, {5787, 5788}, {5867, 5869},
|
||||
{5941, 5942}, {6100, 6102}, {6104, 6106}, {6144, 6154},
|
||||
{6468, 6469}, {6622, 6623}, {6686, 6687}, {8208, 8231},
|
||||
{8240, 8259}, {8261, 8273}, {8275, 8286}, {8317, 8318},
|
||||
{8333, 8334}, {9001, 9002}, {9140, 9142}, {10088, 10101},
|
||||
{10181, 10182}, {10214, 10219}, {10627, 10648}, {10712, 10715},
|
||||
{10748, 10749}, {11513, 11516}, {11518, 11519}, {11776, 11799},
|
||||
{11804, 11805}, {12289, 12291}, {12296, 12305}, {12308, 12319},
|
||||
{12336, 12336}, {12349, 12349}, {12448, 12448}, {12539, 12539},
|
||||
{64830, 64831}, {65040, 65049}, {65072, 65106}, {65108, 65121},
|
||||
{65123, 65123}, {65128, 65128}, {65130, 65131}, {65281, 65283},
|
||||
{65285, 65290}, {65292, 65295}, {65306, 65307}, {65311, 65312},
|
||||
{65339, 65341}, {65343, 65343}, {65371, 65371}, {65373, 65373},
|
||||
{65375, 65381}, {65792, 65793}, {66463, 66463}, {68176, 68184},
|
||||
{-1, -1}};
|
||||
|
||||
void NormalizeDigits(string *form) {
|
||||
for (size_t i = 0; i < form->size(); ++i) {
|
||||
if ((*form)[i] >= '0' && (*form)[i] <= '9') (*form)[i] = '9';
|
||||
}
|
||||
}
|
||||
|
||||
void GetUTF8Chars(const string &text, std::vector<string> *chars) {
|
||||
const char *start = text.c_str();
|
||||
const char *end = text.c_str() + text.size();
|
||||
while (start < end) {
|
||||
int char_length = UTF8FirstLetterNumBytes(start);
|
||||
chars->emplace_back(start, char_length);
|
||||
start += char_length;
|
||||
}
|
||||
}
|
||||
|
||||
int UTF8FirstLetterNumBytes(const char *utf8_str) {
|
||||
if (*utf8_str == '\0') return 0;
|
||||
return OneCharLen(utf8_str);
|
||||
}
|
||||
|
||||
int OneCharLen(const char *src) {
|
||||
// On most platforms, char is unsigned by default, but iOS is an exception.
|
||||
// The cast below makes sure we always interpret *src as an unsigned char.
|
||||
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
|
||||
[(*(reinterpret_cast<const unsigned char *>(src)) & 0xFF) >> 4];
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace chrome_lang_id
|
||||
144
Telegram/ThirdParty/cld3/src/utils.h
vendored
Normal file
144
Telegram/ThirdParty/cld3/src/utils.h
vendored
Normal file
@@ -0,0 +1,144 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef UTILS_H_
|
||||
#define UTILS_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <functional>
|
||||
#include <initializer_list>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "base.h"
|
||||
#include "script_span/stringpiece.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
namespace utils {
|
||||
|
||||
bool ParseInt32(const char *c_str, int *value);
|
||||
bool ParseDouble(const char *c_str, double *value);
|
||||
|
||||
template <typename T>
|
||||
T ParseUsing(const string &str, std::function<bool(const char *, T *)> func) {
|
||||
T value;
|
||||
func(str.c_str(), &value);
|
||||
return value;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T ParseUsing(const string &str, T defval,
|
||||
std::function<bool(const char *, T *)> func) {
|
||||
return str.empty() ? defval : ParseUsing<T>(str, func);
|
||||
}
|
||||
|
||||
string CEscape(const string &src);
|
||||
|
||||
std::vector<string> Split(const string &text, char delim);
|
||||
|
||||
int RemoveLeadingWhitespace(StringPiece *text);
|
||||
|
||||
int RemoveTrailingWhitespace(StringPiece *text);
|
||||
|
||||
int RemoveWhitespaceContext(StringPiece *text);
|
||||
|
||||
uint32 Hash32(const char *data, size_t n, uint32 seed);
|
||||
|
||||
uint32 Hash32WithDefaultSeed(const string &input);
|
||||
|
||||
// Deletes all the elements in an STL container and clears the container. This
|
||||
// function is suitable for use with a vector, set, hash_set, or any other STL
|
||||
// container which defines sensible begin(), end(), and clear() methods.
|
||||
// If container is NULL, this function is a no-op.
|
||||
template <typename T>
|
||||
void STLDeleteElements(T *container) {
|
||||
if (!container) return;
|
||||
auto it = container->begin();
|
||||
while (it != container->end()) {
|
||||
auto temp = it;
|
||||
++it;
|
||||
delete *temp;
|
||||
}
|
||||
container->clear();
|
||||
}
|
||||
|
||||
class PunctuationUtil {
|
||||
public:
|
||||
// Unicode character ranges for punctuation characters according to CoNLL.
|
||||
struct CharacterRange {
|
||||
int first;
|
||||
int last;
|
||||
};
|
||||
static CharacterRange kPunctuation[];
|
||||
|
||||
// Returns true if Unicode character is a punctuation character.
|
||||
static bool IsPunctuation(int u) {
|
||||
int i = 0;
|
||||
while (kPunctuation[i].first > 0) {
|
||||
if (u < kPunctuation[i].first) return false;
|
||||
if (u <= kPunctuation[i].last) return true;
|
||||
++i;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Determine if tag is a punctuation tag.
|
||||
static bool IsPunctuationTag(const string &tag) {
|
||||
for (size_t i = 0; i < tag.length(); ++i) {
|
||||
int c = tag[i];
|
||||
if (c != ',' && c != ':' && c != '.' && c != '\'' && c != '`') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns true if tag is non-empty and has only punctuation or parens
|
||||
// symbols.
|
||||
static bool IsPunctuationTagOrParens(const string &tag) {
|
||||
if (tag.empty()) return false;
|
||||
for (size_t i = 0; i < tag.length(); ++i) {
|
||||
int c = tag[i];
|
||||
if (c != '(' && c != ')' && c != ',' && c != ':' && c != '.' &&
|
||||
c != '\'' && c != '`') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
void NormalizeDigits(string *form);
|
||||
|
||||
// Takes a text and convert it into a vector, where each element is a utf8
|
||||
// character.
|
||||
void GetUTF8Chars(const string &text, std::vector<string> *chars);
|
||||
|
||||
// Returns the number of bytes in the first UTF-8 char at the beginning
|
||||
// of the string. It is assumed that the string is valid UTF-8. If
|
||||
// the first byte of the string is null, return 0 (for backwards
|
||||
// compatibility only; this use is discouraged).
|
||||
int UTF8FirstLetterNumBytes(const char *in_buf);
|
||||
|
||||
// Returns the length (number of bytes) of the Unicode code point starting at
|
||||
// src, based on inspecting just that one byte. Preconditions: src != NULL,
|
||||
// *src can be read, and *src is not '\0', and src points to a well-formed UTF-8
|
||||
// string.
|
||||
int OneCharLen(const char *src);
|
||||
|
||||
} // namespace utils
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // UTILS_H_
|
||||
64
Telegram/ThirdParty/cld3/src/workspace.cc
vendored
Normal file
64
Telegram/ThirdParty/cld3/src/workspace.cc
vendored
Normal file
@@ -0,0 +1,64 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "workspace.h"
|
||||
|
||||
#include "base.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
WorkspaceSet::WorkspaceSet() {}
|
||||
|
||||
WorkspaceSet::~WorkspaceSet() { Reset(WorkspaceRegistry()); }
|
||||
|
||||
WorkspaceRegistry::WorkspaceRegistry() {}
|
||||
|
||||
WorkspaceRegistry::~WorkspaceRegistry() {}
|
||||
|
||||
string WorkspaceRegistry::DebugString() const {
|
||||
string str;
|
||||
for (auto &it : workspace_names_) {
|
||||
const string &type_name = workspace_types_.at(it.first);
|
||||
for (size_t index = 0; index < it.second.size(); ++index) {
|
||||
const string &workspace_name = it.second[index];
|
||||
str += "\n ";
|
||||
str += type_name;
|
||||
str += " :: ";
|
||||
str += workspace_name;
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
VectorIntWorkspace::~VectorIntWorkspace() {}
|
||||
|
||||
VectorIntWorkspace::VectorIntWorkspace(int size) : elements_(size) {}
|
||||
|
||||
VectorIntWorkspace::VectorIntWorkspace(int size, int value)
|
||||
: elements_(size, value) {}
|
||||
|
||||
VectorIntWorkspace::VectorIntWorkspace(const std::vector<int> &elements)
|
||||
: elements_(elements) {}
|
||||
|
||||
string VectorIntWorkspace::TypeName() { return "Vector"; }
|
||||
|
||||
VectorVectorIntWorkspace::~VectorVectorIntWorkspace() {}
|
||||
|
||||
VectorVectorIntWorkspace::VectorVectorIntWorkspace(int size)
|
||||
: elements_(size) {}
|
||||
|
||||
string VectorVectorIntWorkspace::TypeName() { return "VectorVector"; }
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
177
Telegram/ThirdParty/cld3/src/workspace.h
vendored
Normal file
177
Telegram/ThirdParty/cld3/src/workspace.h
vendored
Normal file
@@ -0,0 +1,177 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Notes on thread-safety: All of the classes here are thread-compatible. More
|
||||
// specifically, the registry machinery is thread-safe, as long as each thread
|
||||
// performs feature extraction on a different Sentence object.
|
||||
|
||||
#ifndef WORKSPACE_H_
|
||||
#define WORKSPACE_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string>
|
||||
#include <typeindex>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "base.h"
|
||||
|
||||
namespace chrome_lang_id {
|
||||
|
||||
// A base class for shared workspaces. Derived classes implement a static member
|
||||
// function TypeName() which returns a human readable string name for the class.
|
||||
class Workspace {
|
||||
public:
|
||||
// Polymorphic destructor.
|
||||
virtual ~Workspace() {}
|
||||
|
||||
protected:
|
||||
// Create an empty workspace.
|
||||
Workspace() {}
|
||||
|
||||
private:
|
||||
CLD3_DISALLOW_COPY_AND_ASSIGN(Workspace);
|
||||
};
|
||||
|
||||
// A registry that keeps track of workspaces.
|
||||
class WorkspaceRegistry {
|
||||
public:
|
||||
// Create an empty registry.
|
||||
WorkspaceRegistry();
|
||||
~WorkspaceRegistry();
|
||||
|
||||
const std::unordered_map<std::type_index, std::vector<std::string>>
|
||||
&WorkspaceNames() const {
|
||||
return workspace_names_;
|
||||
}
|
||||
|
||||
// Returns a string describing the registered workspaces.
|
||||
string DebugString() const;
|
||||
|
||||
private:
|
||||
// Workspace type names, indexed as workspace_types_[typeid].
|
||||
std::unordered_map<std::type_index, string> workspace_types_;
|
||||
|
||||
// Workspace names, indexed as workspace_names_[typeid][workspace].
|
||||
std::unordered_map<std::type_index, std::vector<string>> workspace_names_;
|
||||
|
||||
CLD3_DISALLOW_COPY_AND_ASSIGN(WorkspaceRegistry);
|
||||
};
|
||||
|
||||
// A typed collected of workspaces. The workspaces are indexed according to an
|
||||
// external WorkspaceRegistry. If the WorkspaceSet is const, the contents are
|
||||
// also immutable.
|
||||
class WorkspaceSet {
|
||||
public:
|
||||
WorkspaceSet();
|
||||
~WorkspaceSet();
|
||||
|
||||
void Reset(const WorkspaceRegistry ®istry) {
|
||||
// Deallocate current workspaces.
|
||||
for (auto &it : workspaces_) {
|
||||
for (size_t index = 0; index < it.second.size(); ++index) {
|
||||
delete it.second[index];
|
||||
}
|
||||
}
|
||||
workspaces_.clear();
|
||||
|
||||
// Allocate space for new workspaces.
|
||||
for (auto &it : registry.WorkspaceNames()) {
|
||||
workspaces_[it.first].resize(it.second.size());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// The set of workspaces, indexed as workspaces_[typeid][index].
|
||||
std::unordered_map<std::type_index, std::vector<Workspace *>> workspaces_;
|
||||
};
|
||||
|
||||
// A workspace that wraps around a single int.
|
||||
class SingletonIntWorkspace : public Workspace {
|
||||
public:
|
||||
// Default-initializes the int value.
|
||||
SingletonIntWorkspace() {}
|
||||
|
||||
// Initializes the int with the given value.
|
||||
explicit SingletonIntWorkspace(int value) : value_(value) {}
|
||||
|
||||
// Returns the name of this type of workspace.
|
||||
static string TypeName() { return "SingletonInt"; }
|
||||
|
||||
// Returns the int value.
|
||||
int get() const { return value_; }
|
||||
|
||||
// Sets the int value.
|
||||
void set(int value) { value_ = value; }
|
||||
|
||||
private:
|
||||
// The enclosed int.
|
||||
int value_ = 0;
|
||||
};
|
||||
|
||||
// A workspace that wraps around a vector of int.
|
||||
class VectorIntWorkspace : public Workspace {
|
||||
public:
|
||||
// Creates a vector of the given size.
|
||||
explicit VectorIntWorkspace(int size);
|
||||
|
||||
// Creates a vector initialized with the given array.
|
||||
explicit VectorIntWorkspace(const std::vector<int> &elements);
|
||||
|
||||
// Creates a vector of the given size, with each element initialized to the
|
||||
// given value.
|
||||
VectorIntWorkspace(int size, int value);
|
||||
|
||||
~VectorIntWorkspace() override;
|
||||
|
||||
// Returns the name of this type of workspace.
|
||||
static string TypeName();
|
||||
|
||||
// Returns the i'th element.
|
||||
int element(int i) const { return elements_[i]; }
|
||||
|
||||
// Sets the i'th element.
|
||||
void set_element(int i, int value) { elements_[i] = value; }
|
||||
|
||||
private:
|
||||
// The enclosed vector.
|
||||
std::vector<int> elements_;
|
||||
};
|
||||
|
||||
// A workspace that wraps around a vector of vector of int.
|
||||
class VectorVectorIntWorkspace : public Workspace {
|
||||
public:
|
||||
// Creates a vector of empty vectors of the given size.
|
||||
explicit VectorVectorIntWorkspace(int size);
|
||||
~VectorVectorIntWorkspace() override;
|
||||
|
||||
// Returns the name of this type of workspace.
|
||||
static string TypeName();
|
||||
|
||||
// Returns the i'th vector of elements.
|
||||
const std::vector<int> &elements(int i) const { return elements_[i]; }
|
||||
|
||||
// Mutable access to the i'th vector of elements.
|
||||
std::vector<int> *mutable_elements(int i) { return &(elements_[i]); }
|
||||
|
||||
private:
|
||||
// The enclosed vector of vector of elements.
|
||||
std::vector<std::vector<int>> elements_;
|
||||
};
|
||||
|
||||
} // namespace chrome_lang_id
|
||||
|
||||
#endif // WORKSPACE_H_
|
||||
Reference in New Issue
Block a user