SERVER-79547 Introduce snowball libstemmer_c import script (#29667)

GitOrigin-RevId: 8a52f04b302e9dfaa8f52e42b0db560111927a86
This commit is contained in:
Catalin Sumanaru 2024-11-28 09:23:24 +00:00 committed by MongoDB Bot
parent 056dacce1a
commit 6aa2cbb084
85 changed files with 131 additions and 94 deletions

View File

@ -21,57 +21,57 @@ not authored by MongoDB, and has a license which requires reproduction,
a notice will be included in
`THIRD-PARTY-NOTICES`.
| Name | License | Vendored Version | Emits persisted data | Distributed in Release Binaries |
| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------- | -------------------------- | -------------------- | ------------------------------- |
| [Abseil] | Apache-2.0 | 20230802.1 | | ✗ |
| [arximboldi/immer] | BSL-1.0 | Unknown | | ✗ |
| [Asio C++ Library] | BSL-1.0 | 1.12.2 | | ✗ |
| [benchmark] | Apache-2.0 | v1.5.2 | | |
| [Boost C++ Libraries - boost] | BSL-1.0 | 1.79.0 | | ✗ |
| [c-ares] | MIT | 1.19.1 | | ✗ |
| [concurrencytest] | GPL-3.0-or-later | 0.1.2 | unknown | |
| [Cyrus SASL] | BSD-Attribution-HPND-disclaimer | 2.1.28 | unknown | |
| [dcleblanc/SafeInt] | MIT | 3.0.26 | | ✗ |
| [derickr/timelib] | MIT | 2022.10 | | ✗ |
| [discover] | BSD-3-Clause | 0.4.0 | unknown | |
| [fmtlib/fmt] | MIT | 7.1.3 | | ✗ |
| [google-re2] | BSD-3-Clause | 2023-11-01 | | ✗ |
| [google-snappy] | BSD-3-Clause | 1.1.10 | ✗ | ✗ |
| [google/s2geometry] | Apache-2.0 | Unknown | ✗ | ✗ |
| [gperftools] | BSD-3-Clause | 2.9.1 | | ✗ |
| [grpc] | Apache-2.0 | 1.59.2 | | ✗ |
| [ICU for C/C++ (ICU4C)] | BSD-3-Clause, MIT v2 with Ad Clause License, Public Domain, BSD-2-Clause | 57.1 | ✗ | ✗ |
| [Intel Decimal Floating-Point Math Library] | BSD-3-Clause | v2.0 U1 | | ✗ |
| [jbeder/yaml-cpp] | MIT | 0.6.3 | | ✗ |
| [JSON-Schema-Test-Suite] | Unknown License | Unknown | | |
| [libmongocrypt] | Apache-2.0 | 1.12.0 | ✗ | ✗ |
| [librdkafka - the Apache Kafka C/C++ client library] | BSD-3-Clause, Xmlproc License, ISC, MIT, Public Domain, Zlib, BSD-2-Clause, Andreas Stolcke License | 2.0.2 | | ✗ |
| [LibTomCrypt] | WTFPL, Public Domain | 1.18.2 | ✗ | ✗ |
| [libunwind/libunwind] | MIT | v1.6.2 | | ✗ |
| [linenoise] | BSD-2-Clause | Unknown | | ✗ |
| [MongoDB C Driver] | Apache-2.0 | 1.27.6 | ✗ | ✗ |
| [Mozilla Firefox] | MPL-2.0 | 115.7.0esr | unknown | ✗ |
| [nlohmann.json.decomposed] | MIT | 3.10.5 | unknown | |
| [node] | ISC | 22.1.0 | unknown | |
| [ocspbuilder] | MIT | 0.10.2 | | |
| [ocspresponder] | Apache-2.0 | 0.5.0 | | |
| [PCRE2] | BSD-3-Clause, Public Domain | 10.40 | | ✗ |
| [Protobuf] | BSD-3-Clause | v4.25.0 | | ✗ |
| [pyiso8601] | MIT | 2.1.0 | unknown | |
| [RoaringBitmap/CRoaring] | Unknown License | v3.0.1 | | ✗ |
| [SchemaStore/schemastore] | Apache-2.0 | Unknown | | |
| [SCons - a Software Construction tool] | MIT | 3.1.2 | | ✗ |
| [smhasher] | Unknown License | Unknown | unknown | ✗ |
| [Snowball Stemming Algorithms] | BSD-3-Clause | Unknown | unknown | ✗ |
| [subunit] | BSD-3-Clause, Apache-2.0 | 1.4.4 | unknown | |
| [tcmalloc] | Apache-2.0 | 20230227-snapshot-093ba93c | | ✗ |
| [testing-cabal/extras] | MIT | 0.0.3 | unknown | |
| [testscenarios] | BSD-3-Clause, Apache-2.0 | 0.4 | unknown | |
| [testtools] | MIT | 2.7.1 | unknown | |
| [unicode-data] | Unicode-DFS-2016 | 8.0 | ✗ | ✗ |
| [valgrind] | GPL-2.0-or-later | Unknown | | ✗ |
| [zlib] | Zlib | v1.3 | ✗ | ✗ |
| [zstd] | BSD-3-Clause, GPL-2.0-or-later | 1.5.5 | ✗ | ✗ |
| Name | License | Vendored Version | Emits persisted data | Distributed in Release Binaries |
| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------- | ---------------------------------------- | -------------------- | ------------------------------- |
| [Abseil] | Apache-2.0 | 20230802.1 | | ✗ |
| [arximboldi/immer] | BSL-1.0 | Unknown | | ✗ |
| [Asio C++ Library] | BSL-1.0 | 1.12.2 | | ✗ |
| [benchmark] | Apache-2.0 | v1.5.2 | | |
| [Boost C++ Libraries - boost] | BSL-1.0 | 1.79.0 | | ✗ |
| [c-ares] | MIT | 1.19.1 | | ✗ |
| [concurrencytest] | GPL-3.0-or-later | 0.1.2 | unknown | |
| [Cyrus SASL] | BSD-Attribution-HPND-disclaimer | 2.1.28 | unknown | |
| [dcleblanc/SafeInt] | MIT | 3.0.26 | | ✗ |
| [derickr/timelib] | MIT | 2022.10 | | ✗ |
| [discover] | BSD-3-Clause | 0.4.0 | unknown | |
| [fmtlib/fmt] | MIT | 7.1.3 | | ✗ |
| [google-re2] | BSD-3-Clause | 2023-11-01 | | ✗ |
| [google-snappy] | BSD-3-Clause | 1.1.10 | ✗ | ✗ |
| [google/s2geometry] | Apache-2.0 | Unknown | ✗ | ✗ |
| [gperftools] | BSD-3-Clause | 2.9.1 | | ✗ |
| [grpc] | Apache-2.0 | 1.59.2 | | ✗ |
| [ICU for C/C++ (ICU4C)] | BSD-3-Clause, MIT v2 with Ad Clause License, Public Domain, BSD-2-Clause | 57.1 | ✗ | ✗ |
| [Intel Decimal Floating-Point Math Library] | BSD-3-Clause | v2.0 U1 | | ✗ |
| [jbeder/yaml-cpp] | MIT | 0.6.3 | | ✗ |
| [JSON-Schema-Test-Suite] | Unknown License | Unknown | | |
| [libmongocrypt] | Apache-2.0 | 1.12.0 | ✗ | ✗ |
| [librdkafka - the Apache Kafka C/C++ client library] | BSD-3-Clause, Xmlproc License, ISC, MIT, Public Domain, Zlib, BSD-2-Clause, Andreas Stolcke License | 2.0.2 | | ✗ |
| [LibTomCrypt] | WTFPL, Public Domain | 1.18.2 | ✗ | ✗ |
| [libunwind/libunwind] | MIT | v1.6.2 | | ✗ |
| [linenoise] | BSD-2-Clause | Unknown | | ✗ |
| [MongoDB C Driver] | Apache-2.0 | 1.27.6 | ✗ | ✗ |
| [Mozilla Firefox] | MPL-2.0 | 115.7.0esr | unknown | ✗ |
| [nlohmann.json.decomposed] | MIT | 3.10.5 | unknown | |
| [node] | ISC | 22.1.0 | unknown | |
| [ocspbuilder] | MIT | 0.10.2 | | |
| [ocspresponder] | Apache-2.0 | 0.5.0 | | |
| [PCRE2] | BSD-3-Clause, Public Domain | 10.40 | | ✗ |
| [Protobuf] | BSD-3-Clause | v4.25.0 | | ✗ |
| [pyiso8601] | MIT | 2.1.0 | unknown | |
| [RoaringBitmap/CRoaring] | Unknown License | v3.0.1 | | ✗ |
| [SchemaStore/schemastore] | Apache-2.0 | Unknown | | |
| [SCons - a Software Construction tool] | MIT | 3.1.2 | | ✗ |
| [smhasher] | Unknown License | Unknown | unknown | ✗ |
| [Snowball Stemming Algorithms] | BSD-3-Clause | 7b264ffa0f767c579d052fd8142558dc8264d795 | ✗ | ✗ |
| [subunit] | BSD-3-Clause, Apache-2.0 | 1.4.4 | unknown | |
| [tcmalloc] | Apache-2.0 | 20230227-snapshot-093ba93c | | ✗ |
| [testing-cabal/extras] | MIT | 0.0.3 | unknown | |
| [testscenarios] | BSD-3-Clause, Apache-2.0 | 0.4 | unknown | |
| [testtools] | MIT | 2.7.1 | unknown | |
| [unicode-data] | Unicode-DFS-2016 | 8.0 | ✗ | ✗ |
| [valgrind] | GPL-2.0-or-later | Unknown | | ✗ |
| [zlib] | Zlib | v1.3 | ✗ | ✗ |
| [zstd] | BSD-3-Clause, GPL-2.0-or-later | 1.5.5 | ✗ | ✗ |
[Abseil]: https://github.com/abseil/abseil-cpp
[Asio C++ Library]: https://github.com/chriskohlhoff/asio

View File

@ -1660,7 +1660,7 @@
"name": ""
},
"name": "Snowball Stemming Algorithms",
"version": "Unknown",
"version": "7b264ffa0f767c579d052fd8142558dc8264d795",
"licenses": [
{
"license": {
@ -1676,6 +1676,14 @@
{
"name": "info_link",
"value": "https://github.com/snowballstem/snowball"
},
{
"name": "emits_persisted_data",
"value": "true"
},
{
"name": "import_script_path",
"value": "src/third_party/libstemmer_c/scripts/import.sh"
}
],
"type": "library",

View File

@ -204,7 +204,7 @@ if not use_system_version_of_library('tomcrypt'):
if not use_system_version_of_library('stemmer'):
thirdPartyEnvironmentModifications['stemmer'] = {
'CPPPATH': ['#/src/third_party/libstemmer_c/include'],
'CPPPATH': ['#/src/third_party/libstemmer_c/dist/include'],
}
# Note that the wiredtiger.h header is generated, so

View File

@ -5,48 +5,48 @@ package(default_visibility = ["//visibility:public"])
mongo_cc_library(
name = "stemmer",
srcs = [
"libstemmer/libstemmer_utf8.c",
"libstemmer/modules.h",
"libstemmer/modules_utf8.h",
"runtime/api.c",
"runtime/api.h",
"runtime/header.h",
"src_c/stem_UTF_8_danish.c",
"src_c/stem_UTF_8_danish.h",
"src_c/stem_UTF_8_dutch.c",
"src_c/stem_UTF_8_dutch.h",
"src_c/stem_UTF_8_english.c",
"src_c/stem_UTF_8_english.h",
"src_c/stem_UTF_8_finnish.c",
"src_c/stem_UTF_8_finnish.h",
"src_c/stem_UTF_8_french.c",
"src_c/stem_UTF_8_french.h",
"src_c/stem_UTF_8_german.c",
"src_c/stem_UTF_8_german.h",
"src_c/stem_UTF_8_hungarian.c",
"src_c/stem_UTF_8_hungarian.h",
"src_c/stem_UTF_8_italian.c",
"src_c/stem_UTF_8_italian.h",
"src_c/stem_UTF_8_norwegian.c",
"src_c/stem_UTF_8_norwegian.h",
"src_c/stem_UTF_8_porter.c",
"src_c/stem_UTF_8_porter.h",
"src_c/stem_UTF_8_portuguese.c",
"src_c/stem_UTF_8_portuguese.h",
"src_c/stem_UTF_8_romanian.c",
"src_c/stem_UTF_8_romanian.h",
"src_c/stem_UTF_8_russian.c",
"src_c/stem_UTF_8_russian.h",
"src_c/stem_UTF_8_spanish.c",
"src_c/stem_UTF_8_spanish.h",
"src_c/stem_UTF_8_swedish.c",
"src_c/stem_UTF_8_swedish.h",
"src_c/stem_UTF_8_turkish.c",
"src_c/stem_UTF_8_turkish.h",
"dist/libstemmer/libstemmer_utf8.c",
"dist/libstemmer/modules.h",
"dist/libstemmer/modules_utf8.h",
"dist/runtime/api.c",
"dist/runtime/api.h",
"dist/runtime/header.h",
"dist/src_c/stem_UTF_8_danish.c",
"dist/src_c/stem_UTF_8_danish.h",
"dist/src_c/stem_UTF_8_dutch.c",
"dist/src_c/stem_UTF_8_dutch.h",
"dist/src_c/stem_UTF_8_english.c",
"dist/src_c/stem_UTF_8_english.h",
"dist/src_c/stem_UTF_8_finnish.c",
"dist/src_c/stem_UTF_8_finnish.h",
"dist/src_c/stem_UTF_8_french.c",
"dist/src_c/stem_UTF_8_french.h",
"dist/src_c/stem_UTF_8_german.c",
"dist/src_c/stem_UTF_8_german.h",
"dist/src_c/stem_UTF_8_hungarian.c",
"dist/src_c/stem_UTF_8_hungarian.h",
"dist/src_c/stem_UTF_8_italian.c",
"dist/src_c/stem_UTF_8_italian.h",
"dist/src_c/stem_UTF_8_norwegian.c",
"dist/src_c/stem_UTF_8_norwegian.h",
"dist/src_c/stem_UTF_8_porter.c",
"dist/src_c/stem_UTF_8_porter.h",
"dist/src_c/stem_UTF_8_portuguese.c",
"dist/src_c/stem_UTF_8_portuguese.h",
"dist/src_c/stem_UTF_8_romanian.c",
"dist/src_c/stem_UTF_8_romanian.h",
"dist/src_c/stem_UTF_8_russian.c",
"dist/src_c/stem_UTF_8_russian.h",
"dist/src_c/stem_UTF_8_spanish.c",
"dist/src_c/stem_UTF_8_spanish.h",
"dist/src_c/stem_UTF_8_swedish.c",
"dist/src_c/stem_UTF_8_swedish.h",
"dist/src_c/stem_UTF_8_turkish.c",
"dist/src_c/stem_UTF_8_turkish.h",
],
hdrs = [
"include/libstemmer.h",
"runtime/utilities.c",
"dist/include/libstemmer.h",
"dist/runtime/utilities.c",
],
copts = select({
"//bazel/config:gcc_or_clang": [
@ -56,6 +56,6 @@ mongo_cc_library(
}),
includes = [
# from https://github.com/10gen/mongo/blob/master/src/third_party/SConscript#L172-L175
"include",
"dist/include",
],
)

View File

@ -0,0 +1,29 @@
#!/bin/bash
# This script downloads and imports libstemmer_c.
set -euo pipefail
IFS=$'\n\t'
set -vx
NAME=libstemmer_c
VERSION="7b264ffa0f767c579d052fd8142558dc8264d795"
DEST_DIR=$(git rev-parse --show-toplevel)/src/third_party/libstemmer_c/dist
if [[ -d $DEST_DIR ]]; then
echo "You must remove '$DEST_DIR' before running $0" >&2
exit 1
fi
mkdir -p $DEST_DIR
SNOWBALL_GIT_DIR=$(mktemp -d /tmp/import-snowball.XXXXXX)
trap "rm -rf $SNOWBALL_GIT_DIR" EXIT
git clone git@github.com:snowballstem/snowball.git $SNOWBALL_GIT_DIR
git -C $SNOWBALL_GIT_DIR checkout $VERSION
pushd $SNOWBALL_GIT_DIR
make dist_libstemmer_c
popd
ARCHIVE=$(find $SNOWBALL_GIT_DIR -regextype posix-extended -regex '^.*\.(tgz|tar\.gz)$')
tar --strip-components=1 -xvzf $ARCHIVE -C $DEST_DIR