blob: 4da41cce075b00d3efe1c0dfa4038cfee2fe73ef (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
# Copyright 2023-2026 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
# Autogenerated by pycargoebuild 0.15.0
EAPI=8
DISTUTILS_USE_PEP517=maturin
PYTHON_COMPAT=( python3_{13..14} )
DISTUTILS_EXT=1
DISTUTILS_SINGLE_IMPL=1
RUST_MIN_VER="1.82.0"
CRATES="
"
inherit cargo distutils-r1
DESCRIPTION="Implementation of today's most used tokenizers"
HOMEPAGE="https://github.com/huggingface/tokenizers"
SRC_URI="
https://github.com/huggingface/${PN}/archive/refs/tags/v${PV}.tar.gz
-> ${P}.gh.tar.gz
${CARGO_CRATE_URIS}
"
if [[ ${PKGBUMPING} != ${PVR} ]]; then
SRC_URI+="
https://dev.gentoo.org/~tupone/distfiles/${P}-crates.tar.xz
https://dev.gentoo.org/~tupone/distfiles/${PN}-python-${PV}-crates.tar.xz
"
fi
LICENSE="Apache-2.0"
# Dependent crate licenses
LICENSE+="
Apache-2.0 Apache-2.0-with-LLVM-exceptions BSD-2 BSD ISC MIT MPL-2.0
Unicode-DFS-2016
"
SLOT="0"
KEYWORDS="~amd64"
RDEPEND="dev-libs/oniguruma"
BDEPEND="
test? ( sci-ml/datasets[${PYTHON_SINGLE_USEDEP}] )
$(python_gen_cond_dep '
dev-python/setuptools-rust[${PYTHON_USEDEP}]
')
"
EPYTEST_PLUGINS=( )
distutils_enable_tests pytest
QA_FLAGS_IGNORED=".*/site-packages/tokenizers/.*so"
src_unpack() {
cargo_src_unpack
}
pkg_setup() {
python-single-r1_pkg_setup
rust_pkg_setup
}
src_prepare() {
default
cd bindings/python
eapply "${FILESDIR}"/${PN}-0.21.2-test.patch
distutils-r1_src_prepare
}
src_configure() {
cd tokenizers
cargo_src_configure
cd ../bindings/python
distutils-r1_src_configure
}
src_compile() {
export RUSTONIG_SYSTEM_LIBONIG=1
cd tokenizers
cargo_src_compile
cd ../bindings/python
distutils-r1_src_compile
}
src_test() {
cd tokenizers
# Tests do not work
#cargo_src_test
cd ../bindings/python
local -x EPYTEST_IGNORE=( benches/test_tiktoken.py )
local -x EPYTEST_DESELECT=(
tests/bindings/test_encoding.py::TestEncoding::test_char_to_token
tests/bindings/test_encoding.py::TestEncoding::test_char_to_word
tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction
tests/bindings/test_encoding.py::TestEncoding::test_n_sequences
tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids
tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence
tests/bindings/test_encoding.py::TestEncoding::test_token_to_word
tests/bindings/test_encoding.py::TestEncoding::test_truncation
tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars
tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens
tests/bindings/test_models.py::TestBPE::test_instantiate
tests/bindings/test_models.py::TestWordLevel::test_instantiate
tests/bindings/test_models.py::TestWordPiece::test_instantiate
tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_async_methods_existence
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_basic_encoding
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_concurrency
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_decode
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_encode
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_error_handling
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_large_batch
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_numpy_inputs
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_various_input_formats
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_performance_comparison
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_with_special_tokens
tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_with_truncation_padding
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats
tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_decode_skip_special_tokens
tests/bindings/test_tokenizer.py::TestTokenizer::test_decode_stream_fallback
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained
tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision
tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting
tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch
tests/bindings/test_trainers.py::TestUnigram::test_train
tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer
tests/documentation/test_pipeline.py::TestPipeline::test_bert_example
tests/documentation/test_pipeline.py::TestPipeline::test_pipeline
tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets
tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace
tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode
tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase
tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism
tests/test_serialization.py::TestSerialization::test_full_serialization_albert
tests/test_serialization.py::TestSerialization::test_str_big
)
distutils-r1_src_test
}
src_install() {
cd tokenizers
cd ../bindings/python
distutils-r1_src_install
}
|