LICENSE
README.md
pyproject.toml
src/datatrove/__init__.py
src/datatrove/data.py
src/datatrove/io.py
src/datatrove.egg-info/PKG-INFO
src/datatrove.egg-info/SOURCES.txt
src/datatrove.egg-info/dependency_links.txt
src/datatrove.egg-info/entry_points.txt
src/datatrove.egg-info/requires.txt
src/datatrove.egg-info/top_level.txt
src/datatrove/assets/banned_subwords.txt
src/datatrove/assets/banned_words.txt
src/datatrove/assets/soft_banned_words.txt
src/datatrove/assets/tokenizer_assignment.csv
src/datatrove/assets/url_filterblacklistsv0_3_0.tar.gz
src/datatrove/executor/__init__.py
src/datatrove/executor/base.py
src/datatrove/executor/local.py
src/datatrove/executor/ray.py
src/datatrove/executor/slurm.py
src/datatrove/pipeline/__init__.py
src/datatrove/pipeline/base.py
src/datatrove/pipeline/decont/__init__.py
src/datatrove/pipeline/decont/n_grams.py
src/datatrove/pipeline/dedup/__init__.py
src/datatrove/pipeline/dedup/bloom_filter.py
src/datatrove/pipeline/dedup/exact_dedup.py
src/datatrove/pipeline/dedup/exact_substrings.py
src/datatrove/pipeline/dedup/minhash.py
src/datatrove/pipeline/dedup/sentence_dedup.py
src/datatrove/pipeline/extractors/__init__.py
src/datatrove/pipeline/extractors/base.py
src/datatrove/pipeline/extractors/modular.py
src/datatrove/pipeline/extractors/trafilatura.py
src/datatrove/pipeline/filters/__init__.py
src/datatrove/pipeline/filters/base_filter.py
src/datatrove/pipeline/filters/c4_filters.py
src/datatrove/pipeline/filters/fasttext_filter.py
src/datatrove/pipeline/filters/fineweb_quality_filter.py
src/datatrove/pipeline/filters/gopher_quality_filter.py
src/datatrove/pipeline/filters/gopher_repetition_filter.py
src/datatrove/pipeline/filters/lambda_filter.py
src/datatrove/pipeline/filters/language_filter.py
src/datatrove/pipeline/filters/regex_filter.py
src/datatrove/pipeline/filters/sampler_filter.py
src/datatrove/pipeline/filters/unigram_log_probs.py
src/datatrove/pipeline/filters/url_filter.py
src/datatrove/pipeline/formatters/__init__.py
src/datatrove/pipeline/formatters/base.py
src/datatrove/pipeline/formatters/ftfy.py
src/datatrove/pipeline/formatters/pii.py
src/datatrove/pipeline/formatters/symbol_lines_remover.py
src/datatrove/pipeline/inference/__init__.py
src/datatrove/pipeline/inference/checkpointing.py
src/datatrove/pipeline/inference/dataset_card_generator.py
src/datatrove/pipeline/inference/dataset_card_template.md
src/datatrove/pipeline/inference/metrics.py
src/datatrove/pipeline/inference/progress_monitor.py
src/datatrove/pipeline/inference/run_inference.py
src/datatrove/pipeline/inference/types.py
src/datatrove/pipeline/inference/distributed/__init__.py
src/datatrove/pipeline/inference/distributed/ray.py
src/datatrove/pipeline/inference/distributed/utils.py
src/datatrove/pipeline/inference/servers/__init__.py
src/datatrove/pipeline/inference/servers/base.py
src/datatrove/pipeline/inference/servers/compile_lock.py
src/datatrove/pipeline/inference/servers/custom_server.py
src/datatrove/pipeline/inference/servers/dummy_server.py
src/datatrove/pipeline/inference/servers/endpoint_server.py
src/datatrove/pipeline/inference/servers/sglang_server.py
src/datatrove/pipeline/inference/servers/vllm_server.py
src/datatrove/pipeline/media/filters/base_filter.py
src/datatrove/pipeline/media/filters/mime_filter.py
src/datatrove/pipeline/media/media_readers/base.py
src/datatrove/pipeline/media/media_readers/warc.py
src/datatrove/pipeline/media/media_readers/zstd.py
src/datatrove/pipeline/media/media_writers/base.py
src/datatrove/pipeline/media/media_writers/zstd.py
src/datatrove/pipeline/media/readers/http_fetch.py
src/datatrove/pipeline/readers/__init__.py
src/datatrove/pipeline/readers/base.py
src/datatrove/pipeline/readers/csv.py
src/datatrove/pipeline/readers/huggingface.py
src/datatrove/pipeline/readers/ipc.py
src/datatrove/pipeline/readers/jsonl.py
src/datatrove/pipeline/readers/parquet.py
src/datatrove/pipeline/readers/warc.py
src/datatrove/pipeline/stats/__init__.py
src/datatrove/pipeline/stats/base.py
src/datatrove/pipeline/stats/config.py
src/datatrove/pipeline/stats/contamination_stats.py
src/datatrove/pipeline/stats/doc_stats.py
src/datatrove/pipeline/stats/lang_stats.py
src/datatrove/pipeline/stats/line_stats.py
src/datatrove/pipeline/stats/merger.py
src/datatrove/pipeline/stats/paragraph_stats.py
src/datatrove/pipeline/stats/perplexity_stats.py
src/datatrove/pipeline/stats/sentence_stats.py
src/datatrove/pipeline/stats/token_stats.py
src/datatrove/pipeline/stats/word_stats.py
src/datatrove/pipeline/tokens/__init__.py
src/datatrove/pipeline/tokens/context_shuffler.py
src/datatrove/pipeline/tokens/counter.py
src/datatrove/pipeline/tokens/megatron_tokenizer.py
src/datatrove/pipeline/tokens/merger.py
src/datatrove/pipeline/tokens/tokenizer.py
src/datatrove/pipeline/writers/__init__.py
src/datatrove/pipeline/writers/disk_base.py
src/datatrove/pipeline/writers/huggingface.py
src/datatrove/pipeline/writers/jsonl.py
src/datatrove/pipeline/writers/parquet.py
src/datatrove/tools/__init__.py
src/datatrove/tools/check_dataset.py
src/datatrove/tools/failed_logs.py
src/datatrove/tools/inspect_data.py
src/datatrove/tools/jobs_status.py
src/datatrove/tools/launch_pickled_pipeline.py
src/datatrove/tools/merge_stats.py
src/datatrove/tools/track_jobs.py
src/datatrove/utils/__init__.py
src/datatrove/utils/_import_utils.py
src/datatrove/utils/batching.py
src/datatrove/utils/binaryio.py
src/datatrove/utils/dataset.py
src/datatrove/utils/hashing.py
src/datatrove/utils/japanese_tokenizer.py
src/datatrove/utils/jobs.py
src/datatrove/utils/lid.py
src/datatrove/utils/logging.py
src/datatrove/utils/media.py
src/datatrove/utils/perplexity.py
src/datatrove/utils/stats.py
src/datatrove/utils/text.py
src/datatrove/utils/tokenization.py
src/datatrove/utils/typeshelper.py
src/datatrove/utils/word_tokenizers.py
src/datatrove/utils/hashes/sha1.py
src/datatrove/utils/hashes/xxhash.py
tests/test_io.py