data-prep-toolkit>=0.2.1
# code quality
bs4==0.0.2
transformers==4.38.2
#pdf2parquet
docling-core==1.3.0
docling-ibm-models==1.1.7
deepsearch-glm==0.21.0
docling==1.11.0,
filetype >=1.2.0, <2.0.0
#Doc chunking
docling-core==1.3.0,
llama-index-core>=0.11.0,<0.12.0,
#filter
duckdb>=0.10.1
#langid
fasttext==0.9.2
langcodes==3.3.0
huggingface-hub >= 0.21.4, <1.0.0
numpy==1.26.4
#fdedup
mmh3>=4.1.0
xxhash==3.4.1
tqdm==4.66.3
scipy>=1.12.0, <2.0.0
# ededup
mmh3>=4.1.0
xxhash==3.4.1
#code2parquet
pandas
parameterized
#header cleanser
scancode-toolkit==32.1.0 ; platform_system != 'Darwin'
#text_encoder
sentence-transformers==3.0.1
# PII-redactor
presidio-analyzer>=2.2.355
presidio-anonymizer>=2.2.355
flair>=0.14.0
pandas>=2.2.2
#html2parquet
#INFO: pip is looking at multiple versions of trafilatura to determine which version is compatible with other requirements. This could take a while.
#The conflict is caused by:
#    docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1
#    trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
#trafilatura==1.12.0
#tokenization
transformers==4.38.2



