LICENSE
README.md
setup.cfg
setup.py
data_juicer/__init__.py
data_juicer/analysis/__init__.py
data_juicer/analysis/collector.py
data_juicer/analysis/column_wise_analysis.py
data_juicer/analysis/diversity_analysis.py
data_juicer/analysis/draw.py
data_juicer/analysis/measure.py
data_juicer/analysis/overall_analysis.py
data_juicer/config/__init__.py
data_juicer/config/config.py
data_juicer/core/__init__.py
data_juicer/core/adapter.py
data_juicer/core/analyzer.py
data_juicer/core/data.py
data_juicer/core/executor.py
data_juicer/core/exporter.py
data_juicer/core/monitor.py
data_juicer/core/ray_data.py
data_juicer/core/ray_executor.py
data_juicer/core/tracer.py
data_juicer/format/__init__.py
data_juicer/format/csv_formatter.py
data_juicer/format/empty_formatter.py
data_juicer/format/formatter.py
data_juicer/format/json_formatter.py
data_juicer/format/load.py
data_juicer/format/mixture_formatter.py
data_juicer/format/parquet_formatter.py
data_juicer/format/text_formatter.py
data_juicer/format/tsv_formatter.py
data_juicer/ops/__init__.py
data_juicer/ops/base_op.py
data_juicer/ops/load.py
data_juicer/ops/op_fusion.py
data_juicer/ops/aggregator/__init__.py
data_juicer/ops/aggregator/entity_attribute_aggregator.py
data_juicer/ops/aggregator/meta_tags_aggregator.py
data_juicer/ops/aggregator/most_relevant_entities_aggregator.py
data_juicer/ops/aggregator/nested_aggregator.py
data_juicer/ops/common/__init__.py
data_juicer/ops/common/helper_func.py
data_juicer/ops/common/prompt2prompt_pipeline.py
data_juicer/ops/common/special_characters.py
data_juicer/ops/deduplicator/__init__.py
data_juicer/ops/deduplicator/document_deduplicator.py
data_juicer/ops/deduplicator/document_minhash_deduplicator.py
data_juicer/ops/deduplicator/document_simhash_deduplicator.py
data_juicer/ops/deduplicator/image_deduplicator.py
data_juicer/ops/deduplicator/ray_basic_deduplicator.py
data_juicer/ops/deduplicator/ray_bts_minhash_deduplicator.py
data_juicer/ops/deduplicator/ray_document_deduplicator.py
data_juicer/ops/deduplicator/ray_image_deduplicator.py
data_juicer/ops/deduplicator/ray_video_deduplicator.py
data_juicer/ops/deduplicator/video_deduplicator.py
data_juicer/ops/filter/__init__.py
data_juicer/ops/filter/alphanumeric_filter.py
data_juicer/ops/filter/audio_duration_filter.py
data_juicer/ops/filter/audio_nmf_snr_filter.py
data_juicer/ops/filter/audio_size_filter.py
data_juicer/ops/filter/average_line_length_filter.py
data_juicer/ops/filter/character_repetition_filter.py
data_juicer/ops/filter/flagged_words_filter.py
data_juicer/ops/filter/image_aesthetics_filter.py
data_juicer/ops/filter/image_aspect_ratio_filter.py
data_juicer/ops/filter/image_face_count_filter.py
data_juicer/ops/filter/image_face_ratio_filter.py
data_juicer/ops/filter/image_nsfw_filter.py
data_juicer/ops/filter/image_pair_similarity_filter.py
data_juicer/ops/filter/image_shape_filter.py
data_juicer/ops/filter/image_size_filter.py
data_juicer/ops/filter/image_text_matching_filter.py
data_juicer/ops/filter/image_text_similarity_filter.py
data_juicer/ops/filter/image_watermark_filter.py
data_juicer/ops/filter/language_id_score_filter.py
data_juicer/ops/filter/llm_difficulty_score_filter.py
data_juicer/ops/filter/llm_quality_score_filter.py
data_juicer/ops/filter/maximum_line_length_filter.py
data_juicer/ops/filter/perplexity_filter.py
data_juicer/ops/filter/phrase_grounding_recall_filter.py
data_juicer/ops/filter/special_characters_filter.py
data_juicer/ops/filter/specified_field_filter.py
data_juicer/ops/filter/specified_numeric_field_filter.py
data_juicer/ops/filter/stopwords_filter.py
data_juicer/ops/filter/suffix_filter.py
data_juicer/ops/filter/text_action_filter.py
data_juicer/ops/filter/text_entity_dependency_filter.py
data_juicer/ops/filter/text_length_filter.py
data_juicer/ops/filter/text_pair_similarity_filter.py
data_juicer/ops/filter/token_num_filter.py
data_juicer/ops/filter/video_aesthetics_filter.py
data_juicer/ops/filter/video_aspect_ratio_filter.py
data_juicer/ops/filter/video_duration_filter.py
data_juicer/ops/filter/video_frames_text_similarity_filter.py
data_juicer/ops/filter/video_motion_score_filter.py
data_juicer/ops/filter/video_motion_score_raft_filter.py
data_juicer/ops/filter/video_nsfw_filter.py
data_juicer/ops/filter/video_ocr_area_ratio_filter.py
data_juicer/ops/filter/video_resolution_filter.py
data_juicer/ops/filter/video_tagging_from_frames_filter.py
data_juicer/ops/filter/video_watermark_filter.py
data_juicer/ops/filter/word_repetition_filter.py
data_juicer/ops/filter/words_num_filter.py
data_juicer/ops/grouper/__init__.py
data_juicer/ops/grouper/key_value_grouper.py
data_juicer/ops/grouper/naive_grouper.py
data_juicer/ops/grouper/naive_reverse_grouper.py
data_juicer/ops/mapper/__init__.py
data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py
data_juicer/ops/mapper/calibrate_qa_mapper.py
data_juicer/ops/mapper/calibrate_query_mapper.py
data_juicer/ops/mapper/calibrate_response_mapper.py
data_juicer/ops/mapper/chinese_convert_mapper.py
data_juicer/ops/mapper/clean_copyright_mapper.py
data_juicer/ops/mapper/clean_email_mapper.py
data_juicer/ops/mapper/clean_html_mapper.py
data_juicer/ops/mapper/clean_ip_mapper.py
data_juicer/ops/mapper/clean_links_mapper.py
data_juicer/ops/mapper/dialog_intent_detection_mapper.py
data_juicer/ops/mapper/dialog_sentiment_detection_mapper.py
data_juicer/ops/mapper/dialog_sentiment_intensity_mapper.py
data_juicer/ops/mapper/dialog_topic_detection_mapper.py
data_juicer/ops/mapper/expand_macro_mapper.py
data_juicer/ops/mapper/extract_entity_attribute_mapper.py
data_juicer/ops/mapper/extract_entity_relation_mapper.py
data_juicer/ops/mapper/extract_event_mapper.py
data_juicer/ops/mapper/extract_keyword_mapper.py
data_juicer/ops/mapper/extract_nickname_mapper.py
data_juicer/ops/mapper/extract_support_text_mapper.py
data_juicer/ops/mapper/fix_unicode_mapper.py
data_juicer/ops/mapper/generate_qa_from_examples_mapper.py
data_juicer/ops/mapper/generate_qa_from_text_mapper.py
data_juicer/ops/mapper/image_blur_mapper.py
data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.py
data_juicer/ops/mapper/image_captioning_mapper.py
data_juicer/ops/mapper/image_diffusion_mapper.py
data_juicer/ops/mapper/image_face_blur_mapper.py
data_juicer/ops/mapper/image_remove_background_mapper.py
data_juicer/ops/mapper/image_segment_mapper.py
data_juicer/ops/mapper/image_tagging_mapper.py
data_juicer/ops/mapper/mllm_mapper.py
data_juicer/ops/mapper/nlpaug_en_mapper.py
data_juicer/ops/mapper/nlpcda_zh_mapper.py
data_juicer/ops/mapper/optimize_qa_mapper.py
data_juicer/ops/mapper/optimize_query_mapper.py
data_juicer/ops/mapper/optimize_response_mapper.py
data_juicer/ops/mapper/pair_preference_mapper.py
data_juicer/ops/mapper/punctuation_normalization_mapper.py
data_juicer/ops/mapper/python_file_mapper.py
data_juicer/ops/mapper/python_lambda_mapper.py
data_juicer/ops/mapper/query_intent_detection_mapper.py
data_juicer/ops/mapper/query_sentiment_detection_mapper.py
data_juicer/ops/mapper/query_topic_detection_mapper.py
data_juicer/ops/mapper/relation_identity_mapper.py
data_juicer/ops/mapper/remove_bibliography_mapper.py
data_juicer/ops/mapper/remove_comments_mapper.py
data_juicer/ops/mapper/remove_header_mapper.py
data_juicer/ops/mapper/remove_long_words_mapper.py
data_juicer/ops/mapper/remove_non_chinese_character_mapper.py
data_juicer/ops/mapper/remove_repeat_sentences_mapper.py
data_juicer/ops/mapper/remove_specific_chars_mapper.py
data_juicer/ops/mapper/remove_table_text_mapper.py
data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py
data_juicer/ops/mapper/replace_content_mapper.py
data_juicer/ops/mapper/sdxl_prompt2prompt_mapper.py
data_juicer/ops/mapper/sentence_augmentation_mapper.py
data_juicer/ops/mapper/sentence_split_mapper.py
data_juicer/ops/mapper/text_chunk_mapper.py
data_juicer/ops/mapper/video_captioning_from_audio_mapper.py
data_juicer/ops/mapper/video_captioning_from_frames_mapper.py
data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py
data_juicer/ops/mapper/video_captioning_from_video_mapper.py
data_juicer/ops/mapper/video_extract_frames_mapper.py
data_juicer/ops/mapper/video_face_blur_mapper.py
data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py
data_juicer/ops/mapper/video_remove_watermark_mapper.py
data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py
data_juicer/ops/mapper/video_resize_resolution_mapper.py
data_juicer/ops/mapper/video_split_by_duration_mapper.py
data_juicer/ops/mapper/video_split_by_key_frame_mapper.py
data_juicer/ops/mapper/video_split_by_scene_mapper.py
data_juicer/ops/mapper/video_tagging_from_audio_mapper.py
data_juicer/ops/mapper/video_tagging_from_frames_mapper.py
data_juicer/ops/mapper/whitespace_normalization_mapper.py
data_juicer/ops/selector/__init__.py
data_juicer/ops/selector/frequency_specified_field_selector.py
data_juicer/ops/selector/random_selector.py
data_juicer/ops/selector/range_specified_field_selector.py
data_juicer/ops/selector/tags_specified_field_selector.py
data_juicer/ops/selector/topk_specified_field_selector.py
data_juicer/utils/__init__.py
data_juicer/utils/asset_utils.py
data_juicer/utils/auto_install_mapping.py
data_juicer/utils/auto_install_utils.py
data_juicer/utils/availability_utils.py
data_juicer/utils/cache_utils.py
data_juicer/utils/ckpt_utils.py
data_juicer/utils/common_utils.py
data_juicer/utils/compress.py
data_juicer/utils/constant.py
data_juicer/utils/file_utils.py
data_juicer/utils/fingerprint_utils.py
data_juicer/utils/lazy_loader.py
data_juicer/utils/logger_utils.py
data_juicer/utils/mm_utils.py
data_juicer/utils/model_utils.py
data_juicer/utils/process_utils.py
data_juicer/utils/registry.py
data_juicer/utils/resource_utils.py
data_juicer/utils/unittest_utils.py
py_data_juicer.egg-info/PKG-INFO
py_data_juicer.egg-info/SOURCES.txt
py_data_juicer.egg-info/dependency_links.txt
py_data_juicer.egg-info/entry_points.txt
py_data_juicer.egg-info/requires.txt
py_data_juicer.egg-info/top_level.txt
tools/__init__.py
tools/analyze_data.py
tools/data_resplit.py
tools/dj_install.py
tools/process_data.py
tools/sandbox_starter.py