from os import listdir
from os.path import isfile, splitext

rule all:
    input:
        "mapping/stats/mapping_stats.xlsx",
        expand("mapping/{A}.bam", A=sorted(config['entries'].keys())),
        ".report/modules/segemehl.html",


rule mapping_stats_xlsx:
    input:
        "mapping/stats/mapping_stats.tsv"
    output:
        "mapping/stats/mapping_stats.xlsx"
    conda:
        "../lib/conda_env.yaml"
    group:
        "segemehl_statistics"
    shell:
        "python3 lib/pe_mapping_stats_tsv_to_xlsx.py {input} {output}"


rule mapping_stats_tsv:
    input:
        expand("mapping/logs/segemehl_mapping.{name}.log", name=config['entries'].keys())
    output:
        tsv="mapping/stats/mapping_stats.tsv",
    group:
        "segemehl_statistics"
    run:
        lines = ["Sample\tTotal_Reads\tMapped_Reads\tMapped_Reads[%]\tUniquely_Mapped_Reads\tUniquely_Mapped_Reads[%]\tMulti_Mapped_Reads\tMulti_Mapped_Reads[%]\tSplit_Mapped_Reads\tSplit_Mapped_Reads[%]\tTotal_Pairs\tMapped_Pairs\tMapped_Pairs[%]\tUniquely_Mapped_Pairs\tUniquely_Mapped_Pairs[%]\tMulti_Mapped_Pairs\tMulti_Mapped_Pairs[%]\tSplit_Mapped_Pairs\tSplit_Mapped_Pairs[%]"]
        with open(output.tsv, "w") as summary_file:
          for file in input:
            name = file.rstrip(".log").lstrip("mapping/logs/segemehl_mapping.")
            with open(file, 'r') as stats:
              stat_row = [name]
              for line in stats:
                if line.startswith("all") or line.startswith("pair"):
                  stat_row.extend([entry.strip("%") for entry in line.strip().split("\t")[1:]])
            lines.append("\t".join(stat_row))
          summary_file.writelines([l + "\n" for l in lines])


rule segemehl_index:
    input:
        genome="%%GENOME_FASTA%%"
    output:
        splitext("%%GENOME_FASTA%%")[0] + ".idx"
    conda:
        "../lib/conda_env.yaml"
    group:
        "segemehl_index"
    log:
        "mapping/logs/segemehl_index.log"
    shell:
        "segemehl.x -x {output} -d {input.genome} 2>&1 |"
        "tee {log}"


rule segemehl_mapping:
    input:
        genome="%%GENOME_FASTA%%",
        genome_index=splitext("%%GENOME_FASTA%%")[0] + ".idx",
        reads="preprocessing/{sample}_R1.fastq.gz",
	    reads_reverse="preprocessing/{sample}_R2.fastq.gz"
    output:
        temp("mapping/sam/{sample}.sam")
    conda:
        "../lib/conda_env.yaml"
    group:
        "segemehl_mapping"
    log:
        "mapping/logs/segemehl_mapping.{sample}.log"
    threads:
        2
    shell:
        "segemehl.x %%ADDITIONAL_SEGEMEHL_OPTIONS%% --accuracy %%ACCURACY%% -t {threads} -o {output} -i {input.genome_index} -d {input.genome} -q {input.reads} -p {input.reads_reverse} 2>&1 |"
        "tee {log}"


rule index_bam:
    input:
        "mapping/sam/{sample}.sam"
    output:
        bam="mapping/{sample}.bam",
        csi="mapping/{sample}.bam.csi",
        bam_unmapped="mapping/unmapped/{sample}_unmapped.bam",
        bam_singleton="mapping/singleton/{sample}_singletons.bam",
        bam_disconc="mapping/disconcordantly/{sample}_disconc.bam"
    params:
        tmp_singleton=temp("mapping/singleton/{sample}_tmp.sam")
    conda:
        "../lib/conda_env.yaml"
    group:
        "segemehl_mapping"
    threads:
        2
    shell:
        "samtools view -F 12 -f 2 -Shb {input} | samtools sort -@ {threads} -o {output.bam} - && samtools index -c {output.bam};"

        "samtools view -f 4 -F 8 -Sh {input} > {params.tmp_singleton} && samtools view -f 8 -F 4 -S {input} >> {params.tmp_singleton} "
        "&& samtools view -Shb {params.tmp_singleton} | samtools sort -@ {threads} -o {output.bam_singleton} - && rm {params.tmp_singleton};"

        "samtools view -F 14 -Shb {input} | samtools sort -@ {threads} -o {output.bam_disconc} -;"

        "samtools view -f 12 -Shb {input} | samtools sort -@ {threads} -o {output.bam_unmapped} -;"


rule write_settings:
    output:
        settings="mapping/settings.yaml"
    conda:
        "../lib/conda_env.yaml"
    group:
        "bwa_report"
    shell:
        """
        set +e
        segemehl_help=$(segemehl.x 2>&1);
        segemehl_version=$(echo "$segemehl_help" | grep -A1 "\[VERSION\\]" | tail -n1 | awk '{{$1=$1}};1');
        echo "segemehl_version: \\"$segemehl_version\\"" > {output.settings};
        echo 'segemehl_accuracy: "%%ACCURACY%%"' >> {output.settings}; 
        echo 'additional_segemehl_parameters: "%%ADDITIONAL_SEGEMEHL_OPTIONS%%"' >> {output.settings}; 
        """


rule generate_report_data:
    input:
        stats_tsv="mapping/stats/mapping_stats.tsv",
        settings="mapping/settings.yaml"
    output:
        segemehl_data=".report/data/segemehl_data.js",
        segemehl_html=".report/modules/segemehl.html",
        segemehl_js=".report/js/modules/segemehl.js",
        segemehl_css=".report/css/modules/segemehl.css",
    conda:
        "../lib/conda_env.yaml"
    group:
        "segemehl_report"
    shell:
        "python3 lib/generate_report_data.py --stats {input.stats_tsv} --output {output.segemehl_data} --settings {input.settings} --paired-end && "
        "cp lib/report/segemehl.html {output.segemehl_html} && "
        "cp lib/report/segemehl.js {output.segemehl_js} && "
        "cp lib/report/segemehl.css {output.segemehl_css}"
