Hi!

I have a pipeline in the form of a snakemake.
Before the pipeline starts working, the snakemake checks and announces empty folders that do not yet contain files. These files will appear in the rule gatk_BaseRecalibrator step of the snakemake. I am currently running the code in several steps. First, I generate files that will fill the folders required in the next step (all_samples_BaseRecalibrator folder), and then I run a rule gatk_GatherBQSRReports that will use these files.
I understand that this approach is inconvenient and not correct. But I'm new to python and don't know how to change my code.

this is how a piece of my code looks like :

    import glob
    SAMPLES = glob_wildcards("input_bam/{sample}.bam")
    INTERVALS = glob_wildcards("../intervals/{interval}.bed")
    all_samples_BaseRecalibrator = glob.glob("out_BaseRecalibrator/*-scattered.txt")


    rule all:
        input: expand("out/{sample}.AddOrReplaceReadGroups.bam", sample = SAMPLES.sample), expand("out/{sample}.genome.covered.bed", sample=SAMPLES.sample), expand("out/{sample}.MarkDuplicates.bam", sample=SAMPLES.sample), expand("out/{sample}.MarkDuplicates.bam", sample = SAMPLES.sample),expand("out_BaseRecalibrator/{sample}.BaseRecalibrator.{interval}.txt", interval=INTERVALS.interval, sample=SAMPLES.sample), , expand("out_BaseRecalibrator/{sample}.GatherBQSRReports.txt", sample=SAMPLES.sample)

    rule gatk_AddOrReplaceReadGroups:
        input: "input_bam/{sample}.bam"
        output: "out/{sample}.AddOrReplaceReadGroups.bam"
        shell: "./../gatk-4.1.3.0/gatk --java-options ""-Xmx30g"" AddOrReplaceReadGroups -I {input}  -O {output} -ID group1 -SM TUMOR -PL illumina -LB lib1 -PU unit1"

    rule bedtools_genomecov:
        input: rules.gatk_AddOrReplaceReadGroups.output
        output: "out/{sample}.genome.covered.bed"
        shell: "bedtools genomecov -ibam  {input} -bg > {output}"

    rule gatk_MarkDuplicates:
        input: rules.gatk_AddOrReplaceReadGroups.output
        output: output1="out/{sample}.MarkDuplicates.bam", output2="out/{sample}.MarkDuplicates.txt"
        shell: "./../gatk-4.1.3.0/gatk --java-options ""-Xmx4g""  MarkDuplicates -I {input} -O {output.output1} -M {output.output2} --CREATE_INDEX true"

    rule gatk_BaseRecalibrator:
        input: input1="../intervals/{interval}.bed", input2="out/{sample}.MarkDuplicates.bam", input3='../ref/ref.fa', input4="../dbsnp/dbsnp_150.hg38.vcf.gz"
        output: "out_BaseRecalibrator/{sample}.BaseRecalibrator.{interval}.txt"
        shell: "./../gatk-4.1.3.0/gatk --java-options ""-Xmx7680m""  BaseRecalibrator -L {input.input1} -I {input.input2} -O {output} -R {input.input3} --known-sites {input.input4}"

rule gatk_GatherBQSRReports:
    input: all_samples_BaseRecalibrator
    output: "out_BaseRecalibrator/{sample}.GatherBQSRReports.txt"
    params: all_samples_BaseRecalibrator=" -I ".join(all_samples_BaseRecalibrator)
    shell: "./../gatk-4.1.3.0/gatk --java-options ""-Xmx7680m""  GatherBQSRReports  -I {params.all_samples_BaseRecalibrator}  -O {output}"



Source link