genrich per sample

c239494a · Aurélien Ginolhac · 17bca5d3 · c239494a · c239494a · c239494a
Verified Commit c239494a authored 3 years ago by Aurélien Ginolhac
--- a/README.md
+++ b/README.md
@@ -5,3 +5,30 @@
 This [workflow](https://gitlab.lcsb.uni.lu/aurelien.ginolhac/snakemake-atac-seq) is derived from [Snakemake template](https://github.com/snakemake-workflows/chipseq) (itself port of the [nextflow chipseq pipeline](https://nf-co.re/chipseq)) and performs ChIP-seq peak-calling, QC and differential analysis.


+### Install the ATAC-seq template
+
+In the destination folder of your choice, otherwise create the folder such as:
+
+```bash
+mkdir snakemake-atac-seq
+cd snakemake-atac-seq
+``` 
+
+and run the following commands:
+
+```bash
+VERSION="v0.0.1"
+wget -qO- https://gitlab.lcsb.uni.lu/aurelien.ginolhac/snakemake-atac-seq/-/archive/${VERSION}/snakemake-atac-seq-${VERSION}.tar.gz | tar xfz - --strip-components=1
+```
+
+this command will download, extract (without the root folder) the following files:
+
+```
+config/
+Dockerfile
+README.md
+workflow/
+```
+
+you may want to delete the `Dockerfile` and `README.md` if you wish, 
+they are not used by `snakemake` for runtime.
\ No newline at end of file
--- a/config/samples.tsv
+++ b/config/samples.tsv
 sample	group
-RR1822153	A
-RR1822154	B
\ No newline at end of file
+mDAN_D30_1	mDAN_D30
+mDAN_D30_2	mDAN_D30
+mDAN_D30_3	mDAN_D30
+smNPC_1	smNPC
+smNPC_2	smNPC
+smNPC_3	smNPC
\ No newline at end of file
--- a/config/units.tsv
+++ b/config/units.tsv
 sample	unit	fq1	fq2	platform
-RR1822153	1	test-datasets/testdata/SRR1822153_1.fastq.gz	test-datasets/testdata/SRR1822153_2.fastq.gz	ILLUMINA
-RR1822154	1	test-datasets/testdata/SRR1822154_1.fastq.gz	test-datasets/testdata/SRR1822154_2.fastq.gz	ILLUMINA
+mDAN_D30_1	1	data/mDAN_D30_1_R1.fastq.gz	data/mDAN_D30_1_R2.fastq.gz	ILLUMINA
+mDAN_D30_2	1	data/mDAN_D30_2_R1.fastq.gz	data/mDAN_D30_2_R2.fastq.gz	ILLUMINA
+mDAN_D30_3	1	data/mDAN_D30_3_R1.fastq.gz	data/mDAN_D30_3_R2.fastq.gz	ILLUMINA
+smNPC_1	1	data/smNPC_1_R1.fastq.gz	data/smNPC_1_R2.fastq.gz	ILLUMINA
+smNPC_2	1	data/smNPC_2_R1.fastq.gz	data/smNPC_2_R2.fastq.gz	ILLUMINA
+smNPC_3	1	data/smNPC_3_R1.fastq.gz	data/smNPC_3_R2.fastq.gz	ILLUMINA
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -17,8 +17,8 @@ include: "rules/qc.smk"
 include: "rules/trimming.smk"
 include: "rules/mapping.smk"
 include: "rules/filtering.smk"
-include: "rules/post-analysis.smk"
 include: "rules/peak_analysis.smk"
+include: "rules/post-analysis.smk"


 rule all:

--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -47,13 +47,13 @@ def is_single_end(sample, unit):
        )
    return fq2_present

-
 def get_se_pe_branches_input(wildcards):
    if config["single_end"]:
        return "results/bamtools_filtered/{sample}.sorted.bam".format(sample=wildcards.sample)
    else:
        return "results/orph_rm_pe/{sample}.sorted.bam".format(sample=wildcards.sample)

+
 def get_individual_fastq(wildcards):
    """Get individual raw FASTQ files from unit sheet, based on a read (end) wildcard"""
    if ( wildcards.read == "0" or wildcards.read == "1" ):
@@ -78,19 +78,26 @@ def get_samtools_view_filter_input(wildcards):
        build=build
    )]

-def exists_multiple_groups(antibody):
-    return len(samples[samples["antibody"] == antibody]["group"].unique()) > 1
-
-def exists_replicates(antibody):
-    return len(samples[samples["antibody"] == antibody]["sample"].unique()) > 1
-
-def get_controls_of_antibody(antibody):
-    groups = samples[samples["antibody"] == antibody]["group"]
-    controls = samples[pd.isnull(samples["control"])]
-    return controls[controls["group"].isin(list(groups))]["sample"]
-
-def get_samples_of_antibody(antibody):
-    return samples[samples["antibody"] == antibody]["sample"]
+def get_sample_group_list(wildcards):
+    group = samples.loc[(wildcards.sample), ["group"]]
+    bams = samples.groupby(by=["group"])['sample'].apply(lambda x: ','.join('results/merged/'+x+'.bam')).tolist()
+    return samples.groupby(by=["group"])['sample'].apply(lambda x: ','.join('results/merged/'+x+'.bam')).tolist()
+samples['string']=samples.groupby(by=["group"]).transform(lambda x: ','.join('results/merged/'+x+'.bam'))
+                sample     group                                             string
+sample                                                                             
+mDAN_D30_1  mDAN_D30_1  mDAN_D30  results/merged/mDAN_D30_1.bam,results/merged/m...
+mDAN_D30_2  mDAN_D30_2  mDAN_D30  results/merged/mDAN_D30_1.bam,results/merged/m...
+mDAN_D30_3  mDAN_D30_3  mDAN_D30  results/merged/mDAN_D30_1.bam,results/merged/m...
+smNPC_1        smNPC_1     smNPC  results/merged/smNPC_1.bam,results/merged/smNP...
+smNPC_2        smNPC_2     smNPC  results/merged/smNPC_1.bam,results/merged/smNP...
+smNPC_3        smNPC_3     smNPC  results/merged/smNPC_1.bam,results/merged/smNP...
+samples[['group', 'string']].reset_index(drop=True).drop_duplicates()
+      group                                             string
+0  mDAN_D30  results/merged/mDAN_D30_1.bam,results/merged/m...
+3     smNPC  results/merged/smNPC_1.bam,results/merged/smNP...
+
+def get_merged_bams(wildcards):
+    return "results/merged/{sample}.bam".format(sample=wildcards.sample)

 def get_map_reads_input(wildcards):
    if is_single_end(wildcards.sample, wildcards.unit):
@@ -170,6 +177,8 @@ def all_input(wildcards):
    for (sample, unit) in units.index:
        wanted_input.extend(
            expand(["results/genrich/{sample}.narrowPeak",
-                    "results/big_wig/{sample}.bigWig"], sample = sample)
+            "results/big_wig/{sample}.bigWig"], sample = sample)
        )
    return wanted_input
+
+
--- a/workflow/rules/filtering.smk
+++ b/workflow/rules/filtering.smk
@@ -3,10 +3,10 @@ rule merge_se_pe:
    input:
        get_se_pe_branches_input
    output:
-        "results/filtered/{sample}.sorted.bam"
+        "results/filtered/{group}_{replicate}.sorted.bam"
    params:
        ""
    log:
-        "logs/filtered/{sample}.sorted.log"
+        "logs/filtered/{group}_{replicate}.sorted.log"
    shell:
        "ln -sr {input} {output}"
--- a/workflow/rules/peak_analysis.smk
+++ b/workflow/rules/peak_analysis.smk

-rule genrich_callpeak:
+rule genrich_callpeak_replicate:
    input:
-        bam = "results/merged/{sample}.bam",
+        bam =  expand("results/merged/{sample.sample}.bam", sample = samples.itertuples()),
        blacklist = "resources/ref/blacklist.bed"
    output:
-        peaks = "results/genrich/{sample}.narrowPeak",
-        bedgraph = temp("results/genrich/{sample}.bg")    
-    log:
-        "logs/genrich/callpeak.{sample}.log"
+        peaks = ["results/genrich/{gp}.narrowPeak".format(gp=group) for group in samples['group'].unique()],
+        bedgraph = ["results/genrich/{gp}.bg".format(gp=group) for group in samples['group'].unique()]
+    #log:
+    #    expand("logs/genrich/{group}.log", group = samples['group'].unique()),
    params:
+        replicates = ["{join}".format(join=i) for i in get_sample_group_list()],
        exclude = "chrM",
        extra = "-a 500 -g 15 -l 15 -d 50"
    shell:
        # -r for duplicate removal
        """
-        Genrich -t {input.bam} -o {output.peaks} \
+        Genrich -t {params.replicates} -o {output.peaks} \
        -k {output.bedgraph} -e {params.exclude} \
        -E {input.blacklist} -r \
        -m 30 -j {params.extra}
        """

-rule homer_annotatepeaks:
+
+rule genrich_callpeak:
    input:
-        peaks = "results/genrich/{sample}.narrowPeak",
-        genome = "resources/ref/genome.fasta",
-        gtf="resources/ref/annotation.gtf"
+        #bam =  expand("results/merged/{sample}.bam", sample = units["sample"].tolist()),
+        bam = get_merged_bams,
+        blacklist = "resources/ref/blacklist.bed"
    output:
-        annotations="results/homer/annotate_peaks/{sample}_peaks.annotatePeaks.txt"
-    threads:
-        2
-    params:
-        mode="",
-        extra="-gid"
+        peaks = "results/genrich/{sample}.narrowPeak",
+        bedgraph = "results/genrich/{sample}.bg"
    log:
-        "logs/homer/annotate_peaks/{sample}.log"
-    wrapper:
-        "0.68.0/bio/homer/annotatePeaks"
-
-
+        "logs/genrich/{sample}.log"
+    threads: 4
+    params:
+        exclude = "chrM",
+        extra = "-a 500 -g 15 -l 15 -d 50"
+    shell:
+        # -r for duplicate removal
+        """
+        Genrich -t {input.bam} -o {output.peaks} \
+        -k {output.bedgraph} -e {params.exclude} \
+        -E {input.blacklist} -r \
+        -m 30 -j {params.extra}
+        """
--- a/workflow/rules/post-analysis.smk
+++ b/workflow/rules/post-analysis.smk

-
-rule collect_multiple_metrics:
-    input:
-         bam="results/filtered/{sample}.sorted.bam",
-         ref="resources/ref/genome.fasta"
-    output:
-        # Through the output file extensions the different tools for the metrics can be selected
-        # so that it is not necessary to specify them under params with the "PROGRAM" option.
-        # Usable extensions (and which tools they implicitly call) are listed here:
-        #         https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/picard/collectmultiplemetrics.html.
-        [
-           "{path}{sample}.insert_size_metrics",
-            report(
-                "{path}{sample}.insert_size_histogram.pdf",
-                caption="../report/plot_insert_size_histogram_picard_mm.rst",
-                category="Multiple Metrics (picard)"
-            )
-        ] if not config["single_end"] else [],
-        multiext("{path}{sample}",
-                 ".alignment_summary_metrics",
-                 ".base_distribution_by_cycle_metrics",
-                 ".quality_by_cycle_metrics",
-                 ".quality_distribution_metrics",
-                 ),
-        report(
-            "{path}{sample}.base_distribution_by_cycle.pdf",
-            caption="../report/plot_base_distribution_by_cycle_picard_mm.rst",
-            category="Multiple Metrics (picard)"),
-        report(
-            "{path}{sample}.quality_by_cycle.pdf",
-            caption="../report/plot_quality_by_cycle_picard_mm.rst",
-            category="Multiple Metrics (picard)"),
-        report(
-            "{path}{sample}.quality_distribution.pdf",
-            caption="../report/plot_quality_distribution_picard_mm.rst",
-            category="Multiple Metrics (picard)")
-    resources:
-        # This parameter (default 3 GB) can be used to limit the total resources a pipeline is allowed to use, see:
-        #     https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#resources
-        mem_gb=3
-    log:
-        "logs/picard/{path}{sample}.log"
-    params:
-        # optional parameters
-        "{} ".format(config["params"]["collect-multiple-metrics"])
-    wrapper:
-        "0.64.0/bio/picard/collectmultiplemetrics"
-
-rule genomecov:
-    input:
-        "results/filtered/{sample}.sorted.bam",
-        flag_stats=expand("results/{step}/{{sample}}.sorted.{step}.flagstat",
-            step= "bamtools_filtered" if config["single_end"]
-            else "orph_rm_pe"),
-        stats=expand("results/{step}/{{sample}}.sorted.{step}.stats.txt",
-            step= "bamtools_filtered" if config["single_end"]
-            else "orph_rm_pe"),
-    output:
-        pipe("results/bed_graph/{sample}.bedgraph")
-    log:
-        "logs/bed_graph/{sample}.log"
-    params:
-        lambda w, input:
-            "-bg -scale $(grep 'mapped (' {flagstats_file} | awk '{{print 1000000/$1}}') {pe_fragment} {extend}".format(
-            flagstats_file=input.flag_stats,
-            pe_fragment="" if config["single_end"] else "-pc",
-            # Estimated fragment size used to extend single-end reads
-            extend=
-                "-fs $(grep ^SN {stats} | "
-                "cut -f 2- | "
-                "grep -m1 'average length:' | "
-                "awk '{{print $NF}}')".format(
-                stats=input.stats)
-            if config["single_end"] else ""
-        )
-    wrapper:
-        "0.64.0/bio/bedtools/genomecov"
-
 rule sort_genomecov:
    input:
-        "results/genrich/{sample}.bg"
+        rules.genrich_callpeak.output.bedgraph
    output:
        "results/genrich/{sample}.sort.bg"
    log: