Skip to content
Snippets Groups Projects
Verified Commit c239494a authored by Aurélien Ginolhac's avatar Aurélien Ginolhac :bicyclist:
Browse files

genrich per sample

parent 17bca5d3
No related branches found
Tags v0.0.1
No related merge requests found
......@@ -5,3 +5,30 @@
This [workflow](https://gitlab.lcsb.uni.lu/aurelien.ginolhac/snakemake-atac-seq) is derived from [Snakemake template](https://github.com/snakemake-workflows/chipseq) (itself port of the [nextflow chipseq pipeline](https://nf-co.re/chipseq)) and performs ChIP-seq peak-calling, QC and differential analysis.
### Install the ATAC-seq template
In the destination folder of your choice, otherwise create the folder such as:
```bash
mkdir snakemake-atac-seq
cd snakemake-atac-seq
```
and run the following commands:
```bash
VERSION="v0.0.1"
wget -qO- https://gitlab.lcsb.uni.lu/aurelien.ginolhac/snakemake-atac-seq/-/archive/${VERSION}/snakemake-atac-seq-${VERSION}.tar.gz | tar xfz - --strip-components=1
```
this command will download, extract (without the root folder) the following files:
```
config/
Dockerfile
README.md
workflow/
```
you may want to delete the `Dockerfile` and `README.md` if you wish,
they are not used by `snakemake` for runtime.
\ No newline at end of file
sample group
RR1822153 A
RR1822154 B
\ No newline at end of file
mDAN_D30_1 mDAN_D30
mDAN_D30_2 mDAN_D30
mDAN_D30_3 mDAN_D30
smNPC_1 smNPC
smNPC_2 smNPC
smNPC_3 smNPC
\ No newline at end of file
sample unit fq1 fq2 platform
RR1822153 1 test-datasets/testdata/SRR1822153_1.fastq.gz test-datasets/testdata/SRR1822153_2.fastq.gz ILLUMINA
RR1822154 1 test-datasets/testdata/SRR1822154_1.fastq.gz test-datasets/testdata/SRR1822154_2.fastq.gz ILLUMINA
mDAN_D30_1 1 data/mDAN_D30_1_R1.fastq.gz data/mDAN_D30_1_R2.fastq.gz ILLUMINA
mDAN_D30_2 1 data/mDAN_D30_2_R1.fastq.gz data/mDAN_D30_2_R2.fastq.gz ILLUMINA
mDAN_D30_3 1 data/mDAN_D30_3_R1.fastq.gz data/mDAN_D30_3_R2.fastq.gz ILLUMINA
smNPC_1 1 data/smNPC_1_R1.fastq.gz data/smNPC_1_R2.fastq.gz ILLUMINA
smNPC_2 1 data/smNPC_2_R1.fastq.gz data/smNPC_2_R2.fastq.gz ILLUMINA
smNPC_3 1 data/smNPC_3_R1.fastq.gz data/smNPC_3_R2.fastq.gz ILLUMINA
......@@ -17,8 +17,8 @@ include: "rules/qc.smk"
include: "rules/trimming.smk"
include: "rules/mapping.smk"
include: "rules/filtering.smk"
include: "rules/post-analysis.smk"
include: "rules/peak_analysis.smk"
include: "rules/post-analysis.smk"
rule all:
......
......@@ -47,13 +47,13 @@ def is_single_end(sample, unit):
)
return fq2_present
def get_se_pe_branches_input(wildcards):
if config["single_end"]:
return "results/bamtools_filtered/{sample}.sorted.bam".format(sample=wildcards.sample)
else:
return "results/orph_rm_pe/{sample}.sorted.bam".format(sample=wildcards.sample)
def get_individual_fastq(wildcards):
"""Get individual raw FASTQ files from unit sheet, based on a read (end) wildcard"""
if ( wildcards.read == "0" or wildcards.read == "1" ):
......@@ -78,19 +78,26 @@ def get_samtools_view_filter_input(wildcards):
build=build
)]
def exists_multiple_groups(antibody):
return len(samples[samples["antibody"] == antibody]["group"].unique()) > 1
def exists_replicates(antibody):
return len(samples[samples["antibody"] == antibody]["sample"].unique()) > 1
def get_controls_of_antibody(antibody):
groups = samples[samples["antibody"] == antibody]["group"]
controls = samples[pd.isnull(samples["control"])]
return controls[controls["group"].isin(list(groups))]["sample"]
def get_samples_of_antibody(antibody):
return samples[samples["antibody"] == antibody]["sample"]
def get_sample_group_list(wildcards):
group = samples.loc[(wildcards.sample), ["group"]]
bams = samples.groupby(by=["group"])['sample'].apply(lambda x: ','.join('results/merged/'+x+'.bam')).tolist()
return samples.groupby(by=["group"])['sample'].apply(lambda x: ','.join('results/merged/'+x+'.bam')).tolist()
samples['string']=samples.groupby(by=["group"]).transform(lambda x: ','.join('results/merged/'+x+'.bam'))
sample group string
sample
mDAN_D30_1 mDAN_D30_1 mDAN_D30 results/merged/mDAN_D30_1.bam,results/merged/m...
mDAN_D30_2 mDAN_D30_2 mDAN_D30 results/merged/mDAN_D30_1.bam,results/merged/m...
mDAN_D30_3 mDAN_D30_3 mDAN_D30 results/merged/mDAN_D30_1.bam,results/merged/m...
smNPC_1 smNPC_1 smNPC results/merged/smNPC_1.bam,results/merged/smNP...
smNPC_2 smNPC_2 smNPC results/merged/smNPC_1.bam,results/merged/smNP...
smNPC_3 smNPC_3 smNPC results/merged/smNPC_1.bam,results/merged/smNP...
samples[['group', 'string']].reset_index(drop=True).drop_duplicates()
group string
0 mDAN_D30 results/merged/mDAN_D30_1.bam,results/merged/m...
3 smNPC results/merged/smNPC_1.bam,results/merged/smNP...
def get_merged_bams(wildcards):
return "results/merged/{sample}.bam".format(sample=wildcards.sample)
def get_map_reads_input(wildcards):
if is_single_end(wildcards.sample, wildcards.unit):
......@@ -170,6 +177,8 @@ def all_input(wildcards):
for (sample, unit) in units.index:
wanted_input.extend(
expand(["results/genrich/{sample}.narrowPeak",
"results/big_wig/{sample}.bigWig"], sample = sample)
"results/big_wig/{sample}.bigWig"], sample = sample)
)
return wanted_input
......@@ -3,10 +3,10 @@ rule merge_se_pe:
input:
get_se_pe_branches_input
output:
"results/filtered/{sample}.sorted.bam"
"results/filtered/{group}_{replicate}.sorted.bam"
params:
""
log:
"logs/filtered/{sample}.sorted.log"
"logs/filtered/{group}_{replicate}.sorted.log"
shell:
"ln -sr {input} {output}"
rule genrich_callpeak:
rule genrich_callpeak_replicate:
input:
bam = "results/merged/{sample}.bam",
bam = expand("results/merged/{sample.sample}.bam", sample = samples.itertuples()),
blacklist = "resources/ref/blacklist.bed"
output:
peaks = "results/genrich/{sample}.narrowPeak",
bedgraph = temp("results/genrich/{sample}.bg")
log:
"logs/genrich/callpeak.{sample}.log"
peaks = ["results/genrich/{gp}.narrowPeak".format(gp=group) for group in samples['group'].unique()],
bedgraph = ["results/genrich/{gp}.bg".format(gp=group) for group in samples['group'].unique()]
#log:
# expand("logs/genrich/{group}.log", group = samples['group'].unique()),
params:
replicates = ["{join}".format(join=i) for i in get_sample_group_list()],
exclude = "chrM",
extra = "-a 500 -g 15 -l 15 -d 50"
shell:
# -r for duplicate removal
"""
Genrich -t {input.bam} -o {output.peaks} \
Genrich -t {params.replicates} -o {output.peaks} \
-k {output.bedgraph} -e {params.exclude} \
-E {input.blacklist} -r \
-m 30 -j {params.extra}
"""
rule homer_annotatepeaks:
rule genrich_callpeak:
input:
peaks = "results/genrich/{sample}.narrowPeak",
genome = "resources/ref/genome.fasta",
gtf="resources/ref/annotation.gtf"
#bam = expand("results/merged/{sample}.bam", sample = units["sample"].tolist()),
bam = get_merged_bams,
blacklist = "resources/ref/blacklist.bed"
output:
annotations="results/homer/annotate_peaks/{sample}_peaks.annotatePeaks.txt"
threads:
2
params:
mode="",
extra="-gid"
peaks = "results/genrich/{sample}.narrowPeak",
bedgraph = "results/genrich/{sample}.bg"
log:
"logs/homer/annotate_peaks/{sample}.log"
wrapper:
"0.68.0/bio/homer/annotatePeaks"
"logs/genrich/{sample}.log"
threads: 4
params:
exclude = "chrM",
extra = "-a 500 -g 15 -l 15 -d 50"
shell:
# -r for duplicate removal
"""
Genrich -t {input.bam} -o {output.peaks} \
-k {output.bedgraph} -e {params.exclude} \
-E {input.blacklist} -r \
-m 30 -j {params.extra}
"""
rule collect_multiple_metrics:
input:
bam="results/filtered/{sample}.sorted.bam",
ref="resources/ref/genome.fasta"
output:
# Through the output file extensions the different tools for the metrics can be selected
# so that it is not necessary to specify them under params with the "PROGRAM" option.
# Usable extensions (and which tools they implicitly call) are listed here:
# https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/picard/collectmultiplemetrics.html.
[
"{path}{sample}.insert_size_metrics",
report(
"{path}{sample}.insert_size_histogram.pdf",
caption="../report/plot_insert_size_histogram_picard_mm.rst",
category="Multiple Metrics (picard)"
)
] if not config["single_end"] else [],
multiext("{path}{sample}",
".alignment_summary_metrics",
".base_distribution_by_cycle_metrics",
".quality_by_cycle_metrics",
".quality_distribution_metrics",
),
report(
"{path}{sample}.base_distribution_by_cycle.pdf",
caption="../report/plot_base_distribution_by_cycle_picard_mm.rst",
category="Multiple Metrics (picard)"),
report(
"{path}{sample}.quality_by_cycle.pdf",
caption="../report/plot_quality_by_cycle_picard_mm.rst",
category="Multiple Metrics (picard)"),
report(
"{path}{sample}.quality_distribution.pdf",
caption="../report/plot_quality_distribution_picard_mm.rst",
category="Multiple Metrics (picard)")
resources:
# This parameter (default 3 GB) can be used to limit the total resources a pipeline is allowed to use, see:
# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#resources
mem_gb=3
log:
"logs/picard/{path}{sample}.log"
params:
# optional parameters
"{} ".format(config["params"]["collect-multiple-metrics"])
wrapper:
"0.64.0/bio/picard/collectmultiplemetrics"
rule genomecov:
input:
"results/filtered/{sample}.sorted.bam",
flag_stats=expand("results/{step}/{{sample}}.sorted.{step}.flagstat",
step= "bamtools_filtered" if config["single_end"]
else "orph_rm_pe"),
stats=expand("results/{step}/{{sample}}.sorted.{step}.stats.txt",
step= "bamtools_filtered" if config["single_end"]
else "orph_rm_pe"),
output:
pipe("results/bed_graph/{sample}.bedgraph")
log:
"logs/bed_graph/{sample}.log"
params:
lambda w, input:
"-bg -scale $(grep 'mapped (' {flagstats_file} | awk '{{print 1000000/$1}}') {pe_fragment} {extend}".format(
flagstats_file=input.flag_stats,
pe_fragment="" if config["single_end"] else "-pc",
# Estimated fragment size used to extend single-end reads
extend=
"-fs $(grep ^SN {stats} | "
"cut -f 2- | "
"grep -m1 'average length:' | "
"awk '{{print $NF}}')".format(
stats=input.stats)
if config["single_end"] else ""
)
wrapper:
"0.64.0/bio/bedtools/genomecov"
rule sort_genomecov:
input:
"results/genrich/{sample}.bg"
rules.genrich_callpeak.output.bedgraph
output:
"results/genrich/{sample}.sort.bg"
log:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment