nascONT-seq-pipeline/Snakefile at e9ffd2640eaff076b302a92779d396b29717b243 · MayerGroup/nascONT-seq-pipeline

1

import os

2

import shutil

3

4

#GLOBAL VALUES

5

sampleID = config["sampleID"]

6

IDS, = glob_wildcards("RawData/"+sampleID+"/{id}.fast5")

7

WORKDIR = os.getcwd()

8

9

rule all:

10

input:

11

expand("Fastq/{sample}.fastq", sample=sampleID),

12

expand("QualityCheck/{sample}/summary.yaml", sample=sampleID),

13

sampleID + "_0.1.pdf"

14

15

################################################################################

16

################################################################################

17

######################################## Basecalling

18

# basecalling using GUPPY

19

rule Basecalling:

20

input:

21

"RawData/{sample}/{id}.fast5"

22

output:

23

"QC/{sample}/{id}.txt",

24

"Fast5/{sample}/{id}/{id}.fast5",

25

"Basecalled/{sample}/{id}.fastq"

26

resources:

27

memory = 10,

28

time = 5

29

params:

30

guppy = config["guppy"],

31

flowcell = config["flowcell"],

32

kit = config["kit"],

33

work = WORKDIR

34

priority: 10

35

group: "basecalling"

36

threads: config["threads"]

37

shell:

38

"""

39

mkdir -p Fast5/{wildcards.sample}/{wildcards.id}

40

41

		ln -sf {params.work}/{input} {params.work}/Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5

42

43

		{params.guppy} -i Fast5/{wildcards.sample}/{wildcards.id}/ -s Basecalled/{wildcards.sample}/{wildcards.id}/ \

44

		--flowcell {params.flowcell} --kit {params.kit} --num_callers 1 --cpu_threads_per_caller {threads} --fast5_out

45

46

		mv -f Basecalled/{wildcards.sample}/{wildcards.id}/workspace/{wildcards.id}.fast5 Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5

47

48

		mv Basecalled/{wildcards.sample}/{wildcards.id}/sequencing_summary.txt QC/{wildcards.sample}/{wildcards.id}.txt

49

50

		find Basecalled/{wildcards.sample}/{wildcards.id} -name '*.fastq' -exec mv {{}} Basecalled/{wildcards.sample}/{wildcards.id}.fastq \;

51

"""

52

53

# aggregation of fastq files

54

rule AggregateBasecalling:

55

input:

56

f5=lambda wildcards: expand("Basecalled/{{sample}}/{id}.fastq",id=IDS),

57

txt=lambda wildcards: expand("QC/{{sample}}/{id}.txt", id=IDS)

58

output:

59

fq="Fastq/{sample}.fastq",

60

txt="QualityCheck/{sample}/sequencing_summary.txt"

61

resources:

62

memory = 50,

63

time = 4

64

priority: 9

65

run:

66

with open(output.fq,'w') as fout:

67

for fn in input.f5:

68

with open(fn,'r') as fin:

69

shutil.copyfileobj(fin, fout)

70

with open(output.txt,'w') as fout:

71

i=0

72

for fn in input.txt:

73

with open(fn,'r') as fin:

74

header=fin.readline()

75

if i==0:

76

fout.write(header)

77

shutil.copyfileobj(fin, fout)

78

i+=1

79

80

# QC

81

rule CheckQuality:

82

input:

83

"QualityCheck/{sample}/sequencing_summary.txt"

84

output:

85

"QualityCheck/{sample}/summary.yaml"

86

priority: 8

87

resources:

88

memory = 10,

89

time = 1

90

shell:

91

		"Rscript scripts/MinIONQC.R -i QualityCheck/{wildcards.sample}/ -o QualityCheck"

92

93

################################################################################

94

################################################################################

95

######################################## Extract Mapping regions

96

# mapping to pre-computed reference: here GRCh38_p12

97

rule Minimap2_mapping:

98

params:

99

minimap2 = config["minimap2"],

100

junc_bonus = config["junc_bonus"],

101

gap_open_cost = config["gap_open_cost"],

102

MAPQ_min = config["MAPQ"],

103

reference = "Minimap2_Index/GRCh38_p12.fasta",

104

junc_bed = "Minimap2_Index/junctions.bed"

105

input:

106

fastq = "Fastq/{sample}.fastq"

107

output:

108

sam = "Data/{sample}.sam",

109

stat = "Data/{sample}.mapping.stat"

110

resources:

111

memory = 50,

112

time = 4

113

threads: config["threads"]

114

shell:

115

"""

116

		{params.minimap2} -ax splice -ub -k14 --secondary=no -t {threads} --junc-bed {params.junc_bed} -O{params.gap_open_cost},32 --junc-bonus={params.junc_bonus} {params.reference} {input.fastq} > {wildcards.sample}.temp

117

    	cat <(grep -P "^@" {wildcards.sample}.temp ) <(grep -P -v "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | awk "{{ if ( \$5 > {params.MAPQ_min} ) print \$0 }}" ) > {output.sam}

118

    	grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l > {output.stat}

119

    	grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 == "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat}

120

    	grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$5 > {params.MAPQ_min} ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat}

121

"""

122

123

# Identify Mapping reads

124

rule filterReads:

125

input:

126

mapping = "Data/" + sampleID + ".sam",

127

fastq = "Fastq/" + sampleID + ".fastq"

128

output:

129

id = "Data/" + sampleID + ".mapped.id",

130

fastq = "Data/" + sampleID + ".mapped.fastq"

131

resources:

132

memory = 50,

133

time = 1

134

shell:

135

"""

136

		grep -P -v "^@" {input.mapping} | cut -f1,2 | grep -v -P "\t4$" | cut -f1 | sort | uniq > {output.id}

137

Rscript scripts/filterReads.r {input.fastq} {output.id} {output.fastq}

138

"""

139

140

# Identify Regions of the reads mapping the reference

141

rule getMappingBed:

142

input:

143

fastq = "Data/" + sampleID + ".mapped.fastq",

144

sam = "Data/" + sampleID + ".sam"

145

output:

146

length = "IGV/" + sampleID + ".len.txt",

147

cigar = "IGV/" + sampleID + ".cigar.txt",

148

bed = "IGV/" + sampleID + ".bed"

149

resources:

150

memory = 10,

151

time = 1

152

shell:

153

"""

154

Rscript scripts/getReadLength.r {input.fastq} {output.length}

155

		paste <(grep -P -v '^@' {input.sam} | cut -f6 | grep -v '\*' | sed 's/\([MIDNSHP]\)/\\1 /g') <(grep -P -v '^@' {input.sam} | cut -f1,2,3,4 | grep -v -P '\t4\t' ) > {output.cigar}

156

Rscript scripts/getBedFile.r {output.cigar} {output.length} {output.bed}

157

"""

158

159

################################################################################

160

################################################################################

161

######################################## Extract Adapter regions

162

163

# convert reads to fasta

164

rule fastqToFasta:

165

input:

166

fastq = "{X}.fastq"

167

output:

168

fasta = "{X}.fasta"

169

resources:

170

memory = 10,

171

time = 1

172

shell:

173

		"awk ' NR % 4 == 1 {{ print $0 ; }} NR % 4 == 2 {{print $0; }}' {input.fastq} |sed 's/@/>/g' | sed 's/ .*$//g' > {output.fasta}"

174

175

#first HMMER iteration

176

rule identifyAdapter_iteration1:

177

params:

178

barcodes = "deposit/barcodes.cDNA.fas",

179

hmmer = config["hmmer"],

180

prefix = "IdentifyAdapter/"

181

input:

182

fasta = "Fastq/" + sampleID + ".fasta"

183

output:

184

tab = "IdentifyAdapter/adapter.{X}.tab"

185

resources:

186

memory = 50,

187

time = 1

188

threads: config["threads"]

189

shell:

190

"""

191

Rscript scripts/getStockholmMSA.r {params.barcodes} {params.prefix}

192

for msa in {params.prefix}/*.msa

193

do

194

{params.hmmer}hmmbuild ${{msa%.msa}}.hmm $msa

195

{params.hmmer}hmmpress ${{msa%.msa}}.hmm

196

done

197

cat {params.prefix}/*.hmm > {params.prefix}adapter.hmm

198

{params.hmmer}hmmpress {params.prefix}adapter.hmm

199

200

		{params.hmmer}nhmmscan --noali --notextw --max -E {wildcards.X} --cpu {threads} --tblout {output.tab} {params.prefix}adapter.hmm {input.fasta} > "IdentifyAdapter/temp"

201

"""

202

203

#second HMMER iteration

204

rule identifyAdapter_iteration2:

205

params:

206

barcodes = "deposit/barcodes.cDNA.fas",

207

hmmer = config["hmmer"],

208

prefix = "IdentifyAdapter/{X}/"

209

input:

210

fasta = "Fastq/" + sampleID + ".fasta",

211

tab = "IdentifyAdapter/adapter.{X}.tab"

212

output:

213

tab = "IdentifyAdapter/{X}/adapter.optimized.tab"

214

resources:

215

memory = 50,

216

time = 1

217

threads: config["threads"]

218

shell:

219

"""

220

		grep -P -v "^#" {input.tab} | sed -r "s/[[:space:]]+/\t/g" | cut -f1,3,7,8,13 > {params.prefix}"firstHits"

221

		Rscript scripts/fetchFasta.r {params.prefix}"firstHits" {input.fasta} {params.prefix} {wildcards.X}

222

for fasta in {params.prefix}*.fasta

223

do

224

p=${{fasta%.fasta}}

225

name=${{p##*/}}

226

			{params.hmmer}hmmalign --trim IdentifyAdapter/${{name}}.hmm $fasta > ${{fasta%.fasta}}.optimized.msa

227

			{params.hmmer}hmmbuild ${{fasta%.fasta}}.optimized.hmm ${{fasta%.fasta}}.optimized.msa

228

{params.hmmer}hmmpress ${{fasta%.fasta}}.optimized.hmm

229

done

230

cat {params.prefix}*.optimized.hmm > {params.prefix}adapter.optimized.hmm

231

{params.hmmer}hmmpress {params.prefix}adapter.optimized.hmm

232

		{params.hmmer}nhmmscan --notextw --max -E 10 --cpu {threads} --tblout {output.tab} {params.prefix}adapter.optimized.hmm {input.fasta} > {params.prefix}"temp"

233

"""

234

235

#get adapter positions in reads

236

rule convert2Bed:

237

input:

238

tab = "{X}.tab"

239

output:

240

bed = "{X}.bed"

241

resources:

242

memory = 10,

243

time = 1

244

shell:

245

"""

246

		sed -r "s/[[:space:]]+/\t/g" {input.tab} | grep -v -P "^#" | cut -f1,3,7,8,12,13 | \

247

		awk '{{print $2"\t"$3"\t"$4"\t"$1"\t"$6"\t"$5}}' | sort -k1,1 -k2,2n | sed "s/\.optimized//g" > {wildcards.X}.temp

248

Rscript scripts/convertEvalue2Score.r {wildcards.X}.temp {output.bed} 1

249

rm {wildcards.X}.temp

250

"""

251

252

################################################################################

253

################################################################################

254

######################################## Classify Reads

255

256

# join adapter and mapping information

257

rule joinBed:

258

input:

259

adapter = "IdentifyAdapter/{X}/adapter.optimized.bed",

260

mapping = "IGV/" + sampleID + ".bed"

261

output:

262

bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed"

263

resources:

264

memory = 10,

265

time = 1

266

shell:

267

        "cat <(grep -P -v '\t0\t[\+\-]+$' {input.adapter}) {input.mapping} | sort -k1,1 -k2,2n > {output.bed}"

268

269

rule mergeBed:

270

params:

271

overlap = 15

272

input:

273

bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed"

274

output:

275

        bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed",

276

        bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed"

277

resources:

278

memory = 10,

279

time = 1

280

shell:

281

"""

282

        sort -k1,1 -k2,2n {input.bed} | grep -v supplementary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_primary}

283

        sort -k1,1 -k2,2n {input.bed} | grep -v primary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_supplementary}

284

"""

285

286

# classification

287

rule classifyReads:

288

input:

289

        bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed",

290

        bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed",

291

len = "IGV/" + sampleID + ".len.txt"

292

output:

293

classification_temp = "Classifier_{X}/" + sampleID + ".classification.temp",

294

classification = "Classifier_{X}/" + sampleID + ".classification.txt",

295

stat = "Classifier_{X}/" + sampleID + ".classification.stat"

296

resources:

297

memory = 10,

298

time = 1

299

shell:

300

"""

301

        Rscript scripts/classifyReads.r {input.bed_primary} {input.bed_supplementary} {input.len} {output.classification_temp} {output.stat}

302

        paste {output.classification_temp} <(grep -o -P "primary_.*_[0-9]+|supplementary_.*_[0-9]+" {output.classification_temp} | sed "s/\_/\t/g") | sort -k1,1n > {output.classification}

303

"""

304

305

# sort

306

rule sortByName:

307

input:

308

sam = "{X}.sam",

309

output:

310

sam_sorted = "{X}.sortedName.sam"

311

resources:

312

memory = 10,

313

time = 1

314

shell:

315

        """cat <(samtools view -H {input.sam}) <(grep -v -P "^@" {input.sam} | sort -k1,1n ) > {output.sam_sorted}"""

316

317

# write classification into sam file

318

rule modifySam:

319

input:

320

sam = "Data/" + sampleID + ".sortedName.sam",

321

classification = "Classifier_{X}/" + sampleID + ".classification.txt"

322

output:

323

sam = "Classifier_{X}/" + sampleID + ".classified.sam"

324

resources:

325

memory = 50,

326

time = 1

327

shell:

328

"Rscript scripts/modifySam.r {input.sam} {input.classification} {output.sam}"

329

330

################################################################################

331

################################################################################

332

######################################## Get statistics

333

334

# quantify suspicious species

335

rule informativeReads:

336

params:

337

featureCounts = config["featureCounts"],

338

maskRrna = "deposit/maskRrna.txt",

339

maskRegions = "deposit/maskRegion.txt"

340

input:

341

bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam"

342

output:

343

tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv",

344

tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv"

345

resources:

346

memory = 10,

347

time = 1

348

shell:

349

"""

350

        {params.featureCounts} -F SAF -L -a {params.maskRrna} -o {output.tsv} {input.bam}

351

        {params.featureCounts} -F SAF -L -a {params.maskRegions} -o {output.tsv2} {input.bam}

352

"""

353

354

# convert to bam

355

rule convertIGV:

356

input:

357

sam = "{path}.sam"

358

output:

359

bam = "{path}.sorted.bam",

360

bai = "{path}.sorted.bam.bai"

361

resources:

362

memory = 10,

363

time = 1

364

shell:

365

"""

366

samtools view -b {input.sam} > {wildcards.path}.bam

367

samtools sort {wildcards.path}.bam -o {output.bam}

368

samtools index {output.bam}

369

"""

370

371

# masked files

372

rule maskeBam:

373

params:

374

maskRrna = "deposit/maskRrna.txt",

375

maskRegions = "deposit/maskRegion.txt"

376

input:

377

bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam"

378

output:

379

bam = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam",

380

bai = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam.bai"

381

resources:

382

memory = 10,

383

time = 1

384

shell:

385

"""

386

        cat <(tail -n +2 {params.maskRrna} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}') <(tail -n +2 {params.maskRegions} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}' ) | sort -k1,1 -k2,2n > Classifier_{wildcards.X}/maskRegions.bed

387

        bedtools intersect -v -abam {input.bam} -b Classifier_{wildcards.X}/maskRegions.bed > {output.bam}

388

samtools index {output.bam}

389

"""

390

391

# get stats

392

rule compareMinimap2:

393

input:

394

sam = "Classifier_{X}/" + sampleID + ".classified.sam"

395

output:

396

stat = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat"

397

resources:

398

memory = 10,

399

time = 1

400

shell:

401

"""

402

        paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f1) <(paste <(paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f2) <(grep -P -o "ts:A:[+-]+" {input.sam} )) <(grep -P "ts:A:[+-]+" {input.sam} | grep -o -P "ST:A:[+-\.]+")) > "Classifier_{wildcards.X}/temp"

403

        Rscript scripts/compareMinimapResults.r "Classifier_{wildcards.X}/temp" {output.stat}

404

"""

405

406

# get stats

407

rule getStats:

408

input:

409

tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv",

410

tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv",

411

stat = "Classifier_{X}/" + sampleID + ".classification.stat",

412

stat2 = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat"

413

output:

414

pdf = "" + sampleID + "_{X}.pdf"

415

resources:

416

memory = 10,

417

time = 1

418

shell:

419

"""

420

        cat <(cat <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv}.summary | awk '{{sum+=$2}} END {{print sum}}') <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv2}.summary | awk '{{sum+=$2}} END {{print sum}}')) \

421

        <(awk '{{sum+=$2}} END {{print sum}}' {input.tsv2}.summary) > Classifier_{wildcards.X}/stat.txt

422

        Rscript scripts/plotStatistics.r Classifier_{wildcards.X}/stat.txt {input.stat} {input.stat2} {output.pdf}

423

"""

424

425

#################################################################################