Cluster.split issue (again, sorry)

Hello!

This is not my first time asking about this, but this time around I am out of options. I need to run cluster.split (because when I run normal clustering I am loosing sequences in my positive control, namely my Salmonella unless I cluster at 0.005) So I want to use cluster.split to make sure that my positive control says as it should be and cluster everything under at 0.03.

But, when I run it, I get many error messages: "Your group file contains 390863 sequences and list file contains 390790 sequences. Please correct.
"

Which does not make sense to me since I am running my batch file using “current” for all files, therefore all files are correctly used. I am using 32 thread, is this a problem with using multiple or too much paralleling? I need to run around 700 samples together. Thanks for your input. Here is the spec I am using to run the samples (it takes me 23 hours to run, version v.1.44.3 which is the latest available on the computer servers I use) and my batch file.

Kind regards,

mem=128000M
cpus-per-task=32

set.current(processors=32)

set.logfile(name=megacampy_logFile_clustersplit)

make.contigs(file=megacampy.files, oligos=primers.oligo.txt, checkorient=t, pdiffs=0, maxee=2,deltaq=6)
summary.seqs(fasta=current)

screen.seqs(fasta=current, group=current, summary=current, maxambig=0, maxhomop=70)
count.groups(group=current)
summary.seqs(fasta=current)

unique.seqs(fasta=current)
count.seqs(name=current, group=current)
summary.seqs(fasta=current, count=current)

align.seqs(fasta=current, reference=silva.nr_v132.pcr.align, flip=t)
summary.seqs(fasta=current, count=current)
count.groups(group=current)

screen.seqs(fasta=current, count=current, summary=current, start=1968, end=11550)
count.groups(count=current)
summary.seqs(fasta=current, count=current)

filter.seqs(fasta=current, vertical=T, trump=.)
unique.seqs(fasta=current, count=current)
count.groups(count=current)
summary.seqs(fasta=current, count=current)

pre.cluster(fasta=current, count=current, diffs=4)
count.groups(count=current)
summary.seqs(fasta=current, count=current)

chimera.vsearch(fasta=current, count=current, dereplicate=t)
remove.seqs(fasta=current, accnos=current)
count.groups(count=current)
summary.seqs(fasta=current, count=current)

classify.seqs(fasta=current, count=current, iters=1000, reference=silva.nr_v138.align, taxonomy=silva.nr_v138.tax, cutoff=80)

remove.lineage(fasta=current, count=current, taxonomy=current, taxon=Chloroplast-Mitochondria-unknown-Eukaryota)
summary.tax(taxonomy=current, count=current)
summary.seqs(fasta=current, count=current)
count.groups(count=current)

dist.seqs(fasta=current,cutoff=0.04)

cluster.split(column=current, count=current, taxonomy=current, splitmethod=classify, taxlevel=6, delta=0, iters=500, cutoff=0.03)

make.shared(list=current,count=current,label=0.03)

classify.otu(list=current, count=current, taxonomy=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Positif_PCR_sophie)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Zymo_Ctrl_PCR_carvacrol)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=zymo_extraction_carvacrol)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Zymo_community_sophie)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Positif_PCR2_sophie)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Commart_probio)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=campy_protease_Zymo_extraction)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

quit()

Could you please email your log file and a link to this post to mothur.bugs@gmail.com?

Thanks,
Pat

thanks!

I was able to get 1.46.1, rerunning it as for now.

But, while checking around the forum, I wondered if this could be a problem with some sequence names such as : “M06648_144_000000000-JKPRF_1_2105_10757_18173” that contain the character “-”?
I spotted this sequence in my server output (I am at the classification step, should have final results by the end of tomorow).

Cheers!

Alright, it ran through.

Took 17 hours. Now analyzing my run, I hope my positive controls are ok. for those that are interested, here is how I did it:

DNA extraction: lysis buffer, beads beating, heat, and phenol/chloro lots of DNA but I also have lots of Chimera probably due to DNA fragmentation, I will work on a less brutal extraction in the coming year)
region: V4, Carporaso primers, so almost full overlaping
number of samples: around 700
number of sequences after cleaning: 45 818 071
number of uniques: 390 779
numbers of OTU (cluster.split, cutoff at 0.03): 71 770
error rate: e-4 to e-6 depending on the positive controls.
samples: chicken, ileal and caecal, different experiments done in our level 2 animal facility with one parameter in common that I want to check the impact for.

Batch file:
set.current(processors=32)

set.logfile(name=megacampy_logFile_clustersplit)

make.contigs(file=megacampy.files, oligos=primers.oligo.txt, checkorient=t, pdiffs=0, maxee=2,deltaq=6)
summary.seqs(fasta=current)

screen.seqs(fasta=current, group=current, summary=current, maxambig=0, maxhomop=70)
count.groups(group=current)
summary.seqs(fasta=current)

unique.seqs(fasta=current)
count.seqs(name=current, group=current)
summary.seqs(fasta=current, count=current)

align.seqs(fasta=current, reference=silva.nr_v132.pcr.align, flip=t)
summary.seqs(fasta=current, count=current)
count.groups(group=current)

screen.seqs(fasta=current, count=current, summary=current, start=1968, end=11550)
count.groups(count=current)
summary.seqs(fasta=current, count=current)

filter.seqs(fasta=current, vertical=T, trump=.)
unique.seqs(fasta=current, count=current)
count.groups(count=current)
summary.seqs(fasta=current, count=current)

pre.cluster(fasta=current, count=current, diffs=4)
count.groups(count=current)
summary.seqs(fasta=current, count=current)

chimera.vsearch(fasta=current, count=current, dereplicate=t)
remove.seqs(fasta=current, accnos=current)
count.groups(count=current)
summary.seqs(fasta=current, count=current)

classify.seqs(fasta=current, count=current, iters=1000, reference=silva.nr_v138.align, taxonomy=silva.nr_v138.tax, cutoff=80)

remove.lineage(fasta=current, count=current, taxonomy=current, taxon=Chloroplast-Mitochondria-unknown-Eukaryota)
summary.tax(taxonomy=current, count=current)
summary.seqs(fasta=current, count=current)
count.groups(count=current)

#dist.seqs(fasta=current,cutoff=0.04)

#cluster.split(column=current, count=current, taxonomy=current, splitmethod=classify, taxlevel=6, delta=0, iters=500, cutoff=0.03)
cluster.split(fasta=current, count=current, taxonomy=current, taxlevel=6, delta=0, iters=500, precision=1000, cutoff=0.03)

make.shared(list=current,count=current,label=0.03)

classify.otu(list=current, count=current, taxonomy=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Positif_PCR_sophie)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Zymo_Ctrl_PCR_carvacrol)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=zymo_extraction_carvacrol)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Zymo_community_sophie)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Positif_PCR2_sophie)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=Commart_probio)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

summary.seqs(fasta=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.fasta, count=megacampy.trim.contigs.good.unique.good.filter.unique.precluster.denovo.vsearch.pick.pick.count_table)
count.groups(count=current)

get.groups(count=current, fasta=current,taxonomy=current, groups=campy_protease_Zymo_extraction)
summary.seqs(fasta=current, count=current)
seq.error(count=current, fasta=current, reference=zymostd_ref.txt, aligned=F)

quit()

Kind regards and always use the latest version available…

This topic was automatically closed 10 days after the last reply. New replies are no longer allowed.