I have a v4 dataset from ~8 miseq runs (I expect fairly low diversity, qiime results in tens of OTUs per sample). I’m following the miseq sop with minor tweaks, but including pre.cluster diffs=2 which reduces my original 14M sequences to ~100k pre.clustered uniques. However with cluster.split (I’ve tried taxlevel=4 and taxlevel=5) I’m still running into memory issues because one of the split distance matrices is still huge (77gb). I’ve pulled a couple of the sequences that are in that dist, they’re blasting to enterobacter which is what I expect to be dominant based on qiime. So, should I try pre.clustering to diffs=3? Or should I switch to mpi? I dont’ understand the RAM implications of MPI vs calling processors in mothur.
summary.seqs(fasta=mapsJuly.contigs.fasta)
screen.seqs(fasta=current, group=mapsJuly.contigs.groups, summary=current, maxambig=0, maxlength=300)
#Summary.seqs counts the number of sequences you are currently dealing with, running this after every step gives you a good starting point for troubleshooting if the process fails or if it produces something other than what you expect.
summary.seqs(fasta=current)
#Cleaning up obviously bad sequences
screen.seqs(fasta=current, group=NAME.groups, summary=current, maxambig=0, maxlength=300)
summary.seqs(fasta=current)
#reduce fasta size by only keeping one of each sequence, this generates a names file
unique.seqs(fasta=current)
summary.seqs(fasta=current, name=current, group=current)
#replaces both the names and group file (which contain the name of every sequence) with a count table
count.seqs(name=current, group=current)
summary.seqs(count=current, fasta=current)
#align to a custom silva db that was trimmed to v4 using "pcr.seqs(fasta=silva.bacteria.fasta, start=13862, end=23444, keepdots=F)"
align.seqs(fasta=current, reference=silva.v4.fasta)
summary.seqs(fasta=current, count=current)
#remove the seqs that just didn't align (using the nubmers from the previous summmary.seqs
screen.seqs(fasta=current, count=current, summary=current, start=8, end=9582, maxhomop=8)
#remove columns from alignment that only contain -
filter.seqs(fasta=current, vertical=T, processors=8)
summary.seqs(fasta=current, count=current)
#pre.cluster to 1% difference to reduce computation time
pre.cluster(fasta=current, diffs=2, count=current)
summary.seqs(fasta=current, count=current)
#calls chimeras only from the samples that they are called chimeras, if you want to remove from all samples change dereplicate=f
chimera.uchime(fasta=current, count=current, dereplicate=t)
#removes chimeras
remove.seqs(fasta=current, accnos=current, count=current)
summary.seqs(fasta=current, count=current)
#RDP classifier
classify.seqs(fasta=current, count=current, reference=trainset10_082014.pds.fasta, taxonomy=trainset10_082014.pds.tax, cutoff=60)
#remove all non-target sequences
remove.lineage(fasta=current, count=current, taxonomy=current, taxon=Chloroplast-Mitochondria-unknown-Eukaryota)
#make otus for each Order individually
cluster.split(fasta=current, count=current, taxonomy=current, splitmethod=classify, taxlevel=5, cutoff=0.15, processors=1)