splitting merging III

Hi all,

after encountering problems with splitting and re-merging count_table files (see my previous post), I wanted to make sure that this is a generic problem and not dependant on a particular dataset of mine.

So I tried splitting and re-merging groups with various datasets that I had and again encountered the same error ([ERROR]: Your count table contains more than 1 sequence named xxxxxxxxxxxxxxxxxxxxx, sequence names must be unique. Please correct.)

Finally I downloaded the mothur MiSeq SOP example dataset and tried the routine with that, same problem.

I am curious if I have made a conceptual error of what users can do with “get.groups, remove.groups & split.groups in conjunction with merge.count” and what they can’t - i.e. that non-unique sequences that are shared between groups are split out with a common sequence name during get.groups and split.groups and thus can-not be re-merged with groups they shared sequences with?

I would be grateful if someone could look into this.

Cheers,
Tom


In brief, from the MiSeq SOP, I ran the provided stability.batch file up until after chimera.uchime and remove.seqs,
then changed the current names stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.fasta &
stability.trim.contigs.good.unique.good.filter.unique.precluster.denovo.uchime.pick.count_table
to
sop.fasta & sop.count_table

I then tested for uniqueness and integrity, chose two random groups, picked with get.groups and removed with remove.groups. Checked the output for uniqueness and integrity and attempted to re-merge with merge.count= error.
I then repeated the exercise with split.groups, chose the same groups, checked the groups for uniqueness and integrity and attempted to re-merge the two groups = error. Finally, I checked how many sequences were still shared between the two split groups.


#change the name of the file from stability.files to whatever suits your study
make.contigs(file=stability.files, processors=32)
screen.seqs(fasta=current, group=current, maxambig=0, maxlength=275)
unique.seqs()
count.seqs(name=current, group=current)
align.seqs(fasta=current, reference=/mnt/shared/scratch/tf40403/bact/NSIS/NSIS_new/joint_runs/silva.bacteria/silva.bacteria/silva.v4.fasta)
screen.seqs(fasta=current, count=current, start=1968, end=11550, maxhomop=8)
filter.seqs(fasta=current, vertical=T, trump=.)
unique.seqs(fasta=current, count=current)
pre.cluster(fasta=current, count=current, diffs=2)
chimera.uchime(fasta=current, count=current, dereplicate=t)
remove.seqs(fasta=current, accnos=current)

stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.fasta
stability.trim.contigs.good.unique.good.filter.unique.precluster.denovo.uchime.pick.count_table

Rename to sop.fasta & sop.count_table


unique.seqs(fasta=sop.fasta, count=sop.count_table) 2668 2668

Output File Names:
sop.unique.count_table
sop.unique.fasta


summary.seqs(count=sop.unique.count_table, fasta=sop.unique.fasta, processors=32)

Using 32 processors.

Start End NBases Ambigs Polymer NumSeqs
Minimum: 1 376 249 0 3 1
2.5%-tile: 1 376 252 0 3 2984
25%-tile: 1 376 252 0 4 29838
Median: 1 376 252 0 4 59676
75%-tile: 1 376 253 0 5 89514
97.5%-tile: 1 376 253 0 6 116368
Maximum: 1 376 256 0 8 119351
Mean: 1 376 252.467 0 4.37276

of unique seqs: 2668

total # of seqs: 119351

Output File Names:
sop.unique.summary

It took 1 secs to summarize 119351 sequences.

count.groups(count=sop.unique.count_table)
F3D0 contains 6227.
F3D1 contains 4696.
F3D141 contains 4698.
F3D142 contains 2455.
F3D143 contains 2436.
F3D144 contains 3504.
F3D145 contains 5597.
F3D146 contains 3873.
F3D147 contains 12617.
F3D148 contains 9607.
F3D149 contains 10109.
F3D150 contains 4168.
F3D2 contains 15812.
F3D3 contains 5320.
F3D5 contains 3485.
F3D6 contains 6469.
F3D7 contains 4095.
F3D8 contains 4322.
F3D9 contains 5801.
Mock contains 4060.

Total seqs: 119351.

Output File Names:
sop.unique.count.summary


get.groups(count=sop.unique.count_table, fasta=sop.unique.fasta, groups=F3D5-F3D6)

system(mv sop.unique.pick.fasta sop.rem.groups.pick.fasta)
system(mv sop.unique.pick.count_table sop.rem.groups.pick.count_table)

unique.seqs(count=sop.rem.groups.pick.count_table, fasta=sop.rem.groups.pick.fasta)
419 419
No new uniques

remove.groups(count=sop.unique.count_table, fasta=sop.unique.fasta, groups=F3D5-F3D6)

system(mv sop.unique.pick.fasta sop.remain.groups.pick.fasta)
system(mv sop.unique.pick.count_table sop.remain.groups.pick.count_table)

unique.seqs(count=sop.remain.groups.pick.count_table, fasta=sop.remain.groups.pick.fasta)
2492 2492
No new uniques

merge.count(count=sop.rem.groups.pick.count_table-sop.remain.groups.pick.count_table, output=sop.remerge.count_table)

[ERROR]: Your count table contains more than 1 sequence named M00967_43_000000000-A3JHG_1_1101_10133_8460, sequence names must be unique. Please correct.

split.groups(count=sop.unique.count_table, fasta=sop.unique.fasta)

sop.unique.F3D6.fasta
sop.unique.F3D6.count_table
sop.unique.F3D5.fasta
sop.unique.F3D5.count_table

unique.seqs(count=sop.unique.F3D6.count_table, fasta=sop.unique.F3D6.fasta)
298 298

unique.seqs(count=sop.unique.F3D5.count_table, fasta=sop.unique.F3D5.fasta)
246 246

merge.count(count=sop.unique.F3D6.count_table-sop.unique.F3D5.count_table, output=sop.remerge.split.count_table)

[ERROR]: Your count table contains more than 1 sequence named M00967_43_000000000-A3JHG_1_1101_10133_8460, sequence names must be unique. Please correct.

list.seqs(count=sop.unique.F3D5.count_table)
sop.unique.F3D5.accnos

remove.seqs(count=sop.unique.F3D6.count_table, fasta=sop.unique.F3D6.fasta, accnos=sop.unique.F3D5.accnos)
sop.unique.F3D6.pick.fasta
sop.unique.F3D6.pick.count_table

summary.seqs(fasta=sop.unique.F3D6.fasta, count=sop.unique.F3D6.count_table, processors=32)

Start End NBases Ambigs Polymer NumSeqs
Minimum: 1 376 250 0 3 1
2.5%-tile: 1 376 252 0 4 162
25%-tile: 1 376 252 0 4 1618
Median: 1 376 252 0 4 3235
75%-tile: 1 376 253 0 5 4852
97.5%-tile: 1 376 253 0 6 6308
Maximum: 1 376 255 0 8 6469
Mean: 1 376 252.484 0 4.40145

of unique seqs: 298

total # of seqs: 6469


summary.seqs(fasta=sop.unique.F3D6.pick.fasta, count=sop.unique.F3D6.pick.count_table, processors=32)

Start End NBases Ambigs Polymer NumSeqs
Minimum: 1 376 250 0 3 1
2.5%-tile: 1 376 251 0 4 12
25%-tile: 1 376 253 0 4 114
Median: 1 376 253 0 5 228
75%-tile: 1 376 253 0 5 341
97.5%-tile: 1 376 254 0 8 443
Maximum: 1 376 254 0 8 454
Mean: 1 376 252.815 0 4.84802

of unique seqs: 173

total # of seqs: 454

Output File Names:
sop.unique.F3D6.pick.summary

I do not believe you can do that so far in the analysis.

Just look where this sequence is after you split and before you split.

When I did that, I saw that this sequence was in both of my split count_table, but was only in one sample in my no plit table. Turns out that I did a stupid mistake when splitting: one sample was in both split table, resulting in a duplication of the sequence.

What do you have?

Hi all,

as this is vexing me and to get to the bottom of things, I did another, hopefully final test using the MiSeq SOP data:

Again, based on the stability.batch file that was provided, I tested splitting and re-merging groups: I) using the names and group files and II) the count_table provided by count.seqs.

The outcome is pretty conclusive: when using a name & group file to split the groups and not a count_table, merging the split groups with count.seqs instead of merge.count is a bit of a faff but works fine, when using the count_table it does not and results in error.

Hmm, so far, and to my own shame, I have not looked into the differences between the formats of the index files used by mothur; I have always regarded the count_table as a more convenient format to the joint use of group and name file - my guess is that some information of what sequences are duplicated with what other sequence is not maintained in the count_table format and thus prohibits re-merging - I don’t think its the merge.count process as I’ve tested that before.

In summary, if you want to split and re-merge groups at any point during your pipeline, you’d have to do that by maintaining the group and names index files.


Cheers, Tom


#test using name and group files as index files - not count_table
make.contigs(file=stability.files, processors=32)
screen.seqs(fasta=current, group=current, maxambig=0, maxlength=275)
unique.seqs(fasta=current)
#count.seqs(name=current, group=current)
align.seqs(fasta=current, reference=/mnt/shared/scratch/tf40403/bact/NSIS/NSIS_new/joint_runs/silva.bacteria/silva.bacteria/silva.v4.fasta)
screen.seqs(fasta=current, name=current, group=current, start=1968, end=11550, maxhomop=8)
filter.seqs(fasta=current, vertical=T, trump=.)
unique.seqs(fasta=current, name=current)
pre.cluster(fasta=current, name=current, group=current, diffs=2)
chimera.uchime(fasta=current, name=current, group=current, dereplicate=t)
remove.seqs(fasta=current, name=current, group=current, accnos=current)
quit()

#output from batch file:
stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.names
stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.fasta
stability.contigs.good.good.pick.groups

#renamed to
system(mv stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.names sop3.names)
system(mv stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.fasta sop3.fasta)
system(mv stability.contigs.good.good.pick.groups sop3.groups)

sop3.groups
sop3.names
sop3.fasta

split.groups(fasta=sop3.fasta, group=sop3.groups, name=sop3.names)

chose
sop3.F3D6.names
sop3.F3D5.names

make.group(fasta=sop3.F3D6.fasta-sop3.F3D5.fasta, groups=F3D6-F3D5)
sop3.F3D6.sop3.F3D5.groups

merge.files(input=sop3.F3D6.fasta-sop3.F3D5.fasta, output=sop3.F3D6.F3D5.merged.fasta)

sop3.F3D6.F3D5.merged.fasta

unique.seqs(fasta=sop3.F3D6.F3D5.merged.fasta)

sop3.F3D6.F3D5.merged.unique.fasta
sop3.F3D6.F3D5.merged.names

count.seqs(name=sop3.F3D6.F3D5.merged.names, group=sop3.F3D6.sop3.F3D5.groups)

sop3.F3D6.F3D5.merged.count_table

test with count_file

count.seqs(group=sop3.groups, name=sop3.names)
sop3.count_table

split.groups(fasta=sop3.fasta, count=sop3.count_table)
sop3.F3D6.fasta
sop3.F3D6.count_table
sop3.F3D5.fasta
sop3.F3D5.count_table
unique.seqs(fasta=sop3.F3D6.fasta, count=sop3.F3D6.count_table)
unique.seqs(fasta=sop3.F3D5.fasta, count=sop3.F3D5.count_table)

merge.count(count=sop3.F3D6.count_table-sop3.F3D5.count_table, output=sop3.merge.count_table)
[ERROR]: Your count table contains more than 1 sequence named M00967_43_000000000-A3JHG_1_1101_10133_8460, sequence names must be unique. Please correct.