Skip to content
代码片段 群组 项目
提交 a3bc5392 编辑于 作者: Hussein Elgridly's avatar Hussein Elgridly
浏览文件

better sorting

上级 d4d9524e
分支 he_YossiSort
无相关合并请求
package picard.sam;
public class CheckScram {
}
......@@ -25,13 +25,15 @@ package picard.sam;
import htsjdk.samtools.*;
import htsjdk.samtools.util.*;
import org.broadinstitute.barclay.argparser.CommandLineParser;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.cmdline.CommandLineProgram;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import picard.sam.util.YossiBAMRecordSet;
import picard.sam.util.YossiBAMRecordSetCodec;
import picard.util.QueryNameBatchIterator;
import java.io.File;
......@@ -59,29 +61,39 @@ public class YossiSort extends CommandLineProgram {
final SamReader yossiQueryReader = srfactory.referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
SAMFileHeader header = reader.getFileHeader();
if( header.getSortOrder() != SAMFileHeader.SortOrder.queryname ) {
throw new RuntimeException("input file must be queryname sorted");
}
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
header.setAttribute("SO", "yossiSort");
SortingCollection<SAMRecord> alignmentSorter = SortingCollection.newInstance(SAMRecord.class,
new BAMRecordCodec(header), new YossiSortComparator(srfactory, REFERENCE_SEQUENCE, INPUT), maxRecordsInRam);
SortingCollection<YossiBAMRecordSet> readsSorter = SortingCollection.newInstance(YossiBAMRecordSet.class,
new YossiBAMRecordSetCodec(header), new YossiSortComparator(), maxRecordsInRam);
final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, OUTPUT);
writer.setProgressLogger(
new ProgressLogger(log, (int) 1e7, "Wrote", "records from a sorting collection"));
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Read");
for (SAMRecord rec : reader) {
alignmentSorter.add(rec);
QueryNameBatchIterator qnbs = new QueryNameBatchIterator(reader.iterator());
while(qnbs.hasNext()) {
readsSorter.add(qnbs.next());
}
alignmentSorter.doneAdding();
readsSorter.doneAdding();
for (final SAMRecord alignment : alignmentSorter) {
writer.addAlignment(alignment);
progress.record(alignment);
for (final YossiBAMRecordSet recSet : readsSorter) {
for(final SAMRecord rec: recSet) {
writer.addAlignment(rec);
progress.record(rec);
}
}
alignmentSorter.cleanup();
readsSorter.cleanup();
writer.close();
CloserUtil.close(reader);
......
......@@ -23,143 +23,23 @@
*/
package picard.sam;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import htsjdk.samtools.*;
import picard.sam.util.YossiBAMRecordSet;
import java.io.File;
import java.io.Serializable;
import java.util.concurrent.ExecutionException;
import java.util.Comparator;
/**
* Comparator for sorting SAMRecords by chr + location + queryname.
* Comparator for sorting sets of SAMRecords by chr + location.
*
*/
public class YossiSortComparator implements SAMRecordComparator, Serializable {
protected class Coordinate {
public String chr;
public int pos;
public Coordinate(String chr, int pos) {
this.chr = chr; this.pos = pos;
}
}
private static final long serialVersionUID = 1L;
protected Cache<String, Coordinate> primaryCache = CacheBuilder.newBuilder()
.maximumSize(1000)
.build();
protected ThreadLocal<SamReader> queryReader;
public YossiSortComparator(SamReaderFactory readerFactory, File refSeq, File bam) {
queryReader = ThreadLocal.withInitial(() -> readerFactory.referenceSequence(refSeq).open(bam));
}
protected String getRefFromSATag(String saTag) {
//SA:Z:11,6482131,-,44S107M,60,0;<next>
return saTag.split(";")[0].split(",")[0];
}
protected int getPosFromSATag(String saTag) {
return Integer.parseInt(saTag.split(";")[0].split(",")[1]);
}
protected boolean recIsWeird(SAMRecord rec) {
return rec.isSecondaryOrSupplementary()
&& rec.getMateUnmappedFlag()
&& rec.getReferenceName().equals(rec.getMateReferenceName())
&& rec.getAlignmentStart() == rec.getMateAlignmentStart();
}
//SA:Z:11,6482131,-,44S107M,60,0
protected String getFirstRefName(final SAMRecord rec) {
if (rec.getFirstOfPairFlag()) {
if(rec.isSecondaryOrSupplementary()) {
return getRefFromSATag(rec.getStringAttribute("SA"));
} else {
return rec.getReferenceName();
}
} else {
return rec.getMateReferenceName();
}
}
protected int getFirstPos(final SAMRecord rec) {
if (rec.getFirstOfPairFlag()) {
if(rec.isSecondaryOrSupplementary()) {
return getPosFromSATag(rec.getStringAttribute("SA"));
}
return rec.getAlignmentStart();
} else {
return rec.getMateAlignmentStart();
}
}
protected Coordinate getPrimary1(SAMRecord rec) {
try {
return primaryCache.get(rec.getReadName(), () -> getPrimary1Internal(rec));
} catch (ExecutionException e) {
e.printStackTrace();
System.exit(1);
}
assert(false); //shouldn't get here
return null;
}
protected Coordinate getPrimary1Internal(SAMRecord rec) {
/*
* I found mysterious pair2 records with mate unmapped.
* Its RNEXT and PNEXT pointed to itself, though it did have an SA tag.
* Detect this, and switcheroo us with
*/
//its mate info points to itself(?!), though it does have an SA tag.
//look it up with a query
if(recIsWeird(rec)) {
String primaryRef = getRefFromSATag(rec.getStringAttribute("SA"));
long primaryPos = getPosFromSATag(rec.getStringAttribute("SA"));
try (
SAMRecordIterator iter = queryReader.get().queryAlignmentStart(primaryRef, (int) primaryPos)
) {
while(iter.hasNext()) {
SAMRecord pRec = iter.next();
if (pRec.getReadName().equals(rec.getReadName())) {
return new Coordinate(getFirstRefName(pRec), getFirstPos(pRec));
}
}
}
throw new RuntimeException("weird record couldn't find its primary mate");
} else {
return new Coordinate(getFirstRefName(rec), getFirstPos(rec));
}
}
public class YossiSortComparator implements Comparator<YossiBAMRecordSet>, Serializable {
@Override
public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) {
Coordinate s1Coord = getPrimary1(samRecord1);
Coordinate s2Coord = getPrimary1(samRecord2);
if (!s1Coord.chr.equals(s2Coord.chr)) {
return s1Coord.chr.compareTo(s2Coord.chr);
} else if (s1Coord.pos != s2Coord.pos) {
return s1Coord.pos - s2Coord.pos;
public int compare(final YossiBAMRecordSet recs1, final YossiBAMRecordSet recs2) {
if (!recs1.getChr().equals(recs2.getChr())) {
return recs1.getChr().compareTo(recs2.getChr());
} else {
return samRecord1.getReadName().compareTo(samRecord2.getReadName());
return recs1.getPos() - recs2.getPos();
}
}
/**
* Less stringent compare method than the regular compare. If the two records
* are equal enough that their ordering in a sorted SAM file would be arbitrary,
* this method returns 0.
*
* @return negative if samRecord1 < samRecord2, 0 if equal, else positive
*/
@Override
public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) {
return compare(samRecord1, samRecord2);
}
}
package picard.sam.util;
import htsjdk.samtools.SAMRecord;
import java.util.HashSet;
public class YossiBAMRecordSet extends HashSet<SAMRecord> {
private SAMRecord primary1 = null;
public YossiBAMRecordSet(int initialSize) {
super(initialSize);
}
@Override
public boolean add(SAMRecord samRecord) {
return super.add(samRecord);
}
private void verifyPrimary1() {
if(primary1 != null) {
return;
}
SAMRecord unmapped1 = null;
for(SAMRecord rec: this) {
if(rec.getFirstOfPairFlag()) {
if(!rec.isSecondaryOrSupplementary()) {
primary1 = rec;
return;
}
if(rec.getReadUnmappedFlag()) {
unmapped1 = rec;
}
}
}
//we couldn't find a primary, first-of-pair record, so it must be unmapped. return that instead
primary1 = unmapped1;
}
public String getChr() {
verifyPrimary1();
return primary1.getReferenceName();
}
public int getPos() {
verifyPrimary1();
return primary1.getAlignmentStart();
}
}
package picard.sam.util;
import htsjdk.samtools.*;
import htsjdk.samtools.util.SortingCollection;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
public class YossiBAMRecordSetCodec implements SortingCollection.Codec<YossiBAMRecordSet> {
private final BAMRecordCodec bamRecordCodec;
private InputStream is;
private OutputStream os;
private SAMFileHeader header;
public YossiBAMRecordSetCodec(final SAMFileHeader header) {
this.header = header;
this.bamRecordCodec = new BAMRecordCodec(header);
}
@Override
public void setOutputStream(OutputStream os) {
this.os = os;
bamRecordCodec.setOutputStream(os);
}
@Override
public void setInputStream(InputStream is) {
this.is = is;
bamRecordCodec.setInputStream(is);
}
@Override
public void encode(YossiBAMRecordSet val) {
try {
os.write(val.size());
os.flush();
for(SAMRecord rec: val) {
bamRecordCodec.encode(rec);
}
os.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public YossiBAMRecordSet decode() {
try {
int numRecords = is.read();
if(numRecords == -1) {
return null;
}
YossiBAMRecordSet recs = new YossiBAMRecordSet(numRecords);
for(int i = 0; i < numRecords; ++i) {
recs.add(bamRecordCodec.decode());
}
return recs;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
@Override
public SortingCollection.Codec<YossiBAMRecordSet> clone() {
return new YossiBAMRecordSetCodec(this.header);
}
}
package picard.util;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.PeekableIterator;
import picard.sam.util.YossiBAMRecordSet;
public class QueryNameBatchIterator implements CloseableIterator<YossiBAMRecordSet> {
final PeekableIterator<SAMRecord> peekit;
public QueryNameBatchIterator(SAMRecordIterator iter) {
this.peekit = new PeekableIterator<SAMRecord>(iter);
}
@Override
public void close() {
peekit.close();
}
@Override
public boolean hasNext() {
return peekit.hasNext();
}
@Override
public YossiBAMRecordSet next() {
String qname = peekit.peek().getReadName();
YossiBAMRecordSet records = new YossiBAMRecordSet(0);
while (peekit.peek() != null && peekit.peek().getReadName().equals(qname)) {
records.add(peekit.next());
}
return records;
}
}
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册