Skip to content

Commit 9b3c768

Browse files
committed
edlib genotyping
1 parent 0498577 commit 9b3c768

2 files changed

Lines changed: 40 additions & 86 deletions

File tree

src/coverage.h

Lines changed: 40 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -66,23 +66,15 @@ namespace torali {
6666
std::vector<uint8_t> alt;
6767
};
6868

69-
template<typename TAlign, typename TQualities>
70-
inline uint32_t
71-
_getAlignmentQual(TAlign const& align, TQualities const& qual) {
72-
typedef typename TAlign::index TAIndex;
73-
uint32_t baseQualSum = 0;
74-
uint32_t seqPtr = 0;
75-
uint32_t alignedBases = 0;
76-
for(TAIndex j = 0; j < (TAIndex) align.shape()[1]; ++j) {
77-
if (align[1][j] != '-') {
78-
if (align[0][j] != '-') {
79-
++alignedBases;
80-
baseQualSum += qual[seqPtr];
81-
}
82-
++seqPtr;
83-
}
84-
}
85-
return (baseQualSum / alignedBases);
69+
70+
template<typename TConfig>
71+
inline double
72+
_editDistanceHW(TConfig const& c, std::string const& query, std::string const& target) {
73+
double score = 0;
74+
EdlibAlignResult align = edlibAlign(query.c_str(), query.size(), target.c_str(), target.size(), edlibNewAlignConfig(2 * c.flankQuality * query.size(), EDLIB_MODE_HW, EDLIB_TASK_DISTANCE, NULL, 0));
75+
if (align.editDistance != -1) score = ((1.0 - c.flankQuality) * (double) query.size()) / (double) (align.editDistance + 1);
76+
edlibFreeAlignResult(align);
77+
return score;
8678
}
8779

8880
template<typename TPos>
@@ -239,7 +231,6 @@ namespace torali {
239231
{
240232
typedef typename TSpanMap::value_type::value_type TSpanPair;
241233
typedef typename TCountMap::value_type::value_type TCountPair;
242-
typedef std::vector<uint8_t> TQuality;
243234

244235
// Open file handles
245236
typedef std::vector<samFile*> TSamFile;
@@ -316,14 +307,14 @@ namespace torali {
316307
for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
317308
// Pair qualities and features
318309
typedef boost::unordered_map<std::size_t, uint8_t> TQualities;
319-
TQualities qualities;
320310
TQualities qualitiestra;
321311
typedef boost::unordered_map<std::size_t, bool> TClip;
322-
TClip clip;
323312
TClip cliptra;
324313

325314
// Iterate chromosomes
326315
for(int32_t refIndex=0; refIndex < (int32_t) hdr[file_c]->n_targets; ++refIndex) {
316+
TQualities qualities;
317+
TClip clip;
327318

328319
// Any SV breakpoints on this chromosome?
329320
if (!svOnChr[refIndex]) continue;
@@ -430,8 +421,8 @@ namespace torali {
430421
if ((countMap[file_c][itBp->id].ref.size() + countMap[file_c][itBp->id].alt.size()) >= c.maxGenoReadCount) continue;
431422
// Read spans breakpoint?
432423
if ((hasSoftClip) || ((!hasClip) && (rec->core.pos + c.minimumFlankSize + itBp->homLeft <= itBp->bppos) && (rec->core.pos + rec->core.l_qseq >= itBp->bppos + c.minimumFlankSize + itBp->homRight))) {
433-
std::string consProbe = consProbeArr[itBp->bpPoint][itBp->id];
434-
std::string refProbe = refProbeArr[itBp->bpPoint][itBp->id];
424+
auto& consProbe = consProbeArr[itBp->bpPoint][itBp->id];
425+
auto& refProbe = refProbeArr[itBp->bpPoint][itBp->id];
435426

436427
// Get sequence
437428
std::string sequence;
@@ -441,65 +432,36 @@ namespace torali {
441432
_adjustOrientation(sequence, itBp->bpPoint, itBp->svt);
442433

443434
// Compute alignment to alternative haplotype
444-
typedef boost::multi_array<char, 2> TAlign;
445-
TAlign alignAlt;
446-
DnaScore<int> simple(5, -4, -4, -4);
447-
AlignConfig<true, false> semiglobal;
448-
int32_t scoreA = needle(consProbe, sequence, alignAlt, semiglobal, simple);
449-
int32_t scoreAltThreshold = (int32_t) (c.flankQuality * consProbe.size() * simple.match + (1.0 - c.flankQuality) * consProbe.size() * simple.mismatch);
450-
double scoreAlt = (double) scoreA / (double) scoreAltThreshold;
451-
//Using edlib and score threshold of >0.7 below
452-
//double scoreAlt = ((1.0 - c.flankQuality) * (double) consProbe.size()) / (double) (_editDistanceHW(consProbe, sequence) + 1);
453-
454-
// Compute alignment to reference haplotype
455-
TAlign alignRef;
456-
int32_t scoreR = needle(refProbe, sequence, alignRef, semiglobal, simple);
457-
int32_t scoreRefThreshold = (int32_t) (c.flankQuality * refProbe.size() * simple.match + (1.0 - c.flankQuality) * refProbe.size() * simple.mismatch);
458-
double scoreRef = (double) scoreR / (double) scoreRefThreshold;
459-
//Using edlib and score threshold of >0.7 below
460-
//double scoreRef = ((1.0 - c.flankQuality) * (double) refProbe.size()) / (double) (_editDistanceHW(refProbe, sequence) + 1);
461-
// Any confident alignment?
462-
if ((scoreRef > 1) || (scoreAlt > 1)) {
463-
// Debug alignment to REF and ALT
464-
//std::cerr << "Alt:\t" << scoreAlt << "\tRef:\t" << scoreRef << std::endl;
465-
//for(TAIndex i = 0; i< (TAIndex) alignAlt.shape()[0]; ++i) {
466-
//for(TAIndex j = 0; j< (TAIndex) alignAlt.shape()[1]; ++j) std::cerr << alignAlt[i][j];
467-
//std::cerr << std::endl;
468-
//}
469-
//for(TAIndex i = 0; i< (TAIndex) alignRef.shape()[0]; ++i) {
470-
//for(TAIndex j = 0; j< (TAIndex) alignRef.shape()[1]; ++j) std::cerr << alignRef[i][j];
471-
//std::cerr << std::endl;
472-
//}
473-
474-
if (scoreRef > scoreAlt) {
475-
// Account for reference bias
476-
if (++refAlignedReadCount[file_c][itBp->id] % 2) {
477-
TQuality quality;
478-
quality.resize(rec->core.l_qseq);
479-
const uint8_t* qualptr = bam_get_qual(rec);
480-
for (int i = 0; i < rec->core.l_qseq; ++i) quality[i] = qualptr[i];
481-
uint32_t rq = _getAlignmentQual(alignRef, quality);
482-
//uint32_t rq = scoreRef * 35;
483-
if (rq >= c.minGenoQual) {
484-
countMap[file_c][itBp->id].ref.push_back((uint8_t) std::min(rq, (uint32_t) rec->core.qual));
435+
if (c.hasDumpFile) {
436+
double scoreAlt = _editDistanceHW(c, consProbe, sequence);
437+
double scoreRef = _editDistanceHW(c, refProbe, sequence);
438+
if ((scoreRef > 0.7) || (scoreAlt > 0.7)) {
439+
if (scoreRef > scoreAlt) {
440+
// Account for reference bias
441+
if (++refAlignedReadCount[file_c][itBp->id] % 2) {
442+
countMap[file_c][itBp->id].ref.push_back((uint8_t) std::min(255, std::min((int) (scoreRef * 35), (int) rec->core.qual)));
485443
}
444+
} else {
445+
countMap[file_c][itBp->id].alt.push_back((uint8_t) std::min(255, std::min((int) (scoreAlt * 35), (int) rec->core.qual)));
446+
447+
std::string svid(_addID(itBp->svt));
448+
std::string padNumber = boost::lexical_cast<std::string>(itBp->id);
449+
padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0');
450+
svid += padNumber;
451+
dumpOut << svid << "\t" << c.files[file_c].string() << "\t" << bam_get_qname(rec) << "\t" << hdr[file_c]->target_name[rec->core.tid] << "\t" << rec->core.pos << "\t" << hdr[file_c]->target_name[rec->core.mtid] << "\t" << rec->core.mpos << "\t" << (int32_t) rec->core.qual << "\tSR" << std::endl;
486452
}
487-
} else {
488-
TQuality quality;
489-
quality.resize(rec->core.l_qseq);
490-
const uint8_t* qualptr = bam_get_qual(rec);
491-
for (int i = 0; i < rec->core.l_qseq; ++i) quality[i] = qualptr[i];
492-
uint32_t aq = _getAlignmentQual(alignAlt, quality);
493-
//uint32_t aq = scoreAlt * 35;
494-
if (aq >= c.minGenoQual) {
495-
if (c.hasDumpFile) {
496-
std::string svid(_addID(itBp->svt));
497-
std::string padNumber = boost::lexical_cast<std::string>(itBp->id);
498-
padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0');
499-
svid += padNumber;
500-
dumpOut << svid << "\t" << c.files[file_c].string() << "\t" << bam_get_qname(rec) << "\t" << hdr[file_c]->target_name[rec->core.tid] << "\t" << rec->core.pos << "\t" << hdr[file_c]->target_name[rec->core.mtid] << "\t" << rec->core.mpos << "\t" << (int32_t) rec->core.qual << "\tSR" << std::endl;
453+
}
454+
} else {
455+
double scoreAlt = _editDistanceHW(c, consProbe, sequence);
456+
double scoreRef = _editDistanceHW(c, refProbe, sequence);
457+
if ((scoreRef > 0.7) || (scoreAlt > 0.7)) {
458+
if (scoreRef > scoreAlt) {
459+
// Account for reference bias
460+
if (++refAlignedReadCount[file_c][itBp->id] % 2) {
461+
countMap[file_c][itBp->id].ref.push_back((uint8_t) std::min(255, std::min((int) (scoreRef * 35), (int) rec->core.qual)));
501462
}
502-
countMap[file_c][itBp->id].alt.push_back((uint8_t) std::min(aq, (uint32_t) rec->core.qual));
463+
} else {
464+
countMap[file_c][itBp->id].alt.push_back((uint8_t) std::min(255, std::min((int) (scoreAlt * 35), (int) rec->core.qual)));
503465
}
504466
}
505467
}

src/util.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -178,14 +178,6 @@ namespace torali
178178
std::cerr << std::endl;
179179
}
180180

181-
inline int32_t
182-
_editDistanceHW(std::string const& query, std::string const& target) {
183-
EdlibAlignResult align = edlibAlign(query.c_str(), query.size(), target.c_str(), target.size(), edlibNewAlignConfig(-1, EDLIB_MODE_HW, EDLIB_TASK_DISTANCE, NULL, 0));
184-
int32_t ed = align.editDistance;
185-
edlibFreeAlignResult(align);
186-
return ed;
187-
}
188-
189181
template<typename TConfig>
190182
inline void
191183
checkSampleNames(TConfig& c) {

0 commit comments

Comments
 (0)