libStatGen Software 1
Loading...
Searching...
No Matches
Cigar.cpp
1/*
2 * Copyright (C) 2010-2011 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <stdio.h>
19#include <stdlib.h>
20#include "Cigar.h"
21#include "STLUtilities.h"
22
23// Initialize INDEX_NA.
24const int32_t Cigar::INDEX_NA = -1;
25
26
27////////////////////////////////////////////////////////////////////////
28//
29// Cigar Class
30//
31
32//
33// Set the passed in string to the string reprentation of the Cigar operations
34// in this object.
35//
36void Cigar::getCigarString(std::string& cigarString) const
37{
38 using namespace STLUtilities;
39
40 std::vector<CigarOperator>::const_iterator i;
41
42 cigarString.clear(); // clear result string
43
44 // Progressively append the character representations of the operations to
45 // the cigar string.
46 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
47 {
48 cigarString << (*i).count << (*i).getChar();
49 }
50}
51
52void Cigar::getCigarString(String& cigarString) const
53{
54 std::string cigar;
55
56 getCigarString(cigar);
57
58 cigarString = cigar.c_str();
59
60 return;
61}
62
63void Cigar::getExpandedString(std::string &s) const
64{
65 s = "";
66
67 std::vector<CigarOperator>::const_iterator i;
68
69 // Progressively append the character representations of the operations to
70 // the string passed in
71
72 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
73 {
74 for (uint32_t j = 0; j<(*i).count; j++) s += (*i).getChar();
75 }
76 return;
77}
78
79
80bool Cigar::operator == (Cigar &rhs) const
81{
82
83 if (this->size() != rhs.size()) return false;
84
85 for (int i = 0; i < this->size(); i++)
86 {
87 if (cigarOperations[i]!=rhs.cigarOperations[i]) return false;
88 }
89 return true;
90}
91
92
93// return the length of the read that corresponds to
94// the current CIGAR string.
96{
97 int matchCount = 0;
98 std::vector<CigarOperator>::const_iterator i;
99 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
100 {
101 switch (i->operation)
102 {
103 case match:
104 case mismatch:
105 case softClip:
106 case insert:
107 matchCount += i->count;
108 break;
109 default:
110 // we only care about operations that are in the query sequence.
111 break;
112 }
113 }
114 return matchCount;
115}
116
117
118// return the number of bases in the reference that
119// this read "spans"
121{
122 int matchCount = 0;
123 std::vector<CigarOperator>::const_iterator i;
124 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
125 {
126 switch (i->operation)
127 {
128 case match:
129 case mismatch:
130 case del:
131 case skip:
132 matchCount += i->count;
133 break;
134 default:
135 // we only care about operations that are in the reference sequence.
136 break;
137 }
138 }
139 return matchCount;
140}
141
142
143// Return the number of clips that are at the beginning of the cigar.
145{
146 int numBeginClips = 0;
147 for (unsigned int i = 0; i != cigarOperations.size(); i++)
148 {
149 if ((cigarOperations[i].operation == softClip) ||
150 (cigarOperations[i].operation == hardClip))
151 {
152 // Clipping operator, increment the counter.
153 numBeginClips += cigarOperations[i].count;
154 }
155 else
156 {
157 // Break out of the loop since a non-clipping operator was found.
158 break;
159 }
160 }
161 return(numBeginClips);
162}
163
164
165// Return the number of clips that are at the end of the cigar.
167{
168 int numEndClips = 0;
169 for (int i = (cigarOperations.size() - 1); i >= 0; i--)
170 {
171 if ((cigarOperations[i].operation == softClip) ||
172 (cigarOperations[i].operation == hardClip))
173 {
174 // Clipping operator, increment the counter.
175 numEndClips += cigarOperations[i].count;
176 }
177 else
178 {
179 // Break out of the loop since a non-clipping operator was found.
180 break;
181 }
182 }
183 return(numEndClips);
184}
185
186
187int32_t Cigar::getRefOffset(int32_t queryIndex)
188{
189 // If the vectors aren't set, set them.
190 if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
191 {
192 setQueryAndReferenceIndexes();
193 }
194 if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
195 {
196 return(INDEX_NA);
197 }
198 return(queryToRef[queryIndex]);
199}
200
201
202int32_t Cigar::getQueryIndex(int32_t refOffset)
203{
204 // If the vectors aren't set, set them.
205 if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
206 {
207 setQueryAndReferenceIndexes();
208 }
209 if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
210 {
211 return(INDEX_NA);
212 }
213 return(refToQuery[refOffset]);
214}
215
216
217int32_t Cigar::getRefPosition(int32_t queryIndex, int32_t queryStartPos)
218{
219 // If the vectors aren't set, set them.
220 if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
221 {
222 setQueryAndReferenceIndexes();
223 }
224 if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
225 {
226 return(INDEX_NA);
227 }
228
229 if (queryToRef[queryIndex] != INDEX_NA)
230 {
231 return(queryToRef[queryIndex] + queryStartPos);
232 }
233 return(INDEX_NA);
234}
235
236
237// Return the query index associated with the specified reference position
238// when the query starts at the specified reference position based on
239// this cigar.
240int32_t Cigar::getQueryIndex(int32_t refPosition, int32_t queryStartPos)
241{
242 // If the vectors aren't set, set them.
243 if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
244 {
245 setQueryAndReferenceIndexes();
246 }
247
248 int32_t refOffset = refPosition - queryStartPos;
249 if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
250 {
251 return(INDEX_NA);
252 }
253
254 return(refToQuery[refOffset]);
255}
256
257
259{
260 // If the vectors aren't set, set them.
261 if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
262 {
263 setQueryAndReferenceIndexes();
264 }
265 if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToCigar.size()))
266 {
267 return(INDEX_NA);
268 }
269 return(queryToCigar[queryIndex]);
270}
271
272
274{
275 // If the vectors aren't set, set them.
276 if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
277 {
278 setQueryAndReferenceIndexes();
279 }
280 if ((refOffset < 0) || ((uint32_t)refOffset >= refToCigar.size()))
281 {
282 return(INDEX_NA);
283 }
284 return(refToCigar[refOffset]);
285}
286
287
288int32_t Cigar::getExpandedCigarIndexFromRefPos(int32_t refPosition,
289 int32_t queryStartPos)
290{
291 return(getExpandedCigarIndexFromRefOffset(refPosition - queryStartPos));
292}
293
294
295char Cigar::getCigarCharOp(int32_t expandedCigarIndex)
296{
297 // Check if the expanded cigar has been set yet
298 if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
299 {
300 // Set the expanded cigar.
301 setQueryAndReferenceIndexes();
302 }
303
304 // Check to see if the index is in range.
305 if((expandedCigarIndex < 0) ||
306 ((uint32_t)expandedCigarIndex >= myExpandedCigar.length()))
307 {
308 return('?');
309 }
310 return(myExpandedCigar[expandedCigarIndex]);
311}
312
313
315{
317}
318
319
321{
323}
324
325
326char Cigar::getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos)
327{
328 return(getCigarCharOp(getExpandedCigarIndexFromRefPos(refPosition, queryStartPos)));
329}
330
331
332// Return the number of bases that overlap the reference and the
333// read associated with this cigar that falls within the specified region.
334uint32_t Cigar::getNumOverlaps(int32_t start, int32_t end,
335 int32_t queryStartPos)
336{
337 // Get the overlap info.
338 if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
339 {
340 setQueryAndReferenceIndexes();
341 }
342
343 // Get the start and end offsets.
344 int32_t startRefOffset = 0;
345 // If the specified start is more than the queryStartPos, set
346 // the startRefOffset to the appropriate non-zero value.
347 // (if start is <= queryStartPos, than startRefOffset is 0 - it should
348 // not be set to a negative value.)
349 if (start > queryStartPos)
350 {
351 startRefOffset = start - queryStartPos;
352 }
353
354 int32_t endRefOffset = end - queryStartPos;
355 if (end == -1)
356 {
357 // -1 means that the region goes to the end of the refrerence.
358 // So set endRefOffset to the max refOffset + 1 which is the
359 // size of the refToQuery vector.
360 endRefOffset = refToQuery.size();
361 }
362
363
364 // if endRefOffset is less than 0, then this read does not fall within
365 // the specified region, so return 0.
366 if (endRefOffset < 0)
367 {
368 return(0);
369 }
370
371 // Get the overlaps for these offsets.
372 // Loop through the read counting positions that match the reference
373 // within this region.
374 int32_t refOffset = 0;
375 int32_t numOverlaps = 0;
376 for (unsigned int queryIndex = 0; queryIndex < queryToRef.size();
377 queryIndex++)
378 {
379 refOffset = getRefOffset(queryIndex);
380 if (refOffset > endRefOffset)
381 {
382 // Past the end of the specified region, so stop checking
383 // for overlaps since there will be no more.
384 break;
385 }
386 else if ((refOffset >= startRefOffset) && (refOffset < endRefOffset))
387 {
388 // within the region, increment the counter.
389 ++numOverlaps;
390 }
391 }
392
393 return(numOverlaps);
394}
395
396
397// Return whether or not the cigar has an indel
399{
400 for(unsigned int i = 0; i < cigarOperations.size(); i++)
401 {
402 if((cigarOperations[i].operation == insert) ||
403 (cigarOperations[i].operation == del))
404 {
405 // Found an indel, so return true.
406 return(true);
407 }
408 }
409 // Went through all the operations, and found no indel, so return false.
410 return(false);
411}
412
413
414// Clear the query index/reference offset index vectors.
415void Cigar::clearQueryAndReferenceIndexes()
416{
417 queryToRef.clear();
418 refToQuery.clear();
419 refToCigar.clear();
420 queryToCigar.clear();
421 myExpandedCigar.clear();
422}
423
424
425///////////////////////////////////////////////////////
426// Set the query index/reference offset index vectors.
427//
428// For Cigar: 3M2I2M1D1M
429// That total count of cigar elements is 9 (3+2+2+1+1)
430//
431// The entries that are valid in the query/reference contain the index/offset
432// where they are found in the query/reference. N/A are marked by 'x':
433// query indexes: 0123456x7
434// ---------
435// reference offsets: 012xx3456
436//
437// This shows what query index is associated with which reference offset and
438// vice versa.
439// For ones where an x appears, -1 would be returned.
440//
441void Cigar::setQueryAndReferenceIndexes()
442{
443 // First ensure that the vectors are clear by clearing them.
444 clearQueryAndReferenceIndexes();
445
446 int extPos = 0;
447
448 // Process each cigar index.
449 for (uint32_t cigarIndex = 0; cigarIndex < cigarOperations.size(); cigarIndex++)
450 {
451 // Process the cigar operation.
452 switch (cigarOperations[cigarIndex].operation)
453 {
454 case match:
455 case mismatch:
456 // For match/mismatch, update the maps between query
457 // and reference for the number of matches/mismatches.
458 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
459 {
460 // The associated indexes are the next location in
461 // each array, which is equal to the current size.
462 int32_t queryToRefLen = queryToRef.size();
463 int32_t refToQueryLen = refToQuery.size();
464 queryToRef.push_back(refToQueryLen);
465 refToQuery.push_back(queryToRefLen);
466 refToCigar.push_back(extPos);
467 queryToCigar.push_back(extPos++);
468 myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
469 }
470 break;
471 case insert:
472 case softClip:
473 // Add N/A reference offset for each query index that this
474 // insert covers.
475 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
476 {
477 queryToRef.push_back(INDEX_NA);
478 queryToCigar.push_back(extPos++);
479 myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
480 }
481 break;
482 case del:
483 case skip:
484 // Add N/A query index for each reference offset that this
485 // deletion/skip covers.
486 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
487 {
488 refToQuery.push_back(INDEX_NA);
489 refToCigar.push_back(extPos++);
490 myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
491 }
492 break;
493 case hardClip:
494 case pad:
495 case none:
496 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
497 {
498 myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
499 ++extPos;
500 }
501 break;
502 };
503 }
504}
505
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition Cigar.h:84
int size() const
Return the number of cigar operations.
Definition Cigar.h:364
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition Cigar.cpp:95
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition Cigar.h:492
char getCigarCharOp(int32_t expandedCigarIndex)
Return the character code of the cigar operator associated with the specified expanded CIGAR index.
Definition Cigar.cpp:295
char getCigarCharOpFromQueryIndex(int32_t queryIndex)
Return the character code of the cigar operator associated with the specified queryIndex.
Definition Cigar.cpp:314
int32_t getExpandedCigarIndexFromQueryIndex(int32_t queryIndex)
Returns the index into the expanded cigar for the cigar associated with the specified queryIndex.
Definition Cigar.cpp:258
int32_t getExpandedCigarIndexFromRefPos(int32_t refPosition, int32_t queryStartPos)
Returns the index into the expanded cigar for the cigar associated with the specified reference posit...
Definition Cigar.cpp:288
void getExpandedString(std::string &s) const
Sets the specified string to a valid CIGAR string of characters that represent the cigar with no digi...
Definition Cigar.cpp:63
int getNumBeginClips() const
Return the number of clips that are at the beginning of the cigar.
Definition Cigar.cpp:144
bool hasIndel()
Return whether or not the cigar has indels (insertions or delections)
Definition Cigar.cpp:398
int getNumEndClips() const
Return the number of clips that are at the end of the cigar.
Definition Cigar.cpp:166
char getCigarCharOpFromRefOffset(int32_t refOffset)
Return the character code of the cigar operator associated with the specified reference offset.
Definition Cigar.cpp:320
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
Definition Cigar.cpp:120
int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos)
Return the reference position associated with the specified query index or INDEX_NA based on this cig...
Definition Cigar.cpp:217
int32_t getExpandedCigarIndexFromRefOffset(int32_t refOffset)
Returns the index into the expanded cigar for the cigar associated with the specified reference offse...
Definition Cigar.cpp:273
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition Cigar.h:92
@ mismatch
mismatch operation. Associated with CIGAR Operation "M"
Definition Cigar.h:90
@ hardClip
Hard clip on the read (clipped sequence not present in the query sequence or reference)....
Definition Cigar.h:95
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition Cigar.h:89
@ pad
Padding (not in reference or query). Associated with CIGAR Operation "P".
Definition Cigar.h:96
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition Cigar.h:91
@ skip
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition Cigar.h:93
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition Cigar.h:94
@ none
no operation has been set.
Definition Cigar.h:88
int32_t getRefOffset(int32_t queryIndex)
Return the reference offset associated with the specified query index or INDEX_NA based on this cigar...
Definition Cigar.cpp:187
bool operator==(Cigar &rhs) const
Return true if the 2 Cigars are the same (the same operations of the same sizes).
Definition Cigar.cpp:80
uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos)
Return the number of bases that overlap the reference and the read associated with this cigar that fa...
Definition Cigar.cpp:334
char getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos)
Return the character code of the cigar operator associated with the specified reference position.
Definition Cigar.cpp:326
void getCigarString(String &cigarString) const
Set the passed in String to the string reprentation of the Cigar operations in this object.
Definition Cigar.cpp:52
int32_t getQueryIndex(int32_t refOffset)
Return the query index associated with the specified reference offset or INDEX_NA based on this cigar...
Definition Cigar.cpp:202
This file is inspired by the poor quality of string support in STL for what should be trivial capabil...