libStatGen Software 1
Loading...
Searching...
No Matches
SamInterface.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include "SamInterface.h"
19#include "SamRecordHelper.h"
20
21#include <limits>
22#include <stdint.h>
23
24SamInterface::SamInterface()
25 : myFirstRecord("")
26{
27}
28
29
30SamInterface::~SamInterface()
31{
32}
33
34
35// Read a SAM file's header.
36bool SamInterface::readHeader(IFILE filePtr, SamFileHeader& header,
37 SamStatus& status)
38{
39 if(filePtr == NULL)
40 {
41 // File is not open.
43 "Cannot read header since the file pointer is null");
44 return(false);
45 }
46
47 // Clear the passed in header.
48 header.resetHeader();
49
50 int numValid = 0;
51 int numInvalid = 0;
52 std::string errorMessages = "";
53
54 do {
55 StringIntHash tags;
56 StringArray values;
57 buffer.ReadLine(filePtr);
58
59 // Stop reading header lines if at the end of the file or
60 // if the line is not blank and does not start with an @.
61 if ( ifeof(filePtr) ||
62 ((buffer.Length() != 0) && (buffer[0] != '@')) )
63 {
64 break;
65 }
66
67 // This is a header line, so add it to header.
68 if(header.addHeaderLine(buffer.c_str()))
69 {
70 if(buffer.Length() != 0)
71 {
72 ++numValid;
73 }
74 }
75 else
76 {
77 ++numInvalid;
78 // Failed reading the header.
79 errorMessages += header.getErrorMessage();
80 // Skip further processing on this line since it was an error.
81 continue;
82 }
83 } while (1);
84
85 // Store the first record since it was read.
86 myFirstRecord = buffer;
87
88 if(numInvalid > 0)
89 {
90 if(numValid == 0)
91 {
92 std::cerr << "Failed to parse " << numInvalid << " header lines";
93 std::cerr << ". No valid header lines.\n";
94 status.setStatus(SamStatus::FAIL_PARSE, errorMessages.c_str());
95 return(false);
96 }
97 }
98
99 // Successfully read.
100 return(true);
101}
102
103bool SamInterface::writeHeader(IFILE filePtr, SamFileHeader& header,
104 SamStatus& status)
105{
106 if((filePtr == NULL) || (filePtr->isOpen() == false))
107 {
108 // File is not open, return failure.
110 "Cannot write header since the file pointer is null");
111 return(false);
112 }
113
114 ////////////////////////////////
115 // Write the header to the file.
116 ////////////////////////////////
117 // Construct a string containing the entire header.
118 std::string headerString = "";
119 header.getHeaderString(headerString);
120
121 int32_t headerLen = headerString.length();
122 int numWrite = 0;
123
124 // Write the header to the file.
125 numWrite = ifwrite(filePtr, headerString.c_str(), headerLen);
126 if(numWrite != headerLen)
127 {
129 "Failed to write the SAM header.");
130 return(false);
131 }
132 return(true);
133}
134
135
136void SamInterface::readRecord(IFILE filePtr, SamFileHeader& header,
137 SamRecord& record,
138 SamStatus& samStatus)
139{
140 // Initialize the status to success - will be set to false on failure.
141 samStatus = SamStatus::SUCCESS;
142
143 if((filePtr == NULL) || (filePtr->isOpen() == false))
144 {
145 // File is not open.
147 "filePtr does not point to an open file.");
148 return;
149 }
150
151 // If the first record has been set, use that and clear it,
152 // otherwise read the record from the file.
153 if(myFirstRecord.Length() != 0)
154 {
155 buffer = myFirstRecord;
156 myFirstRecord.Clear();
157 }
158 else
159 {
160 // Read the next record.
161 buffer.Clear();
162 buffer.ReadLine(filePtr);
163 // If the end of the file and nothing was read, return false.
164 if ((ifeof(filePtr)) && (buffer.Length() == 0))
165 {
166 // end of the file and nothing to process.
168 "No more records in the file.");
169 return;
170 }
171 }
172
173 tokens.ReplaceColumns(buffer, '\t');
174
175
176 // Error string for reporting a parsing failure.
177 String errorString = "";
178
179 if (tokens.Length() < 11)
180 {
181 errorString = "Too few columns (";
182 errorString += tokens.Length();
183 errorString += ") in the Record, expected at least 11.";
185 errorString.c_str());
186 return;
187 }
188
189 // Reset the record before setting any fields.
190 record.resetRecord();
191
192 if(!record.setReadName(tokens[0]))
193 {
194 samStatus.addError(record.getStatus());
195 }
196
197 long flagInt = 0;
198 if(!tokens[1].AsInteger(flagInt))
199 {
200 errorString = "flag, ";
201 errorString += tokens[1].c_str();
202 errorString += ", is not an integer.";
204 errorString.c_str());
205 }
206 else if((flagInt < 0) || (flagInt > UINT16_MAX))
207 {
208 errorString = "flag, ";
209 errorString += tokens[1].c_str();
210 errorString += ", is not between 0 and (2^16)-1 = 65535.";
212 errorString.c_str());
213 }
214 else if(!record.setFlag(flagInt))
215 {
216 samStatus.addError(record.getStatus().getStatus(),
217 record.getStatus().getStatusMessage());
218 }
219
220 if(!record.setReferenceName(header, tokens[2]))
221 {
222 samStatus.addError(record.getStatus().getStatus(),
223 record.getStatus().getStatusMessage());
224 }
225
226 long posInt = 0;
227 if(!tokens[3].AsInteger(posInt))
228 {
229 errorString = "position, ";
230 errorString += tokens[3].c_str();
231 errorString += ", is not an integer.";
233 errorString.c_str());
234 }
235 else if((posInt < INT32_MIN) || (posInt > INT32_MAX))
236 {
237 // If it is not in this range, it cannot fit into a 32 bit int.
238 errorString = "position, ";
239 errorString += tokens[3].c_str();
240 errorString += ", does not fit in a 32 bit signed int.";
242 errorString.c_str());
243 }
244 else if(!record.set1BasedPosition(posInt))
245 {
246 samStatus.addError(record.getStatus().getStatus(),
247 record.getStatus().getStatusMessage());
248 }
249
250 long mapInt = 0;
251 if(!tokens[4].AsInteger(mapInt))
252 {
253 errorString = "map quality, ";
254 errorString += tokens[4].c_str();
255 errorString += ", is not an integer.";
257 errorString.c_str());
258 }
259 else if((mapInt < 0) || (mapInt > UINT8_MAX))
260 {
261 errorString = "map quality, ";
262 errorString += tokens[4].c_str();
263 errorString += ", is not between 0 and (2^8)-1 = 255.";
265 errorString.c_str());
266 }
267 else if(!record.setMapQuality(mapInt))
268 {
269 samStatus.addError(record.getStatus().getStatus(),
270 record.getStatus().getStatusMessage());
271 }
272
273 if(!record.setCigar(tokens[5]))
274 {
275 samStatus.addError(record.getStatus().getStatus(),
276 record.getStatus().getStatusMessage());
277 }
278
279 if(!record.setMateReferenceName(header, tokens[6]))
280 {
281 samStatus.addError(record.getStatus().getStatus(),
282 record.getStatus().getStatusMessage());
283 }
284
285 long matePosInt = 0;
286 if(!tokens[7].AsInteger(matePosInt))
287 {
288 errorString = "mate position, ";
289 errorString += tokens[7].c_str();
290 errorString += ", is not an integer.";
292 errorString.c_str());
293 }
294 else if(!record.set1BasedMatePosition(matePosInt))
295 {
296 samStatus.addError(record.getStatus().getStatus(),
297 record.getStatus().getStatusMessage());
298 }
299
300 long insertInt = 0;
301 if(!tokens[8].AsInteger(insertInt))
302 {
303 errorString = "insert size, ";
304 errorString += tokens[8].c_str();
305 errorString += ", is not an integer.";
307 errorString.c_str());
308 }
309 else if(!record.setInsertSize(insertInt))
310 {
311 samStatus.addError(record.getStatus().getStatus(),
312 record.getStatus().getStatusMessage());
313 }
314
315 if(!record.setSequence(tokens[9]))
316 {
317 samStatus.addError(record.getStatus().getStatus(),
318 record.getStatus().getStatusMessage());
319 }
320
321 if(!record.setQuality(tokens[10]))
322 {
323 samStatus.addError(record.getStatus().getStatus(),
324 record.getStatus().getStatusMessage());
325 }
326
327 // Clear the tag fields.
328 record.clearTags();
329
330 // Add the tags to the record.
331 for (int i = 11; i < tokens.Length(); i++)
332 {
333 String & nugget = tokens[i];
334
335 if (nugget.Length() < 6 || nugget[2] != ':' || nugget[4] != ':')
336 {
337 // invalid tag format.
338 errorString = "Invalid Tag Format: ";
339 errorString += nugget.c_str();
340 errorString += ", should be cc:c:x*.";
342 errorString.c_str());
343 continue;
344 }
345
346 // Valid tag format.
347 // Add the tag.
348 if(!record.addTag((const char *)nugget, nugget[3],
349 (const char *)nugget + 5))
350 {
351 samStatus.addError(record.getStatus().getStatus(),
352 record.getStatus().getStatusMessage());
353 }
354 }
355
356 return;
357}
358
359
360SamStatus::Status SamInterface::writeRecord(IFILE filePtr,
361 SamFileHeader& header,
362 SamRecord& record,
364{
365 // Store all the fields into a string, then write the string.
366 String recordString = record.getReadName();
367 recordString += "\t";
368 recordString += record.getFlag();
369 recordString += "\t";
370 recordString += record.getReferenceName();
371 recordString += "\t";
372 recordString += record.get1BasedPosition();
373 recordString += "\t";
374 recordString += record.getMapQuality();
375 recordString += "\t";
376 recordString += record.getCigar();
377 recordString += "\t";
378 recordString += record.getMateReferenceNameOrEqual();
379 recordString += "\t";
380 recordString += record.get1BasedMatePosition();
381 recordString += "\t";
382 recordString += record.getInsertSize();
383 recordString += "\t";
384 recordString += record.getSequence(translation);
385 recordString += "\t";
386 recordString += record.getQuality();
387
388 // If there are any tags, add a preceding tab.
389 if(record.getTagLength() != 0)
390 {
391 recordString += "\t";
392 SamRecordHelper::genSamTagsString(record, recordString);
393 }
394
395 recordString += "\n";
396
397
398 // Write the record.
399 ifwrite(filePtr, recordString.c_str(), recordString.Length());
400 return(SamStatus::SUCCESS);
401}
402
403
404void SamInterface::ParseHeaderLine(StringIntHash & tags, StringArray & values)
405{
406 tags.Clear();
407 values.Clear();
408
409 tokens.AddColumns(buffer, '\t');
410
411 for (int i = 1; i < tokens.Length(); i++)
412 {
413 tags.Add(tokens[i].Left(2), i - 1);
414 values.Push(tokens[i].SubStr(3));
415 }
416}
417
418
419bool SamInterface::isEOF(IFILE filePtr)
420{
421
422 if(myFirstRecord.Length() != 0)
423 {
424 // First record is set, so return false, not EOF, since we
425 // know the record still needs to be processed.
426 return(false);
427 }
428 return(GenericSamInterface::isEOF(filePtr));
429}
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition InputFile.h:654
unsigned int ifwrite(IFILE file, const void *buffer, unsigned int size)
Write the specified number of bytes from the specified buffer into the file.
Definition InputFile.h:669
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition InputFile.h:37
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition InputFile.h:423
This class allows a user to get/set the fields in a SAM/BAM Header.
const char * getErrorMessage()
Get the failure message if a method returned failure.
bool addHeaderLine(const char *type, const char *tag, const char *value)
Add a header line that is just one tag with a const char* value.
bool getHeaderString(std::string &header) const
Set the passed in string to the entire header string, clearing its current contents.
void resetHeader()
Initialize the header.
static bool genSamTagsString(SamRecord &record, String &returnString, char delim='\t')
Helper to append the SAM string representation of all the tags to the specified string.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition SamRecord.h:52
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
Definition SamRecord.h:57
bool setReadName(const char *readName)
Set QNAME to the passed in name.
int32_t getInsertSize()
Get the inferred insert size of the read pair (ISIZE) or observed template length (TLEN).
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
void clearTags()
Clear the tags in this record.
bool setInsertSize(int32_t insertSize)
Sets the inferred insert size (ISIZE)/observed template length (TLEN).
uint32_t getTagLength()
Returns the length of the BAM formatted tags.
bool setMateReferenceName(SamFileHeader &header, const char *mateReferenceName)
Set the mate/next fragment's reference sequence name (RNEXT) to the specified name,...
const char * getMateReferenceNameOrEqual()
Get the mate/next fragment's reference sequence name (RNEXT), returning "=" if it is the same as the ...
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
void resetRecord()
Reset the fields of the record to a default value.
Definition SamRecord.cpp:91
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
bool set1BasedPosition(int32_t position)
Set the leftmost position (POS) using the specified 1-based (SAM format) value.
uint16_t getFlag()
Get the flag (FLAG).
int32_t get1BasedMatePosition()
Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
const SamStatus & getStatus()
Returns the status associated with the last method that sets the status.
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
bool setSequence(const char *seq)
Sets the sequence (SEQ) to the specified SAM formatted sequence string.
bool set1BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position (PNEXT) using the specified 1-based (SAM format) value...
const char * getCigar()
Returns the SAM formatted CIGAR string.
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
bool setQuality(const char *quality)
Sets the quality (QUAL) to the specified SAM formatted quality string.
bool setReferenceName(SamFileHeader &header, const char *referenceName)
Set the reference sequence name (RNAME) to the specified name, using the header to determine the refe...
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
This class is used to track the status results of some methods in the BAM classes.
const char * getStatusMessage() const
Return the status message for this object.
Status
Return value enum for StatGenFile methods.
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
@ SUCCESS
method completed successfully.
@ FAIL_IO
method failed due to an I/O issue.
@ FAIL_PARSE
failed to parse a record/header - invalid format.
@ FAIL_ORDER
FAIL_ORDER: method failed because it was called out of order, like trying to read a file without open...
void setStatus(Status newStatus, const char *newMessage)
Set the status with the specified status enum and message.
Status getStatus() const
Return the enum for this status object.
void addError(Status newStatus, const char *newMessage)
Add the specified error message to the status message, setting the status to newStatus if the current...