libStatGen Software 1
|
Create/Access/Modify/Load Genome Sequences stored as binary mapped files. More...
#include <GenomeSequence.h>
Public Member Functions | |
GenomeSequence () | |
Simple constructor - no implicit file open. | |
void | constructorClear () |
GenomeSequence (std::string &referenceFilename) | |
attempt to open an existing sequence | |
GenomeSequence (const char *referenceFilename) | |
Smarter constructor - attempt to open an existing sequence. | |
~GenomeSequence () | |
Close the file if open and destroy the object. | |
bool | open (bool isColorSpace=false, int flags=O_RDONLY) |
open the reference specified using GenomeSequence::setReferenceName | |
bool | open (const char *filename, int flags=O_RDONLY) |
open the given file as the genome (no filename munging occurs). | |
bool | create (bool isColor=false) |
void | setProgressStream (std::ostream &progressStream) |
if set, then show progress when creating and pre-fetching | |
void | setColorSpace (bool colorSpace) |
void | setSearchCommonFileSuffix (bool searchCommonFileSuffix) |
void | setCreateOverwrite (bool createOverwrite) |
bool | loadFastaData (const char *filename) |
bool | setReferenceName (std::string referenceFilename) |
set the reference name that will be used in open() | |
void | setApplication (std::string application) |
set the application name in the binary file header | |
const std::string & | getFastaName () const |
const std::string & | getReferenceName () const |
bool | isColorSpace () const |
tell us if we are a color space reference or not | |
genomeIndex_t | getNumberBases () const |
return the number of bases represented in this reference | |
int | getChromosome (genomeIndex_t position) const |
given a whole genome index, get the chromosome it is located in | |
int | getChromosome (const char *chromosomeName) const |
given a chromosome name, return the chromosome index | |
int | getChromosomeCount () const |
Return the number of chromosomes in the genome. | |
genomeIndex_t | getChromosomeStart (int chromosomeIndex) const |
given a chromosome, return the genome base it starts in | |
genomeIndex_t | getChromosomeSize (int chromosomeIndex) const |
given a chromosome, return its size in bases | |
genomeIndex_t | getGenomePosition (const char *chromosomeName, unsigned int chromosomeIndex) const |
given a chromosome name and position, return the genome position | |
genomeIndex_t | getGenomePosition (int chromosome, unsigned int chromosomeIndex) const |
given a chromosome index and position, return the genome position | |
genomeIndex_t | getGenomePosition (const char *chromosomeName) const |
given the chromosome name, get the corresponding 0 based genome index for the start of that chromosome | |
genomeIndex_t | getGenomePosition (int chromosomeIndex) const |
const std::string & | getBaseFilename () const |
const char * | getChromosomeName (int chromosomeIndex) const |
void | setDebugFlag (bool d) |
genomeIndex_t | sequenceLength () const |
const char * | chromosomeName (int chr) const |
void | sanityCheck (MemoryMap &fasta) const |
std::string | IntegerToSeq (unsigned int n, unsigned int wordsize) const |
bool | wordMatch (unsigned int index, std::string &word) const |
bool | printNearbyWords (unsigned int index, unsigned int variance, std::string &word) const |
char | BasePair (char c) const |
void | dumpSequenceSAMDictionary (std::ostream &) const |
void | dumpHeaderTSV (std::ostream &) const |
char | operator[] (genomeIndex_t index) const |
Return the bases in base space or color space for within range index, ot. | |
char | getBase (const char *chromosomeName, unsigned int chromosomeIndex) const |
given a chromosome name and 1-based position, return the reference base. | |
uint8_t | getInteger (genomeIndex_t index) const |
void | set (genomeIndex_t index, char value) |
uint8_t * | getDataPtr (genomeIndex_t index) |
obtain the pointer to the raw data for other access methods | |
void | getReverseRead (std::string &read) |
void | getReverseRead (String &read) |
int | debugPrintReadValidation (std::string &read, std::string &quality, char direction, genomeIndex_t readLocation, int sumQuality, int mismatchCount, bool recurse=true) |
void | getString (std::string &str, int chromosome, uint32_t index, int baseCount) const |
void | getString (String &str, int chromosome, uint32_t index, int baseCount) const |
void | getString (std::string &str, genomeIndex_t index, int baseCount) const |
void | getString (String &str, genomeIndex_t index, int baseCount) const |
void | getHighLightedString (std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const |
void | print30 (genomeIndex_t) const |
genomeIndex_t | simpleLocalAligner (std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const |
int | getMismatchCount (std::string &read, genomeIndex_t location, char exclude='\0') const |
Return the mismatch count, disregarding CIGAR strings. | |
int | getSumQ (std::string &read, std::string &qualities, genomeIndex_t location) const |
brute force sumQ - no sanity checking | |
void | getMismatchHatString (std::string &result, const std::string &read, genomeIndex_t location) const |
void | getMismatchString (std::string &result, const std::string &read, genomeIndex_t location) const |
void | getChromosomeAndIndex (std::string &, genomeIndex_t) const |
void | getChromosomeAndIndex (String &, genomeIndex_t) const |
bool | checkRead (std::string &read, std::string &qualities, std::string &cigar, int &sumQ, int &gapOpenCount, int &gapExtendCount, int &gapDeleteCount, std::string &result) const |
check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correct. | |
bool | populateDBSNP (mmapArrayBool_t &dbSNP, IFILE inputFile) const |
bool | loadDBSNP (mmapArrayBool_t &dbSNP, const char *inputFileName) const |
user friendly dbSNP loader. | |
![]() | |
void | constructorClear () |
const std::string & | getErrorString () |
arrayHeaderClass & | getHeader () |
void | setContentCookie (uint32_t c) |
void | setContentVersion (uint32_t v) |
elementT | operator[] (indexT i) |
void | set (indexT i, elementT v) |
int | create (const char *file, indexT elementCount, int optionalHeaderCount=0) |
Create a vector with elementCount memebers. | |
int | create (indexT elementCount, int optionalHeaderCount=0) |
allow anonymous (malloc) create. | |
bool | open (const char *file, int flags=O_RDONLY) |
open a previously created mapped vector | |
bool | close () |
void | debugPrint (FILE *f) |
size_t | getElementCount () const |
![]() | |
void | debug_print () |
void | constructor_clear () |
void | destructor_clear () |
virtual bool | allocate () |
virtual bool | create (const char *file, size_t size) |
create the memory mapped file on disk | |
virtual bool | create (size_t size) |
store in allocated memory (malloc), not mmap: | |
bool | close () |
void | test () |
size_t | length () |
char | operator[] (unsigned int index) |
int | prefetch () |
void | useMemoryMap (bool flag=true) |
Additional Inherited Members | |
![]() | |
void * | data |
![]() | |
arrayHeaderClass * | header |
char * | data |
std::string | errorStr |
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
GenomeSequence is designed to be a high performance shared access reference object.
It is implemented as a MemoryMapArray template object with unsigned 8 bit ints, each of which stores two bases. Although 2 bits could be used, most references have more than four symbols (usually at least including 'N', indicating an unknown or masked out base).
Normal use of this class follows these steps:
Sharing is accomplished using the mmap() function via the MemoryMap base class. This allows a potentially large genome reference to be shared among a number of simultaneously executing instances of one or more programs sharing the same reference.
Definition at line 99 of file GenomeSequence.h.
GenomeSequence::GenomeSequence | ( | ) |
Simple constructor - no implicit file open.
Definition at line 139 of file GenomeSequence.cpp.
|
inline |
attempt to open an existing sequence
referenceFilename | the name of the reference fasta file to open |
debug | if true, additional debug information is printed |
Definition at line 128 of file GenomeSequence.h.
|
inline |
Smarter constructor - attempt to open an existing sequence.
referenceFilename | the name of the reference fasta file to open |
debug | if true, additional debug information is printed |
Definition at line 138 of file GenomeSequence.h.
GenomeSequence::~GenomeSequence | ( | ) |
Close the file if open and destroy the object.
Definition at line 169 of file GenomeSequence.cpp.
|
inline |
Definition at line 319 of file GenomeSequence.h.
bool GenomeSequence::checkRead | ( | std::string & | read, |
std::string & | qualities, | ||
std::string & | cigar, | ||
int & | sumQ, | ||
int & | gapOpenCount, | ||
int & | gapExtendCount, | ||
int & | gapDeleteCount, | ||
std::string & | result | ||
) | const |
check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correct.
read | the read in base space |
qualities | the phred encoded qualities (Sanger, not Illumina) |
cigar | the SAM file CIGAR column |
sumQ | if >0 on entry, is checked against the computed sumQ |
insertions | count of insertions found in |
|
inline |
Definition at line 305 of file GenomeSequence.h.
void GenomeSequence::constructorClear | ( | ) |
Definition at line 144 of file GenomeSequence.cpp.
bool GenomeSequence::create | ( | bool | isColor = false | ) |
Definition at line 488 of file GenomeSequence.cpp.
int GenomeSequence::debugPrintReadValidation | ( | std::string & | read, |
std::string & | quality, | ||
char | direction, | ||
genomeIndex_t | readLocation, | ||
int | sumQuality, | ||
int | mismatchCount, | ||
bool | recurse = true |
||
) |
Definition at line 855 of file GenomeSequence.cpp.
void GenomeSequence::dumpHeaderTSV | ( | std::ostream & | file | ) | const |
Definition at line 976 of file GenomeSequence.cpp.
void GenomeSequence::dumpSequenceSAMDictionary | ( | std::ostream & | file | ) | const |
Definition at line 960 of file GenomeSequence.cpp.
|
inline |
given a chromosome name and 1-based position, return the reference base.
chromosomeName | name of the chromosome - exact match only |
chromosomeIndex | 1-based chromosome position |
Definition at line 388 of file GenomeSequence.h.
References getGenomePosition().
Referenced by PileupElement::getRefBase().
|
inline |
Definition at line 285 of file GenomeSequence.h.
int GenomeSequence::getChromosome | ( | const char * | chromosomeName | ) | const |
given a chromosome name, return the chromosome index
This is done via a linear search of the chromosome table in the header of the mapped file, so it is O(N)
chromosomeName | the name of the chromosome - exact match only |
Definition at line 814 of file GenomeSequence.cpp.
int GenomeSequence::getChromosome | ( | genomeIndex_t | position | ) | const |
given a whole genome index, get the chromosome it is located in
This is done via a binary search of the chromosome table in the header of the mapped file, so it is O(log(N))
0-based | position the base in the genome |
Definition at line 737 of file GenomeSequence.cpp.
Referenced by getGenomePosition().
void GenomeSequence::getChromosomeAndIndex | ( | std::string & | s, |
genomeIndex_t | i | ||
) | const |
Definition at line 1165 of file GenomeSequence.cpp.
void GenomeSequence::getChromosomeAndIndex | ( | String & | s, |
genomeIndex_t | i | ||
) | const |
Definition at line 1189 of file GenomeSequence.cpp.
int GenomeSequence::getChromosomeCount | ( | ) | const |
Return the number of chromosomes in the genome.
Definition at line 731 of file GenomeSequence.cpp.
|
inline |
Definition at line 290 of file GenomeSequence.h.
|
inline |
given a chromosome, return its size in bases
0-based | chromosome index |
Definition at line 256 of file GenomeSequence.h.
|
inline |
given a chromosome, return the genome base it starts in
0-based | chromosome index |
Definition at line 246 of file GenomeSequence.h.
|
inline |
obtain the pointer to the raw data for other access methods
this is a fairly ugly hack to reach into the raw genome vector, get the byte that encodes two bases, and return it. This is used by karma ReadIndexer::getSumQ to compare genome matchines by byte (two bases at a time) to speed it up.
Definition at line 422 of file GenomeSequence.h.
|
inline |
Definition at line 198 of file GenomeSequence.h.
genomeIndex_t GenomeSequence::getGenomePosition | ( | const char * | chromosomeName | ) | const |
given the chromosome name, get the corresponding 0 based genome index for the start of that chromosome
Definition at line 807 of file GenomeSequence.cpp.
References getChromosome().
genomeIndex_t GenomeSequence::getGenomePosition | ( | const char * | chromosomeName, |
unsigned int | chromosomeIndex | ||
) | const |
given a chromosome name and position, return the genome position
chromosomeName | name of the chromosome - exact match only |
chromosomeIndex | 1-based chromosome position |
Definition at line 779 of file GenomeSequence.cpp.
References getGenomePosition().
Referenced by SamTags::createMDTag(), getBase(), getGenomePosition(), SamQuerySeqWithRefIter::reset(), SamQuerySeqWithRef::seqWithEquals(), and SamQuerySeqWithRef::seqWithoutEquals().
genomeIndex_t GenomeSequence::getGenomePosition | ( | int | chromosome, |
unsigned int | chromosomeIndex | ||
) | const |
given a chromosome index and position, return the genome position
chromosome | index of the chromosome |
chromosomeIndex | 1-based chromosome position |
Definition at line 788 of file GenomeSequence.cpp.
void GenomeSequence::getHighLightedString | ( | std::string & | str, |
genomeIndex_t | index, | ||
int | baseCount, | ||
genomeIndex_t | highLightStart, | ||
genomeIndex_t | highLightEnd | ||
) | const |
Definition at line 1046 of file GenomeSequence.cpp.
|
inline |
Definition at line 402 of file GenomeSequence.h.
|
inline |
Return the mismatch count, disregarding CIGAR strings.
read | is the sequence we're counting mismatches in |
location | is where in the genmoe we start comparing |
exclude | is a wildcard character (e.g. '.' or 'N') |
Definition at line 488 of file GenomeSequence.h.
void GenomeSequence::getMismatchHatString | ( | std::string & | result, |
const std::string & | read, | ||
genomeIndex_t | location | ||
) | const |
Definition at line 1085 of file GenomeSequence.cpp.
void GenomeSequence::getMismatchString | ( | std::string & | result, |
const std::string & | read, | ||
genomeIndex_t | location | ||
) | const |
Definition at line 1097 of file GenomeSequence.cpp.
|
inline |
return the number of bases represented in this reference
Definition at line 216 of file GenomeSequence.h.
Referenced by loadDBSNP(), and operator[]().
|
inline |
Definition at line 202 of file GenomeSequence.h.
void GenomeSequence::getReverseRead | ( | std::string & | read | ) |
Definition at line 831 of file GenomeSequence.cpp.
void GenomeSequence::getReverseRead | ( | String & | read | ) |
Definition at line 841 of file GenomeSequence.cpp.
void GenomeSequence::getString | ( | std::string & | str, |
genomeIndex_t | index, | ||
int | baseCount | ||
) | const |
Definition at line 1018 of file GenomeSequence.cpp.
void GenomeSequence::getString | ( | std::string & | str, |
int | chromosome, | ||
uint32_t | index, | ||
int | baseCount | ||
) | const |
Definition at line 1001 of file GenomeSequence.cpp.
void GenomeSequence::getString | ( | String & | str, |
genomeIndex_t | index, | ||
int | baseCount | ||
) | const |
Definition at line 1039 of file GenomeSequence.cpp.
void GenomeSequence::getString | ( | String & | str, |
int | chromosome, | ||
uint32_t | index, | ||
int | baseCount | ||
) | const |
Definition at line 1011 of file GenomeSequence.cpp.
|
inline |
brute force sumQ - no sanity checking
read | shotgun sequencer read string |
qualities | phred quality string of same length |
location | the alignment location to check sumQ |
Definition at line 501 of file GenomeSequence.h.
std::string GenomeSequence::IntegerToSeq | ( | unsigned int | n, |
unsigned int | wordsize | ||
) | const |
Definition at line 118 of file GenomeSequence.cpp.
|
inline |
tell us if we are a color space reference or not
Definition at line 209 of file GenomeSequence.h.
Referenced by open(), and operator[]().
bool GenomeSequence::loadDBSNP | ( | mmapArrayBool_t & | dbSNP, |
const char * | inputFileName | ||
) | const |
user friendly dbSNP loader.
inputFileName | may be empty, point to a text file or a dbSNP vector file |
In all cases, dbSNP is returned the same length as this genome.
When no SNPs are loaded, all values are false.
When a text file is given, the file is parsed with two space separated columns - the first column is the chromosome name, and the second is the 1-based chromosome position of the SNP.
Definition at line 1301 of file GenomeSequence.cpp.
References MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::create(), getNumberBases(), ifclose(), ifopen(), and MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::open().
bool GenomeSequence::open | ( | bool | isColorSpace = false , |
int | flags = O_RDONLY |
||
) |
open the reference specified using GenomeSequence::setReferenceName
isColorSpace | open the color space reference |
flags | pass through to the open() call (O_RDWR lets you modify the contents) |
Definition at line 182 of file GenomeSequence.cpp.
References isColorSpace(), and MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::open().
|
inlinevirtual |
open the given file as the genome (no filename munging occurs).
filename | the name of the file to open |
flags | pass through to the open() call (O_RDWR lets you modify the contents) |
Reimplemented from MemoryMap.
Definition at line 159 of file GenomeSequence.h.
|
inline |
Return the bases in base space or color space for within range index, ot.
index | the array-like index (0 based). |
NB: bounds checking here needs to be deprecated - do not assume it will exist - the call must clip reads so that this routine is never called with a index value larger than the genome.
The reason for this is simply that this routine gets called hundreds of billions of time in one run of karma, which will absolutely kill performance. Every single instruction here matters a great, great deal.
Definition at line 361 of file GenomeSequence.h.
References BaseAsciiMap::baseNIndex, getNumberBases(), BaseAsciiMap::int2base, BaseAsciiMap::int2colorSpace, and isColorSpace().
bool GenomeSequence::populateDBSNP | ( | mmapArrayBool_t & | dbSNP, |
IFILE | inputFile | ||
) | const |
Definition at line 1213 of file GenomeSequence.cpp.
void GenomeSequence::print30 | ( | genomeIndex_t | index | ) | const |
Definition at line 1073 of file GenomeSequence.cpp.
bool GenomeSequence::printNearbyWords | ( | unsigned int | index, |
unsigned int | variance, | ||
std::string & | word | ||
) | const |
Definition at line 941 of file GenomeSequence.cpp.
void GenomeSequence::sanityCheck | ( | MemoryMap & | fasta | ) | const |
Definition at line 223 of file GenomeSequence.cpp.
|
inline |
Definition at line 300 of file GenomeSequence.h.
|
inline |
Definition at line 407 of file GenomeSequence.h.
|
inline |
set the application name in the binary file header
application | name of the application |
Definition at line 194 of file GenomeSequence.h.
|
inline |
Definition at line 176 of file GenomeSequence.h.
|
inline |
Definition at line 180 of file GenomeSequence.h.
|
inline |
Definition at line 295 of file GenomeSequence.h.
|
inline |
if set, then show progress when creating and pre-fetching
Definition at line 175 of file GenomeSequence.h.
bool GenomeSequence::setReferenceName | ( | std::string | referenceFilename | ) |
set the reference name that will be used in open()
referenceFilename | the name of the reference fasta file to open |
Definition at line 254 of file GenomeSequence.cpp.
|
inline |
Definition at line 177 of file GenomeSequence.h.
genomeIndex_t GenomeSequence::simpleLocalAligner | ( | std::string & | read, |
std::string & | quality, | ||
genomeIndex_t | index, | ||
int | windowSize | ||
) | const |
Definition at line 1109 of file GenomeSequence.cpp.
bool GenomeSequence::wordMatch | ( | unsigned int | index, |
std::string & | word | ||
) | const |
Definition at line 932 of file GenomeSequence.cpp.