// read the header of a bam file SR_BamHeader* SR_BamInStreamLoadHeader(SR_BamInStream* pBamInStream) { bam_header_t* pOrigHeader = bam_header_read(pBamInStream->fpBamInput); if (pOrigHeader == NULL) return NULL; SR_BamHeader* pBamHeader = SR_BamHeaderAlloc(); pBamHeader->pOrigHeader = pOrigHeader; pBamHeader->pMD5s = (const char**) calloc(pOrigHeader->n_targets, sizeof(char*)); if (pBamHeader->pMD5s == NULL) SR_ErrQuit("ERROR: Not enough memory for md5 string"); unsigned int numMD5 = 0; for (const char* md5Pos = pOrigHeader->text; numMD5 <= pOrigHeader->n_targets && (md5Pos = strstr(md5Pos, "M5:")) != NULL; ++numMD5, ++md5Pos) { pBamHeader->pMD5s[numMD5] = md5Pos + 3; } if (numMD5 != pOrigHeader->n_targets) { free(pBamHeader->pMD5s); pBamHeader->pMD5s = NULL; if (numMD5 != 0) SR_ErrMsg("WARNING: Number of MD5 string is not consistent with number of chromosomes."); } return pBamHeader; }
SR_Status SR_LibInfoTableSetRG(SR_LibInfoTable* pTable, unsigned int* oldSize, const SR_BamHeader* pBamHeader) { SR_Status status = SR_OK; *oldSize = pTable->size; unsigned int oldCapacity = pTable->capacity; const char* tagPos = pBamHeader->pOrigHeader->text; status = SR_LibInfoTableAddAnchor(pTable, pBamHeader); if (status != SR_OK) return SR_ERR; while ((tagPos = strstr(tagPos, "@RG")) != NULL) { const char* lineEnd = strpbrk(tagPos, "\n"); int32_t sampleID = 0; status = SR_LibInfoTableAddSample(&sampleID, pTable, tagPos, lineEnd); if (status != SR_OK) { SR_ErrMsg("ERROR: the \"SM\" field is not found under the read group tag in the bam header.\n"); ++tagPos; continue; } status = SR_LibInfoTableAddReadGrp(pTable, tagPos, lineEnd, sampleID); if (status != SR_OK) SR_ErrMsg("ERROR: the \"ID\" field is required under the read group tag.\n"); ++tagPos; } if (pTable->capacity > oldCapacity) { pTable->pLibInfo = (SR_LibInfo*) realloc(pTable->pLibInfo, sizeof(SR_LibInfo) * pTable->capacity); if (pTable->pLibInfo == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of library summary in the library information object.\n"); } return SR_OK; }
SR_Status SR_LibInfoTableGetRGIndex(int32_t* pReadGrpIndex, const SR_LibInfoTable* pTable, const char* pReadGrpName) { *pReadGrpIndex = 0; khash_t(name)* pRgHash = pTable->pReadGrpHash; khiter_t khIter = kh_get(name, pRgHash, pReadGrpName); if (khIter != kh_end(pRgHash)) { *pReadGrpIndex = kh_value(pRgHash, khIter); } else { SR_ErrMsg("ERROR: Found a read group name that is not recorded in the library information table.\n"); return SR_ERR; } return SR_OK; }
static SR_Status SR_LibInfoTableAddReadGrp(SR_LibInfoTable* pTable, const char* tagPos, const char* lineEnd, int sampleID) { // get the name of the current read group const char* readGrpNamePos = strstr(tagPos, "ID:"); if (readGrpNamePos != NULL && readGrpNamePos < lineEnd) readGrpNamePos += 3; else return SR_ERR; const char* platformPos = strstr(tagPos, "PL:"); if (platformPos != NULL && platformPos < lineEnd) platformPos += 3; else return SR_ERR; // expand the array if necessary if (pTable->size == pTable->capacity) { pTable->capacity *= 2; pTable->pReadGrps = (char**) realloc(pTable->pReadGrps, pTable->capacity * sizeof(char*)); if (pTable->pReadGrps == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of read group names in the library information object.\n"); pTable->pSampleMap = (int32_t*) realloc(pTable->pSampleMap, pTable->capacity * sizeof(int32_t)); if (pTable->pSampleMap == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of the sample ID map in the library information object.\n"); pTable->pSeqTech = (int8_t*) realloc(pTable->pSeqTech, pTable->capacity * sizeof(int8_t)); if (pTable->pSeqTech == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of the sequencing technology in the library information object.\n"); } SR_Status status = SR_LibInfoTableAddSeqTech(pTable, platformPos); if (status != SR_OK) return SR_ERR; const char* readGrpNameEnd = strpbrk(readGrpNamePos, " \t\n\0"); size_t readGrpNameLen = readGrpNameEnd - readGrpNamePos; if (readGrpNameLen > 0) { pTable->pReadGrps[pTable->size] = (char*) calloc(readGrpNameLen + 1, sizeof(char)); if (pTable->pReadGrps[pTable->size] == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of read group name in the fragment length distribution object.\n"); memcpy(pTable->pReadGrps[pTable->size], readGrpNamePos, readGrpNameLen); int ret = 0; khash_t(name)* pReadGrpHash = pTable->pReadGrpHash; khiter_t khIter = kh_put(name, pReadGrpHash, pTable->pReadGrps[pTable->size], &ret); if (ret != 0) { pTable->pSampleMap[pTable->size] = sampleID; kh_value(pReadGrpHash, khIter) = pTable->size; ++(pTable->size); return SR_OK; } else { free(pTable->pReadGrps[pTable->size]); pTable->pReadGrps[pTable->size] = NULL; SR_ErrMsg("ERROR: Found a duplicated read group ID.\n"); } } return SR_ERR; }
static SR_Status SR_LibInfoTableAddAnchor(SR_LibInfoTable* pTable, const SR_BamHeader* pBamHeader) { const SR_Bool isLoaded = pTable->pAnchorInfo->size == 0 ? FALSE : TRUE; int ret = 0; khiter_t khIter = 0; if (isLoaded) { if (pBamHeader->pOrigHeader->n_targets != pTable->pAnchorInfo->size) { SR_ErrMsg("ERROR: The number of reference sequences in this bam file is inconsistent with that in previous bam files.\n"); return SR_ERR; } unsigned int refIndex = 0; for (unsigned int i = 0; i != pBamHeader->pOrigHeader->n_targets; ++i) { khIter = kh_put(name, pTable->pAnchorInfo->pAnchorHash, pBamHeader->pOrigHeader->target_name[i], &ret); if (ret == 0) { khash_t(name)* pAnchorHash = pTable->pAnchorInfo->pAnchorHash; refIndex = kh_value(pAnchorHash, khIter); if (refIndex != i) { SR_ErrMsg("ERROR: Reference ID in this bam file is inconsistent with that in the previous bam files.\n"); return SR_ERR; } if (pTable->pAnchorInfo->pLength[i] > 0 && pTable->pAnchorInfo->pLength[i] != pBamHeader->pOrigHeader->target_len[i]) { SR_ErrMsg("ERROR: The length of the reference sequence in this bam file is inconsistent with that in previous bam files.\n"); return SR_ERR; } } else { SR_ErrMsg("ERROR: Found a reference sequence that is not in the previous bam headers.\n"); return SR_ERR; } if (strncmp(pTable->pAnchorInfo->pMd5s + MD5_STR_LEN * i, pBamHeader->pMD5s[i], MD5_STR_LEN) != 0) { SR_ErrMsg("ERROR: MD5 string in this bam file is inconsistent with that in previous bam files.\n"); return SR_ERR; } } } else { if (pBamHeader->pOrigHeader->n_targets > pTable->pAnchorInfo->capacity) { SR_AnchorInfoFree(pTable->pAnchorInfo); pTable->pAnchorInfo = SR_AnchorInfoAlloc(pBamHeader->pOrigHeader->n_targets); } pTable->pAnchorInfo->size = pBamHeader->pOrigHeader->n_targets; for (unsigned int i = 0; i != pBamHeader->pOrigHeader->n_targets; ++i) { unsigned int nameLen = strlen(pBamHeader->pOrigHeader->target_name[i]); pTable->pAnchorInfo->pAnchors[i] = (char*) calloc(nameLen + 1, sizeof(char)); if (pTable->pAnchorInfo->pAnchors[i] == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of reference name.\n"); strncpy(pTable->pAnchorInfo->pAnchors[i], pBamHeader->pOrigHeader->target_name[i], nameLen); khIter = kh_put(name, pTable->pAnchorInfo->pAnchorHash, pTable->pAnchorInfo->pAnchors[i], &ret); khash_t(name)* pAnchorHash = pTable->pAnchorInfo->pAnchorHash; kh_value(pAnchorHash, khIter) = i; // set the length of those unused references to -1 so that // we will ignore any reads aligned to them pTable->pAnchorInfo->pLength[i] = pBamHeader->pOrigHeader->target_len[i]; if (strncmp("GL0", pTable->pAnchorInfo->pAnchors[i], 3) == 0 || strncmp("NC_", pTable->pAnchorInfo->pAnchors[i], 3) == 0 || strncmp("NT_", pTable->pAnchorInfo->pAnchors[i], 3) == 0 || strncmp("hs", pTable->pAnchorInfo->pAnchors[i], 2) == 0) { pTable->pAnchorInfo->pLength[i] = -1; } strncpy(pTable->pAnchorInfo->pMd5s + MD5_STR_LEN * i, pBamHeader->pMD5s[i], MD5_STR_LEN); } } return SR_OK; }
SR_BamInStream* SR_BamInStreamAlloc(const char* bamFilename, uint32_t binLen, unsigned int numThreads, unsigned int buffCapacity, unsigned int reportSize, const SR_StreamMode* pStreamMode) { SR_BamInStream* pBamInStream = (SR_BamInStream*) calloc(1, sizeof(SR_BamInStream)); if (pBamInStream == NULL) SR_ErrQuit("ERROR: Not enough memory for a bam input stream object."); pBamInStream->bam_cur_status = -1; pBamInStream->fpBamInput = bam_open(bamFilename, "r"); if (pBamInStream->fpBamInput == NULL) SR_ErrQuit("ERROR: Cannot open bam file %s for reading.\n", bamFilename); if ((pStreamMode->controlFlag & SR_USE_BAM_INDEX) != 0) { pBamInStream->pBamIndex = bam_index_load(bamFilename); if (pBamInStream->pBamIndex == NULL) { SR_ErrMsg("WARNING: Cannot open bam index file for reading. Creating it......"); bam_index_build(bamFilename); SR_ErrMsg(" The bam index is created."); pBamInStream->pBamIndex = bam_index_load(bamFilename); } } pBamInStream->filterFunc = pStreamMode->filterFunc; pBamInStream->filterData = pStreamMode->filterData; pBamInStream->numThreads = numThreads; pBamInStream->reportSize = reportSize; pBamInStream->currRefID = NO_QUERY_YET; pBamInStream->currBinPos = NO_QUERY_YET; pBamInStream->binLen = binLen; pBamInStream->pNewNode = NULL; pBamInStream->pBamIterator = NULL; if (numThreads > 0) { pBamInStream->pRetLists = (SR_BamList*) calloc(numThreads, sizeof(SR_BamList)); if (pBamInStream->pRetLists == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of retrun alignment lists in the bam input stream object.\n"); pBamInStream->pAlgnTypes = (SR_AlgnType*) malloc(numThreads * reportSize * sizeof(SR_AlgnType)); if (pBamInStream->pAlgnTypes == NULL) SR_ErrQuit("ERROR: Not enough memory for the storage of pair alignment type in the bam input stream object.\n"); } else { pBamInStream->pRetLists = NULL; pBamInStream->pAlgnTypes = NULL; pBamInStream->reportSize = 0; } if ((pStreamMode->controlFlag & SR_PAIR_GENOMICALLY) == 0) { pBamInStream->pNameHashes[PREV_BIN] = kh_init(queryName); kh_resize(queryName, pBamInStream->pNameHashes[PREV_BIN], reportSize); } else { pBamInStream->pNameHashes[PREV_BIN] = NULL; pBamInStream->binLen = SR_MAX_BIN_LEN; } pBamInStream->pNameHashes[CURR_BIN] = kh_init(queryName); kh_resize(queryName, pBamInStream->pNameHashes[CURR_BIN], reportSize); pBamInStream->pMemPool = SR_BamMemPoolAlloc(buffCapacity); pBamInStream->bam_cur_status = 1; return pBamInStream; }