コード例 #1
0
ファイル: SR_BamInStream.c プロジェクト: monkollek/scissors
// read the header of a bam file
SR_BamHeader* SR_BamInStreamLoadHeader(SR_BamInStream* pBamInStream)
{
    bam_header_t* pOrigHeader = bam_header_read(pBamInStream->fpBamInput);
    if (pOrigHeader == NULL)
        return NULL;

    SR_BamHeader* pBamHeader = SR_BamHeaderAlloc();

    pBamHeader->pOrigHeader = pOrigHeader;

    pBamHeader->pMD5s = (const char**) calloc(pOrigHeader->n_targets, sizeof(char*));
    if (pBamHeader->pMD5s == NULL)
        SR_ErrQuit("ERROR: Not enough memory for md5 string");

    unsigned int numMD5 = 0;
    for (const char* md5Pos = pOrigHeader->text; numMD5 <= pOrigHeader->n_targets && (md5Pos = strstr(md5Pos, "M5:")) != NULL; ++numMD5, ++md5Pos)
    {
        pBamHeader->pMD5s[numMD5] = md5Pos + 3;
    }

    if (numMD5 != pOrigHeader->n_targets)
    {
        free(pBamHeader->pMD5s);
        pBamHeader->pMD5s = NULL;

        if (numMD5 != 0)
            SR_ErrMsg("WARNING: Number of MD5 string is not consistent with number of chromosomes.");
    }

    return pBamHeader;
}
コード例 #2
0
ファイル: SR_LibInfo.c プロジェクト: jiantao/SplitRead
SR_Status SR_LibInfoTableSetRG(SR_LibInfoTable* pTable, unsigned int* oldSize, const SR_BamHeader* pBamHeader)
{
    SR_Status status = SR_OK;
    *oldSize = pTable->size;
    unsigned int oldCapacity = pTable->capacity;

    const char* tagPos = pBamHeader->pOrigHeader->text;
    status = SR_LibInfoTableAddAnchor(pTable, pBamHeader);
    if (status != SR_OK)
        return SR_ERR;

    while ((tagPos = strstr(tagPos, "@RG")) != NULL)
    {
        const char* lineEnd = strpbrk(tagPos, "\n");
        int32_t sampleID = 0;

        status = SR_LibInfoTableAddSample(&sampleID, pTable, tagPos, lineEnd);
        if (status != SR_OK)
        {
            SR_ErrMsg("ERROR: the \"SM\" field is not found under the read group tag in the bam header.\n");

            ++tagPos;
            continue;
        }

        status = SR_LibInfoTableAddReadGrp(pTable, tagPos, lineEnd, sampleID);
        if (status != SR_OK)
            SR_ErrMsg("ERROR: the \"ID\" field is required under the read group tag.\n");

        ++tagPos;
    }

    if (pTable->capacity > oldCapacity)
    {
        pTable->pLibInfo = (SR_LibInfo*) realloc(pTable->pLibInfo, sizeof(SR_LibInfo) * pTable->capacity);
        if (pTable->pLibInfo == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of library summary in the library information object.\n");
    }

    return SR_OK;
}
コード例 #3
0
ファイル: SR_LibInfo.c プロジェクト: jiantao/SplitRead
SR_Status SR_LibInfoTableGetRGIndex(int32_t* pReadGrpIndex, const SR_LibInfoTable* pTable, const char* pReadGrpName)
{
    *pReadGrpIndex = 0;

    khash_t(name)* pRgHash = pTable->pReadGrpHash;
    khiter_t khIter = kh_get(name, pRgHash, pReadGrpName);
    if (khIter != kh_end(pRgHash))
    {
        *pReadGrpIndex = kh_value(pRgHash, khIter);
    }
    else
    {
        SR_ErrMsg("ERROR: Found a read group name that is not recorded in the library information table.\n");
        return SR_ERR;
    }

    return SR_OK;
}
コード例 #4
0
ファイル: SR_LibInfo.c プロジェクト: jiantao/SplitRead
static SR_Status SR_LibInfoTableAddReadGrp(SR_LibInfoTable* pTable, const char* tagPos, const char* lineEnd, int sampleID)
{
    // get the name of the current read group
    const char* readGrpNamePos = strstr(tagPos, "ID:");
    if (readGrpNamePos != NULL && readGrpNamePos < lineEnd)
        readGrpNamePos += 3;
    else
        return SR_ERR;

    const char* platformPos = strstr(tagPos, "PL:");
    if (platformPos != NULL && platformPos < lineEnd)
        platformPos += 3;
    else
        return SR_ERR;

    // expand the array if necessary
    if (pTable->size == pTable->capacity)
    {
        pTable->capacity *= 2;
        pTable->pReadGrps = (char**) realloc(pTable->pReadGrps, pTable->capacity * sizeof(char*));
        if (pTable->pReadGrps == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of read group names in the library information object.\n");

        pTable->pSampleMap = (int32_t*) realloc(pTable->pSampleMap, pTable->capacity * sizeof(int32_t));
        if (pTable->pSampleMap == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of the sample ID map in the library information object.\n");

        pTable->pSeqTech = (int8_t*) realloc(pTable->pSeqTech, pTable->capacity * sizeof(int8_t));
        if (pTable->pSeqTech == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of the sequencing technology in the library information  object.\n");
    }

    SR_Status status = SR_LibInfoTableAddSeqTech(pTable, platformPos);
    if (status != SR_OK)
        return SR_ERR;

    const char* readGrpNameEnd = strpbrk(readGrpNamePos, " \t\n\0");
    size_t readGrpNameLen = readGrpNameEnd - readGrpNamePos;
    if (readGrpNameLen > 0)
    {
        pTable->pReadGrps[pTable->size] = (char*) calloc(readGrpNameLen + 1, sizeof(char));
        if (pTable->pReadGrps[pTable->size] == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of read group name in the fragment length distribution object.\n");

        memcpy(pTable->pReadGrps[pTable->size], readGrpNamePos, readGrpNameLen);

        int ret = 0;

        khash_t(name)* pReadGrpHash = pTable->pReadGrpHash;
        khiter_t khIter = kh_put(name, pReadGrpHash, pTable->pReadGrps[pTable->size], &ret);

        if (ret != 0)
        {
            pTable->pSampleMap[pTable->size] = sampleID;
            kh_value(pReadGrpHash, khIter) = pTable->size;
            ++(pTable->size);

            return SR_OK;
        }
        else
        {
            free(pTable->pReadGrps[pTable->size]);
            pTable->pReadGrps[pTable->size] = NULL;
            SR_ErrMsg("ERROR: Found a duplicated read group ID.\n");
        }
    }

    return SR_ERR;
}
コード例 #5
0
ファイル: SR_LibInfo.c プロジェクト: jiantao/SplitRead
static SR_Status SR_LibInfoTableAddAnchor(SR_LibInfoTable* pTable, const SR_BamHeader* pBamHeader)
{
    const SR_Bool isLoaded = pTable->pAnchorInfo->size == 0 ? FALSE : TRUE;

    int ret = 0;
    khiter_t khIter = 0;

    if (isLoaded)
    {
        if (pBamHeader->pOrigHeader->n_targets != pTable->pAnchorInfo->size)
        {
            SR_ErrMsg("ERROR: The number of reference sequences in this bam file is inconsistent with that in previous bam files.\n");
            return SR_ERR;
        }

        unsigned int refIndex = 0;
        for (unsigned int i = 0; i != pBamHeader->pOrigHeader->n_targets; ++i)
        {
            khIter = kh_put(name, pTable->pAnchorInfo->pAnchorHash, pBamHeader->pOrigHeader->target_name[i], &ret);
            if (ret == 0)
            {
                khash_t(name)* pAnchorHash = pTable->pAnchorInfo->pAnchorHash;
                refIndex = kh_value(pAnchorHash, khIter);
                if (refIndex != i)
                {
                    SR_ErrMsg("ERROR: Reference ID in this bam file is inconsistent with that in the previous bam files.\n");
                    return SR_ERR;
                }

                if (pTable->pAnchorInfo->pLength[i] > 0 && pTable->pAnchorInfo->pLength[i] != pBamHeader->pOrigHeader->target_len[i])
                {
                    SR_ErrMsg("ERROR: The length of the reference sequence in this bam file is inconsistent with that in previous bam files.\n");
                    return SR_ERR;
                }
            }
            else
            {
                SR_ErrMsg("ERROR: Found a reference sequence that is not in the previous bam headers.\n");
                return SR_ERR;
            }

            if (strncmp(pTable->pAnchorInfo->pMd5s + MD5_STR_LEN * i, pBamHeader->pMD5s[i], MD5_STR_LEN) != 0)
            {
                SR_ErrMsg("ERROR: MD5 string in this bam file is inconsistent with that in previous bam files.\n");
                return SR_ERR;
            }
        }
    }
    else
    {
        if (pBamHeader->pOrigHeader->n_targets > pTable->pAnchorInfo->capacity)
        {
            SR_AnchorInfoFree(pTable->pAnchorInfo);
            pTable->pAnchorInfo = SR_AnchorInfoAlloc(pBamHeader->pOrigHeader->n_targets);
        }

        pTable->pAnchorInfo->size = pBamHeader->pOrigHeader->n_targets;

        for (unsigned int i = 0; i != pBamHeader->pOrigHeader->n_targets; ++i)
        {
            unsigned int nameLen = strlen(pBamHeader->pOrigHeader->target_name[i]);

            pTable->pAnchorInfo->pAnchors[i] = (char*) calloc(nameLen + 1, sizeof(char));
            if (pTable->pAnchorInfo->pAnchors[i] == NULL)
                SR_ErrQuit("ERROR: Not enough memory for the storage of reference name.\n");

            strncpy(pTable->pAnchorInfo->pAnchors[i], pBamHeader->pOrigHeader->target_name[i], nameLen);
            khIter = kh_put(name, pTable->pAnchorInfo->pAnchorHash, pTable->pAnchorInfo->pAnchors[i], &ret);

            khash_t(name)* pAnchorHash = pTable->pAnchorInfo->pAnchorHash;
            kh_value(pAnchorHash, khIter) = i;

            // set the length of those unused references to -1 so that
            // we will ignore any reads aligned to them
            pTable->pAnchorInfo->pLength[i] = pBamHeader->pOrigHeader->target_len[i];
            if (strncmp("GL0", pTable->pAnchorInfo->pAnchors[i], 3) == 0
                || strncmp("NC_", pTable->pAnchorInfo->pAnchors[i], 3) == 0
                || strncmp("NT_", pTable->pAnchorInfo->pAnchors[i], 3) == 0
                || strncmp("hs", pTable->pAnchorInfo->pAnchors[i], 2) == 0)
            {
                pTable->pAnchorInfo->pLength[i] = -1;
            }

            strncpy(pTable->pAnchorInfo->pMd5s + MD5_STR_LEN * i, pBamHeader->pMD5s[i], MD5_STR_LEN);
        }
    }

    return SR_OK;
}
コード例 #6
0
ファイル: SR_BamInStream.c プロジェクト: monkollek/scissors
SR_BamInStream* SR_BamInStreamAlloc(const char* bamFilename, uint32_t binLen, unsigned int numThreads, unsigned int buffCapacity, 
                                    unsigned int reportSize, const SR_StreamMode* pStreamMode)
{
    SR_BamInStream* pBamInStream = (SR_BamInStream*) calloc(1, sizeof(SR_BamInStream));
    if (pBamInStream == NULL)
        SR_ErrQuit("ERROR: Not enough memory for a bam input stream object.");

    pBamInStream->bam_cur_status = -1;

    pBamInStream->fpBamInput = bam_open(bamFilename, "r");
    if (pBamInStream->fpBamInput == NULL)
        SR_ErrQuit("ERROR: Cannot open bam file %s for reading.\n", bamFilename);

    if ((pStreamMode->controlFlag & SR_USE_BAM_INDEX) != 0)
    {
        pBamInStream->pBamIndex = bam_index_load(bamFilename);
	if (pBamInStream->pBamIndex == NULL) {
            SR_ErrMsg("WARNING: Cannot open bam index file for reading. Creating it......");
	    bam_index_build(bamFilename);
	    SR_ErrMsg("         The bam index is created.");
	    pBamInStream->pBamIndex = bam_index_load(bamFilename);
	}
    }

    pBamInStream->filterFunc = pStreamMode->filterFunc;
    pBamInStream->filterData = pStreamMode->filterData;
    pBamInStream->numThreads = numThreads;
    pBamInStream->reportSize = reportSize;
    pBamInStream->currRefID = NO_QUERY_YET;
    pBamInStream->currBinPos = NO_QUERY_YET;
    pBamInStream->binLen = binLen;
    pBamInStream->pNewNode = NULL;
    pBamInStream->pBamIterator = NULL;

    if (numThreads > 0)
    {
        pBamInStream->pRetLists = (SR_BamList*) calloc(numThreads, sizeof(SR_BamList));
        if (pBamInStream->pRetLists == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of retrun alignment lists in the bam input stream object.\n");

        pBamInStream->pAlgnTypes = (SR_AlgnType*) malloc(numThreads * reportSize * sizeof(SR_AlgnType));
        if (pBamInStream->pAlgnTypes == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of pair alignment type in the bam input stream object.\n");
    }
    else
    {
        pBamInStream->pRetLists = NULL;
        pBamInStream->pAlgnTypes = NULL;
        pBamInStream->reportSize = 0;
    }

    if ((pStreamMode->controlFlag & SR_PAIR_GENOMICALLY) == 0)
    {
        pBamInStream->pNameHashes[PREV_BIN] = kh_init(queryName);
        kh_resize(queryName, pBamInStream->pNameHashes[PREV_BIN], reportSize);
    }
    else
    {
        pBamInStream->pNameHashes[PREV_BIN] = NULL;
        pBamInStream->binLen = SR_MAX_BIN_LEN;
    }

    pBamInStream->pNameHashes[CURR_BIN] = kh_init(queryName);
    kh_resize(queryName, pBamInStream->pNameHashes[CURR_BIN], reportSize);

    pBamInStream->pMemPool = SR_BamMemPoolAlloc(buffCapacity);

    pBamInStream->bam_cur_status = 1;

    return pBamInStream;
}