Esempio n. 1
0
File: sam.c Progetto: Annak17/partis
static int aux_fields1(void)
{
    static const char sam[] = "data:"
"@SQ\tSN:one\tLN:1000\n"
"@SQ\tSN:two\tLN:500\n"
"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tZZ:i:1000000\n";

    // Canonical form of the alignment record above, as output by sam_format1()
    static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tZZ:i:1000000";

    samFile *in = sam_open(sam, "r");
    bam_hdr_t *header = sam_hdr_read(in);
    bam1_t *aln = bam_init1();
    uint8_t *p;
    uint32_t n;
    kstring_t ks = { 0, 0, NULL };

    if (sam_read1(in, header, aln) >= 0) {
        if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k')
            fail("XA field is '%c', expected 'k'", bam_aux2A(p));

        if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37)
            fail("Xi field is %d, expected 37", bam_aux2i(p));

        if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6)
            fail("Xf field is %.12f, expected pi", bam_aux2f(p));

        if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6)
            fail("Xf field is %.12f, expected e", bam_aux2f(p));

        if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0)
            fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO);

        if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0)
            fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF);

        // TODO Invent and use bam_aux2B()
        if ((p = check_bam_aux_get(aln, "XB", 'B')) && ! (memcmp(p, "Bc", 2) == 0 && (memcpy(&n, p+2, 4), n) == 3 && memcmp(p+6, "\xfe\x00\x02", 3) == 0))
            fail("XB field is %c,..., expected c,-2,0,+2", p[1]);

        if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000)
            fail("ZZ field is %d, expected 1000000", bam_aux2i(p));

        if (sam_format1(header, aln, &ks) < 0)
            fail("can't format record");

        if (strcmp(ks.s, r1) != 0)
            fail("record formatted incorrectly: \"%s\"", ks.s);

        free(ks.s);
    }
    else fail("can't read record");

    bam_destroy1(aln);
    bam_hdr_destroy(header);
    sam_close(in);

    return 1;
}
Esempio n. 2
0
File: sam.c Progetto: atks/vt
static int test_update_array(bam1_t *aln, const char target_id[2],
                             uint8_t type, uint32_t nitems, void *data,
                             const char next_id[2], int64_t next_val,
                             char next_type)
{
    uint8_t *p;

    // Try updating target
    if (bam_aux_update_array(aln, target_id, type, nitems, data) < 0) {
        fail("update %2.s tag", target_id);
        return -1;
    }

    // Check values
    p = bam_aux_get(aln, target_id);
    if (!p) {
        fail("find  %.2s tag", target_id);
        return -1;
    }
    switch (type) {
        case 'c':
            CHECK_ARRAY_VALS(int8_t, bam_auxB2i, PRId64, PRId8); break;
        case 'C':
            CHECK_ARRAY_VALS(uint8_t, bam_auxB2i, PRId64, PRIu8); break;
        case 's':
            CHECK_ARRAY_VALS(int16_t, bam_auxB2i, PRId64, PRId16); break;
        case 'S':
            CHECK_ARRAY_VALS(uint16_t, bam_auxB2i, PRId64, PRIu16); break;
        case 'i':
            CHECK_ARRAY_VALS(int32_t, bam_auxB2i, PRId64, PRId32); break;
        case 'I':
            CHECK_ARRAY_VALS(uint32_t, bam_auxB2i, PRId64, PRIu32); break;
        case 'f':
            CHECK_ARRAY_VALS(float, bam_auxB2f, "e", "e"); break;
    }

    // If given, check that the next tag hasn't been clobbered by the
    // update above.
    if (!*next_id) return 0;
    p = bam_aux_get(aln, next_id);
    if (!p) {
        fail("find  %.2s tag after updating %.2s", next_id, target_id);
        return -1;
    }
    if (*p != next_type || bam_aux2i(p) != next_val) {
        fail("after updating %.2s:"
             " %.2s field is %c:%"PRId64"; expected %c:%"PRId64,
             target_id, next_id, *p, bam_aux2i(p), next_type, next_val);
        return -1;
    }

    return 0;
}
Esempio n. 3
0
 int32_t getMismatches() const
 {
     assert(m_dataPtr);
     uint8_t *mm = bam_aux_get(m_dataPtr.get(), "NM");
     if (mm != NULL)
         return bam_aux2i(mm);
     return NO_COLOR_MM;
 }
Esempio n. 4
0
 int getIndelAmbiguity() const
 {
    assert(m_dataPtr);
    uint8_t *amb = bam_aux_get(m_dataPtr.get(), "XA");
    if (amb != NULL)
        return bam_aux2i(amb);
    return INDEL_NO_AMBIGUITY;
 }
Esempio n. 5
0
 bool isMappedUnique() const
 {
     assert(m_dataPtr);
     uint8_t *hits = bam_aux_get(m_dataPtr.get(), "NH");
     if (hits != NULL)
         return (bam_aux2i(hits) == 1);
     return false;
 }
Esempio n. 6
0
 int32_t getReportedAlignments() const
 {
     assert(m_dataPtr);
     uint8_t *hits = bam_aux_get(m_dataPtr.get(), "NH");
     if (hits != NULL)
         return bam_aux2i(hits);
     return NO_NH;
 }
Esempio n. 7
0
 int32_t getMapQual() const  // comes from SM tag, applies to this read if this read has a mate.
 {
     assert(m_dataPtr);
     //assert(shouldHaveMate()); // This will be controversial!
     uint8_t *mq = bam_aux_get(m_dataPtr.get(), "SM");
     if (mq != NULL)
         return bam_aux2i(mq);
     return NO_MAP_QUAL;
 }
Esempio n. 8
0
int32_t
tmap_sam_get_fo_start_idx(tmap_sam_t *sam)
{
  uint8_t *tag = NULL;
  // ZF
  tag = bam_aux_get(sam->b, "ZF");
  if(NULL != tag) return bam_aux2i(tag);
  else return -1;
}
Esempio n. 9
0
int32_t
tmap_sam_get_za(tmap_sam_t *sam)
{
  uint8_t *tag = NULL;
  // ZA
  if(NULL == sam->b) tmap_bug();
  tag = bam_aux_get(sam->b, "ZA");
  if(NULL != tag) return bam_aux2i(tag);
  else return -1;
}
Esempio n. 10
0
File: sam.c Progetto: atks/vt
static int test_update_int(bam1_t *aln,
                           const char target_id[2], int64_t target_val,
                           char expected_type,
                           const char next_id[2], int64_t next_val,
                           char next_type) {
    uint8_t *p;

    // Try updating target
    if (bam_aux_update_int(aln, target_id, target_val) < 0) {
        fail("update %.2s tag", target_id);
        return -1;
    }

    // Check it's there and has the right type and value
    p = bam_aux_get(aln, target_id);
    if (!p) {
        fail("find  %.2s tag", target_id);
        return -1;
    }
    if (*p != expected_type || bam_aux2i(p) != target_val) {
        fail("%.2s field is %c:%"PRId64"; expected %c:%"PRId64,
             target_id, *p, bam_aux2i(p), expected_type, target_val);
        return -1;
    }

    // If given, check that the next tag hasn't been clobbered by the
    // update above.
    if (!*next_id) return 0;
    p = bam_aux_get(aln, next_id);
    if (!p) {
        fail("find  %.2s tag after updating %.2s", next_id, target_id);
        return -1;
    }
    if (*p != next_type || bam_aux2i(p) != next_val) {
        fail("after updating %.2s to %"PRId64":"
             " %.2s field is %c:%"PRId64"; expected %c:%"PRId64,
             target_id, target_val,
             next_id, *p, bam_aux2i(p), next_type, next_val);
        return -1;
    }
    return 0;
}
Esempio n. 11
0
static int64_t get_mate_score(bam1_t *b) {
    uint8_t *data;
    int64_t score;

    if ((data = bam_aux_get(b, "ms"))) {
        score = bam_aux2i(data);
    } else {
        fprintf(stderr, "[markdup] error: no ms score tag.\n");
        return -1;
    }

    return score;
}
Esempio n. 12
0
int read_bam(void *data, bam1_t *b)
{
    aux_t *aux = (aux_t*)data;
    int ret;
    while(1) {
        uint8_t *tmp = 0;
        ret = sam_read1(aux->fp, aux->hdr, b);
        if (ret < 0) break;
        if (b->core.flag & (BAM_FUNMAP)) continue;
        if ((int)b->core.qual < aux->min_mapq) continue;
        if (bam_cigar2ulen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len) continue;
        tmp = bam_aux_get(b, "AS");
        if (tmp && bam_aux2i(tmp) < aux->min_as) continue;
        break;
    }
    return ret;
}
Esempio n. 13
0
int main(int argc, char *argv[])  
{  
  short out2stdout=0;
  hashtable ht=new_hashtable(HASHSIZE);
  bamFile in,in2; 
  bamFile out; 
  int paired;//1 if not paired or pair read 1, 2 otherwise
  index_mem=sizeof(hashtable)*sizeof(hashnode**)*HASHSIZE*2;

  if (argc != 3) {  
    fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam or - for stdout>\n");  
    return 1;  
  }  
  // Open file and exit if error
  in = bam_open(argv[1], "rb");
  out2stdout = strcmp(argv[2], "-")? 0 : 1; 
  out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); 
  if (in == 0 ) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]);  
    return 1;  
  }  
  if (out == 0) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]);  
    return 1;  
  }  

  unsigned long num_alns=0;
  int ref;  

  // ***********
  // Copy header
  bam_header_t *header;
  header = bam_header_read(in);
  bam_header_write(out,header);

  // sorted by name?
  // Should not rely on the value in SO 
  bam1_t *aln=bam_init1();
  bam1_t *prev=bam_init1();

  if (!out2stdout) {
    fprintf(stderr,"bam_fix_NH version %s\n",VERSION);
    fprintf(stderr,"Processing %s\n",argv[1]);
    fprintf(stderr,"Hashing...\n");fflush(stderr);
  }

  while(bam_read1(in,aln)>=0) { // read alignment
    if (aln->core.tid < 0) continue;//ignore unaligned reads
    if (aln->core.flag & BAM_FUNMAP) continue;
    if (aln->core.flag & BAM_FREAD2) paired=2;
    else paired=1;
    ++num_alns;
    new_read_aln(ht,fix_read_name(bam1_qname(aln),paired));
    if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns);
  }
  bam_close(in);  
  if(!out2stdout) {
    fprintf(stderr,"%s%lu\n",BACKLINE,num_alns);
    fprintf(stderr,"Hashing complete (%lu alignments)\n",num_alns);
    fprintf(stderr,"Memory used: %ld MB\n",index_mem/1024/1024);  
    fprintf(stderr,"Updating entries with NH and printing BAM...\n");
    fflush(stderr);
  }
  // reopen
  in2 = bam_open(argv[1], "rb");
  if (in2 == 0 ) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]);  
    return 1;  
  }  

  header = bam_header_read(in2);
  num_alns=0;
  while(bam_read1(in2,aln)>=0) { // read alignment
    paired=1;
    if (aln->core.tid < 0) continue;//ignore unaligned reads
    if (aln->core.flag & BAM_FUNMAP) continue;
    if (aln->core.flag & BAM_FREAD2) paired=2;
    ++num_alns;
    READ_ALN *r=get_read_aln(ht,fix_read_name(bam1_qname(aln),paired));

    assert(r!=NULL);
    // update the NH field
    uint8_t *old_nh = bam_aux_get(aln, "NH");    
    int32_t nh=r->ctr;
    if (old_nh) {
      if (nh!=bam_aux2i(old_nh)) {
	fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh);
      }
      bam_aux_del(aln, old_nh);
      bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh);
#ifdef DEBUG
      //      printf("!>%s %d\n",bam1_qname(aln),r->ctr);
#endif
    }
    if (!old_nh) { // add NH  
      bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh);
#ifdef DEBUG
      fprintf(stderr,"!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh));
#endif
    }
    bam_write1(out,aln);
    if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns);
  }
  // 
  bam_destroy1(aln);
  bam_close(in2);  
  bam_close(out);  
  if(!out2stdout) {
    fprintf(stderr,"%s%lu\n",BACKLINE,num_alns);
    fprintf(stderr,"Done.\n");
  }
  return 0;  
}  
Esempio n. 14
0
void signalFromBAM(const string bamFileName, const string sigFileName, Parameters P) {

    bam1_t *bamA;
    bamA=bam_init1();

    double nMult=0, nUniq=0;

    if (P.outWigFlags.norm==1) {//count reads in the BAM file
        BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r");
        bam_hdr_t *bamHeader=bam_hdr_read(bamIn);
        while ( true ) {//until the end of file
            int bamBytes1=bam_read1(bamIn, bamA);
            if (bamBytes1<0) break; //end of file
            if (bamA->core.tid<0) continue; //unmapped read
//             if ( !std::regex_match(chrName.at(bamA->core.tid),std::regex(P.outWigReferencesPrefix))) continue; //reference does not mathc required references
            if ( P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) continue; //reference does not match required references

            uint8_t* aNHp=bam_aux_get(bamA,"NH");
            if (aNHp!=NULL) {
                uint32_t aNH=bam_aux2i(aNHp);
                if (aNH==1) {//unique mappers
                    ++nUniq;
                } else if (aNH>1) {
                    nMult+=1.0/aNH;
                };
            };
        };
        bgzf_close(bamIn);
    };

    BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r");
    bam_hdr_t *bamHeader=bam_hdr_read(bamIn);

    int sigN=P.outWigFlags.strand ? 4 : 2;

    double *normFactor=new double[sigN];

    ofstream **sigOutAll=new ofstream* [sigN];

    string* sigOutFileName=new string[sigN];
    sigOutFileName[0]=sigFileName+".Unique.str1.out";
    sigOutFileName[1]=sigFileName+".UniqueMultiple.str1.out";
    if (P.outWigFlags.strand) {
        sigOutFileName[2]=sigFileName+".Unique.str2.out";
        sigOutFileName[3]=sigFileName+".UniqueMultiple.str2.out";
    };

    for (int ii=0; ii<sigN; ii++) {
        sigOutFileName[ii]+= (P.outWigFlags.format==0 ? ".bg" : ".wig");
        sigOutAll[ii]=new ofstream ( sigOutFileName[ii].c_str() );
    };

    if (P.outWigFlags.norm==0) {//raw counts
        normFactor[0]=1;
        normFactor[1]=1;
    } else if (P.outWigFlags.norm==1) {//normlaized
        normFactor[0]=1.0e6 / nUniq;
        normFactor[1]=1.0e6 / (nUniq+nMult);
        for (int is=0;is<sigN;is++) {//formatting double output
            *sigOutAll[is]<<setiosflags(ios::fixed) << setprecision(5);
        };
    };
    if (P.outWigFlags.strand) {
        normFactor[2]=normFactor[0];
        normFactor[3]=normFactor[1];
    };


    int iChr=-999;
    double *sigAll=NULL;
    uint32_t chrLen=0;
    while ( true ) {//until the end of file
        int bamBytes1=bam_read1(bamIn, bamA);
        if (bamA->core.tid!=iChr || bamBytes1<0) {
            //output to file
            if (iChr!=-999) {//iChr=-999 marks chromosomes that are not output, including unmapped reads
                for (int is=0;is<sigN;is++) {
                    if (P.outWigFlags.format==1) {
                        *sigOutAll[is] <<"variableStep chrom="<<bamHeader->target_name[iChr] <<"\n";
                    };
                    double prevSig=0;
                    for (uint32_t ig=0;ig<chrLen;ig++) {
                        double newSig=sigAll[sigN*ig+is];
                        if (P.outWigFlags.format==0) {//bedGraph
                            if (newSig!=prevSig) {
                                if (prevSig!=0) {//finish previous record
                                    *sigOutAll[is] <<ig<<"\t"<<prevSig*normFactor[is] <<"\n"; //1-based end
                                };
                                if (newSig!=0) {
                                    *sigOutAll[is] << bamHeader->target_name[iChr] <<"\t"<< ig <<"\t"; //0-based beginning
                                };
                                prevSig=newSig;
                            };
                        } else if (P.outWigFlags.format==1){//wiggle
                            if (newSig!=0) {
                                *sigOutAll[is] <<ig+1<<"\t"<<newSig*normFactor[is] <<"\n";
                            };
                        };
                    };
                };
            };
            if (bamBytes1<0) {//no more reads
                break;
            };

            iChr=bamA->core.tid;
            if ( iChr==-1 || (P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) ) {
                iChr=-999;
                continue; //reference does not match required references
            };

            chrLen=bamHeader->target_len[iChr]+1;//one extra base at the end which sohuld always be 0
            delete [] sigAll;
            sigAll= new double[sigN*chrLen];
            memset(sigAll, 0, sizeof(*sigAll)*sigN*chrLen);
        };

//         uint32_t nCigar =(bamA->core.flag<<16)>>16;
//         uint32_t mapFlag=bamA->core.flag>>16;
//         uint32_t mapQ=(bamA->core.flag<<16)>>24;

        #define BAM_CIGAR_OperationShift 4
        #define BAM_CIGAR_LengthBits 28
        #define BAM_CIGAR_M 0
        #define BAM_CIGAR_I 1
        #define BAM_CIGAR_D 2
        #define BAM_CIGAR_N 3
        #define BAM_CIGAR_S 4
        #define BAM_CIGAR_H 5
        #define BAM_CIGAR_P 6
        #define BAM_CIGAR_EQ 7
        #define BAM_CIGAR_X 8

        //by default, alignments marked as duplicate are not processed
        if ( (bamA->core.flag & 0x400) > 0 ) continue;

        //NH attribute
        uint8_t* aNHp=bam_aux_get(bamA,"NH");
        uint32_t aNH;
        if (aNHp==NULL) {
            aNH=1; //no NH tag: assume NH=1
            //continue; //do not process lines without NH field
        } else {
            aNH=bam_aux2i(bam_aux_get(bamA,"NH")); //write a safer function allowing for lacking NH tag
        };
        if (aNH==0) continue; //do not process lines without NH=0
        uint32_t aG=bamA->core.pos;
        uint32_t iStrand=0;
        if (P.outWigFlags.strand) {//strand for stranded data from SAM flag
            iStrand= ( (bamA->core.flag & 0x10) > 0 ) == ( (bamA->core.flag & 0x80) == 0 );//0/1 for +/-
        };
        if (P.outWigFlags.type==1) {//5' of the1st read signal only, RAMPAGE/CAGE
            if ( (bamA->core.flag & 0x80)>0) continue; //skip if this the second mate
            if (iStrand==0) {
                if (aNH==1) {//unique mappers
                    sigAll[aG*sigN+0+2*iStrand]++;
                };
                sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci
                continue; //record only the first position
            };
        };

        uint32_t* cigar=(uint32_t*) (bamA->data+bamA->core.l_qname);

        for (uint32_t ic=0; ic<bamA->core.n_cigar; ic++) {
            uint32_t cigOp=(cigar[ic]<<BAM_CIGAR_LengthBits)>>BAM_CIGAR_LengthBits;
            uint32_t cigL=cigar[ic]>>BAM_CIGAR_OperationShift;
            switch (cigOp) {
                case(BAM_CIGAR_D):
                case(BAM_CIGAR_N):
                    aG+=cigL;
                    break;
                case(BAM_CIGAR_M):
                    if (P.outWigFlags.type==0 || (P.outWigFlags.type==2 && (bamA->core.flag & 0x80)>0 )) {//full signal, or second mate onyl signal
                        for (uint32_t ig=0;ig<cigL;ig++) {
                            if (aG>=chrLen) {
                                cerr << "BUG: alignment extends past chromosome in signalFromBAM.cpp\n";
                                exit(-1);
                            };
                            if (aNH==1) {//unique mappers
                                sigAll[aG*sigN+0+2*iStrand]++;
                            };
                            sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci
                            aG++;
                        };
                    } else {
                        aG+=cigL;
                    };
            };
        };
        if (P.outWigFlags.type==1) {//full signal
            --aG;
            if (aNH==1) {//unique mappers
                sigAll[aG*sigN+0+2*iStrand]++;
            };
            sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci
        };
    };
    delete [] sigAll;

    for (int is=0; is<sigN; is++) {// flush/close all signal files
        sigOutAll[is]->flush();
        sigOutAll[is]->close();
    };
};
Esempio n. 15
0
int main(int argc, char *argv[])  
{  
  bamFile in; 
  sqlite3 * db;
  sqlite3_stmt * stmt;
  char * sErrMsg = NULL;
  char * tail = 0;
  int nRetCode;
  char sSQL [BUFFER_SIZE] = "\0";
  char database[BUFFER_SIZE];
  clock_t startClock,startClock2;

  if (argc != 2) {  
    fprintf(stderr, "Usage: bamRindex <in.bam>\n");  
    return 1;  
  }  

  // Open file and exit if error
  //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb");
  //fprintf(stderr,"Options ok\n");
  in = bam_open(argv[1], "rb");
  if (in == 0 ) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]);  
    return 1;  
  }  
  //fprintf(stderr,"BAM opened\n");
  assert(strcpy(database,argv[1])!=NULL);
  assert(strcat(database,".ridx")!=NULL);
  remove(database);
  // ***********
  // Read header
  bam_header_t *header;
  header = bam_header_read(in);
  // sorted by name?
  // Should not rely on the value in SO 
  bam1_t *aln=bam_init1();
  unsigned long num_alns=0;

  /*********************************************/
  /* Open the Database and create the Schema */
  // TODO: check the errors
  sqlite3_open(database, &db);
  sqlite3_exec(db, TABLE, NULL, NULL, &sErrMsg); // create the table
  SQLITE_CHECK_ERROR();
  startClock = clock();
  sqlite3_exec(db, "PRAGMA synchronous = 0;", NULL, NULL, &sErrMsg);
  SQLITE_CHECK_ERROR();
  sqlite3_exec(db, "PRAGMA journal_mode = OFF;", NULL, NULL, &sErrMsg);
  SQLITE_CHECK_ERROR();
  // Use up to 8GB of memory
  sqlite3_exec(db, "PRAGMA cache_size = -8000000;", NULL, NULL, &sErrMsg);
  SQLITE_CHECK_ERROR();
  sqlite3_exec(db, "BEGIN TRANSACTION;", NULL, NULL, &sErrMsg);
  SQLITE_CHECK_ERROR();
  while(bam_read1(in,aln)>=0) { // read alignment
    //aln->core.tid < 0 ? 
    uint8_t *nh = bam_aux_get(aln, "NH");
    uint8_t *nm = bam_aux_get(aln, "NM");
    uint8_t *xs = bam_aux_get(aln, "XS");
    
    BOOLEAN isPrimary;
    BOOLEAN isMapped;
    BOOLEAN notMapped;
    BOOLEAN isDuplicate;
    BOOLEAN isNotPassingQualityControls;
    BOOLEAN isPaired;
    BOOLEAN isSecondMateRead,isProperPair;
    //secondary alignment
    notMapped=(aln->core.flag & BAM_FUNMAP) ? TRUE: FALSE;
    //notMapped=((aln->core.flag & BAM_FUNMAP) || (aln->core.mtid ==0)) ? TRUE: FALSE;
    isMapped=!notMapped;
    isPrimary= (aln->core.flag & BAM_FSECONDARY) ? FALSE:TRUE;
    isProperPair=(aln->core.flag & BAM_FPROPER_PAIR) ? TRUE:FALSE;
    isPaired=(aln->core.flag & BAM_FPAIRED ) ? TRUE:FALSE;
    isSecondMateRead=(aln->core.flag & BAM_FREAD2 ) ? TRUE: FALSE;
    isNotPassingQualityControls=(aln->core.flag & BAM_FQCFAIL ) ? TRUE:FALSE;
    isDuplicate=(aln->core.flag & BAM_FDUP) ? TRUE: FALSE;

    BOOLEAN isSpliced=FALSE;
    BOOLEAN hasSimpleCigar=TRUE;
    int nSpliced=0;
    int i;
    if (aln->core.n_cigar != 0) {
      for (i = 0; i < aln->core.n_cigar; ++i) {
	char l="MIDNSHP=X"[bam1_cigar(aln)[i]&BAM_CIGAR_MASK];
	//fprintf(stderr,"%c",l);
	if ( l == 'N' ) { isSpliced=TRUE; hasSimpleCigar=FALSE;++nSpliced;}	  
	if ( l != 'M' && l!='=' ) {  hasSimpleCigar=FALSE;}	  
      }
    } 
    //fprintf(stderr,"read %ld\n",num_alns);
    // isDuplicate,isNotPassingQualityControls,
    // isSpliced,isPAired,isPrimary,hasSimpleCigar,isSecondMateRead,isProperPair,nh,nm,qual/mapq,xs
    sprintf(sSQL,"INSERT into bam_index values (%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,'%c')",
	   isDuplicate,isNotPassingQualityControls,
	   nSpliced,isPaired,isPrimary,isMapped,hasSimpleCigar,isSecondMateRead,isProperPair,
	   (nh==0?0:bam_aux2i(nh)),(nm==0?0:bam_aux2i(nm)),
	    aln->core.qual,
	    (xs==0?' ':(bam_aux2A(xs)==0?' ':bam_aux2A(xs))));
    sqlite3_exec(db, sSQL, NULL, NULL, &sErrMsg);
    SQLITE_CHECK_ERROR();
    ++num_alns;
    PRINT_ALNS_PROCESSED(num_alns);
  }
  bam_close(in);  
  sqlite3_exec(db, "END TRANSACTION;", NULL, NULL, &sErrMsg);
  SQLITE_CHECK_ERROR();
  printf("\nImported %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock))/CLOCKS_PER_SEC);
  // Create the indexes
  startClock2 = clock();
  // generating the indexes does not pay off
  //sqlite3_exec(db, INDEXES, NULL, NULL, &sErrMsg);
  //printf("Indexed %d records in %4.2f seconds\n", num_alns, ( (double) (clock() - startClock2))/CLOCKS_PER_SEC);
  printf("Total time: %4.2f seconds\n", ((double)(clock() - startClock))/CLOCKS_PER_SEC);
  sqlite3_close(db);
  return 0;  
}  
Esempio n. 16
0
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
{
    uint8_t *seq = bam_get_seq(b);
    uint32_t *cigar = bam_get_cigar(b);
    bam1_core_t *c = &b->core;
    int i, x, y, u = 0;
    kstring_t *str;
    int32_t old_nm_i = -1, nm = 0;

    str = (kstring_t*)calloc(1, sizeof(kstring_t));
    for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
        int j, l = cigar[i]>>4, op = cigar[i]&0xf;
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
            for (j = 0; j < l; ++j) {
                int c1, c2, z = y + j;
                if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
                c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
                if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                    if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
                    ++u;
                } else {
                    kputw(u, str);
                    kputc(ref[x+j], str);
                    u = 0;
                    ++nm;
                }
            }
            if (j < l) break;
            x += l;
            y += l;
        } else if (op == BAM_CDEL) {
            kputw(u, str);
            kputc('^', str);
            for (j = 0; j < l; ++j) {
                if (x+j >= ref_len || ref[x+j] == '\0') break;
                kputc(ref[x+j], str);
            }
            u = 0;
            x += j;
            nm += j;
            if (j < l) break;
        } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
            y += l;
            if (op == BAM_CINS) nm += l;
        } else if (op == BAM_CREF_SKIP) {
            x += l;
        }
    }
    kputw(u, str);
    // apply max_nm
    if (max_nm > 0 && nm >= max_nm) {
        for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
            int j, l = cigar[i]>>4, op = cigar[i]&0xf;
            if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                for (j = 0; j < l; ++j) {
                    int c1, c2, z = y + j;
                    if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
                    c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
                    if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                        seq[z/2] |= (z&1)? 0x0f : 0xf0;
                        bam_get_qual(b)[z] = 0;
                    }
                }
                if (j < l) break;
                x += l;
                y += l;
            } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
            else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
        }
    }
    // update NM
    if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
        uint8_t *old_nm = bam_aux_get(b, "NM");
        if (old_nm) old_nm_i = bam_aux2i(old_nm);
        if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
        else if (nm != old_nm_i) {
            fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
            bam_aux_del(b, old_nm);
            bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
        }
    }
    // update MD
    if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
        uint8_t *old_md = bam_aux_get(b, "MD");
        if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
        else {
            int is_diff = 0;
            if (strlen((char*)old_md+1) == str->l) {
                for (i = 0; i < str->l; ++i)
                    if (toupper(old_md[i+1]) != toupper(str->s[i]))
                        break;
                if (i < str->l) is_diff = 1;
            } else is_diff = 1;
            if (is_diff) {
                fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
                bam_aux_del(b, old_md);
                bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
            }
        }
    }

    // drop all tags but RG
    if (flag&DROP_TAG) {
        uint8_t *q = bam_aux_get(b, "RG");
        bam_aux_drop_other(b, q);
    }
    // reduce the resolution of base quality
    if (flag&BIN_QUAL) {
        uint8_t *qual = bam_get_qual(b);
        for (i = 0; i < b->core.l_qseq; ++i)
            if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
    }

    free(str->s);
    free(str);
}
Esempio n. 17
0
File: sam.c Progetto: atks/vt
static int aux_fields1(void)
{
    static const char sam[] = "data:,"
"@SQ\tSN:one\tLN:1000\n"
"@SQ\tSN:two\tLN:500\n"
"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:d:2.46801\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n";

    // Canonical form of the alignment record above, as output by sam_format1()
    static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" NEW_HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:f:9.8765\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\tN0:i:-1234\tN1:i:1234\tN2:i:-2\tN3:i:3\tF1:f:4.5678\tN4:B:S,65535,32768,1,0\tN5:i:4242";

    samFile *in = sam_open(sam, "r");
    bam_hdr_t *header = sam_hdr_read(in);
    bam1_t *aln = bam_init1();
    uint8_t *p;
    kstring_t ks = { 0, 0, NULL };
    int64_t b0vals[5] = { -2147483648LL,-1,0,1,2147483647LL }; // i
    int64_t b1vals[4] = { 0,1,2147483648LL,4294967295LL };     // I
    int64_t b2vals[5] = { -32768,-1,0,1,32767 };           // s
    int64_t b3vals[4] = { 0,1,32768,65535 };               // S
    int64_t b4vals[5] = { -128,-1,0,1,127 };               // c
    int64_t b5vals[4] = { 0,1,127,255 };                   // C
    // NB: Floats not doubles below!
    // See https://randomascii.wordpress.com/2012/06/26/doubles-are-not-floats-so-dont-compare-them/
    float bfvals[2] = { -3.14159f, 2.71828f };

    int8_t n4v1[] = { -128, -64, -32, -16, -8, -4, -2, -1,
                      0, 1, 2, 4, 8, 16, 32, 64, 127 };
    uint32_t n4v2[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1234, 5678, 1U << 31, 0 };
    int16_t n4v3[] = { -32768, -1, 0, 1, 32767 };
    float n4v4[] = { 0, 1, 2, 10, 20, 30, 1.5, -2.5 };
    uint8_t n4v5[] = { 0, 255 };
    int32_t n4v6[] = { -2147483647 - 1, 10, -1, 0, 1, 2147483647 };
    uint16_t n4v7[] = { 65535, 32768, 1, 0 };

    int32_t ival = -1234;
    uint32_t uval = 1234;
    float f1 = 4.5678;
    float f2 = 9.8765;

    size_t nvals, i;

    if (sam_read1(in, header, aln) >= 0) {
        if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k')
            fail("XA field is '%c', expected 'k'", bam_aux2A(p));

        bam_aux_del(aln,p);
        if (bam_aux_get(aln,"XA"))
            fail("XA field was not deleted");

        if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37)
            fail("Xi field is %"PRId64", expected 37", bam_aux2i(p));

        if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6)
            fail("Xf field is %.12f, expected pi", bam_aux2f(p));

        if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6)
            fail("Xf field is %.12f, expected e", bam_aux2f(p));

        if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0)
            fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO);

        bam_aux_update_str(aln,"XZ",strlen(NEW_HELLO)+1,NEW_HELLO);
        if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), NEW_HELLO) != 0)
            fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), NEW_HELLO);


        if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0)
            fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF);

        if ((p = check_bam_aux_get(aln, "XB", 'B'))
            && ! (memcmp(p, "Bc", 2) == 0
                  && memcmp(p + 2, "\x03\x00\x00\x00\xfe\x00\x02", 7) == 0))
            fail("XB field is %c,..., expected c,-2,0,+2", p[1]);

        check_int_B_array(aln, "B0", NELE(b0vals), b0vals);
        check_int_B_array(aln, "B1", NELE(b1vals), b1vals);
        check_int_B_array(aln, "B2", NELE(b2vals), b2vals);
        check_int_B_array(aln, "B3", NELE(b3vals), b3vals);
        check_int_B_array(aln, "B4", NELE(b4vals), b4vals);
        check_int_B_array(aln, "B5", NELE(b5vals), b5vals);

        nvals = NELE(bfvals);
        if ((p = check_bam_aux_get(aln, "Bf", 'B')) != NULL) {
            if (bam_auxB_len(p) != nvals)
                fail("Wrong length reported for Bf field, got %d, expected %zd\n",
                     bam_auxB_len(p), nvals);

            for (i = 0; i < nvals; i++) {
                if (bam_auxB2f(p, i) != bfvals[i]) {
                    fail("Wrong value from bam_auxB2f for Bf field index %zd, "
                         "got %f expected %f\n",
                         i, bam_auxB2f(p, i), bfvals[i]);
                }
            }
        }

        if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000)
            fail("ZZ field is %"PRId64", expected 1000000", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y1")) && bam_aux2i(p) != -2147483647-1)
            fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y2")) && bam_aux2i(p) != -2147483647)
            fail("Y2 field is %"PRId64", expected -2^31+1", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y3")) && bam_aux2i(p) != -1)
            fail("Y3 field is %"PRId64", expected -1", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y4")) && bam_aux2i(p) != 0)
            fail("Y4 field is %"PRId64", expected 0", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y5")) && bam_aux2i(p) != 1)
            fail("Y5 field is %"PRId64", expected 1", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y6")) && bam_aux2i(p) != 2147483647)
            fail("Y6 field is %"PRId64", expected 2^31-1", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y7")) && bam_aux2i(p) != 2147483648LL)
            fail("Y7 field is %"PRId64", expected 2^31", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y8")) && bam_aux2i(p) != 4294967295LL)
            fail("Y8 field is %"PRId64", expected 2^32-1", bam_aux2i(p));

        // Try appending some new tags
        if (bam_aux_append(aln, "N0", 'i', sizeof(ival), (uint8_t *) &ival) != 0)
            fail("Failed to append N0:i tag");

        if ((p = bam_aux_get(aln, "N0")) && bam_aux2i(p) != ival)
            fail("N0 field is %"PRId64", expected %d", bam_aux2i(p), ival);

        if (bam_aux_append(aln, "N1", 'I', sizeof(uval), (uint8_t *) &uval) != 0)
            fail("failed to append N1:I tag");

        if ((p = bam_aux_get(aln, "N1")) && bam_aux2i(p) != uval)
            fail("N1 field is %"PRId64", expected %u", bam_aux2i(p), uval);

        // Append tags with bam_aux_update_int()
        if (bam_aux_update_int(aln, "N2", -2) < 0)
            fail("failed to append N2:c tag");

        if (bam_aux_update_int(aln, "N3", 3) < 0)
            fail("failed to append N3:C tag");

        p = bam_aux_get(aln, "N2");
        if (!p)
            fail("failed to retrieve N2 tag");
        else if (*p != 'c' || bam_aux2i(p) != -2)
            fail("N2 field is %c:%"PRId64", expected c:-2", *p, bam_aux2i(p));

        p = bam_aux_get(aln, "N3");
        if (!p)
            fail("failed to retrieve N3 tag");
        else if (*p != 'C' || bam_aux2i(p) != 3)
            fail("N3 field is %c:%"PRId64", expected C:3", *p, bam_aux2i(p));

        // Try changing values with bam_aux_update_int()
        i = test_update_int(aln, "N2", 2, 'C', "N3", 3, 'C');
        if (i == 0) test_update_int(aln, "N2", 1234, 'S', "N3", 3, 'C');
        if (i == 0) test_update_int(aln, "N2", -1, 's', "N3", 3, 'C');
        if (i == 0) test_update_int(aln, "N2", 4294967295U, 'I', "N3", 3, 'C');
        if (i == 0) test_update_int(aln, "N2", -2, 'i', "N3", 3, 'C');

        // Append a value with bam_aux_update_float()
        if (bam_aux_update_float(aln, "F1", f1) < 0)
            fail("append F1:f tag");

        p = bam_aux_get(aln, "F1");
        if (!p)
            fail("retrieve F1 tag");
        else if (*p != 'f' || bam_aux2f(p) != f1)
            fail("F1 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f1);

        // Change a double tag to a float
        if (bam_aux_update_float(aln, "F2", f2) < 0)
            fail("update F2 tag");

        p = bam_aux_get(aln, "F2");
        if (!p)
            fail("retrieve F2 tag");
        else if (*p != 'f' || bam_aux2f(p) != f2)
            fail("F2 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f2);

        // Check the next one is intact too
        p = bam_aux_get(aln, "Y1");
        if (!p)
            fail("retrieve Y1 tag");
        else if (*p != 'i' && bam_aux2i(p) != -2147483647-1)
            fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p));

        // bam_aux_update_array tests
        // append a new array
        i = test_update_array(aln, "N4", 'c', NELE(n4v1), n4v1, "\0\0", 0, 0);

        // Add a sentinal to check resizes work
        if (i == 0) i = test_update_int(aln, "N5", 4242, 'S', "\0\0", 0, 0);

        // alter the array tag a few times
        if (i == 0)
            i = test_update_array(aln, "N4", 'I', NELE(n4v2), n4v2,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 's', NELE(n4v3), n4v3,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 'f', NELE(n4v4), n4v4,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 'c', NELE(n4v5), n4v5,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 'i', NELE(n4v6), n4v6,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 'S', NELE(n4v7), n4v7,
                                  "N5", 4242, 'S');

        if (sam_format1(header, aln, &ks) < 0)
            fail("can't format record");

        if (strcmp(ks.s, r1) != 0)
            fail("record formatted incorrectly: \"%s\"", ks.s);

        free(ks.s);
    }
    else fail("can't read record");

    bam_destroy1(aln);
    bam_hdr_destroy(header);
    sam_close(in);

    return 1;
}
Esempio n. 18
0
// from bam_md.c in SAMtools
// modified not fill in the NM tag, and not to start the reference a c->pos
static void 
tmap_sam_md1_core(bam1_t *b, char *ref)
{
  uint8_t *seq = bam1_seq(b);
  uint32_t *cigar = bam1_cigar(b);
  bam1_core_t *c = &b->core;
  int i, x, y, u = 0;
  kstring_t *str;
  uint8_t *old_md, *old_nm;
  int32_t old_nm_i=-1, nm=0;

  str = (kstring_t*)calloc(1, sizeof(kstring_t));
  for (i = y = x = 0; i < c->n_cigar; ++i) {
      int j, l = cigar[i]>>4, op = cigar[i]&0xf;
      if (op == BAM_CMATCH) {
          for (j = 0; j < l; ++j) {
              int z = y + j;
              int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
              if (ref[x+j] == 0) break; // out of boundary
              if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                  ++u;
              } else {
                  ksprintf(str, "%d", u);
                  kputc(ref[x+j], str);
                  u = 0; 
                  nm++;
              }
          }
          if (j < l) break;
          x += l; y += l;
      } else if (op == BAM_CDEL) {
          ksprintf(str, "%d", u);
          kputc('^', str);
          for (j = 0; j < l; ++j) {
              if (ref[x+j] == 0) break;
              kputc(ref[x+j], str);
          }
          u = 0;
          if (j < l) break;
          x += l; 
          nm += l;
      } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
          y += l;
          if (op == BAM_CINS) nm += l;
      } else if (op == BAM_CREF_SKIP) {
          x += l;
      }
  }
  ksprintf(str, "%d", u);

  // update MD
  old_md = bam_aux_get(b, "MD");
  if(NULL == old_md) {
      bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
  }
  else {
      int is_diff = 0;
      if(strlen((char*)old_md+1) == str->l) {
          for(i = 0; i < str->l; ++i) {
            if(toupper(old_md[i+1]) != toupper(str->s[i])) {
              break;
            }
          }
          if(i < str->l) {
              is_diff = 1;
          }
      } 
      else {
          is_diff = 1;
      }
      if(1 == is_diff) {
          bam_aux_del(b, old_md);
          bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
      }
  }

  // update NM
  old_nm = bam_aux_get(b, "NM");
  if(NULL != old_nm) {
      old_nm_i = bam_aux2i(old_nm);
      if(old_nm_i != nm) {
          bam_aux_del(b, old_nm);
          bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
      }
  }

  free(str->s); free(str);
}
Esempio n. 19
0
int main(int argc, char *argv[])  
{  
  hashtable ht=new_hashtable(HASHSIZE);
  bamFile in,in2; 
  bamFile out; 
  
  if (argc != 3) {  
    fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam>\n");  
    return 1;  
  }  
  
  // Open file and exit if error
  //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb");
  in = bam_open(argv[1], "rb");
  out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); 
  if (in == 0 ) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]);  
    return 1;  
  }  
  if (out == 0) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]);  
    return 1;  
  }  

  unsigned long num_alns=0;
  int ref;  

  // ***********
  // Copy header
  bam_header_t *header;
  header = bam_header_read(in);
  bam_header_write(out,header);

  // sorted by name?
  // Should not rely on the value in SO 
  bam1_t *aln=bam_init1();
  bam1_t *prev=bam_init1();

  printf("Hashing...\n");flush(stdout);
  while(bam_read1(in,aln)>=0) { // read alignment
    if (aln->core.tid < 0) continue;//ignore unaligned reads
    ++num_alns;
    new_read_aln(ht,bam1_qname(aln));
  }
  bam_close(in);  
  printf("Hashing complete (%lu alignments)\n",num_alns);
  printf("Memory used in the hash: %ld MB\n",index_mem/1024/1024);  
  flush(stdout);
  // reopen
  in2 = bam_open(argv[1], "rb");
  if (in2 == 0 ) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]);  
    return 1;  
  }  

  header = bam_header_read(in2);
  
  while(bam_read1(in2,aln)>=0) { // read alignment
    if (aln->core.tid < 0) continue;//ignore unaligned reads
    ++num_alns;
    
    READ_ALN *r=get_read_aln(ht,bam1_qname(aln));

    //assert(r!=NULL);
    // update the NH field
    uint8_t *old_nh = bam_aux_get(aln, "NH");    
    uint8_t nh=r->ctr;
    if (old_nh) {
      if (nh!=bam_aux2i(old_nh)) {
	fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh);
      }
      bam_aux_del(aln, old_nh);
      bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh);
    }
    if (!old_nh) { // add NH  
      bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh);
#ifdef DEBUG
      printf("!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh));
#endif
    }
    // in->header
    // Also fix the XS:A tag
    // BAM_FREAD1
    // BAM_FREAD2
    // BAM_FREVERSE the read is mapped to the reverse strand 
    //bam1_cigar(b) 
      //BAM_CREF_SKIP 3 CIGAR skip on the reference (e.g. spliced alignment)
      //BAM_FREVERSE 16 the read is mapped to the reverse strand
    if (aln->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments
    if (aln->core.flag & ! BAM_FPAIRED) continue; // not paired
    if (aln->core.flag & ! BAM_FPROPER_PAIR) continue; // not a proper pair
    if (aln->core.flag & ! BAM_FMUNMAP) continue; // the mate is mapped
    if (aln->core.flag & BAM_FSECONDARY) continue; // secundary read
    if (aln->core.flag & BAM_FREAD2) continue; // only count each pair once
    // core.strand == 0 (f/+) 1 r/-
    // flag
    // bam1_qname(b)
    bam_write1(out,aln);
  }
  // 
  bam_destroy1(aln);
  bam_close(in2);  
  bam_close(out);  
  return 0;  
/*
uint8_t *old_nm = bam_aux_get(b, "NM");
90 	if (c->flag & BAM_FUNMAP) return;
91 	if (old_nm) old_nm_i = bam_aux2i(old_nm);
92 	if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
93 	else if (nm != old_nm_i) {
94 	fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
95 	bam_aux_del(b, old_nm);
96 	bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
97 	}
*/
}  
bool ReadAlign::chimericDetection() {

    bool chimRecord=false;
    //output chains for out-of-STAR chimeric detection
    #ifdef OUTPUT_localChains
    {
        P->inOut->outLocalChains << readName <<"\t"<< Read0[0] <<"\t"<< Read0[1] << "\n";
        for (uint iw=0; iw<nW; iw++) {
            for (uint itr=0;itr<nWinTr[iw];itr++) {
                P->inOut->outLocalChains << trAll[iw][itr]->maxScore<<"\t"<< trAll[iw][itr]->Chr<<"\t"<<trAll[iw][itr]->Str<<"\t"<<trAll[iw][itr]->nExons;
                for (uint ib=0;ib<trAll[iw][itr]->nExons;ib++) {                    
                    P->inOut->outLocalChains <<"\t"<< trAll[iw][itr]->exons[ib][EX_G]-P->chrStart[trAll[iw][itr]->Chr] \
                                             <<"\t"<< trAll[iw][itr]->exons[ib][EX_R] <<"\t"<< trAll[iw][itr]->exons[ib][EX_L];
                };
                P->inOut->outLocalChains <<"\n";
            };
        };
    };
    #endif
    //////////////////// chimeras
    //stich windows => chimeras
    //stich only the best window with one of the lower score ones for now - do not stich 2 lower score windows
    //stitch only one window on each end of the read

    if (P->chimSegmentMin>0 && nW>1 && trBest->rLength >= P->chimSegmentMin \
            && ( trBest->exons[trBest->nExons-1][EX_R] + trBest->exons[trBest->nExons-1][EX_L] + P->chimSegmentMin <= Lread \
              || trBest->exons[0][EX_R] >= P->chimSegmentMin ) \
             && trBest->nextTrScore+P->outFilterMultimapScoreRange < trBest->maxScore \
             && trBest->intronMotifs[0]==0 && (trBest->intronMotifs[1]==0 || trBest->intronMotifs[2]==0) ) { 
            //there is unmapped space at the start/end, and the main window is not a multimapping window, and non non-canonical junctions, and consistend junction motif
        int chimScoreBest=0,chimScoreNext=0;
        trChim[0]=*trBest;

        uint roStart1=trBest->Str==0 ? trBest->exons[0][EX_R] : Lread - trBest->exons[trBest->nExons-1][EX_R] - trBest->exons[trBest->nExons-1][EX_L];
        uint roEnd1=trBest->Str==0 ? trBest->exons[trBest->nExons-1][EX_R] + trBest->exons[trBest->nExons-1][EX_L] - 1 : Lread - trBest->exons[0][EX_R] - 1;
        if (roStart1>readLength[0]) roStart1--;
        if (roEnd1>readLength[0]) roEnd1--;

        uint chimStrBest=0;
        if (trBest->intronMotifs[1]==0 && trBest->intronMotifs[2]==0) {//strand is undefined
            chimStr=0;
        } else if ( (trBest->Str==0) == (trBest->intronMotifs[1]>0)) {//strand the same as RNA
            chimStr=1;
        } else {//strand opposite to RNA
            chimStr=2;
        };

        for (uint iW=0; iW<nW; iW++) {//check all other windows for chimeras
            for (uint iWt=0; iWt<nWinTr[iW]; iWt++){//cycl over transcripts in the window    
                if (trBest!=trAll[iW][0] && iWt>0) break; //for all windows except that of the best transcript - hceck only iWt=0 (best trnascripts)
                if (trBest==trAll[iW][0] && iWt==0) continue;
    //                 {//same window
    //                     if (iWt==0) continue; //do not check the best transcript itself
    //                     if (trBest->exons[0][EX_R]<=trAll[iW][iWt]->exons[0][EX_R]) {
    //                         //start of the last Best exon is before end of the first Chim exon
    //                         if (trBest->exons[trBest->nExons-1][EX_G]<trAll[iW][iWt]->exons[0][EX_G]+trAll[iW][iWt]->exons[0][EX_L]) continue;
    //                     } else {
    //                         if (trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_G]<trBest->exons[0][EX_G]+trBest->exons[0][EX_L]) continue;                        
    //                     };
    //                 };

                if (trAll[iW][iWt]->intronMotifs[0]>0) continue; //do not stitch a window to itself, or to a window with non-canonical junctions
                uint chimStr1;
                if (trAll[iW][iWt]->intronMotifs[1]==0 && trAll[iW][iWt]->intronMotifs[2]==0) {//strand is undefined
                    chimStr1=0;
                } else if ( (trAll[iW][iWt]->Str==0) == (trAll[iW][iWt]->intronMotifs[1]>0)) {//strand the same as RNA
                    chimStr1=1;
                } else {//strand opposite to RNA
                    chimStr1=2;
                };            

                if (chimStr!=0 && chimStr1!=0 && chimStr!=chimStr1) continue; //chimeric segments have to have consitent strands

                uint roStart2=trAll[iW][iWt]->Str==0 ? trAll[iW][iWt]->exons[0][EX_R] : Lread - trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_R] - trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_L];
                uint roEnd2=trAll[iW][iWt]->Str==0 ? trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_R] + trAll[iW][iWt]->exons[trAll[iW][iWt]->nExons-1][EX_L] - 1 : Lread - trAll[iW][iWt]->exons[0][EX_R] - 1;
                if (roStart2>readLength[0]) roStart2--;
                if (roEnd2>readLength[0]) roEnd2--;          

                uint chimOverlap = roStart2>roStart1 ?  (roStart2>roEnd1 ? 0 : roEnd1-roStart2+1) : (roEnd2<roStart1 ? 0 : roEnd2-roStart1+1);
                bool diffMates=(roEnd1 < readLength[0] && roStart2 >= readLength[0]) || (roEnd2 < readLength[0] && roStart1 >= readLength[0]);

                //segment lengths && (different mates || small gap between segments)
                if (roEnd1 > P->chimSegmentMin + roStart1 + chimOverlap && roEnd2> P->chimSegmentMin + roStart2 + chimOverlap  \
                    && ( diffMates || ( (roEnd1 + P->maxChimReadGap + 1) >= roStart2 && (roEnd2 + P->maxChimReadGap + 1) >= roStart1 ) ) ) {
                                           //maxChimReadGap=0 in Parameters.cpp

                    int chimScore=trBest->maxScore + trAll[iW][iWt]->maxScore - (int)chimOverlap; //subtract overlap to avoid double counting

                    if (chimScore > chimScoreBest && chimScore >= P->chimScoreMin && chimScore+P->chimScoreDropMax >= (int) (readLength[0]+readLength[1]) ) {
                        trChim[1]=*trAll[iW][iWt];                                      
                        chimScoreNext=chimScoreBest;
                        chimScoreBest=chimScore;
                        trChim[1].roStart = trChim[1].roStr ==0 ? trChim[1].rStart : Lread - trChim[1].rStart - trChim[1].rLength;
                        trChim[1].cStart  = trChim[1].gStart - P->chrStart[trChim[1].Chr];      
                        chimStrBest=chimStr1;
                    } else if (chimScore>chimScoreNext) {//replace the nextscore if it's not the best one and is higher than the previous one
                        chimScoreNext=chimScore;              
                    };
                };
            };//cycle over window transcripts
        };//cyecl over windows

        if (chimStr==0) chimStr=chimStrBest;

        chimN=0;
        if (chimScoreNext + P->chimScoreSeparation < chimScoreBest) {//report only if chimera is unique

            if (trChim[0].roStart > trChim[1].roStart) swap (trChim[0],trChim[1]);

            uint e0 = trChim[0].Str==1 ? 0 : trChim[0].nExons-1;
            uint e1 = trChim[1].Str==0 ? 0 : trChim[1].nExons-1;

            uint chimRepeat0=0,chimRepeat1=0,chimJ0=0,chimJ1=0;
            int chimMotif=0;
            chimN=2;      
            if ( trChim[0].exons[e0][EX_iFrag] > trChim[1].exons[e1][EX_iFrag] ) {//strange configuration, rare, similar to the next one
                chimN=0;//reject such chimeras
                //good test example: 
                //CTTAGCTAGCAGCGTCTTCCCAGTGCCTGGAGGGCCAGTGAGAATGGCACCCTCTGGGATTTTTGCTCCTAGGTCT
                //TTGAGGTGAAGTTCAAAGATGTGGCTGGCTGTGAGGAGGCCGAGCTAGAGATCATGGAATTTGTGAATTTCTTGAA
            } else if ( trChim[0].exons[e0][EX_iFrag] < trChim[1].exons[e1][EX_iFrag] ) {//mates bracket the chimeric junction
                chimN=2;
                chimRepeat=0;
                chimMotif=-1;
                if (trChim[0].Str==1) {//negative strand                    
                    chimJ0=trChim[0].exons[e0][EX_G]-1;
                } else {
                    chimJ0=trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L];                            
                };            
                if (trChim[1].Str==0) {//positive strand                    
                    chimJ1=trChim[1].exons[e1][EX_G]-1;
                } else {
                    chimJ1=trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L];                            
                };                    
            } else {//chimeric junctions is within one of the mates, check and shift chimeric junction if necessary
                if (trChim[0].exons[e0][EX_L]>=P->chimJunctionOverhangMin && trChim[1].exons[e1][EX_L]>=P->chimJunctionOverhangMin ) {//large enough overhang required
                    uint roStart0 = trChim[0].Str==0 ? trChim[0].exons[e0][EX_R] : Lread - trChim[0].exons[e0][EX_R] - trChim[0].exons[e0][EX_L];
                    uint roStart1 = trChim[1].Str==0 ? trChim[1].exons[e1][EX_R] : Lread - trChim[1].exons[e1][EX_R] - trChim[1].exons[e1][EX_L];

                    uint jR, jRbest=0;
                    int jScore=0,jMotif=0,jScoreBest=-999999,jScoreJ=0;
                    for (jR=0; jR<roStart1+trChim[1].exons[e1][EX_L]-roStart0-1; jR++) {//scan through the exons to find a canonical junction, and check for mismatches

                        if (jR==readLength[0]) jR++; //skip the inter-mate base

                        char bR=Read1[0][roStart0+jR];

                        char b0,b1;
                        if (trChim[0].Str==0) {
                            b0=G[trChim[0].exons[e0][EX_G]+jR];
                        } else {
                            b0=G[trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]-1-jR];
                            if (b0<4) b0=3-b0;
                        };

                        if (trChim[1].Str==0) {
                            b1=G[trChim[1].exons[e1][EX_G]-roStart1+roStart0+jR];
                        } else {
                            b1=G[trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]-1+roStart1-roStart0-jR];
                            if (b1<4) b1=3-b1;
                        };

                        if (b0>3 || b1>3 || bR>3) {//chimera is not called if there are Ns in the genome or in the read
                            chimN=0;
                            break;
                        };

                        char b01,b02,b11,b12;
                        if (trChim[0].Str==0) {
                            b01=G[trChim[0].exons[e0][EX_G]+jR+1];
                            b02=G[trChim[0].exons[e0][EX_G]+jR+2];                                
                        } else {
                            b01=G[trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]-1-jR-1];
                            if (b01<4) b01=3-b01;
                            b02=G[trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L]-1-jR-2];
                            if (b02<4) b02=3-b02;                                
                        };      
                        if (trChim[1].Str==0) {
                            b11=G[trChim[1].exons[e1][EX_G]-roStart1+roStart0+jR-1];
                            b12=G[trChim[1].exons[e1][EX_G]-roStart1+roStart0+jR];                                
                        } else {
                            b11=G[trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]-1+roStart1-roStart0-jR+1];
                            if (b11<4) b11=3-b11;
                            b12=G[trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L]-1+roStart1-roStart0-jR];
                            if (b12<4) b12=3-b12;                                
                        };        

                        jMotif=0;                        
                        if (b01==2 && b02==3 && b11==0 && b12==2) {//GTAG
                            if (chimStr!=2) {
                                jMotif=1;
                            };
                        } else if(b01==1 && b02==3 && b11==0 && b12==1) {//CTAC
                            if (chimStr!=1) {
                                jMotif=2;
                            };            
                        };  

                        if (bR==b0 && bR!=b1) {
                            jScore++;
                        } else if (bR!=b0 && bR==b1) {
                            jScore--;
                        };

                        jScoreJ =jMotif==0 ? jScore +  P->chimScoreJunctionNonGTAG : jScore ;

                        if ( jScoreJ > jScoreBest || (jScoreJ == jScoreBest && jMotif>0) ) {
                            chimMotif=jMotif;
                            jRbest=jR;
                            jScoreBest=jScoreJ;
                        };
                    };//jR cycle
                    if (chimN>0) {//else the chimera was rejected because of mismatches

                        //shift junction in trChim
                        if (trChim[0].Str==1) {
                            trChim[0].exons[e0][EX_R] +=trChim[0].exons[e0][EX_L]-jRbest-1;
                            trChim[0].exons[e0][EX_G] +=trChim[0].exons[e0][EX_L]-jRbest-1;
                            trChim[0].exons[e0][EX_L]=jRbest+1;                        
                            chimJ0=trChim[0].exons[e0][EX_G]-1;
                        } else {
                            trChim[0].exons[e0][EX_L]=jRbest+1;
                            chimJ0=trChim[0].exons[e0][EX_G]+trChim[0].exons[e0][EX_L];                            
                        };                            

                        if (trChim[1].Str==0) {
                            trChim[1].exons[e1][EX_R] +=roStart0+jRbest+1-roStart1;
                            trChim[1].exons[e1][EX_G] +=roStart0+jRbest+1-roStart1;
                            trChim[1].exons[e1][EX_L]=roStart1+trChim[1].exons[e1][EX_L]-roStart0-jRbest-1;  
                            chimJ1=trChim[1].exons[e1][EX_G]-1;
                        } else {
                            trChim[1].exons[e1][EX_L]=roStart1+trChim[1].exons[e1][EX_L]-roStart0-jRbest-1;  
                            chimJ1=trChim[1].exons[e1][EX_G]+trChim[1].exons[e1][EX_L];  
                        };
                        //find repeats
                        char b0,b1;
                        uint jR;
                        for (jR=0;jR<100;jR++) {//forward check
                            if (trChim[0].Str==0) {
                                b0=G[chimJ0+jR];
                            } else {
                                b0=G[chimJ0-jR];
                                if (b0<4) b0=3-b0;
                            };

                            if (trChim[1].Str==0) {
                                b1=G[chimJ1+1+jR];
                            } else {
                                b1=G[chimJ1-1-jR];
                                if (b1<4) b1=3-b1;
                            };
                            if (b0!=b1) break;
                        };
                        chimRepeat1=jR;
                        for (jR=0;jR<100;jR++) {//reverse check
                            if (trChim[0].Str==0) {
                                b0=G[chimJ0-1-jR];
                            } else {
                                b0=G[chimJ0+1+jR];
                                if (b0<4) b0=3-b0;
                            };

                            if (trChim[1].Str==0) {
                                b1=G[chimJ1-jR];
                            } else {
                                b1=G[chimJ1+jR];
                                if (b1<4) b1=3-b1;
                            };
                            if (b0!=b1) break;
                        };                        
                        chimRepeat0=jR;
                    };//chimN>0
                };//large enough overhang
            };//chimeric junction is within a mate

            //debug
    //             cout << readName <<"\t"<< (trChim[0].Str==0 ? chimJ1-chimJ0 : chimJ0-chimJ1) << "\t"<< (chimMotif>=0 ? P->alignIntronMax :  P->alignMatesGapMax)<<"\n";
    //             cout <<  chimRepeat0 <<"\t"<<trChim[0].exons[e0][EX_L]<<"\n";
            //chimeric alignments output
            if ( chimN==2 && trChim[0].exons[e0][EX_L]>=P->chimJunctionOverhangMin+chimRepeat0 \
                    && trChim[1].exons[e1][EX_L]>=P->chimJunctionOverhangMin+chimRepeat1 \
                    && ( trChim[0].Str!=trChim[1].Str ||  trChim[0].Chr!=trChim[1].Chr \
                    || (trChim[0].Str==0 ? chimJ1-chimJ0+1LLU : chimJ0-chimJ1+1LLU) > (chimMotif>=0 ? P->alignIntronMax :  P->alignMatesGapMax) ) )
            {//unique chimeras only && minOverhang1 
             //&& minOverhang2
             //&& (diff str || diff chr || 
             //|| gap > (alignIntronMax,alignMatesGapMax) ) negative gap = very large # because of uint
                        
                chimRecord=true; //chimeric alignment was recorded

                //re-calculate the score for chimeric transcripts
                trChim[0].alignScore(Read1, G, P);
                trChim[1].alignScore(Read1, G, P);
                
                int chimRepresent=-999, chimType=0;        
                if (trChim[0].exons[0][EX_iFrag]!=trChim[0].exons[trChim[0].nExons-1][EX_iFrag]) {//tr0 has both mates
                    chimRepresent = 0;
                    chimType = 1;
                    trChim[0].primaryFlag=true;//paired portion is primary
                    trChim[1].primaryFlag=false;
                } else if (trChim[1].exons[0][EX_iFrag]!=trChim[1].exons[trChim[1].nExons-1][EX_iFrag]) {//tr1 has both mates
                    chimRepresent = 1;
                    chimType = 1;
                    trChim[1].primaryFlag=true;//paired portion is primary
                    trChim[0].primaryFlag=false;                    
                } else if (trChim[0].exons[0][EX_iFrag]!=trChim[1].exons[0][EX_iFrag]) {//tr0 and tr1 are single different mates 
                    chimRepresent = -1;
                    chimType = 2;
                    trChim[0].primaryFlag=true;
                    trChim[1].primaryFlag=true;
                } else  {//two chimeric segments are on the same mate - this can only happen for single-end reads
                    chimRepresent = (trChim[0].maxScore > trChim[1].maxScore) ? 0 : 1;
                    chimType = 3;
                    trChim[chimRepresent].primaryFlag=true;
                    trChim[1-chimRepresent].primaryFlag=false;                    
                };
                
                if (P->chimOutType=="WithinBAM") {//BAM output
                    int alignType, bamN=0, bamIsuppl=-1, bamIrepr=-1;
                    uint bamBytesTotal=0;//estimate of the total size of all bam records, for output buffering
                    uint mateChr,mateStart;
                    uint8_t mateStrand;
                    for (int itr=0;itr<(int)chimN;itr++) {//generate bam for all chimeric pieces
                        if (chimType==2) {//PE, encompassing
                            mateChr=trChim[1-itr].Chr;
                            mateStart=trChim[1-itr].exons[0][EX_G];
                            mateStrand=(uint8_t) (trChim[1-itr].Str!=trChim[1-itr].exons[0][EX_iFrag]);
                            alignType=-1;
                        } else {//spanning chimeric alignment, could be PE or SE
                            mateChr=-1;mateStart=-1;mateStrand=0;//no need fot mate info unless this is the supplementary alignment
                            if (chimRepresent==itr) {
                                alignType=-1; //this is representative part of chimeric alignment, record is as normal; if encompassing chimeric junction, both are recorded as normal
                                bamIrepr=( (itr%2)==(trChim[itr].Str) ) ? bamN+1 : bamN;//this is the mate that is chimerically split
                            } else {//"supplementary" chimeric segment
                                alignType=( (itr%2)==(trChim[itr].Str) ) ? -12 : -11; //right:left chimeric junction
                                bamIsuppl=bamN;
                                if (chimType==1) {//PE alignment, need mate info for the suppl
                                    uint iex=0;
                                    for (;iex<trChim[chimRepresent].nExons-1;iex++) {
                                        if (trChim[chimRepresent].exons[iex][EX_iFrag]!=trChim[itr].exons[0][EX_iFrag]) {
                                            break;
                                        };
                                    };
                                    mateChr=trChim[chimRepresent].Chr;
                                    mateStart=trChim[chimRepresent].exons[iex][EX_G];
                                    mateStrand=(uint8_t) (trChim[chimRepresent].Str!=trChim[chimRepresent].exons[iex][EX_iFrag]);
                                };
                            };
                            
                        };    
                        
                        bamN+=alignBAM(trChim[itr], 1, 1, P->chrStart[trChim[itr].Chr],  mateChr, mateStart, mateStrand, \
                                        alignType, NULL, P->outSAMattrOrder, outBAMoneAlign+bamN, outBAMoneAlignNbytes+bamN);
                        bamBytesTotal+=outBAMoneAlignNbytes[0]+outBAMoneAlignNbytes[1];//outBAMoneAlignNbytes[1] = 0 if SE is recorded
                    };

                    //write all bam lines
                    for (int ii=0; ii<bamN; ii++) {//output all pieces
                        int tagI=-1;
                        if (ii==bamIrepr) {
                          tagI=bamIsuppl;
                        } else if (ii==bamIsuppl) {
                          tagI=bamIrepr;
                        };
                        if (tagI>=0) {
                            bam1_t *b;
                            b=bam_init1();
                            bam_read1_fromArray(outBAMoneAlign[tagI], b);
                            uint8_t* auxp=bam_aux_get(b,"NM");
                            uint32_t auxv=bam_aux2i(auxp);
                            string tagSA1="SAZ"+P->chrName[b->core.tid]+','+to_string((uint)b->core.pos+1) +',' + ( (b->core.flag&0x10)==0 ? '+':'-') + \
                                    ',' + bam_cigarString(b) + ',' + to_string((uint)b->core.qual) + ',' + to_string((uint)auxv) + ';' ;

                             memcpy( (void*) (outBAMoneAlign[ii]+outBAMoneAlignNbytes[ii]), tagSA1.c_str(), tagSA1.size()+1);//copy string including \0 at the end
                             outBAMoneAlignNbytes[ii]+=tagSA1.size()+1;
                             * ( (uint32*) outBAMoneAlign[ii] ) = outBAMoneAlignNbytes[ii]-sizeof(uint32);
                        };

                        if (P->outBAMunsorted) outBAMunsorted->unsortedOneAlign(outBAMoneAlign[ii], outBAMoneAlignNbytes[ii], ii>0 ? 0 : bamBytesTotal);
                        if (P->outBAMcoord)    outBAMcoord->coordOneAlign(outBAMoneAlign[ii], outBAMoneAlignNbytes[ii], (iReadAll<<32) );
                    };
                };        
                
                
                for (uint iTr=0;iTr<chimN;iTr++) {//write all chimeric pieces to Chimeric.out.sam/junction
                    if (P->readNmates==2) {
                        outputTranscriptSAM(trChim[iTr], chimN, iTr, trChim[1-iTr].Chr, trChim[1-iTr].exons[0][EX_G], (int) (trChim[1-iTr].Str!=trChim[1-iTr].exons[0][EX_iFrag]), -1, NULL, &chunkOutChimSAM);
                    } else {
                        outputTranscriptSAM(trChim[iTr], chimN, iTr, -1, -1, -1, -1, NULL, &chunkOutChimSAM);
                    };                        
                };        
                //junction + SAMp
                chunkOutChimJunction << P->chrName[trChim[0].Chr] <<"\t"<< chimJ0 - P->chrStart[trChim[0].Chr]+1 <<"\t"<< (trChim[0].Str==0 ? "+":"-") \
                        <<"\t"<< P->chrName[trChim[1].Chr] <<"\t"<< chimJ1 - P->chrStart[trChim[1].Chr]+1 <<"\t"<< (trChim[1].Str==0 ? "+":"-") \
                        <<"\t"<< chimMotif <<"\t"<< chimRepeat0  <<"\t"<< chimRepeat1 <<"\t"<< readName+1 \
                        <<"\t"<< trChim[0].exons[0][EX_G] - P->chrStart[trChim[0].Chr]+1 <<"\t"<< outputTranscriptCIGARp(trChim[0]) \
                        <<"\t"<< trChim[1].exons[0][EX_G] - P->chrStart[trChim[1].Chr]+1 <<"\t"<<  outputTranscriptCIGARp(trChim[1]) <<"\n"; //<<"\t"<< trChim[0].exons[0][EX_iFrag]+1 --- no need for that, since trChim[0] is always on the first mate
            };
        };//chimeric score
    };//chimeric search
    return chimRecord;
};//END
Esempio n. 21
0
static inline void famstats_fm_loop(famstats_t *s, bam1_t *b, famstats_fm_settings_t *settings)
{
    uint8_t *data;
    if(b->core.flag & BAM_FREAD2) return; // Silently skip all read 2s since they have the same FM values.
    if((b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY))) {
        ++s->n_flag_fail;
        return;
    }
    if(b->core.qual < settings->minmq) {
        ++s->n_mq_fail;
        return;
    }
    const int FM(((data = bam_aux_get(b, "FM")) != nullptr ? bam_aux2i(data) : 0));
    const int NP(((data = bam_aux_get(b, "NP")) != nullptr ? bam_aux2i(data) : -1));
    int RV(((data = bam_aux_get(b, "RV")) != nullptr ? bam_aux2i(data) : -1));
    if(UNLIKELY(FM == 0)) LOG_EXIT("Missing required FM tag. Abort!\n");
    if(FM < settings->minFM) {
        ++s->n_fm_fail;
        return;
    }
    if(bam_itag(b, "FP") == 0) {
        ++s->n_fp_fail;
        if(settings->skip_fp_fail) return;
    }
    ++s->n_pass;

    if(FM > 1) {
        ++s->realfm_counts;
        s->realfm_sum += FM;
        s->realrc_sum += RV < 0 ? 0 : RV;
    }
    ++s->allfm_counts;
    s->allfm_sum += FM;
    s->allrc_sum += RV < 0 ? 0 : RV;

    int khr;
    // Have we seen this family size before?
    if((s->ki = kh_get(fm, s->fm, FM)) == kh_end(s->fm))
        // If not, put it into the hash table with a count of 1.
        s->ki = kh_put(fm, s->fm, FM, &khr), kh_val(s->fm, s->ki) = 1;
    else ++kh_val(s->fm, s->ki); // Otherwise increment counts
    // Same, but for RV
    if((s->ki = kh_get(fm, s->rc, RV)) == kh_end(s->rc))
        s->ki = kh_put(fm, s->rc, RV, &khr), kh_val(s->rc, s->ki) = 1;
    else ++kh_val(s->rc, s->ki);
    // Same, but for NP
    if(NP > 0) {
        if((s->ki = kh_get(fm, s->np, NP)) == kh_end(s->np))
            s->ki = kh_put(fm, s->np, NP, &khr), kh_val(s->np, s->ki) = 1;
        else ++kh_val(s->np, s->ki);
    }

    // If the Duplex Read tag is present, increment duplex read counts
    uint8_t *const dr_data(bam_aux_get(b, "DR"));
    if(dr_data && bam_aux2i(dr_data)) {
        if(RV < 0) RV = 0;
        s->dr_sum += FM;
        ++s->dr_counts;
        s->dr_rc_sum += RV;
        s->dr_rc_frac_sum += (double)RV / FM;
    }
}
Esempio n. 22
0
bam_stats_t *bam1_stats(bam1_t *bam1, bam_stats_options_t *opts) {
  
  bam_stats_t *bam_stats = NULL;
  uint32_t bam_flag = (uint32_t) bam1->core.flag;

  if (bam_flag & BAM_FUNMAP) {
    // not mapped, then return
    bam_stats = bam_stats_new();
    bam_stats->mapped = 0;
    return bam_stats;
  }

  if (opts->region_table) {
    region_t region;
    region.chromosome = opts->sequence_labels[bam1->core.tid];
    region.start_position = bam1->core.pos;
    region.end_position = region.start_position + bam1->core.l_qseq;
    region.strand = NULL;
    region.type = NULL;
    
    if (find_region(&region, opts->region_table)) {
      bam_stats = bam_stats_new();
    } else {
      return NULL;
    }
  } else {
    bam_stats = bam_stats_new();
  }

  // mapped !!
  bam_stats->mapped = 1;
  
  bam_stats->strand = (int) ((bam_flag & BAM_FREVERSE) > 0);
  
  // number of errors
  bam_stats->num_errors = bam_aux2i(bam_aux_get(bam1, "NM"));
  
  // cigar handling: number of indels and length
  uint32_t cigar_int, *cigar = bam1_cigar(bam1);
  int num_cigar_ops = (int) bam1->core.n_cigar; 
  for (int j = 0; j < num_cigar_ops; j++) {
    cigar_int = cigar[j];
    switch (cigar_int & BAM_CIGAR_MASK) {
    case BAM_CINS:  //I: insertion to the reference
    case BAM_CDEL:  //D: deletion from the reference
      bam_stats->num_indels++;
      bam_stats->indels_length += (cigar_int >> BAM_CIGAR_SHIFT);
      break;
    }
  }

  // quality
  bam_stats->quality = bam1->core.qual;

  // unique alignment
  if (!(bam_flag & BAM_FSECONDARY)) {
    bam_stats->unique_alignment = 1;
  }

  // handling pairs
  bam_stats->single_end = 1;
  if (bam_flag & BAM_FPAIRED) {
    bam_stats->single_end = 0;    
    if (bam_flag & BAM_FUNMAP) {
      if (bam_flag & BAM_FREAD1) {
	bam_stats->unmapped_pair_1 = 1;
      } else {
	bam_stats->unmapped_pair_2 = 1;
      }
    } else {
      if (bam_flag & BAM_FREAD1) {
	bam_stats->mapped_pair_1 = 1;
      } else {
	bam_stats->mapped_pair_2 = 1;
      }
    }
    
    if (!(bam_flag & BAM_FUNMAP) && !(bam_flag & BAM_FMUNMAP) && (bam_flag & BAM_FPROPER_PAIR)) { 
      bam_stats->isize = abs(bam1->core.isize);
    }
  }

  // mapping length
  char *bam_seq = bam1_seq(bam1);
  int seq_len = bam1->core.l_qseq;
  bam_stats->seq_length = seq_len;

  // nucleotide content
  for (int i = 0; i < seq_len; i++) {
    switch (bam1_seqi(bam_seq, i)) {
    case 1:
      bam_stats->num_As++;
      break;
    case 2:
      bam_stats->num_Cs++;
      break;
    case 4:
      bam_stats->num_Gs++;
      break;
    case 8:
      bam_stats->num_Ts++;
      break;
    case 15:
      bam_stats->num_Ns++;
      break;
    }
  }
  bam_stats->num_GCs = bam_stats->num_Gs + bam_stats->num_Cs;

  return bam_stats;
}
Esempio n. 23
0
void profileReads(char* bamFile,
                  int ignoreSuppAlignments,
                  int ignoreSecondaryAlignments) {
    //
    int result = -1;

    int supp_check = 0x0;
    if (ignoreSuppAlignments) {
        supp_check |= BAM_FSUPPLEMENTARY;
    }
    if (ignoreSecondaryAlignments) {
        supp_check |= BAM_FSECONDARY;
    }

    // helper variables
    BGZF* in = 0 ;
    bam1_t *b = bam_init1();
    bam_hdr_t *h;

    // open bam
    if ((in = bgzf_open(bamFile, "r")) == 0) {
        fprintf(stderr,
               "ERROR: Failed to open \"%s\" for reading.\n",
               bamFile);
    }
    else if ((h = bam_hdr_read(in)) == 0) { // read header
        fprintf(stderr,
                "ERROR: Failed to read BAM header of file \"%s\".\n",
                bamFile);
    }
    else {
        // destroy header
        bam_hdr_destroy(h);

        int line = 0;
        
        int supplementary, secondary;
        int mapQual;
        int matches, mismatches, qLen;
        float pcAln, pcId;
        int showStats = 0;
        uint8_t *aux_mismatches;
        
        // print header
        printf("line\tsupp\tsecondary\tmapQ\tmismatches\tmatches\tqLen\tpcId\tpcAln\n");

        // fetch alignments
        while ((result = bam_read1(in, b)) >= 0) {
            line += 1;
            
            
            // only primary mappings
            if ((b->core.flag & supp_check) != 0) { 
                if (showStats)
                    fprintf(stdout, "Rejected %d, non-primary\n", line);
                continue;
            }
            supplementary = (b->core.flag & (1 | BAM_FSUPPLEMENTARY)) != 0;
            secondary = (b->core.flag & (1 | BAM_FSECONDARY)) != 0;
            // quality
            mapQual = b->core.qual;
            // bam_aux_get returns 0 if optional NM tag is missing
            if ((aux_mismatches = bam_aux_get(b, "NM")))
               mismatches = bam_aux2i(aux_mismatches);
            else
                mismatches = 0;
            // length
            qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b));
            // percent identity
            matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b));
            pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1
            // percent alignment
            pcAln = matches / (float)qLen; // percentage as float between 0 to 1
            
            // print read values
            printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\t%.4f\n",
                   line, supplementary, secondary, mapQual, mismatches, matches,
                   qLen, pcId, pcAln);
        }
        if (result < -1) {
            fprintf(stderr,
                    "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n",
                    line, bamFile, result);
        }
    }
    if (in) bgzf_close(in);
    bam_destroy1(b);
}
Esempio n. 24
0
static int fetch_func(const bam1_t *b, void *data)
{
    fetch_data_t *d = (fetch_data_t*) data;

    char *name = bam1_qname(b);

    //check if name is requested here
    if(!d->requestedTranscripts->empty()) {
        //we're doing transcript filtering
       if(d->requestedTranscripts->find(name) == d->requestedTranscripts->end()) {
           //transcript wasn't requested
           return 0;
       }
    }

    fprintf(stderr,"%s\n",name);
    //TODO Desparately need some error checking on flag retrieval
    char *status = bam_aux2Z(bam_aux_get(b,"YT"));
    int length = bam_aux2i(bam_aux_get(b,"HI"));

    YTranscript* transcript;

    YTranscriptSubStructure structure;
    structure.position = b->core.pos + 1;
    structure.length = b->core.l_qseq;
    structure.ordinal = bam_aux2i(bam_aux_get(b,"HI"));

    if(d->transcriptNames.find(name,&transcript)) {
        //then we've already found some part of this transcript
        transcript->orderedStructures.push_back(structure);
    }
    else {
        transcript = new YTranscript;

        char *tidName = d->in->header->target_name[b->core.tid];
        char *refName = new char[ strlen(tidName) + 1];
        strcpy(refName, tidName);

        char *transcriptName = new char[ strlen(name) + 1];
        strcpy(transcriptName, name);

        char *flagGeneName = bam_aux2Z(bam_aux_get(b,"YG"));
        char *geneName = new char[ strlen(flagGeneName) + 1];
        strcpy(geneName, flagGeneName);

        char *statusName = new char[ strlen(status + 1) ];
        strcpy(statusName, status);

        transcript->gene = geneName;
        transcript->name = transcriptName;
        transcript->refName = refName;
        transcript->status = statusName;
        transcript->orderedStructures.push_back(structure);
        transcript->strand = b->core.flag & BAM_FREVERSE ? -1 : 1;
        transcript->totalNumberOfStructures = bam_aux2i(bam_aux_get(b, "IH"));
        transcript->length = length;

        d->transcriptNames.insert(name,transcript);
        d->transcripts->push_back(transcript);

    }
    return 0;
}
Esempio n. 25
0
void filterReads(char * inBamFile,
                 char * outBamFile,
                 int minMapQual,
                 int minLen,
                 int maxMisMatches,
                 float minPcId,
                 float minPcAln,
                 int ignoreSuppAlignments,
                 int ignoreSecondaryAlignments) {
    //
    int result = -1;
    int outResult = -1;

    int supp_check = 0x0;
    if (ignoreSuppAlignments) {
        supp_check |= BAM_FSUPPLEMENTARY;
    }
    if (ignoreSecondaryAlignments) {
        supp_check |= BAM_FSECONDARY;
    }

    // helper variables
    BGZF* in = 0;
    BGZF* out = 0;
    bam1_t *b = bam_init1();
    bam_hdr_t *h;

    // open bam
    if ((in = bgzf_open(inBamFile, "r")) == 0) {
        fprintf(stderr,
               "ERROR: Failed to open \"%s\" for reading.\n",
               inBamFile);
    }
    else if ((h = bam_hdr_read(in)) == 0) { // read header
        fprintf(stderr,
                "ERROR: Failed to read BAM header of file \"%s\".\n",
                inBamFile);
    }
    else if ((out = bgzf_open(outBamFile, "w")) == 0) {
        fprintf(stderr,
               "ERROR: Failed to open \"%s\" for writing.\n",
               outBamFile);
    }
    else {
        // write and destroy header
        bam_hdr_write(out, h);
        bam_hdr_destroy(h);

        int line = 0;
        int matches, mismatches, qLen;
        float pcAln, pcId;
        int showStats = 0;

        // fetch alignments
        while ((result = bam_read1(in, b)) >= 0) {
            line += 1;

            // only primary mappings
            if ((b->core.flag & supp_check) != 0) {
                if (showStats)
                    fprintf(stdout, "Rejected %d, non-primary\n", line);
                continue;
            }

            // only high quality
            if (b->core.qual < minMapQual) {
                if (showStats)
                    fprintf(stdout, "Rejected %d, quality: %d\n", line, b->core.qual);
                continue;
            }

            // not too many absolute mismatches
            mismatches = bam_aux2i(bam_aux_get(b, "NM"));
            if (mismatches > maxMisMatches) {
                if (showStats)
                    fprintf(stdout, "Rejected %d, mismatches: %d\n", line, mismatches);
                continue;
            }

            // not too short
            qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b));
            if (qLen < minLen) {
                if (showStats)
                    fprintf(stdout, "Rejected %d, length: %d\n", line, qLen);
                continue;
            }

            // only high percent identity
            matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b));
            pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1
            if (pcId < minPcId) {
                if (showStats)
                    fprintf(stdout, "Rejected %d, identity pc: %.4f\n", line, pcId);
                continue;
            }

            // only high percent alignment
            pcAln = matches / (float)qLen; // percentage as float between 0 to 1
            if (pcAln < minPcAln) {
                if (showStats)
                    fprintf(stdout, "Rejected %d, alignment pc: %.4f\n", line, pcAln);
                continue;
            }

            if ((outResult = bam_write1(out, b)) < -1) {
                fprintf(stderr,
                        "ERROR: Attempt to write read no. %d to file \"%s\" failed with code %d.\n",
                        line, outBamFile, outResult);
            }
        }
        if (result < -1) {
            fprintf(stderr,
                    "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n",
                    line, inBamFile, result);
        }
    }
    if (in) bgzf_close(in);
    if (out) bgzf_close(out);
    bam_destroy1(b);
}
Esempio n. 26
0
BM_mappedRead * extractReads(char * bamFile,
                             char ** contigs,
                             int numContigs,
                             uint16_t * groups,
                             char * prettyName,
                             int headersOnly,
                             int minMapQual,
                             int maxMisMatches,
                             int ignoreSuppAlignments,
                             int ignoreSecondaryAlignments) {
    //-----
    // code uses the pattern outlined in samtools view (sam_view.c)
    // thanks lh3!
    //
    int i = 0;
    int result = -1;
    int hh = 0;

    int supp_check = 0x0; // include supp mappings
    if (ignoreSuppAlignments) {
        supp_check |= BAM_FSUPPLEMENTARY;
    }
    if (ignoreSecondaryAlignments) {
        supp_check |= BAM_FSECONDARY;
    }

    // we need to let the users know if their pairings
    // will be corrupted
    int p_corrupt = 0;

    // helper variables
    samFile *in = 0;
    bam_hdr_t *header = NULL;
    bam1_t *b = bam_init1();

    BM_mappedRead * root = 0;
    BM_mappedRead * prev = 0;

    // open file handlers
    if ((in = sam_open(bamFile, "r")) == 0) {
        fprintf(stderr,
                "ERROR: Failed to open \"%s\" for reading.\n",
                bamFile);
    }
    else {
        // retrieve the header
        if ((header = sam_hdr_read(in)) == 0) {
            fprintf(stderr,
                    "ERROR: Failed to read the header from \"%s\".\n",
                    bamFile);
        }
        else {
            // check the index is intact
            hts_idx_t *idx = sam_index_load(in, bamFile); // load index
            if (idx == 0) { // index is unavailable
                fprintf(stderr,
                        "ERROR: Random retrieval only works "\
                        "for indexed files.\n");
            }
            else {
                cfuhash_table_t *pair_buffer = \
                    cfuhash_new_with_initial_size(1000000);
                cfuhash_set_flag(pair_buffer, CFUHASH_FROZEN_UNTIL_GROWS);

                for (hh = 0; hh < numContigs; ++hh) {
                    // parse a region in the format like `chr2:100-200'
                    hts_itr_t *iter = sam_itr_querys(idx, header, contigs[hh]);
                    if (iter == NULL) { // reference name is not found
                        fprintf(stderr,
                                "WARNING: Could not find contig: "\
                                "[%s] in BAM: [%s].\n",
                                contigs[hh],
                                bamFile);
                    }

                    // fetch alignments
                    int line = 0;
                    while ((result = sam_itr_next(in, iter, b)) >= 0) {
                        bam1_core_t core = b->core;
                        line += 1;
                        // only high quality?, primary? mappings
                        if ( core.qual < minMapQual)
                            continue;
                        if ((core.flag & supp_check) != 0)
                            continue;
                        if(bam_aux2i(bam_aux_get(b, "NM")) > maxMisMatches) {
                            continue;
                        }

                        char * seqId = bam_get_qname(b);
                        char * seq = 0;
                        char * qual = 0;
                        int qual_len = 0;
                        int seq_len = 0;

                        // get sequence and quality
                        if(0 == headersOnly) {
                            // no point allocating unused space
                            seq = calloc(core.l_qseq+1, sizeof(char));
                            qual = calloc(core.l_qseq+1, sizeof(char));
                            uint8_t *s = bam_get_seq(b);
                            if (core.flag&BAM_FREVERSE) {
                                // reverse the read
                                int r = 0;
                                for (i = core.l_qseq-1; i >=0 ; --i) {
                                    seq[r]="=TGKCYSBAWRDMHVN"[bam_seqi(s,
                                                                       i)];
                                    ++r;
                                }
                            }
                            else {
                                for (i = 0; i < core.l_qseq; ++i) {
                                    seq[i]="=ACMGRSVTWYHKDBN"[bam_seqi(s,
                                                                       i)];
                                }
                            }
                            seq_len = core.l_qseq;

                            s = bam_get_qual(b);
                            if (s[0] != 0xff) {
                                qual_len = core.l_qseq;
                                for (i = 0; i < core.l_qseq; ++i) {
                                    qual[i] = (char)(s[i] + 33);
                                }
                            }
                            else if (qual != 0) {
                                free(qual);
                                qual = 0;
                            }
                        }

                        // work out pairing information
                        uint8_t rpi = RPI_ERROR;
                        if (core.flag&BAM_FPAIRED) {
                            if(core.flag&BAM_FMUNMAP) {
                                if (core.flag&BAM_FREAD1) {
                                    rpi = RPI_SNGL_FIR;
                                }
                                else if (core.flag&BAM_FREAD2) {
                                    rpi = RPI_SNGL_SEC;
                                }
                            }
                            else {
                                if (core.flag&BAM_FREAD1) {
                                    rpi = RPI_FIR;
                                }
                                else if (core.flag&BAM_FREAD2) {
                                    rpi = RPI_SEC;
                                }
                            }
                        }
                        else {
                            rpi = RPI_SNGL;
                        }

                        // make the funky Id
                        #define MAX_SEQ_ID_LEN 80
                        char * seq_id = calloc(MAX_SEQ_ID_LEN,
                                               sizeof(char));
                        // allocate the string to the buffer but check to
                        // ensure we're not cutting anything off
                        int id_len = snprintf(seq_id,
                                              MAX_SEQ_ID_LEN,
                                              "b_%s;c_%s;r_%s",
                                              prettyName,
                                              contigs[hh],
                                              seqId);
                        if(id_len >= MAX_SEQ_ID_LEN) {
                            seq_id = calloc(id_len+1, sizeof(char));
                            snprintf(seq_id,
                                     id_len+1, // don't forget the NULL!
                                     "b_%s;c_%s;r_%s",
                                     prettyName,
                                     contigs[hh],
                                     seqId);
                        }

                        // make the mapped read struct
                        prev = makeMappedRead(seq_id,
                                              seq,
                                              qual,
                                              id_len,
                                              seq_len,
                                              qual_len,
                                              rpi,
                                              groups[hh],
                                              prev);

                        if (0 == root) { root = prev; }

                        if(rpi == RPI_SNGL || \
                           rpi == RPI_SNGL_FIR || \
                           rpi == RPI_SNGL_SEC) {
                            // we can just add away
                            // indicate singleton reads by pointing the
                            // partner pointer to itself
                            prev->partnerRead = prev;
                        }
                        else {
                            // RPI_FIR or RPI_SEC
                            // work out pairing information using the hash
                            // we append a 1 or 2 to the end so that
                            // we don't accidentally pair 1's with 1's etc.
                            char * stripped_result;
                            if(rpi == RPI_FIR) {
                                stripped_result = \
                                    pairStripper(seqId,
                                                 core.l_qname-1,
                                                 '2');
                            }
                            else {
                                stripped_result = \
                                    pairStripper(seqId,
                                                 core.l_qname-1,
                                                 '1');
                            }

                            char * stripped = seqId;
                            if(stripped_result)
                                stripped = stripped_result;

                            //fprintf(stdout, "SEARCH %s\n", stripped);
                            // now stripped always holds a stripped value
                            // see if it is in the hash already
                            BM_mappedRead * stored_MR = \
                                cfuhash_get(pair_buffer,
                                            stripped);

                            if (0 != stored_MR) {
                                // exists in the hash -> Add the pair info
                                if(rpi == RPI_FIR) {
                                    prev->partnerRead = stored_MR;
                                }
                                else {
                                    stored_MR->partnerRead = prev;
                                }

                                // delete the entry from the hash
                                cfuhash_delete(pair_buffer,
                                               stripped);
                            }
                            else {
                                // we should put it in the hash
                                // make sure to change it into something
                                // we will find next time
                                if(rpi == RPI_FIR)
                                    stripped[strlen(stripped)-1] = '1';
                                else
                                    stripped[strlen(stripped)-1] = '2';

                                // check to make sure we're not overwriting
                                // anything important. cfuhash overwrites
                                // duplicate entries, so we need to grab
                                // it and put it to "SNGL_XXX" before we
                                // lose the pointer
                                BM_mappedRead * OWMMR = \
                                    cfuhash_put(pair_buffer,
                                                stripped, prev);
                                if(OWMMR) {
                                    if(OWMMR->rpi == RPI_FIR)
                                        OWMMR->rpi = RPI_SNGL_FIR;
                                    else
                                        OWMMR->rpi = RPI_SNGL_SEC;
                                    OWMMR->partnerRead = OWMMR;
                                    printPairCorruptionWarning(p_corrupt);
                                    p_corrupt = 1;
                                }


                            }

                            if(stripped_result != 0) { // free this!
                                free(stripped_result);
                                stripped_result = 0;
                            }
                        }
                    }
                    hts_itr_destroy(iter);
                    if (result < -1) {
                        fprintf(stderr, "ERROR: retrieval of reads from "\
                                        "contig:  \"%s\" failed due to "\
                                        "truncated file or corrupt BAM index "\
                                        "file\n", header->target_name[hh]);
                        break;
                    }
                }

                // any entries left in the hash are pairs whose mates did
                // not meet quality standards
                size_t key_size = 0;
                char * key;
                BM_mappedRead * LOMMR;
                size_t pr_size = 1;
                if(cfuhash_each_data(pair_buffer,
                                     (void**)&key,
                                     &key_size,
                                     (void**)&LOMMR,
                                     &pr_size)) {
                    do {
                        // get the mapped read
                        // update it's pairing so we know it's really single
                        if (LOMMR->rpi == RPI_FIR)
                            LOMMR->rpi = RPI_SNGL_FIR;
                        else if (LOMMR->rpi == RPI_SEC)
                            LOMMR->rpi = RPI_SNGL_SEC;

                        // indicate singleton reads by pointing the
                        // partner pointer to itself
                        LOMMR->partnerRead = LOMMR;

                    } while(cfuhash_next_data(pair_buffer,
                                              (void**)&key,
                                              &key_size,
                                              (void**)&LOMMR,
                                              &pr_size));
                }

                cfuhash_clear(pair_buffer);
                cfuhash_destroy(pair_buffer);
            }
            hts_idx_destroy(idx); // destroy the BAM index
        }
    }
    // always do this
    if (in) sam_close(in);
    bam_destroy1(b);
    if ( header ) bam_hdr_destroy(header);

    return root;
}
Esempio n. 27
0
void bam_filter(array_list_t *bam1s, array_list_t *passed_bam1s,
		array_list_t *failed_bam1s, bam_filter_options_t *opts) {

  bam1_t *bam1;
  uint32_t bam_flag;
  int value;

  region_t region;
  char **chromosomes;
  int num_chromosomes;
  if (opts->region_table) {
    chromosomes = opts->region_table->ordering;
    num_chromosomes = opts->region_table->max_chromosomes;
    region.strand = NULL;
    region.type = NULL;
  }

  size_t num_items = array_list_size(bam1s);

  for (size_t i = 0; i < num_items; i++) {
    
    bam1 = array_list_get(i, bam1s);
    bam_flag = (uint32_t) bam1->core.flag;

    // if not mapped, 
    if (bam_flag & BAM_FUNMAP) {
      array_list_insert(bam1, failed_bam1s);
      continue;
    }

    // unique
    if (opts->unique && (bam_flag & BAM_FSECONDARY)) {
      array_list_insert(bam1, failed_bam1s);
      continue;
    }

    // proper pairs
    if (opts->proper_pairs && (bam_flag & BAM_FPAIRED)) {
      if ( !(bam_flag & BAM_FPROPER_PAIR)) {
	array_list_insert(bam1, failed_bam1s);
	continue;
      }
    }

    // length
    value = bam1->core.l_qseq;
    if (value < opts->min_length || value > opts->max_length) {
      array_list_insert(bam1, failed_bam1s);
      continue;
    }

    // quality
    value = bam1->core.qual;
    if (value < opts->min_quality || value > opts->max_quality) {
      array_list_insert(bam1, failed_bam1s);
      continue;
    }

    // num. error
    value = bam_aux2i(bam_aux_get(bam1, "NM"));
    if (value < opts->min_num_errors || value > opts->max_num_errors) {
      array_list_insert(bam1, failed_bam1s);
      continue;
    }

    // region
    if (opts->region_table) {
      int seq_id = bam1->core.tid;
      if (seq_id >=0 &&  seq_id < num_chromosomes) {

	region.chromosome = chromosomes[bam1->core.tid];
	region.start_position = bam1->core.pos;
	region.end_position = region.start_position + bam1->core.l_qseq;

	if (!find_exact_region(&region, opts->region_table)) {
	  array_list_insert(bam1, failed_bam1s);
	  continue;
	}

      } else {
	array_list_insert(bam1, failed_bam1s);
	continue;
      }
    }

    // finally, this bam1 passed all the filters,
    // insert it in the output list
    array_list_insert(bam1, passed_bam1s);
  }
}