Пример #1
0
//read program parameters
void setArgsBam(argStruct *arguments){
  int seed=0;
  remove_bads = angsd::getArg("-remove_bads",remove_bads,arguments);
  uniqueOnly = angsd::getArg("-uniqueOnly",uniqueOnly,arguments);
  only_proper_pairs =angsd::getArg("-only_proper_pairs",only_proper_pairs,arguments);
   fai_fname =angsd::getArg("-f",fai_fname,arguments);
  minMapQ = angsd::getArg("-minMapQ",minMapQ,arguments);
  cigstat = angsd::getArg("-cigstat",cigstat,arguments);
  minQ = angsd::getArg("-minQ",minQ,arguments);
  downSample = angsd::getArg("-downSample",downSample,arguments);
  seed = angsd::getArg("-seed",seed,arguments);
  trim = angsd::getArg("-trim",trim,arguments);
  trim5 = angsd::getArg("-trim5",trim5,arguments);
  trim3 = angsd::getArg("-trim3",trim3,arguments);
  arguments->ref=angsd::getArg("-ref",arguments->ref,arguments);
  arguments->anc=angsd::getArg("-anc",arguments->anc,arguments);
  rghash_name= angsd::getArg("+RG",rghash_name,arguments);
  if(rghash_name&&!angsd::fexists(rghash_name))
    rghash = add_read_group_single(rghash_name);
  if(rghash_name&&angsd::fexists(rghash_name))
    rghash = add_read_groups_file(rghash_name);
  if(rghash)
    fprintf(stderr,"\t-> [READGROUP info] Number of readgroups to include: %d\n",khash_str2int_size(rghash));
  adjustMapQ = angsd::getArg("-C",adjustMapQ,arguments);
  baq = angsd::getArg("-baq",baq,arguments);
  redo_baq = angsd::getArg("-redo-baq",redo_baq,arguments);
  if(baq){
    if(baq==1)
      baq=1; //wauv
    else if(baq==2)
      baq=3;
    else{
      fprintf(stderr,"\t-> only supported options for -baq is: 1 (normal baq) and 2 (extended baq (SAMtools default)). Value supplied:%d\n",baq);
      exit(0);//ly su
    }
    if(redo_baq==1)
      baq |=4;
  }
  //  fprintf(stderr,"baq:%d redobaq:%d\n",baq,redo_baq);exit(0);
  regfile =angsd::getArg("-r",regfile,arguments);
  regfiles = angsd::getArg("-rf",regfiles,arguments);
  MAX_SEQ_LEN = angsd::getArg("-setMinChunkSize",MAX_SEQ_LEN,arguments);
  checkBamHeaders = angsd::getArg("-checkBamHeaders",checkBamHeaders,arguments);
  doCheck = angsd::getArg("-doCheck",doCheck,arguments);
  MPLP_IGNORE_RG = angsd::getArg("--ignore-RG",MPLP_IGNORE_RG,arguments);
  arguments->nReads = angsd::getArg("-nReads",arguments->nReads,arguments);
  arguments->show = angsd::getArg("-show",arguments->show,arguments);
  if(regfile && regfiles)
    fprintf(stderr,"\t-> WARNING both -r and -rf has been set \n");

  if(seed)
    srand48(seed);
  char *tmp = NULL;
  tmp = angsd::getArg("-ref",tmp,arguments);
  if(tmp==NULL && adjustMapQ!=0){
    fprintf(stderr,"\t-> Must also supply -ref for adjusting the mapping quality\n");
    exit(0);
  }
  if(tmp==NULL&&baq!=0){
    fprintf(stderr,"\t-> Must also supply -ref for adjusting base qualities (baq)\n");
    exit(0);
  }
  free(tmp);
  
  
  std::vector<char *> regionsRaw;
  if(regfiles)
    regionsRaw =  angsd::getFilenames(regfiles,0);
  
  if(regfile)
    regionsRaw.push_back(strdup(regfile));
  //  fprintf(stderr,"\t-> RegionsRaw.size():%lu hd:%p\n",regionsRaw.size(),arguments->hd);
  for(size_t i=0;i<regionsRaw.size();i++){
    regs tmpRegs;
    if(parse_region(regionsRaw[i],arguments->hd,tmpRegs.refID,tmpRegs.start,tmpRegs.stop,arguments->revMap)<0||tmpRegs.stop<tmpRegs.start){
      fprintf(stderr,"[%s] Problems with indexing: %s\n",__FUNCTION__,regionsRaw[i]);
      exit(0);
    }else
      arguments->regions.push_back(tmpRegs);
    free(regionsRaw[i]);
  }



  printArg(arguments->argumentFile,arguments);

  if(regfile)
    free(regfile);
  if(regfiles)
    free(regfiles);

}
Пример #2
0
/*
    The logic of this function is a bit complicated because we want to work
    also with broken bams containing read groups that are not listed in the
    header. The desired behavior is as follows:
        - when -G is given, read groups which are not listed in the header must
          be given explicitly using the "?" symbol in -G.
          Otherwise:
        - if the bam has no header, all reads in the file are assigned to a
          single sample named after the file
        - if there is at least one sample defined in the header, reads with no
          read group id or with a read group id not listed in the header are
          assigned to the first sample encountered in the header
*/
int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
{
    bsmpl->nfiles++;
    bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t));
    file_t *file = &bsmpl->files[bsmpl->nfiles-1];
    memset(file,0,sizeof(file_t));
    file->fname  = strdup(fname);
    file->default_idx = -1;

    if ( bsmpl->ignore_rg || !bam_hdr )
    {
        // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name
        bsmpl_add_readgroup(bsmpl,file,"*",file->fname);
        return bsmpl->nfiles-1;
    }

    void *bam_smpls = khash_str2int_init();
    int first_smpl = -1, nskipped = 0;
    const char *p = bam_hdr, *q, *r;
    while (p != NULL && (q = strstr(p, "@RG")) != 0)
    {
        char *eol = strchr(q + 3, '\n');
        if (q > bam_hdr && *(q - 1) != '\n') { // @RG must be at start of line
            p = eol;
            continue;
        }
        p = q + 3;
        if ((q = strstr(p, "\tID:")) != 0) q += 4;
        if ((r = strstr(p, "\tSM:")) != 0) r += 4;
        if (r && q)
        {
            char *u, *v;
            int ioq, ior;
            for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
            for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
            ioq = *u; ior = *v; *u = *v = '\0';

            // q now points to a null terminated read group id
            // r points to a null terminated sample name
            if ( !strcmp("*",q) || !strcmp("?",q) )
                error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname);

            int accept_rg = 1;
            if ( bsmpl->sample_list )
            {
                // restrict samples based on the -s/-S options
                char *name = khash_str2str_get(bsmpl->sample_list,r);
                if ( bsmpl->sample_logic==0 )
                    accept_rg = name ? 0 : 1;
                else if ( !name )
                    accept_rg = 0;
                else
                    r = name;
            }
            if ( accept_rg && bsmpl->rg_list )
            {
                // restrict readgroups based on the -G option, possibly renaming the sample
                accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r);
            }
            if ( accept_rg )
                bsmpl_add_readgroup(bsmpl,file,q,r);
            else
            {
                bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header
                nskipped++;
            }

            if ( first_smpl<0 )
                khash_str2int_get(bsmpl->name2idx,r,&first_smpl);
            if ( !khash_str2int_has_key(bam_smpls,r) )
                khash_str2int_inc(bam_smpls,strdup(r));

            *u = ioq; *v = ior;
        }
        else
            break;
        p = eol;
    }
    int nsmpls = khash_str2int_size(bam_smpls);
    khash_str2int_destroy_free(bam_smpls);

    const char *smpl_name = NULL;
    int accept_null_rg = 1;
    if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0;
    if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0;

    if ( !accept_null_rg && first_smpl==-1 )
    {
        // no suitable read group is available in this bam: ignore the whole file.
        free(file->fname);
        if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
        bsmpl->nfiles--;
        return -1;
    }
    if ( !accept_null_rg ) return bsmpl->nfiles-1;
    if ( nsmpls==1 && !nskipped )
    {
        file->default_idx = first_smpl;
        return bsmpl->nfiles-1;
    }
    if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl];

    bsmpl_add_readgroup(bsmpl,file,"?",smpl_name);
    return bsmpl->nfiles-1;
}