void mafBlock_print(FILE *outfile, MafBlock *block, int pretty_print) { int i, j, k, numSpace; int fieldSize[6]; //maximum # of characters in the first 6 fields of block MafSubBlock *sub; char firstChar, formatstr[1000]; char *firstseq=NULL; //if processing has reduced the number of species with data to zero, or has //reduced the block to all gaps, don't print if (lst_size(block->data) == 0 || mafBlock_all_gaps(block)) return; mafBlock_remove_gap_cols(block); mafBlock_get_fieldSizes(block, fieldSize); fprintf(outfile, "%s\n", block->aLine->chars); for (i=0; i<lst_size(block->data); i++) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); for (j=0; j<sub->numLine; j++) { firstChar = sub->lineType[j]; if (firstChar == 's' || firstChar == 'e') { sprintf(formatstr, "%%c %%-%is %%%ii %%%ii %%c %%%ii ", fieldSize[1], fieldSize[2], fieldSize[3], fieldSize[5]); fprintf(outfile, formatstr, firstChar, sub->src->chars, sub->start, sub->size, sub->strand, sub->srcSize); if (firstChar == 's') { if (firstseq == NULL) { fprintf(outfile, "%s\n", sub->seq->chars); if (pretty_print) firstseq = sub->seq->chars; } else { for (k=0; k<block->seqlen; k++) fputc(tolower(sub->seq->chars[k])==tolower(firstseq[k]) ? '.' : sub->seq->chars[k], outfile); } } else fprintf(outfile, "%c\n", sub->eStatus); } else if (firstChar=='i') { sprintf(formatstr, "i %%-%is %%c %%i %%c %%i", fieldSize[1]); fprintf(outfile, formatstr, sub->src->chars, sub->iStatus[0], sub->iCount[0], sub->iStatus[1], sub->iCount[1]); fputc('\n', outfile); } else { if (firstChar != 'q') die("ERROR mafBlock_print: firstChar should be q, got %c\n", firstChar); sprintf(formatstr, "q %%-%is", fieldSize[1]); fprintf(outfile, formatstr, sub->src->chars); numSpace = 6 + fieldSize[2] + fieldSize[3] + fieldSize[5]; for (k=0; k<numSpace; k++) fputc(' ', outfile); fprintf(outfile, "%s\n", sub->quality->chars); } } } fputc('\n', outfile); //blank line to mark end of block // fflush(outfile); }
int main(int argc, char* argv[]) { char *maf_fname = NULL, *out_root_fname = "maf_parse", *masked_fn = NULL; String *refseq = NULL, *currRefseq; int opt_idx, startcol = 1, endcol = -1, include = 1, splitInterval = -1; char c, outfilename[1000], splitFormat[100]="%s%.1i.maf", *group_tag = NULL; List *order_list = NULL, *seqlist_str = NULL, *cats_to_do_str=NULL, *cats_to_do=NULL; MafBlock *block; FILE *mfile, *outfile=NULL, *masked_file=NULL; int useRefseq=TRUE, currLen=-1, blockIdx=0, currSize, sortWarned=0; int lastIdx = 0, currStart=0, by_category = FALSE, i, pretty_print = FALSE; int lastStart = -1, gffSearchIdx=0; GFF_Set *gff = NULL, *gffSub; GFF_Feature *feat; CategoryMap *cm = NULL; int base_mask_cutoff = -1, stripILines=FALSE, stripELines=FALSE;//, numspec=0; List *outfileList=NULL; Hashtable *outfileHash=NULL;//, *specNameHash=NULL; msa_format_type output_format = MAF; MSA *msa = NULL;//, **catMsa; char *mask_features_spec_arg=NULL; List *mask_features_spec=NULL; struct option long_opts[] = { {"start", 1, 0, 's'}, {"end", 1, 0, 'e'}, {"seqs", 1, 0, 'l'}, {"exclude", 0, 0, 'x'}, {"order", 1, 0, 'O'}, {"split", 1, 0, 'S'}, {"out-root", 1, 0, 'r'}, {"out-root-digits", 1, 0, 'd'}, {"no-refseq", 0, 0, 'n'}, {"features", 1, 0, 'g'}, {"by-category", 0, 0, 'L'}, {"do-cats", 1, 0, 'C'}, {"catmap", 1, 0, 'c'}, {"by-group", 1, 0, 'P'}, {"mask-bases", 1, 0, 'b'}, {"masked-file", 1, 0, 'm'}, {"strip-i-lines", 0, 0, 'I'}, {"strip-e-lines", 0, 0, 'E'}, {"mask-features", 1, 0, 'M'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "s:e:l:O:r:S:d:g:c:P:b:o:m:M:pLnxEIh", long_opts, &opt_idx)) != -1) { switch(c) { case 's': startcol = get_arg_int(optarg); break; case 'e': endcol = get_arg_int(optarg); break; case 'l': seqlist_str = get_arg_list(optarg); break; case 'O': order_list = get_arg_list(optarg); break; case 'x': include = FALSE; break; case 'S': splitInterval = atoi(optarg); break; case 'r': out_root_fname = optarg; break; case 'd': sprintf(splitFormat, "%%s%%.%si.%%s", optarg); break; case 'n': useRefseq = FALSE; break; case 'g': gff = gff_read_set(phast_fopen(optarg, "r")); gff_sort(gff); stripILines=TRUE; stripELines=TRUE; break; case 'c': cm = cm_new_string_or_file(optarg); break; case 'C': cats_to_do_str = get_arg_list(optarg); break; case 'L': by_category = TRUE; break; case 'P': group_tag = optarg; break; case 'b': base_mask_cutoff = atoi(optarg); break; case 'm': masked_fn = optarg; break; case 'M': mask_features_spec_arg = optarg; break; case 'E': stripELines=TRUE; break; case 'I': stripILines=TRUE; break; case 'o': output_format = msa_str_to_format(optarg); if (output_format == UNKNOWN_FORMAT) die("ERROR: bad output format. Try \"maf_parse -h\" for help.\n"); if (output_format != MAF) die("Sorry, only MAF format output has been implemented right now.\n"); break; case 'p': pretty_print = TRUE; break; case 'h': print_usage(); exit(0); case '?': die("Bad argument. Try 'maf_parse -h' for help.\n"); } } if (optind >= argc) die("Missing alignment filename. Try 'maf_parse -h' for help.\n"); else if (optind == argc - 1) maf_fname = argv[optind]; else die("ERROR: Too many arguments. Try 'maf_parse -h' for help.\n"); set_seed(-1); if (startcol < 1 || (endcol != -1 && endcol < startcol)) die("ERROR: must have 1 <= start <= end <= [msa_length]\n"); if ((group_tag != NULL || by_category) && gff == NULL) die("ERROR: --by-category and --by-group require --features. Try \"maf_parse -h\"" " for help.\n"); if (group_tag != NULL && by_category) die("ERROR: --by-category and --by-group cannot be used together. Try \"maf_parse -h\"" " for help.\n"); if (splitInterval != -1 && gff != NULL) die("ERROR: can't use --split and --features together. Try \"maf_parse -h\"" "for help\n"); if (group_tag != NULL || by_category) { outfileList = lst_new_ptr(10); outfileHash = hsh_new(100); } if (gff != NULL && cm == NULL) cm = cm_new_from_features(gff); if (cats_to_do_str != NULL) { cats_to_do = cm_get_category_str_list(cm, cats_to_do_str, FALSE); if (gff != NULL) gff_filter_by_type(gff, cats_to_do, 0, NULL); } if (masked_fn != NULL) { if (base_mask_cutoff == -1) die("ERROR: need to use --mask-bases with --masked-file"); masked_file = phast_fopen(masked_fn, "w"); } if (mask_features_spec_arg != NULL) { if (gff==NULL) die("ERROR: need --features with --mask-features"); mask_features_spec = lst_new_ptr(10); str_split(str_new_charstr(mask_features_spec_arg), ",", mask_features_spec); for (i=0; i < lst_size(mask_features_spec); i++) { fprintf(stderr, "masking species %s within features\n", ((String*)lst_get_ptr(mask_features_spec, i))->chars); } } /* Check to see if --do-cats names a feature which is length 1. If so, set output_format to SS ? or FASTA ? */ mfile = phast_fopen(maf_fname, "r"); block = mafBlock_read_next(mfile, NULL, NULL); if (splitInterval == -1 && gff==NULL) { //TODO: do we want to copy header from original MAF in this case? mafBlock_open_outfile(NULL, argc, argv); } while (block != NULL) { if (order_list != NULL) mafBlock_reorder(block, order_list); if (seqlist_str != NULL) mafBlock_subSpec(block, seqlist_str, include); if (mafBlock_numSpec(block)==0 || mafBlock_all_gaps(block)) goto get_next_block; if (stripILines) mafBlock_strip_iLines(block); if (stripELines) mafBlock_strip_eLines(block); if (base_mask_cutoff != -1) mafBlock_mask_bases(block, base_mask_cutoff, masked_file); //TODO: still need to implement (either here or elsewhere) // if (indel_mask_cutoff != -1) // mafBlock_mask_indels(block, indel_mask_cutoff, mfile); if (useRefseq) { //get refseq and check that it is consistent in MAF file currRefseq = mafBlock_get_refSpec(block); if (refseq == NULL) refseq = str_new_charstr(currRefseq->chars); else if (str_compare(refseq, currRefseq)!=0) die("Error: refseq not consistent in MAF (got %s, %s)\n", refseq->chars, currRefseq->chars); } if (startcol != 1 || endcol != -1) if (0 == mafBlock_trim(block, startcol, endcol, refseq, useRefseq ? 0 : lastIdx)) goto get_next_block; currSize = mafBlock_get_size(block, refseq); if (useRefseq) { currStart = mafBlock_get_start(block, refseq); if (currStart < lastIdx && sortWarned == 0) { fprintf(stderr, "Warning: input MAF not sorted with respect to refseq. Output files may not represent contiguous alignments. (%i, %i)\n", lastIdx, currStart); sortWarned = 1; } } else currStart = lastIdx; if (currStart < lastStart) gffSearchIdx = 0; lastStart = currStart; lastIdx = currStart + currSize; //split by length if (splitInterval != -1) { if (currLen == -1 || currLen+currSize > splitInterval) { sprintf(outfilename, splitFormat, out_root_fname, ++blockIdx, msa_suffix_for_format(output_format)); if (output_format == MAF) { if (outfile != NULL) mafBlock_close_outfile(outfile); outfile = mafBlock_open_outfile(outfilename, argc, argv); } else if (output_format != MAF && msa != NULL) { // msa_print_to_filename(msa, outfilename, output_format, pretty_print); msa_free(msa); msa = NULL; } currLen = 0; } currLen += currSize; } else outfile = stdout; if (gff != NULL && mask_features_spec != NULL) { gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx, &gffSearchIdx); if (gffSub != NULL) { mafBlock_mask_region(block, gffSub, mask_features_spec); gff_free_set(gffSub); } mafBlock_print(outfile, block, pretty_print); } else if (gff != NULL) { gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx, &gffSearchIdx); if (gffSub != NULL) { if (by_category) gff_group_by_feature(gffSub); else if (group_tag != NULL) gff_group(gffSub, group_tag); gff_sort(gffSub); gff_flatten_within_groups(gffSub); for (i=0; i<lst_size(gffSub->features); i++) { feat = (GFF_Feature*)lst_get_ptr(gffSub->features, i); MafBlock *subBlock = mafBlock_copy(block); mafBlock_trim(subBlock, feat->start, feat->end, refseq, 0); if (by_category) outfile = get_outfile(outfileList, outfileHash, feat->feature, out_root_fname, argc, argv); else if (group_tag != NULL) outfile = get_outfile(outfileList, outfileHash, gff_group_name(gffSub, feat), out_root_fname, argc, argv); else outfile = stdout; if (output_format == MAF) mafBlock_print(outfile, subBlock, pretty_print); // else msa_add_mafBlock(msa); mafBlock_free(subBlock); } gff_free_set(gffSub); } } else { if (output_format == MAF) mafBlock_print(outfile, block, pretty_print); // else msa = msa_add_mafBlock(mafBlock, msa, ); } get_next_block: mafBlock_free(block); block = mafBlock_read_next(mfile, NULL, NULL); } if (masked_file != NULL) fclose(masked_file); if (output_format == MAF) { if (by_category || group_tag != NULL) close_outfiles(outfileList, outfileHash); else if (outfile!=NULL) mafBlock_close_outfile(outfile); } else { msa_print(stdout, msa, output_format, pretty_print); msa_free(msa); } if (gff != NULL) gff_free_set(gff); phast_fclose(mfile); return 0; }