mafFileApi_t* maf_newMfa(const char *filename, char const *mode) { mafFileApi_t *mfa = (mafFileApi_t *) de_malloc(sizeof(*mfa)); mfa->lineNumber = 0; mfa->lastLine = NULL; mfa->mfp = de_fopen(filename, mode); mfa->filename = de_strdup(filename); return mfa; }
mafLine_t* maf_copyMafLine(mafLine_t *orig) { // create and return a copy of a single mafLine_t structure if (orig == NULL) { return NULL; } mafLine_t *ml = maf_newMafLine(); if (orig->line != NULL) { ml->line = de_strdup(orig->line); } ml->lineNumber = orig->lineNumber; ml->type = orig->type; if (orig->species != NULL) { ml->species = de_strdup(orig->species); } ml->start = orig->start; ml->length = orig->length; ml->strand = orig->strand; ml->sourceLength = orig->sourceLength; if (orig->sequence != NULL) { ml->sequence = de_strdup(orig->sequence); } ml->sequenceFieldLength = orig->sequenceFieldLength; return ml; }
// Create or open a file for writing, that is *not* one of the usual // "output.000.ext" files we extract from the input file. // // overwrite_mode, flags: Same as for de_fopen_for_write(). // // On failure, prints an error message, and sets f->btype to DBUF_TYPE_NULL. dbuf *dbuf_create_unmanaged_file(deark *c, const char *fname, int overwrite_mode, unsigned int flags) { dbuf *f; char msgbuf[200]; f = de_malloc(c, sizeof(dbuf)); f->c = c; f->is_managed = 0; f->name = de_strdup(c, fname); f->btype = DBUF_TYPE_OFILE; f->max_len_hard = c->max_output_file_size; f->fp = de_fopen_for_write(c, f->name, msgbuf, sizeof(msgbuf), c->overwrite_mode, flags); if(!f->fp) { de_err(c, "Failed to write %s: %s", f->name, msgbuf); f->btype = DBUF_TYPE_NULL; } return f; }
mafBlock_t *spliceBlock(mafBlock_t *b, uint64_t l, uint64_t r, int64_t **offsetArray) { // b is the input maf block // l is the left index in the sequence field, the start of inclusion // r is the right index in the sequence field, the stop of inclusion if ((l == 0) && (r == maf_mafBlock_getSequenceFieldLength(b) - 1)) { return b; } // printf("spliceBlock(l=%"PRIu64", r=%"PRIu64")\n", l, r); mafBlock_t *mb = maf_newMafBlock(); mafLine_t *ml1 = NULL, *ml2 = NULL; ml1 = maf_mafBlock_getHeadLine(b); uint64_t lineNumber = maf_mafLine_getLineNumber(ml1); uint64_t numberOfSequences = maf_mafBlock_getNumberOfSequences(b); for (uint64_t i = 0; i < numberOfSequences; ++i) { assert(offsetArray[i][0] <= (int64_t)l); } assert(r < maf_mafBlock_getSequenceFieldLength(b)); ml1 = maf_mafLine_getNext(ml1); maf_mafBlock_setHeadLine(mb, maf_newMafLineFromString("a score=0 mafExtractor_splicedBlock=true", lineNumber)); ml2 = maf_mafBlock_getHeadLine(mb); maf_mafBlock_setLineNumber(mb, lineNumber); maf_mafBlock_incrementNumberOfLines(mb); uint64_t len; char *seq = NULL; bool prevLineUsed = true; // used when a mafline is dropped because it's length becomes 0 bool emptyBlock = true; uint64_t si = 0; // sequence index, for addressing into offsetArray while (ml1 != NULL) { // loop through all maf lines in a block if (prevLineUsed) { // advance ml2 maf_mafBlock_setTailLine(mb, ml2); // assign this first, in case last line is not used. maf_mafLine_setNext(ml2, maf_newMafLine()); ml2 = maf_mafLine_getNext(ml2); } if (maf_mafLine_getLine(ml1) == NULL) { ml1 = maf_mafLine_getNext(ml1); prevLineUsed = false; continue; } maf_mafLine_setType(ml2, maf_mafLine_getType(ml1)); maf_mafLine_setLineNumber(ml2, ++lineNumber); maf_mafBlock_incrementNumberOfLines(mb); maf_mafBlock_incrementLineNumber(mb); if (maf_mafLine_getType(ml2) != 's') { // copy the line over, move on to next ml maf_mafLine_setLine(ml2, de_strdup(maf_mafLine_getLine(ml1))); ml1 = maf_mafLine_getNext(ml1); prevLineUsed = true; continue; } maf_mafBlock_incrementNumberOfSequences(mb); seq = maf_mafLine_getSequence(ml1); len = maf_mafBlock_getSequenceFieldLength(b); if (maf_mafBlock_getSequenceFieldLength(mb) == 0) { maf_mafBlock_setSequenceFieldLength(mb, len); } // printf(" [%2"PRIi64", %2"PRIi64"] pre: %s\n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); // find first non-gap position OR the left edge, update offset while (seq[offsetArray[si][0]] == '-' && offsetArray[si][0] <= (int64_t)l) { ++offsetArray[si][0]; if (seq[offsetArray[si][0]] != '-' && offsetArray[si][1] >= 0) { // if we've advanced to a non-gap char and the local offset // is not 0 (i.e. these aren't simply leading gaps), then // advance the offset ++offsetArray[si][1]; // advance offset } } // printf(" [%2"PRIi64", %2"PRIi64"] post-initial gap / left edge discovery: %s \n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); // we normally ignore the initial value because it's already been set. however if -1 it must be set if (seq[offsetArray[si][0]] != '-' && offsetArray[si][1] == -1) { offsetArray[si][1] = 0; } // offsets for (int64_t i = offsetArray[si][0] + 1; i <= (int64_t)l; ++i) { // figure out the non-gap offset for the splice-in point, `l' offsetArray[si][0] = i; // local pos if (seq[i] != '-') { if (offsetArray[si][1] == -1) { offsetArray[si][1] = 0; } ++offsetArray[si][1]; // advance offset } } // printf(" [%2"PRIi64", %2"PRIi64"] post-gaps: %s \n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); bool allGaps = true; for (uint64_t i = l; i <= r; ++i) { if (seq[i] != '-') { allGaps = false; break; } } if (allGaps) { // this sequence is all gaps in this region, exclude it ml1 = maf_mafLine_getNext(ml1); prevLineUsed = false; maf_mafBlock_decrementLineNumber(mb); maf_mafBlock_decrementNumberOfLines(mb); maf_mafBlock_decrementNumberOfSequences(mb); --lineNumber; ++si; continue; } // Walk up beyond the `l' point if the left edge falls on a gap character while (seq[offsetArray[si][0]] == '-' && offsetArray[si][0] < (int64_t) r) { ++offsetArray[si][0]; if (seq[offsetArray[si][0]] != '-') { // && offsetArray[si][1] >= 0) { ++offsetArray[si][1]; } } // printf(" [%2"PRIi64", %2"PRIi64"] walk past left: %s\n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); // ensure local offset is set properly int64_t seqCoords = offsetArray[si][1]; if (offsetArray[si][1] == -1) { seqCoords = 0; if (seq[offsetArray[si][0]] != '-') { offsetArray[si][1] = 0; } } // printf(" [%2"PRIi64", %2"PRIi64"] seqCoords:%"PRIi64" set properly: %s\n", offsetArray[si][0], offsetArray[si][1], seqCoords, maf_mafLine_getSpecies(ml1)); maf_mafLine_setStart(ml2, maf_mafLine_getStart(ml1) + seqCoords); // update offsetArray: for (uint64_t i = offsetArray[si][0] + 1; i <= r; ++i) { offsetArray[si][0] = i; if (seq[i] != '-') { if (offsetArray[si][1] == -1) { offsetArray[si][1] = 0; } ++offsetArray[si][1]; } } // printf(" [%2"PRIi64", %2"PRIi64"] post update: %s\n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1)); maf_mafLine_setSequence(ml2, de_strndup(seq + l, 1 + r - l)); maf_mafLine_setLength(ml2, countNonGaps(maf_mafLine_getSequence(ml2))); maf_mafBlock_setSequenceFieldLength(mb, maf_mafLine_getSequenceFieldLength(ml2)); maf_mafLine_setStrand(ml2, maf_mafLine_getStrand(ml1)); maf_mafLine_setSourceLength(ml2, maf_mafLine_getSourceLength(ml1)); maf_mafLine_setSpecies(ml2, de_strdup(maf_mafLine_getSpecies(ml1))); maf_mafLine_setLine(ml2, maf_mafLine_imputeLine(ml2)); prevLineUsed = true; ml1 = maf_mafLine_getNext(ml1); ++si; emptyBlock = false; } maf_mafBlock_incrementLineNumber(mb); // extra \n at the end of a block if (prevLineUsed) { maf_mafBlock_setTailLine(mb, ml2); } if (!emptyBlock) { return mb; } else { // this condition should never be tripped, we have a condition set up // in the "process" wrapper above this function. printf("block was empty, this should never happen in spliceBlock()\n"); assert(2 + 2 == 5); maf_destroyMafBlockList(mb); return NULL; } }
void checkBlock(mafBlock_t *block) { // read through each line of a mafBlock and filter duplicates. // Report the top scoring duplication only. mafLine_t *ml = maf_mafBlock_getHeadLine(block); unsigned n = maf_mafLine_getNumberOfSequences(ml); char **species = (char **) de_malloc(sizeof(char *) * n); char **sequences = (char **) de_malloc(sizeof(char *) * n); int index = 0; bool containsDuplicates = false; duplicate_t *d = NULL, *dupSpeciesHead = NULL; while (ml != NULL) { if (maf_mafLine_getType(ml) != 's') { // skip non-sequence lines ml = maf_mafLine_getNext(ml); continue; } species[index] = de_strdup(maf_mafLine_getSpecies(ml)); sequences[index] = de_strdup(maf_mafLine_getSequence(ml)); duplicate_t *thisDup = findDuplicate(dupSpeciesHead, maf_mafLine_getSpecies(ml)); if (thisDup == NULL) { // first instance of species, add to list if (dupSpeciesHead == NULL) { dupSpeciesHead = newDuplicate(); d = dupSpeciesHead; } else { d->next = newDuplicate(); d = d->next; } d->species = de_strdup(maf_mafLine_getSpecies(ml)); // create the mafline linked list d->headScoredMaf = newScoredMafLine(); d->headScoredMaf->mafLine = ml; d->tailScoredMaf = d->headScoredMaf; } else { // this sequence is a duplicate, extend the duplicate list. containsDuplicates = true; ++(thisDup->numSequences); scoredMafLine_t *sml = thisDup->tailScoredMaf; sml->next = newScoredMafLine(); sml = sml->next; sml->mafLine = ml; thisDup->tailScoredMaf = sml; } ++index; ml = maf_mafLine_getNext(ml); } if (!containsDuplicates) { reportBlock(block); destroyStringArray(species, n); destroyStringArray(sequences, n); destroyDuplicates(dupSpeciesHead); return; } // this block contains duplicates char *consensus = (char *) de_malloc(longestLine(block) + 1); consensus[0] = '\0'; buildConsensus(consensus, sequences, n, maf_mafLine_getLineNumber(maf_mafBlock_getHeadLine(block))); // lineno used for error reporting findBestDupes(dupSpeciesHead, consensus); reportBlockWithDuplicates(block, dupSpeciesHead); // clean up destroyStringArray(species, n); destroyStringArray(sequences, n); destroyDuplicates(dupSpeciesHead); free(consensus); }
dbuf *dbuf_create_output_file(deark *c, const char *ext, de_finfo *fi, unsigned int createflags) { char nbuf[500]; char msgbuf[200]; dbuf *f; const char *basefn; int file_index; u8 is_directory = 0; char *name_from_finfo = NULL; i64 name_from_finfo_len = 0; if(ext && fi && fi->original_filename_flag) { de_dbg(c, "[internal warning: Incorrect use of create_output_file]"); } f = de_malloc(c, sizeof(dbuf)); f->c = c; f->max_len_hard = c->max_output_file_size; f->is_managed = 1; if(fi && fi->is_directory) { is_directory = 1; } if(is_directory && !c->keep_dir_entries) { de_dbg(c, "skipping 'directory' file"); f->btype = DBUF_TYPE_NULL; goto done; } if(c->extract_policy==DE_EXTRACTPOLICY_MAINONLY) { if(createflags&DE_CREATEFLAG_IS_AUX) { de_dbg(c, "skipping 'auxiliary' file"); f->btype = DBUF_TYPE_NULL; goto done; } } else if(c->extract_policy==DE_EXTRACTPOLICY_AUXONLY) { if(!(createflags&DE_CREATEFLAG_IS_AUX)) { de_dbg(c, "skipping 'main' file"); f->btype = DBUF_TYPE_NULL; goto done; } } file_index = c->file_count; c->file_count++; basefn = c->base_output_filename ? c->base_output_filename : "output"; if(fi && ucstring_isnonempty(fi->file_name_internal)) { name_from_finfo_len = 1 + ucstring_count_utf8_bytes(fi->file_name_internal); name_from_finfo = de_malloc(c, name_from_finfo_len); ucstring_to_sz(fi->file_name_internal, name_from_finfo, (size_t)name_from_finfo_len, 0, DE_ENCODING_UTF8); } if(c->output_style==DE_OUTPUTSTYLE_ARCHIVE && !c->base_output_filename && fi && fi->is_directory && (fi->is_root_dir || (fi->detect_root_dot_dir && fi->orig_name_was_dot))) { de_strlcpy(nbuf, ".", sizeof(nbuf)); } else if(c->output_style==DE_OUTPUTSTYLE_ARCHIVE && !c->base_output_filename && fi && fi->original_filename_flag && name_from_finfo) { // TODO: This is a "temporary" hack to allow us to, when both reading from // and writing to an archive format, use some semblance of the correct // filename (instead of "output.xxx.yyy"). // There are some things that we don't handle optimally, such as // subdirectories. // A major redesign of the file naming logic would be good. de_strlcpy(nbuf, name_from_finfo, sizeof(nbuf)); } else { char fn_suffix[256]; if(ext && name_from_finfo) { de_snprintf(fn_suffix, sizeof(fn_suffix), "%s.%s", name_from_finfo, ext); } else if(ext) { de_strlcpy(fn_suffix, ext, sizeof(fn_suffix)); } else if(is_directory && name_from_finfo) { de_snprintf(fn_suffix, sizeof(fn_suffix), "%s.dir", name_from_finfo); } else if(name_from_finfo) { de_strlcpy(fn_suffix, name_from_finfo, sizeof(fn_suffix)); } else if(is_directory) { de_strlcpy(fn_suffix, "dir", sizeof(fn_suffix)); } else { de_strlcpy(fn_suffix, "bin", sizeof(fn_suffix)); } de_snprintf(nbuf, sizeof(nbuf), "%s.%03d.%s", basefn, file_index, fn_suffix); } f->name = de_strdup(c, nbuf); if(fi) { // The finfo object passed to us at file creation is not required to // remain valid, so make a copy of anything in it that we might need // later. f->fi_copy = de_finfo_create(c); finfo_shallow_copy(c, fi, f->fi_copy); // Here's where we respect the -intz option, by using it to convert to // UTC in some cases. if(f->fi_copy->mod_time.is_valid && f->fi_copy->mod_time.tzcode==DE_TZCODE_LOCAL && c->input_tz_offs_seconds!=0) { de_timestamp_cvt_to_utc(&f->fi_copy->mod_time, -c->input_tz_offs_seconds); } if(f->fi_copy->image_mod_time.is_valid && f->fi_copy->image_mod_time.tzcode==DE_TZCODE_LOCAL && c->input_tz_offs_seconds!=0) { de_timestamp_cvt_to_utc(&f->fi_copy->image_mod_time, -c->input_tz_offs_seconds); } } if(file_index < c->first_output_file) { f->btype = DBUF_TYPE_NULL; goto done; } if(c->max_output_files>=0 && file_index >= c->first_output_file + c->max_output_files) { f->btype = DBUF_TYPE_NULL; goto done; } c->num_files_extracted++; if(c->extrlist_dbuf) { dbuf_printf(c->extrlist_dbuf, "%s\n", f->name); dbuf_flush(c->extrlist_dbuf); } if(c->list_mode) { f->btype = DBUF_TYPE_NULL; if(c->list_mode_include_file_id) { de_msg(c, "%d:%s", file_index, f->name); } else { de_msg(c, "%s", f->name); } goto done; } if(c->output_style==DE_OUTPUTSTYLE_ARCHIVE && c->archive_fmt==DE_ARCHIVEFMT_TAR) { de_info(c, "Adding %s to TAR file", f->name); f->btype = DBUF_TYPE_ODBUF; // A dummy max_len_hard value. The parent will do the checking. f->max_len_hard = DE_DUMMY_MAX_FILE_SIZE; f->writing_to_tar_archive = 1; de_tar_start_member_file(c, f); } else if(c->output_style==DE_OUTPUTSTYLE_ARCHIVE) { // ZIP i64 initial_alloc; de_info(c, "Adding %s to ZIP file", f->name); f->btype = DBUF_TYPE_MEMBUF; f->max_len_hard = DE_MAX_MEMBUF_SIZE; if(is_directory) { // A directory entry is not expected to have any data associated // with it (besides the files it contains). initial_alloc = 16; } else { initial_alloc = 65536; } f->membuf_buf = de_malloc(c, initial_alloc); f->membuf_alloc = initial_alloc; f->write_memfile_to_zip_archive = 1; } else if(c->output_style==DE_OUTPUTSTYLE_STDOUT) { de_info(c, "Writing %s to [stdout]", f->name); f->btype = DBUF_TYPE_STDOUT; // TODO: Should we increase f->max_len_hard? f->fp = stdout; } else { de_info(c, "Writing %s", f->name); f->btype = DBUF_TYPE_OFILE; f->fp = de_fopen_for_write(c, f->name, msgbuf, sizeof(msgbuf), c->overwrite_mode, 0); if(!f->fp) { de_err(c, "Failed to write %s: %s", f->name, msgbuf); f->btype = DBUF_TYPE_NULL; } } done: de_free(c, name_from_finfo); return f; }