示例#1
0
mafFileApi_t* maf_newMfa(const char *filename, char const *mode) {
  mafFileApi_t *mfa = (mafFileApi_t *) de_malloc(sizeof(*mfa));
  mfa->lineNumber = 0;
  mfa->lastLine = NULL;
  mfa->mfp = de_fopen(filename, mode);
  mfa->filename = de_strdup(filename);
  return mfa;
}
示例#2
0
mafLine_t* maf_copyMafLine(mafLine_t *orig) {
  // create and return a copy of a single mafLine_t structure
  if (orig == NULL) {
    return NULL;
  }
  mafLine_t *ml = maf_newMafLine();
  if (orig->line != NULL) {
    ml->line = de_strdup(orig->line);
  }
  ml->lineNumber = orig->lineNumber;
  ml->type = orig->type;
  if (orig->species != NULL) {
    ml->species = de_strdup(orig->species);
  }
  ml->start = orig->start;
  ml->length = orig->length;
  ml->strand = orig->strand;
  ml->sourceLength = orig->sourceLength;
  if (orig->sequence != NULL) {
    ml->sequence = de_strdup(orig->sequence);
  }
  ml->sequenceFieldLength = orig->sequenceFieldLength;
  return ml;
}
示例#3
0
// Create or open a file for writing, that is *not* one of the usual
// "output.000.ext" files we extract from the input file.
//
// overwrite_mode, flags: Same as for de_fopen_for_write().
//
// On failure, prints an error message, and sets f->btype to DBUF_TYPE_NULL.
dbuf *dbuf_create_unmanaged_file(deark *c, const char *fname, int overwrite_mode,
	unsigned int flags)
{
	dbuf *f;
	char msgbuf[200];

	f = de_malloc(c, sizeof(dbuf));
	f->c = c;
	f->is_managed = 0;
	f->name = de_strdup(c, fname);

	f->btype = DBUF_TYPE_OFILE;
	f->max_len_hard = c->max_output_file_size;
	f->fp = de_fopen_for_write(c, f->name, msgbuf, sizeof(msgbuf),
		c->overwrite_mode, flags);

	if(!f->fp) {
		de_err(c, "Failed to write %s: %s", f->name, msgbuf);
		f->btype = DBUF_TYPE_NULL;
	}

	return f;
}
示例#4
0
mafBlock_t *spliceBlock(mafBlock_t *b, uint64_t l, uint64_t r, int64_t **offsetArray) {
    // b is the input maf block
    // l is the left index in the sequence field, the start of inclusion
    // r is the right index in the sequence field, the stop of inclusion
    if ((l == 0) && (r == maf_mafBlock_getSequenceFieldLength(b) - 1)) {
        return b;
    }
    // printf("spliceBlock(l=%"PRIu64", r=%"PRIu64")\n", l, r);
    mafBlock_t *mb = maf_newMafBlock();
    mafLine_t *ml1 = NULL, *ml2 = NULL;
    ml1 = maf_mafBlock_getHeadLine(b);
    uint64_t lineNumber = maf_mafLine_getLineNumber(ml1);
    uint64_t numberOfSequences = maf_mafBlock_getNumberOfSequences(b);
    for (uint64_t i = 0; i < numberOfSequences; ++i) {
        assert(offsetArray[i][0] <= (int64_t)l);
    }
    assert(r < maf_mafBlock_getSequenceFieldLength(b));
    ml1 = maf_mafLine_getNext(ml1);
    maf_mafBlock_setHeadLine(mb, maf_newMafLineFromString("a score=0 mafExtractor_splicedBlock=true", lineNumber));
    ml2 = maf_mafBlock_getHeadLine(mb);
    maf_mafBlock_setLineNumber(mb, lineNumber);
    maf_mafBlock_incrementNumberOfLines(mb);
    uint64_t len;
    char *seq = NULL;
    bool prevLineUsed = true; // used when a mafline is dropped because it's length becomes 0
    bool emptyBlock = true;
    uint64_t si = 0; // sequence index, for addressing into offsetArray
    while (ml1 != NULL) {
        // loop through all maf lines in a block
        if (prevLineUsed) {
            // advance ml2
            maf_mafBlock_setTailLine(mb, ml2); // assign this first, in case last line is not used.
            maf_mafLine_setNext(ml2, maf_newMafLine());
            ml2 = maf_mafLine_getNext(ml2);
        }
        if (maf_mafLine_getLine(ml1) == NULL) {
            ml1 = maf_mafLine_getNext(ml1);
            prevLineUsed = false;
            continue;
        }
        maf_mafLine_setType(ml2, maf_mafLine_getType(ml1));
        maf_mafLine_setLineNumber(ml2, ++lineNumber);
        maf_mafBlock_incrementNumberOfLines(mb);
        maf_mafBlock_incrementLineNumber(mb);
        if (maf_mafLine_getType(ml2) != 's') {
            // copy the line over, move on to next ml
            maf_mafLine_setLine(ml2, de_strdup(maf_mafLine_getLine(ml1)));
            ml1 = maf_mafLine_getNext(ml1);
            prevLineUsed = true;
            continue;
        }
        maf_mafBlock_incrementNumberOfSequences(mb);
        seq = maf_mafLine_getSequence(ml1);
        len = maf_mafBlock_getSequenceFieldLength(b);
        if (maf_mafBlock_getSequenceFieldLength(mb) == 0) {
            maf_mafBlock_setSequenceFieldLength(mb, len);
        }
        // printf(" [%2"PRIi64", %2"PRIi64"] pre: %s\n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1));
        // find first non-gap position OR the left edge, update offset
        while (seq[offsetArray[si][0]] == '-' &&
               offsetArray[si][0] <= (int64_t)l) {
            ++offsetArray[si][0];
            if (seq[offsetArray[si][0]] != '-' && offsetArray[si][1] >= 0) {
                // if we've advanced to a non-gap char and the local offset
                // is not 0 (i.e. these aren't simply leading gaps), then
                // advance the offset
                ++offsetArray[si][1]; // advance offset
            }
        }
        // printf(" [%2"PRIi64", %2"PRIi64"] post-initial gap / left edge discovery: %s \n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1));
        // we normally ignore the initial value because it's already been set. however if -1 it must be set
        if (seq[offsetArray[si][0]] != '-' && offsetArray[si][1] == -1) {
            offsetArray[si][1] = 0;
        }
        // offsets
        for (int64_t i = offsetArray[si][0] + 1; i <= (int64_t)l; ++i) {
            // figure out the non-gap offset for the splice-in point, `l'
            offsetArray[si][0] = i; // local pos
            if (seq[i] != '-') {
                if (offsetArray[si][1] == -1) {
                    offsetArray[si][1] = 0;
                }
                ++offsetArray[si][1]; // advance offset
            }
        }
        // printf(" [%2"PRIi64", %2"PRIi64"] post-gaps: %s \n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1));
        bool allGaps = true;
        for (uint64_t i = l; i <= r; ++i) {
            if (seq[i] != '-') {
                allGaps = false;
                break;
            }
        }
        if (allGaps) {
            // this sequence is all gaps in this region, exclude it
            ml1 = maf_mafLine_getNext(ml1);
            prevLineUsed = false;
            maf_mafBlock_decrementLineNumber(mb);
            maf_mafBlock_decrementNumberOfLines(mb);
            maf_mafBlock_decrementNumberOfSequences(mb);
            --lineNumber;
            ++si;
            continue;
        }
        // Walk up beyond the `l' point if the left edge falls on a gap character
        while (seq[offsetArray[si][0]] == '-' && offsetArray[si][0] < (int64_t) r) {
            ++offsetArray[si][0];
            if (seq[offsetArray[si][0]] != '-') { //  && offsetArray[si][1] >= 0) {
                ++offsetArray[si][1];
            }
        }
        // printf(" [%2"PRIi64", %2"PRIi64"] walk past left: %s\n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1));
        // ensure local offset is set properly
        int64_t seqCoords = offsetArray[si][1];
        if (offsetArray[si][1] == -1) {
            seqCoords = 0;
            if (seq[offsetArray[si][0]] != '-') {
                offsetArray[si][1] = 0;
            }
        }
        // printf(" [%2"PRIi64", %2"PRIi64"] seqCoords:%"PRIi64" set properly: %s\n", offsetArray[si][0], offsetArray[si][1], seqCoords, maf_mafLine_getSpecies(ml1));
        maf_mafLine_setStart(ml2, maf_mafLine_getStart(ml1) + seqCoords);
        // update offsetArray:
        for (uint64_t i = offsetArray[si][0] + 1; i <= r; ++i) {
            offsetArray[si][0] = i;
            if (seq[i] != '-') {
                if (offsetArray[si][1] == -1) {
                    offsetArray[si][1] = 0;
                }
                ++offsetArray[si][1];
            }
        }
        // printf(" [%2"PRIi64", %2"PRIi64"] post update: %s\n", offsetArray[si][0], offsetArray[si][1], maf_mafLine_getSpecies(ml1));
        maf_mafLine_setSequence(ml2, de_strndup(seq + l, 1 + r - l));
        maf_mafLine_setLength(ml2, countNonGaps(maf_mafLine_getSequence(ml2)));
        maf_mafBlock_setSequenceFieldLength(mb, maf_mafLine_getSequenceFieldLength(ml2));
        maf_mafLine_setStrand(ml2, maf_mafLine_getStrand(ml1));
        maf_mafLine_setSourceLength(ml2, maf_mafLine_getSourceLength(ml1));
        maf_mafLine_setSpecies(ml2, de_strdup(maf_mafLine_getSpecies(ml1)));
        maf_mafLine_setLine(ml2, maf_mafLine_imputeLine(ml2));
        prevLineUsed = true;
        ml1 = maf_mafLine_getNext(ml1);
        ++si;
        emptyBlock = false;
    }
    maf_mafBlock_incrementLineNumber(mb); // extra \n at the end of a block
    if (prevLineUsed) {
        maf_mafBlock_setTailLine(mb, ml2);
    }
    if (!emptyBlock) {
        return mb;
    } else {
        // this condition should never be tripped, we have a condition set up
        // in the "process" wrapper above this function.
        printf("block was empty, this should never happen in spliceBlock()\n");
        assert(2 + 2 == 5);
        maf_destroyMafBlockList(mb);
        return NULL;
    }
}
示例#5
0
void checkBlock(mafBlock_t *block) {
    // read through each line of a mafBlock and filter duplicates.
    // Report the top scoring duplication only.
    mafLine_t *ml = maf_mafBlock_getHeadLine(block);
    unsigned n = maf_mafLine_getNumberOfSequences(ml);
    char **species = (char **) de_malloc(sizeof(char *) * n);
    char **sequences = (char **) de_malloc(sizeof(char *) * n);
    int index = 0;
    bool containsDuplicates = false;
    duplicate_t *d = NULL, *dupSpeciesHead = NULL;
    while (ml != NULL) {
        if (maf_mafLine_getType(ml) != 's') {
            // skip non-sequence lines
            ml = maf_mafLine_getNext(ml);
            continue;
        }
        species[index] = de_strdup(maf_mafLine_getSpecies(ml));
        sequences[index] = de_strdup(maf_mafLine_getSequence(ml));
        duplicate_t *thisDup = findDuplicate(dupSpeciesHead, maf_mafLine_getSpecies(ml));
        if (thisDup == NULL) {
            // first instance of species, add to list
            if (dupSpeciesHead == NULL) {
                dupSpeciesHead = newDuplicate();
                d = dupSpeciesHead;
            } else {
                d->next = newDuplicate();
                d = d->next;
            }
            d->species = de_strdup(maf_mafLine_getSpecies(ml));
            // create the mafline linked list
            d->headScoredMaf = newScoredMafLine();
            d->headScoredMaf->mafLine = ml;
            d->tailScoredMaf = d->headScoredMaf;
        } else {
            // this sequence is a duplicate, extend the duplicate list.
            containsDuplicates = true;
            ++(thisDup->numSequences);
            scoredMafLine_t *sml = thisDup->tailScoredMaf;
            sml->next = newScoredMafLine();
            sml = sml->next;
            sml->mafLine = ml;
            thisDup->tailScoredMaf = sml;
        }
        ++index;
        ml = maf_mafLine_getNext(ml);
    }
    if (!containsDuplicates) {
        reportBlock(block);
        destroyStringArray(species, n);
        destroyStringArray(sequences, n);
        destroyDuplicates(dupSpeciesHead);
        return;
    }
    // this block contains duplicates
    char *consensus = (char *) de_malloc(longestLine(block) + 1);
    consensus[0] = '\0';
    buildConsensus(consensus, sequences, n,
                   maf_mafLine_getLineNumber(maf_mafBlock_getHeadLine(block))); // lineno used for error reporting
    findBestDupes(dupSpeciesHead, consensus);
    reportBlockWithDuplicates(block, dupSpeciesHead);
    // clean up
    destroyStringArray(species, n);
    destroyStringArray(sequences, n);
    destroyDuplicates(dupSpeciesHead);
    free(consensus);
}
示例#6
0
dbuf *dbuf_create_output_file(deark *c, const char *ext, de_finfo *fi,
	unsigned int createflags)
{
	char nbuf[500];
	char msgbuf[200];
	dbuf *f;
	const char *basefn;
	int file_index;
	u8 is_directory = 0;
	char *name_from_finfo = NULL;
	i64 name_from_finfo_len = 0;

	if(ext && fi && fi->original_filename_flag) {
		de_dbg(c, "[internal warning: Incorrect use of create_output_file]");
	}

	f = de_malloc(c, sizeof(dbuf));
	f->c = c;
	f->max_len_hard = c->max_output_file_size;
	f->is_managed = 1;

	if(fi && fi->is_directory) {
		is_directory = 1;
	}

	if(is_directory && !c->keep_dir_entries) {
		de_dbg(c, "skipping 'directory' file");
		f->btype = DBUF_TYPE_NULL;
		goto done;
	}

	if(c->extract_policy==DE_EXTRACTPOLICY_MAINONLY) {
		if(createflags&DE_CREATEFLAG_IS_AUX) {
			de_dbg(c, "skipping 'auxiliary' file");
			f->btype = DBUF_TYPE_NULL;
			goto done;
		}
	}
	else if(c->extract_policy==DE_EXTRACTPOLICY_AUXONLY) {
		if(!(createflags&DE_CREATEFLAG_IS_AUX)) {
			de_dbg(c, "skipping 'main' file");
			f->btype = DBUF_TYPE_NULL;
			goto done;
		}
	}

	file_index = c->file_count;
	c->file_count++;

	basefn = c->base_output_filename ? c->base_output_filename : "output";

	if(fi && ucstring_isnonempty(fi->file_name_internal)) {
		name_from_finfo_len = 1 + ucstring_count_utf8_bytes(fi->file_name_internal);
		name_from_finfo = de_malloc(c, name_from_finfo_len);
		ucstring_to_sz(fi->file_name_internal, name_from_finfo, (size_t)name_from_finfo_len, 0,
			DE_ENCODING_UTF8);
	}

	if(c->output_style==DE_OUTPUTSTYLE_ARCHIVE && !c->base_output_filename &&
		fi && fi->is_directory &&
		(fi->is_root_dir || (fi->detect_root_dot_dir && fi->orig_name_was_dot)))
	{
		de_strlcpy(nbuf, ".", sizeof(nbuf));
	}
	else if(c->output_style==DE_OUTPUTSTYLE_ARCHIVE && !c->base_output_filename &&
		fi && fi->original_filename_flag && name_from_finfo)
	{
		// TODO: This is a "temporary" hack to allow us to, when both reading from
		// and writing to an archive format, use some semblance of the correct
		// filename (instead of "output.xxx.yyy").
		// There are some things that we don't handle optimally, such as
		// subdirectories.
		// A major redesign of the file naming logic would be good.
		de_strlcpy(nbuf, name_from_finfo, sizeof(nbuf));
	}
	else {
		char fn_suffix[256];

		if(ext && name_from_finfo) {
			de_snprintf(fn_suffix, sizeof(fn_suffix), "%s.%s", name_from_finfo, ext);
		}
		else if(ext) {
			de_strlcpy(fn_suffix, ext, sizeof(fn_suffix));
		}
		else if(is_directory && name_from_finfo) {
			de_snprintf(fn_suffix, sizeof(fn_suffix), "%s.dir", name_from_finfo);
		}
		else if(name_from_finfo) {
			de_strlcpy(fn_suffix, name_from_finfo, sizeof(fn_suffix));
		}
		else if(is_directory) {
			de_strlcpy(fn_suffix, "dir", sizeof(fn_suffix));
		}
		else {
			de_strlcpy(fn_suffix, "bin", sizeof(fn_suffix));
		}

		de_snprintf(nbuf, sizeof(nbuf), "%s.%03d.%s", basefn, file_index, fn_suffix);
	}

	f->name = de_strdup(c, nbuf);

	if(fi) {
		// The finfo object passed to us at file creation is not required to
		// remain valid, so make a copy of anything in it that we might need
		// later.
		f->fi_copy = de_finfo_create(c);
		finfo_shallow_copy(c, fi, f->fi_copy);

		// Here's where we respect the -intz option, by using it to convert to
		// UTC in some cases.
		if(f->fi_copy->mod_time.is_valid && f->fi_copy->mod_time.tzcode==DE_TZCODE_LOCAL &&
			c->input_tz_offs_seconds!=0)
		{
			de_timestamp_cvt_to_utc(&f->fi_copy->mod_time, -c->input_tz_offs_seconds);
		}

		if(f->fi_copy->image_mod_time.is_valid && f->fi_copy->image_mod_time.tzcode==DE_TZCODE_LOCAL &&
			c->input_tz_offs_seconds!=0)
		{
			de_timestamp_cvt_to_utc(&f->fi_copy->image_mod_time, -c->input_tz_offs_seconds);
		}
	}

	if(file_index < c->first_output_file) {
		f->btype = DBUF_TYPE_NULL;
		goto done;
	}

	if(c->max_output_files>=0 &&
		file_index >= c->first_output_file + c->max_output_files)
	{
		f->btype = DBUF_TYPE_NULL;
		goto done;
	}

	c->num_files_extracted++;

	if(c->extrlist_dbuf) {
		dbuf_printf(c->extrlist_dbuf, "%s\n", f->name);
		dbuf_flush(c->extrlist_dbuf);
	}

	if(c->list_mode) {
		f->btype = DBUF_TYPE_NULL;
		if(c->list_mode_include_file_id) {
			de_msg(c, "%d:%s", file_index, f->name);
		}
		else {
			de_msg(c, "%s", f->name);
		}
		goto done;
	}

	if(c->output_style==DE_OUTPUTSTYLE_ARCHIVE && c->archive_fmt==DE_ARCHIVEFMT_TAR) {
		de_info(c, "Adding %s to TAR file", f->name);
		f->btype = DBUF_TYPE_ODBUF;
		// A dummy max_len_hard value. The parent will do the checking.
		f->max_len_hard = DE_DUMMY_MAX_FILE_SIZE;
		f->writing_to_tar_archive = 1;
		de_tar_start_member_file(c, f);
	}
	else if(c->output_style==DE_OUTPUTSTYLE_ARCHIVE) { // ZIP
		i64 initial_alloc;
		de_info(c, "Adding %s to ZIP file", f->name);
		f->btype = DBUF_TYPE_MEMBUF;
		f->max_len_hard = DE_MAX_MEMBUF_SIZE;
		if(is_directory) {
			// A directory entry is not expected to have any data associated
			// with it (besides the files it contains).
			initial_alloc = 16;
		}
		else {
			initial_alloc = 65536;
		}
		f->membuf_buf = de_malloc(c, initial_alloc);
		f->membuf_alloc = initial_alloc;
		f->write_memfile_to_zip_archive = 1;
	}
	else if(c->output_style==DE_OUTPUTSTYLE_STDOUT) {
		de_info(c, "Writing %s to [stdout]", f->name);
		f->btype = DBUF_TYPE_STDOUT;
		// TODO: Should we increase f->max_len_hard?
		f->fp = stdout;
	}
	else {
		de_info(c, "Writing %s", f->name);
		f->btype = DBUF_TYPE_OFILE;
		f->fp = de_fopen_for_write(c, f->name, msgbuf, sizeof(msgbuf),
			c->overwrite_mode, 0);

		if(!f->fp) {
			de_err(c, "Failed to write %s: %s", f->name, msgbuf);
			f->btype = DBUF_TYPE_NULL;
		}
	}

done:
	de_free(c, name_from_finfo);
	return f;
}