Esempio n. 1
0
/**
 * Reads a block of files into memory.
 * @param strs Array for data
 * @param len Length of block
 * @return number of lines read into memory
 */
int input_lines_read(hstring_t *strs, int len)
{
    assert(strs && len > 0);
    int read, i = 0, j = 0;
    size_t size;
    char buf[32], *line = NULL;

    for (i = 0; i < len; i++) {
        line = NULL;
        read = gzgetline(&line, &size, in);
        if (read == -1) {
            free(line);
            break;
        }

        /* Strip newline characters */
        strip_newline(line, read);

        /* Caution: May modify the line */
        strs[j].label = get_label(line);

        strs[j].str.c = line;
        strs[j].type = TYPE_CHAR;
        strs[j].len = strlen(line);
        snprintf(buf, 32, "line%d", line_num++);
        strs[j].src = strdup(buf);
        j++;
    }

    return j;
}
Esempio n. 2
0
/**
 * Opens a file for reading text fasta. 
 * @param name File name
 * @return number of fasta or -1 on error
 */
int input_fasta_open(char *name) 
{
    assert(name);    
    size_t read, size;
    char *line = NULL;
    const char *pattern;

    /* Compile regular expression for label */
    config_lookup_string(&cfg, "input.fasta_regex", &pattern);    
    if (regcomp(&re, pattern, REG_EXTENDED) != 0) {
        error("Could not compile regex for label");
        return -1;
    }

    in = gzopen(name, "r");
    if (!in) {
        error("Could not open '%s' for reading", name);
        return -1;
    }

    int num, cont = FALSE;
    while(!gzeof(in)) {
        line = NULL;
        read = gzgetline(&line, &size, in);
        if (read > 0)
            strtrim(line);
        if (read > 1 && !cont && (line[0] == '>' || line[0] == ';')) {
            num++;
            cont = TRUE;
        } else {
            cont = FALSE;
        }
        free(line);
    }

    /* Prepare reading */
    gzrewind(in);    
    return num;
}
Esempio n. 3
0
/**
 * Reads a block of files into memory.
 * @param strs Array for data
 * @param len Length of block
 * @return number of lines read into memory
 */
int input_lines_read(string_t *strs, int len)
{
    assert(strs && len > 0);
    int read, i = 0, j = 0;
    size_t size;
    char buf[32], *line = NULL;

    for (i = 0; i < len; i++) {
#ifdef ENABLE_EVALTIME 
        double t1 = time_stamp();   
#endif    
    
        line = NULL;
        read = gzgetline(&line, &size, in);
        if (read == -1) {
            free(line);
            break;
        }
        
        /* Strip newline characters */
        strip_newline(line, read);

        strs[j].label = get_label(line);
        strs[j].str = line;
        strs[j].len = strlen(line);

        snprintf(buf, 32, "line%d", line_num++);
        strs[j].src = strdup(buf);
        j++;

#ifdef ENABLE_EVALTIME 
        printf("strlen %d read %f\n", strs[j-1].len, time_stamp() - t1);
#endif    
    }

    return j;
}
Esempio n. 4
0
/**
 * Reads a block of files into memory.
 * @param strs Array for data
 * @param len Length of block
 * @return number of read files
 */
int input_fasta_read(string_t *strs, int len)
{
    assert(strs && len > 0);
    int read, i = 0, alloc = -1;
    size_t size;
    char *line = NULL, *seq = NULL;

    while (i < len) {
        
        /* Read line */
        if (old_line) {
            line = old_line;
            read = strlen(line) + 1;
        } else {
            line = NULL;        
            read = gzgetline(&line, &size, in);
        }
        old_line = NULL;        

        /* Trim line */
        if (read >= 0)
            strtrim(line);

        /* End of sequence */
        if (alloc > 1 && (read == -1 || line[0] == ';' || line[0] == '>')) {
            strs[i].str = seq;
            strs[i].len = alloc - 1;
            i++;
        }
        
        /* Stop on read error */
        if (read == -1) {
            free(line);
            break;
        }
        
        /* Reset pointer for next chunk */
        if (i == len) {
            /* Save old line */
            old_line = line;
#if 0            
            /* Alternative code with slow gzseek */
            gzseek(in, -read, SEEK_CUR);
            free(line);
#endif            
            break;
        }
        
        /* Check for comment char */
        if (line[0] == ';' || line[0] == '>') {            
            /* Start of sequence */
            if (alloc == -1 || alloc > 1) {
                strs[i].src = strdup(line);
                strs[i].label = get_label(line);
                seq = calloc(sizeof(char), 1);
                alloc = 1;
            }
            goto skip;
        } 
        
        /* Skip text before first comment */
        if (alloc == -1)
            goto skip;
        
        /* Append line to sequence */
        alloc += strlen(line);
        seq = realloc(seq, alloc * sizeof(char));
        strncat(seq, line, strlen(line));        
                
skip:        
        free(line);
    }

    return i;
}
Esempio n. 5
0
struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer) {

	char *line = NULL;
	char **headers = NULL;

	double *matrix = NULL;

	unsigned int kmer = 0;
	
	unsigned long long i = 0;
	unsigned long long *row = NULL;
	unsigned long long sequences = 0;
	unsigned long long width = 0;

	struct matrix *ret = NULL;
	size_t lineno = 0;

	gzFile fh = NULL;

	fh = gzopen(filename, "r");
	if(fh == NULL) {
		fprintf(stderr, "could not open %s", filename);
		exit(EXIT_FAILURE);
	}

	line = malloc(1024 * sizeof(char));
	check_malloc(line, NULL);

	// Check for quikr
	line = gzgets(fh, line, 1024);
	lineno++;
	if(strcmp(line, "quikr\n") != 0) {
		fprintf(stderr, "This does not look like a quikr sensing matrix. Please check your path: %s\n", filename);
		exit(EXIT_FAILURE);
	}

	// check version
	line = gzgets(fh, line, 1024);
	if(atoi(line) != MATRIX_REVISION) {
		fprintf(stderr, "Sensing Matrix uses an unsupported version, please retrain your matrix\n");
		exit(EXIT_FAILURE);
	}
	lineno++;

	// get number of sequences
	line = gzgets(fh, line, 1024);
	sequences = strtoull(line, NULL, 10);
	if(sequences == 0) {
		fprintf(stderr, "Error parsing sensing matrix, sequence count is zero\n");
		exit(EXIT_FAILURE);
	}
	lineno++;

	// get kmer
	gzgets(fh, line, 1024);
	kmer = atoi(line);
	if(kmer == 0) {
		fprintf(stderr, "Error parsing sensing matrix, kmer is zero\n");
		exit(EXIT_FAILURE);
	}
	lineno++;

	if(kmer != target_kmer) {
		fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n");
		exit(EXIT_FAILURE);
	}

	width = pow_four(kmer);

	// allocate a +1 size for the extra row
	matrix = malloc(sequences * (width) * sizeof(double));
	check_malloc(matrix, NULL);

	row = malloc((width) * sizeof(unsigned long long));
	check_malloc(row, NULL);
	
	headers = malloc(sequences * sizeof(char *));
	check_malloc(headers, NULL);

	char *buf = NULL;
	size_t len = 0;
	size_t read = 0;
	for(i = 0; i < sequences; i++) {
		unsigned long long j = 0;
		// get header and add it to headers array
		//
		read = gzgetline(&buf, &len, fh);
		if(read == 0)  {
			fprintf(stderr, "Error parsing sensing matrix, could not read header\n");
			exit(EXIT_FAILURE);
		}

		char *header = malloc(sizeof(char) * read + 1);
		check_malloc(header, NULL);	
		header = strncpy(header, buf, read - 1);
		if(header[0] != '>') {
			fprintf(stderr, "Error parsing sensing matrix, could not read header in line %llu\n", lineno);
			exit(EXIT_FAILURE);
		}
		lineno++;

		header[read - 1] = '\0';
		headers[i] = header+1;

		row = memset(row, 0, (width) * sizeof(unsigned long long));

		for(j = 0; j < width; j++) {
			line = gzgets(fh, line, 32);
			lineno++;
			if(line == NULL || line[0] == '>') {
				fprintf(stderr, "Error parsing sensing matrix, line %zu does not look like a value\n", lineno);
				exit(EXIT_FAILURE);
			}
			lineno++;

			row[j] = strtoull(line, NULL, 10);
			if(errno) {
				printf("could not parse '%s'\n into a number", line);
				exit(EXIT_FAILURE);
			}

		}
		for(j = 0; j < width; j++) {
			matrix[i*(width) + j] = ((double)row[j]);
		}
	}

	// load the matrix of counts
	gzclose(fh);

	free(line);
	free(row);
	ret = malloc(sizeof(struct matrix));
	(*ret).kmer = kmer;
	(*ret).sequences = sequences;
	(*ret).matrix = matrix;
	(*ret).headers = headers;

	return ret;
}