/** * Reads a block of files into memory. * @param strs Array for data * @param len Length of block * @return number of lines read into memory */ int input_lines_read(hstring_t *strs, int len) { assert(strs && len > 0); int read, i = 0, j = 0; size_t size; char buf[32], *line = NULL; for (i = 0; i < len; i++) { line = NULL; read = gzgetline(&line, &size, in); if (read == -1) { free(line); break; } /* Strip newline characters */ strip_newline(line, read); /* Caution: May modify the line */ strs[j].label = get_label(line); strs[j].str.c = line; strs[j].type = TYPE_CHAR; strs[j].len = strlen(line); snprintf(buf, 32, "line%d", line_num++); strs[j].src = strdup(buf); j++; } return j; }
/** * Opens a file for reading text fasta. * @param name File name * @return number of fasta or -1 on error */ int input_fasta_open(char *name) { assert(name); size_t read, size; char *line = NULL; const char *pattern; /* Compile regular expression for label */ config_lookup_string(&cfg, "input.fasta_regex", &pattern); if (regcomp(&re, pattern, REG_EXTENDED) != 0) { error("Could not compile regex for label"); return -1; } in = gzopen(name, "r"); if (!in) { error("Could not open '%s' for reading", name); return -1; } int num, cont = FALSE; while(!gzeof(in)) { line = NULL; read = gzgetline(&line, &size, in); if (read > 0) strtrim(line); if (read > 1 && !cont && (line[0] == '>' || line[0] == ';')) { num++; cont = TRUE; } else { cont = FALSE; } free(line); } /* Prepare reading */ gzrewind(in); return num; }
/** * Reads a block of files into memory. * @param strs Array for data * @param len Length of block * @return number of lines read into memory */ int input_lines_read(string_t *strs, int len) { assert(strs && len > 0); int read, i = 0, j = 0; size_t size; char buf[32], *line = NULL; for (i = 0; i < len; i++) { #ifdef ENABLE_EVALTIME double t1 = time_stamp(); #endif line = NULL; read = gzgetline(&line, &size, in); if (read == -1) { free(line); break; } /* Strip newline characters */ strip_newline(line, read); strs[j].label = get_label(line); strs[j].str = line; strs[j].len = strlen(line); snprintf(buf, 32, "line%d", line_num++); strs[j].src = strdup(buf); j++; #ifdef ENABLE_EVALTIME printf("strlen %d read %f\n", strs[j-1].len, time_stamp() - t1); #endif } return j; }
/** * Reads a block of files into memory. * @param strs Array for data * @param len Length of block * @return number of read files */ int input_fasta_read(string_t *strs, int len) { assert(strs && len > 0); int read, i = 0, alloc = -1; size_t size; char *line = NULL, *seq = NULL; while (i < len) { /* Read line */ if (old_line) { line = old_line; read = strlen(line) + 1; } else { line = NULL; read = gzgetline(&line, &size, in); } old_line = NULL; /* Trim line */ if (read >= 0) strtrim(line); /* End of sequence */ if (alloc > 1 && (read == -1 || line[0] == ';' || line[0] == '>')) { strs[i].str = seq; strs[i].len = alloc - 1; i++; } /* Stop on read error */ if (read == -1) { free(line); break; } /* Reset pointer for next chunk */ if (i == len) { /* Save old line */ old_line = line; #if 0 /* Alternative code with slow gzseek */ gzseek(in, -read, SEEK_CUR); free(line); #endif break; } /* Check for comment char */ if (line[0] == ';' || line[0] == '>') { /* Start of sequence */ if (alloc == -1 || alloc > 1) { strs[i].src = strdup(line); strs[i].label = get_label(line); seq = calloc(sizeof(char), 1); alloc = 1; } goto skip; } /* Skip text before first comment */ if (alloc == -1) goto skip; /* Append line to sequence */ alloc += strlen(line); seq = realloc(seq, alloc * sizeof(char)); strncat(seq, line, strlen(line)); skip: free(line); } return i; }
struct matrix *load_sensing_matrix(const char *filename, unsigned int target_kmer) { char *line = NULL; char **headers = NULL; double *matrix = NULL; unsigned int kmer = 0; unsigned long long i = 0; unsigned long long *row = NULL; unsigned long long sequences = 0; unsigned long long width = 0; struct matrix *ret = NULL; size_t lineno = 0; gzFile fh = NULL; fh = gzopen(filename, "r"); if(fh == NULL) { fprintf(stderr, "could not open %s", filename); exit(EXIT_FAILURE); } line = malloc(1024 * sizeof(char)); check_malloc(line, NULL); // Check for quikr line = gzgets(fh, line, 1024); lineno++; if(strcmp(line, "quikr\n") != 0) { fprintf(stderr, "This does not look like a quikr sensing matrix. Please check your path: %s\n", filename); exit(EXIT_FAILURE); } // check version line = gzgets(fh, line, 1024); if(atoi(line) != MATRIX_REVISION) { fprintf(stderr, "Sensing Matrix uses an unsupported version, please retrain your matrix\n"); exit(EXIT_FAILURE); } lineno++; // get number of sequences line = gzgets(fh, line, 1024); sequences = strtoull(line, NULL, 10); if(sequences == 0) { fprintf(stderr, "Error parsing sensing matrix, sequence count is zero\n"); exit(EXIT_FAILURE); } lineno++; // get kmer gzgets(fh, line, 1024); kmer = atoi(line); if(kmer == 0) { fprintf(stderr, "Error parsing sensing matrix, kmer is zero\n"); exit(EXIT_FAILURE); } lineno++; if(kmer != target_kmer) { fprintf(stderr, "The sensing_matrix was trained with a different kmer than your requested kmer\n"); exit(EXIT_FAILURE); } width = pow_four(kmer); // allocate a +1 size for the extra row matrix = malloc(sequences * (width) * sizeof(double)); check_malloc(matrix, NULL); row = malloc((width) * sizeof(unsigned long long)); check_malloc(row, NULL); headers = malloc(sequences * sizeof(char *)); check_malloc(headers, NULL); char *buf = NULL; size_t len = 0; size_t read = 0; for(i = 0; i < sequences; i++) { unsigned long long j = 0; // get header and add it to headers array // read = gzgetline(&buf, &len, fh); if(read == 0) { fprintf(stderr, "Error parsing sensing matrix, could not read header\n"); exit(EXIT_FAILURE); } char *header = malloc(sizeof(char) * read + 1); check_malloc(header, NULL); header = strncpy(header, buf, read - 1); if(header[0] != '>') { fprintf(stderr, "Error parsing sensing matrix, could not read header in line %llu\n", lineno); exit(EXIT_FAILURE); } lineno++; header[read - 1] = '\0'; headers[i] = header+1; row = memset(row, 0, (width) * sizeof(unsigned long long)); for(j = 0; j < width; j++) { line = gzgets(fh, line, 32); lineno++; if(line == NULL || line[0] == '>') { fprintf(stderr, "Error parsing sensing matrix, line %zu does not look like a value\n", lineno); exit(EXIT_FAILURE); } lineno++; row[j] = strtoull(line, NULL, 10); if(errno) { printf("could not parse '%s'\n into a number", line); exit(EXIT_FAILURE); } } for(j = 0; j < width; j++) { matrix[i*(width) + j] = ((double)row[j]); } } // load the matrix of counts gzclose(fh); free(line); free(row); ret = malloc(sizeof(struct matrix)); (*ret).kmer = kmer; (*ret).sequences = sequences; (*ret).matrix = matrix; (*ret).headers = headers; return ret; }