struct vector *img_to_lines(struct matrix *img, struct vector *blocks) { struct vector *lines = vector_make((img->height) * (img->width)); int status = 0; // not on a line size_t h = 0; for(; blocks->size != 0;) { struct coords current_block; vector_pop_front(blocks, ¤t_block); int w1 = current_block.w1, w2 = current_block.w2; for (; h < img->height; h++) { if (status) { struct coords line; line.w1 = 1; line.w2 = 1; line.h1 = h - 1; for (; h < img->height && !line_is_empty(img, h, w1, w2); h++) { } line.h2 = h - 1; vector_push_back(lines, line); status = 0; } else { for (; h < img->height && line_is_empty(img, h, w1, w2); h++) { } status = 1; } } } free(blocks); return lines; }
int horizontal_search(struct matrix *img, struct coords block) { int i = block.h1; for(; i < block.h2 && line_is_empty(img, i, block.w1, block.w2); i++) { } for(; i < block.h2 && !line_is_empty(img, i, block.w1, block.w2); i++) { } if(i == block.h2) return 0; return i; }
//the main functions : struct vector *img_to_blocks(struct matrix *img) { struct coords init; struct vector *output; //all the text is within the square ((w1,h1),(w2,h2)) size_t i = 0; for(; i < img->height - 1 && line_is_empty(img, i, 0, img->width - 1); i++){} init.h1 = i; for(i = img->height - 1; i > 0 && line_is_empty(img, i, 0, img->width - 1); i--){} init.h2 = i; for(i = 0; i < img->width - 1 && column_is_empty(img, i, init.h1, init.h2); i++){} init.w1 = i; for(i = img->width - 1; i > 0 && column_is_empty(img, i, init.h1, init.h2); i--){} init.w2 = i; //we create a matrix with appearant blocks struct matrix *M = malloc(sizeof(struct matrix)); M->data = malloc(sizeof(double) * img->width * img->height); M->width = img->width, M->height = img->height; for(size_t i = 0; i < img->width * img->height; i++) { M->data[i] = img->data[i]; } for(size_t i = 0; i < 5; i++) { filter_noise(M); if(i % 2) filter_contrast(M); } //we launch the block detection in that original block if(init.w1 < init.w2 && init.h1 < init.h2) output = vertical_rec(M, init, 1); else output = NULL; //invalid image. free(M->data); free(M); return output; }
int main(int argc, char *argv[]) { size_t buffer_size = BUFFER_SIZE_DEFAULT; // Read arguments FILE *infile = stdin; int quiet = 0; char output = 'r'; char read_opt = '\0'; int i; for (i = 1; i < argc; i++) { if (strcmp(argv[i], "-h") == 0) { die(USAGE); } else if (strcmp(argv[i], "-q") == 0) { quiet = 1; } else if (strcmp(argv[i], "-o") == 0) { read_opt = 'o'; } else if (strcmp(argv[i], "-B") == 0) { read_opt = 'B'; } else if (read_opt == 'o') { if (strcmp(argv[i], "reads") == 0) { output = 'r'; } else if (strcmp(argv[i], "format") == 0) { output = 'f'; } else { die("Invalid -o output format \"%s\"", argv[i]); } read_opt = '\0'; } else if (read_opt == 'B') { if (! is_int(argv[i])) { die("Invalid buffer size: \"%s\"", argv[i]); } buffer_size = atoi(argv[i]); read_opt = '\0'; } else if (infile == stdin) { infile = fopen(argv[i], "r"); if (errno) { die("\"%s\"", argv[i]); } } else { //TODO: allow any number of input files die("Can only process one file argument"); } } int get_extremes = 1; if (quiet && output != 'f') { get_extremes = 0; } /*TODO: This assumes that there will be at least as many quality scores as there are sequence * bases. According to Dan, we can't make that assumption. * Then what do we do to tell when the quality lines have ended? * Ideas for disambiguating: * 1. If len(qual) >= len(seq), it's a HEADER (If we've already seen enough * quality values to cover the read, the QUAL lines must be over.) * 2. If the line plus the observed quality values so far is longer than the * read, it must be a HEADER line. * 3. No FASTQ format uses space characters for quality scores, according to * Wikipedia. If there's a space character in the line, say it's a HEADER line? * But there could still conceivably be a read with truncated quality scores, * followed by a HEADER line that contains no spaces and is short enough to not * exceed the read length. * Conclusion: Just check how BioPython does it: * http://biopython.org/DIST/docs/api/Bio.SeqIO.QualityIO-pysrc.html * Update: BioPython just throws an error if the number of bases != num of quality scores. */ /* Notes on format requirements: * Empty lines are allowed, and ignored. If there's an empty line where a sequence or quality line * is expected, it's interpreted as 0 base/quality scores. * This means you can have zero-length reads. * Multi-line sequences (more than 4 lines per read) are allowed. * The number of quality scores must be >= the number of sequence bases. If there are missing * quality scores, this will likely fail with an error, but it could possibly succeed, giving * incorrect results. */ char *line = malloc(buffer_size); Extremes extremes; extremes.max = 0; extremes.min = 256; long num_reads = 0; long seq_len = 0; long qual_len = 0; State state = HEADER; // fgets() reads a line at a time. char *result = fgets(line, buffer_size, infile); long line_num = 0; while (result != NULL) { line_num++; if (state == HEADER) { // Allow empty lines before the header. if (! line_is_empty(line)) { if (line[0] != '@') { die("Line %ld looked like a header line but does not start with \"@\".", line_num); } num_reads++; seq_len = 0; // Assume only 1 header line. state = SEQ; } } else if (state == SEQ) { if (line[0] == '+') { qual_len = 0; // End of sequence line comes when we see a line starting with "+". state = PLUS; } else { seq_len += count_chars(line, buffer_size); } } else if (state == PLUS || state == QUAL) { // If the state is PLUS, we already saw the "+" line on the last loop. // Assume there's only 1 "+" line, and assume we're now on a quality scores line. if (state == QUAL && line[0] == '@') { // If we're past the "first" quality scores line and we see one that starts with a "@", // that's very suspicious. Allow it, but raise a warning. fprintf(stderr, "Warning: Looking for more quality scores on line %ld but it starts with " "\"@\".\nThis might be a header line and there were fewer quality scores " "than bases.\n", line_num); } state = QUAL; if (get_extremes) { qual_len += count_chars_and_extremes(line, buffer_size, &extremes); } else { qual_len += count_chars(line, buffer_size); } if (qual_len >= seq_len) { // End of quality line comes once we've seen enough quality scores to match the sequence line. state = HEADER; if (qual_len > seq_len) { fprintf(stderr, "Warning on line %ld: Counted more quality scores than bases.\n", line_num); } } } result = fgets(line, buffer_size, infile); } char format_guess = '?'; if (get_extremes) { format_guess = guess_quality_format(extremes, num_reads); } if (!quiet) { fprintf(stderr, "Quality score ascii range: %d (%c) to %d (%c)\n", extremes.min, (char)extremes.min, extremes.max, (char)extremes.max); switch (format_guess) { case 'S': fprintf(stderr, "Format: Very likely Sanger (offset 33).\n"); break; case 'X': fprintf(stderr, "Format: Very likely Solexa (offset 64).\n"); break; case 's': fprintf(stderr, "Format: Maybe Sanger? (offset 33)\n"); break; case 'x': fprintf(stderr, "Format: Maybe Solexa? (offset 64)\n"); break; case '?': fprintf(stderr, "Format: Unknown\n"); } } if (output == 'r') { printf("%ld\n", num_reads); } else if (output == 'f') { switch (format_guess) { case 'S': case 's': printf("sanger\n"); break; case 'X': case 'x': printf("solexa\n"); break; default: printf("?\n"); } } fclose(infile); return 0; }