Beispiel #1
0
struct vector *img_to_lines(struct matrix *img, struct vector *blocks) {
    struct vector *lines = vector_make((img->height) * (img->width));
    int status = 0; // not on a line
    size_t h = 0;
    for(; blocks->size != 0;)
    {
	struct coords current_block;
	vector_pop_front(blocks, &current_block);
	int w1 = current_block.w1, w2 = current_block.w2;

	for (; h < img->height; h++) {
	    if (status) {
		struct coords line;
		line.w1 = 1;
		line.w2 = 1;
		line.h1 = h - 1;
		for (; h < img->height && !line_is_empty(img, h, w1, w2); h++) {
		}
		line.h2 = h - 1;
		vector_push_back(lines, line);
		status = 0;
	    } else {
		for (; h < img->height && line_is_empty(img, h, w1, w2); h++) {
		}
		status = 1;
	    }
	}
    }
    free(blocks);
    return lines;
}
Beispiel #2
0
int horizontal_search(struct matrix *img, struct coords block)
{
    int i = block.h1;
    for(; i < block.h2 && line_is_empty(img, i, block.w1, block.w2); i++) { }
    for(; i < block.h2 && !line_is_empty(img, i, block.w1, block.w2); i++) { }
    if(i == block.h2)
        return 0;
    return i;
}
Beispiel #3
0
//the main functions :
struct vector *img_to_blocks(struct matrix *img)
{
    struct coords init;
    struct vector *output;

    //all the text is within the square ((w1,h1),(w2,h2))
    size_t i = 0;
    for(; i < img->height - 1 && line_is_empty(img, i, 0, img->width - 1); i++){}
    init.h1 = i;
    for(i = img->height - 1; i > 0 && line_is_empty(img, i, 0, img->width - 1); i--){}
    init.h2 = i;
    for(i = 0; i < img->width - 1 && column_is_empty(img, i, init.h1, init.h2); i++){}
    init.w1 = i;
    for(i = img->width - 1; i > 0 && column_is_empty(img, i, init.h1, init.h2); i--){}
    init.w2 = i;

    //we create a matrix with appearant blocks
    struct matrix *M = malloc(sizeof(struct matrix));
    M->data = malloc(sizeof(double) * img->width * img->height);
    M->width = img->width, M->height = img->height;
    for(size_t i = 0; i < img->width * img->height; i++)
    {
        M->data[i] = img->data[i];
    }
    for(size_t i = 0; i < 5; i++)
    {
        filter_noise(M);
        if(i % 2)
            filter_contrast(M);
    }

    //we launch the block detection in that original block
    if(init.w1 < init.w2 && init.h1 < init.h2)
        output = vertical_rec(M, init, 1);
    else
        output = NULL; //invalid image.
    free(M->data);
    free(M);
    return output;
}
Beispiel #4
0
int main(int argc, char *argv[]) {

  size_t buffer_size = BUFFER_SIZE_DEFAULT;

  // Read arguments
  FILE *infile = stdin;
  int quiet = 0;
  char output = 'r';
  char read_opt = '\0';
  int i;
  for (i = 1; i < argc; i++) {
    if (strcmp(argv[i], "-h") == 0) {
      die(USAGE);
    } else if (strcmp(argv[i], "-q") == 0) {
      quiet = 1;
    } else if (strcmp(argv[i], "-o") == 0) {
      read_opt = 'o';
    } else if (strcmp(argv[i], "-B") == 0) {
      read_opt = 'B';
    } else if (read_opt == 'o') {
      if (strcmp(argv[i], "reads") == 0) {
        output = 'r';
      } else if (strcmp(argv[i], "format") == 0) {
        output = 'f';
      } else {
        die("Invalid -o output format \"%s\"", argv[i]);
      }
      read_opt = '\0';
    } else if (read_opt == 'B') {
      if (! is_int(argv[i])) {
        die("Invalid buffer size: \"%s\"", argv[i]);
      }
      buffer_size = atoi(argv[i]);
      read_opt = '\0';
    } else if (infile == stdin) {
      infile = fopen(argv[i], "r");
      if (errno) {
        die("\"%s\"", argv[i]);
      }
    } else {
      //TODO: allow any number of input files
      die("Can only process one file argument");
    }
  }

  int get_extremes = 1;
  if (quiet && output != 'f') {
    get_extremes = 0;
  }

  /*TODO: This assumes that there will be at least as many quality scores as there are sequence
   *      bases. According to Dan, we can't make that assumption.
   *      Then what do we do to tell when the quality lines have ended?
   *      Ideas for disambiguating:
   *      1. If len(qual) >= len(seq), it's a HEADER (If we've already seen enough
   *         quality values to cover the read, the QUAL lines must be over.)
   *      2. If the line plus the observed quality values so far is longer than the
   *         read, it must be a HEADER line.
   *      3. No FASTQ format uses space characters for quality scores, according to
   *         Wikipedia. If there's a space character in the line, say it's a HEADER line?
   *      But there could still conceivably be a read with truncated quality scores,
   *      followed by a HEADER line that contains no spaces and is short enough to not
   *      exceed the read length.
   *      Conclusion: Just check how BioPython does it:
   *      http://biopython.org/DIST/docs/api/Bio.SeqIO.QualityIO-pysrc.html
   *      Update: BioPython just throws an error if the number of bases != num of quality scores.
   */
  /* Notes on format requirements:
   * Empty lines are allowed, and ignored. If there's an empty line where a sequence or quality line
   * is expected, it's interpreted as 0 base/quality scores.
   * This means you can have zero-length reads.
   * Multi-line sequences (more than 4 lines per read) are allowed.
   * The number of quality scores must be >= the number of sequence bases. If there are missing
   * quality scores, this will likely fail with an error, but it could possibly succeed, giving
   * incorrect results.
   */

  char *line = malloc(buffer_size);

  Extremes extremes;
  extremes.max = 0;
  extremes.min = 256;
  long num_reads = 0;
  long seq_len = 0;
  long qual_len = 0;
  State state = HEADER;
  // fgets() reads a line at a time.
  char *result = fgets(line, buffer_size, infile);
  long line_num = 0;
  while (result != NULL) {
    line_num++;
    if (state == HEADER) {
      // Allow empty lines before the header.
      if (! line_is_empty(line)) {
        if (line[0] != '@') {
          die("Line %ld looked like a header line but does not start with \"@\".", line_num);
        }
        num_reads++;
        seq_len = 0;
        // Assume only 1 header line.
        state = SEQ;
      }
    } else if (state == SEQ) {
      if (line[0] == '+') {
        qual_len = 0;
        // End of sequence line comes when we see a line starting with "+".
        state = PLUS;
      } else {
        seq_len += count_chars(line, buffer_size);
      }
    } else if (state == PLUS || state == QUAL) {
      // If the state is PLUS, we already saw the "+" line on the last loop.
      // Assume there's only 1 "+" line, and assume we're now on a quality scores line.
      if (state == QUAL && line[0] == '@') {
        // If we're past the "first" quality scores line and we see one that starts with a "@",
        // that's very suspicious. Allow it, but raise a warning.
        fprintf(stderr, "Warning: Looking for more quality scores on line %ld but it starts with "
                        "\"@\".\nThis might be a header line and there were fewer quality scores "
                        "than bases.\n", line_num);
      }
      state = QUAL;
      if (get_extremes) {
        qual_len += count_chars_and_extremes(line, buffer_size, &extremes);
      } else {
        qual_len += count_chars(line, buffer_size);
      }
      if (qual_len >= seq_len) {
        // End of quality line comes once we've seen enough quality scores to match the sequence line.
        state = HEADER;
        if (qual_len > seq_len) {
          fprintf(stderr, "Warning on line %ld: Counted more quality scores than bases.\n", line_num);
        }
      }
    }
    result = fgets(line, buffer_size, infile);
  }

  char format_guess = '?';
  if (get_extremes) {
    format_guess = guess_quality_format(extremes, num_reads);
  }

  if (!quiet) {
    fprintf(stderr, "Quality score ascii range: %d (%c) to %d (%c)\n",
            extremes.min, (char)extremes.min, extremes.max, (char)extremes.max);
    switch (format_guess) {
      case 'S':
        fprintf(stderr, "Format: Very likely Sanger (offset 33).\n");
        break;
      case 'X':
        fprintf(stderr, "Format: Very likely Solexa (offset 64).\n");
        break;
      case 's':
        fprintf(stderr, "Format: Maybe Sanger? (offset 33)\n");
        break;
      case 'x':
        fprintf(stderr, "Format: Maybe Solexa? (offset 64)\n");
        break;
      case '?':
        fprintf(stderr, "Format: Unknown\n");
    }
  }

  if (output == 'r') {
    printf("%ld\n", num_reads);
  } else if (output == 'f') {
    switch (format_guess) {
      case 'S':
      case 's':
        printf("sanger\n");
        break;
      case 'X':
      case 'x':
        printf("solexa\n");
        break;
      default:
        printf("?\n");
    }
  }

  fclose(infile);
  return 0;
}