/* * parseForProtos() * * Input: filein (output of cpp) * prestring (<optional> string that prefaces each decl; * use NULL to omit) * Return: parsestr (string of function prototypes), or NULL on error * * Notes: * (1) We parse the output of cpp: * cpp -ansi <filein> * Three plans were attempted, with success on the third. * (2) Plan 1. A cursory examination of the cpp output indicated that * every function was preceeded by a cpp comment statement. * So we just need to look at statements beginning after comments. * Unfortunately, this is NOT the case. Some functions start * without cpp comment lines, typically when there are no * comments in the source that immediately precede the function. * (3) Plan 2. Consider the keywords in the language that start * parts of the cpp file. Some, like 'typedef', 'enum', * 'union' and 'struct', are followed after a while by '{', * and eventually end with '}, plus an optional token and a * final ';' Others, like 'extern' and 'static', are never * the beginnings of global function definitions. Function * prototypes have one or more sets of '(' followed eventually * by a ')', and end with ';'. But function definitions have * tokens, followed by '(', more tokens, ')' and then * immediately a '{'. We would generate a prototype from this * by adding a ';' to all tokens up to the ')'. So we use * these special tokens to decide what we are parsing. And * whenever a function definition is found and the prototype * extracted, we skip through the rest of the function * past the corresponding '}'. This token ends a line, and * is often on a line of its own. But as it turns out, * the only keyword we need to consider is 'static'. * (4) Plan 3. Consider the parentheses and braces for various * declarations. A struct, enum, or union has a pair of * braces followed by a semicolon. They cannot have parentheses * before the left brace, but a struct can have lots of parentheses * within the brace set. A function prototype has no braces. * A function declaration can have sets of left and right * parentheses, but these are followed by a left brace. * So plan 3 looks at the way parentheses and braces are * organized. Once the beginning of a function definition * is found, the prototype is extracted and we search for * the ending right brace. * (5) To find the ending right brace, it is necessary to do some * careful parsing. For example, in this file, we have * left and right braces as characters, and these must not * be counted. Somewhat more tricky, the file fhmtauto.c * generates code, and includes a right brace in a string. * So we must not include braces that are in strings. But how * do we know if something is inside a string? Keep state, * starting with not-inside, and every time you hit a double quote * that is not escaped, toggle the condition. Any brace * found in the state of being within a string is ignored. * (6) When a prototype is extracted, it is put in a canonical * form (i.e., cleaned up). Finally, we check that it is * not static and save it. (If static, it is ignored). * (7) The @prestring for unix is NULL; it is included here so that * you can use Microsoft's declaration for importing or * exporting to a dll. See environ.h for examples of use. * Here, we set: @prestring = "LEPT_DLL ". Note in particular * the space character that will separate 'LEPT_DLL' from * the standard unix prototype that follows. */ char * parseForProtos(const char *filein, const char *prestring) { char *strdata, *str, *newstr, *parsestr, *secondword; l_int32 nbytes, start, next, stop, charindex, found; SARRAY *sa, *saout, *satest; PROCNAME("parseForProtos"); if (!filein) return (char *)ERROR_PTR("filein not defined", procName, NULL); /* Read in the cpp output into memory, one string for each * line in the file, omitting blank lines. */ strdata = (char *)arrayRead(filein, &nbytes); sa = sarrayCreateLinesFromString(strdata, 0); saout = sarrayCreate(0); next = 0; while (1) { /* repeat after each non-static prototype is extracted */ searchForProtoSignature(sa, next, &start, &stop, &charindex, &found); if (!found) break; /* fprintf(stderr, " start = %d, stop = %d, charindex = %d\n", start, stop, charindex); */ str = captureProtoSignature(sa, start, stop, charindex); /* Make sure it is not static. Note that 'extern' has * been prepended to the prototype, so the 'static' * keyword, if it exists, would be the second word. */ satest = sarrayCreateWordsFromString(str); secondword = sarrayGetString(satest, 1, 0); if (strcmp(secondword, "static")) { /* not static */ if (prestring) { /* prepend it to the prototype */ newstr = stringJoin(prestring, str); sarrayAddString(saout, newstr, L_INSERT); FREE(str); } else sarrayAddString(saout, str, L_INSERT); } else FREE(str); sarrayDestroy(&satest); skipToEndOfFunction(sa, stop, charindex, &next); if (next == -1) break; } /* Flatten into a string with newlines between prototypes */ parsestr = sarrayToString(saout, 1); FREE(strdata); sarrayDestroy(&sa); sarrayDestroy(&saout); return parsestr; }
/* * cleanProtoSignature() * * Input: instr (input prototype string) * Return: cleanstr (clean prototype string), or NULL on error * * Notes: * (1) Adds 'extern' at beginning and regularizes spaces * between tokens. */ static char * cleanProtoSignature(char *instr) { char *str, *cleanstr; char buf[L_BUF_SIZE]; char externstring[] = "extern"; l_int32 i, j, nwords, nchars, index, len; SARRAY *sa, *saout; PROCNAME("cleanProtoSignature"); if (!instr) return (char *)ERROR_PTR("instr not defined", procName, NULL); sa = sarrayCreateWordsFromString(instr); nwords = sarrayGetCount(sa); saout = sarrayCreate(0); sarrayAddString(saout, externstring, 1); for (i = 0; i < nwords; i++) { str = sarrayGetString(sa, i, 0); nchars = strlen(str); index = 0; for (j = 0; j < nchars; j++) { if (index > L_BUF_SIZE - 6) return (char *)ERROR_PTR("token too large", procName, NULL); if (str[j] == '(') { buf[index++] = ' '; buf[index++] = '('; buf[index++] = ' '; } else if (str[j] == ')') { buf[index++] = ' '; buf[index++] = ')'; } else buf[index++] = str[j]; } buf[index] = '\0'; sarrayAddString(saout, buf, 1); } /* Flatten to a prototype string with spaces added after * each word, and remove the last space */ cleanstr = sarrayToString(saout, 2); len = strlen(cleanstr); cleanstr[len - 1] = '\0'; sarrayDestroy(&sa); sarrayDestroy(&saout); return cleanstr; }
/*! * \brief l_getIndexFromFile() * * \param[in] filename * \param[out] pindex found index * \return 0 if found, 1 on error. */ static l_int32 l_getIndexFromFile(const char *filename, l_int32 *pindex) { char buf[256]; char *word; FILE *fp; l_int32 notfound, format; SARRAY *sa; PROCNAME("l_getIndexFromFile"); if (!pindex) return ERROR_INT("&index not defined", procName, 1); *pindex = 0; if (!filename) return ERROR_INT("filename not defined", procName, 1); /* Open the stream, read lines until you find one with more * than a newline, and grab the first word. */ if ((fp = fopenReadStream(filename)) == NULL) return ERROR_INT("stream not opened", procName, 1); do { if ((fgets(buf, sizeof(buf), fp)) == NULL) { fclose(fp); return ERROR_INT("fgets read fail", procName, 1); } } while (buf[0] == '\n'); fclose(fp); sa = sarrayCreateWordsFromString(buf); word = sarrayGetString(sa, 0, L_NOCOPY); /* Find the index associated with the word. If it is not * found, test to see if the file is a compressed pix. */ notfound = l_getIndexFromStructname(word, pindex); sarrayDestroy(&sa); if (notfound) { /* maybe a Pix */ if (findFileFormat(filename, &format) == 0) { l_getIndexFromStructname("Pix", pindex); } else { return ERROR_INT("no file type identified", procName, 1); } } return 0; }
int main(int argc, char **argv) { l_int32 ignore; size_t nbytesin, nbytesout; char *infile, *instring, *outstring; SARRAY *sa1, *sa2, *sa3, *sa4, *sa5; char buf[256]; static char mainName[] = "string_reg"; if (argc != 2) return ERROR_INT(" Syntax: string_reg infile", mainName, 1); infile = argv[1]; instring = (char *)l_binaryRead(infile, &nbytesin); if (!instring) return ERROR_INT("file not read", mainName, 1); sa1 = sarrayCreateWordsFromString(instring); sa2 = sarrayCreateLinesFromString(instring, 0); sa3 = sarrayCreateLinesFromString(instring, 1); outstring = sarrayToString(sa1, 0); nbytesout = strlen(outstring); l_binaryWrite("/tmp/junk1.txt", "w", outstring, nbytesout); lept_free(outstring); outstring = sarrayToString(sa1, 1); nbytesout = strlen(outstring); l_binaryWrite("/tmp/junk2.txt", "w", outstring, nbytesout); lept_free(outstring); outstring = sarrayToString(sa2, 0); nbytesout = strlen(outstring); l_binaryWrite("/tmp/junk3.txt", "w", outstring, nbytesout); lept_free(outstring); outstring = sarrayToString(sa2, 1); nbytesout = strlen(outstring); l_binaryWrite("/tmp/junk4.txt", "w", outstring, nbytesout); lept_free(outstring); outstring = sarrayToString(sa3, 0); nbytesout = strlen(outstring); l_binaryWrite("/tmp/junk5.txt", "w", outstring, nbytesout); lept_free(outstring); outstring = sarrayToString(sa3, 1); nbytesout = strlen(outstring); l_binaryWrite("/tmp/junk6.txt", "w", outstring, nbytesout); lept_free(outstring); sprintf(buf, "diff -s /tmp/junk6.txt %s", infile); ignore = system(buf); /* write/read/write; compare /tmp/junkout5 with /tmp/junkout6 */ sarrayWrite("/tmp/junk7.txt", sa2); sarrayWrite("/tmp/junk8.txt", sa3); sa4 = sarrayRead("/tmp/junk8.txt"); sarrayWrite("/tmp/junk9.txt", sa4); sa5 = sarrayRead("/tmp/junk9.txt"); ignore = system("diff -s /tmp/junk8.txt /tmp/junk9.txt"); sarrayDestroy(&sa1); sarrayDestroy(&sa2); sarrayDestroy(&sa3); sarrayDestroy(&sa4); sarrayDestroy(&sa5); lept_free(instring); return 0; }
/* * parseForProtos() * * Input: filein (output of cpp) * prestring (<optional> string that prefaces each decl; * use NULL to omit) * Return: parsestr (string of function prototypes), or NULL on error * * Notes: * (1) We parse the output of cpp: * cpp -ansi <filein> * Three plans were attempted, with success on the third. * (2) Plan 1. A cursory examination of the cpp output indicated that * every function was preceded by a cpp comment statement. * So we just need to look at statements beginning after comments. * Unfortunately, this is NOT the case. Some functions start * without cpp comment lines, typically when there are no * comments in the source that immediately precede the function. * (3) Plan 2. Consider the keywords in the language that start * parts of the cpp file. Some, like 'typedef', 'enum', * 'union' and 'struct', are followed after a while by '{', * and eventually end with '}, plus an optional token and a * final ';' Others, like 'extern' and 'static', are never * the beginnings of global function definitions. Function * prototypes have one or more sets of '(' followed eventually * by a ')', and end with ';'. But function definitions have * tokens, followed by '(', more tokens, ')' and then * immediately a '{'. We would generate a prototype from this * by adding a ';' to all tokens up to the ')'. So we use * these special tokens to decide what we are parsing. And * whenever a function definition is found and the prototype * extracted, we skip through the rest of the function * past the corresponding '}'. This token ends a line, and * is often on a line of its own. But as it turns out, * the only keyword we need to consider is 'static'. * (4) Plan 3. Consider the parentheses and braces for various * declarations. A struct, enum, or union has a pair of * braces followed by a semicolon. They cannot have parentheses * before the left brace, but a struct can have lots of parentheses * within the brace set. A function prototype has no braces. * A function declaration can have sets of left and right * parentheses, but these are followed by a left brace. * So plan 3 looks at the way parentheses and braces are * organized. Once the beginning of a function definition * is found, the prototype is extracted and we search for * the ending right brace. * (5) To find the ending right brace, it is necessary to do some * careful parsing. For example, in this file, we have * left and right braces as characters, and these must not * be counted. Somewhat more tricky, the file fhmtauto.c * generates code, and includes a right brace in a string. * So we must not include braces that are in strings. But how * do we know if something is inside a string? Keep state, * starting with not-inside, and every time you hit a double quote * that is not escaped, toggle the condition. Any brace * found in the state of being within a string is ignored. * (6) When a prototype is extracted, it is put in a canonical * form (i.e., cleaned up). Finally, we check that it is * not static and save it. (If static, it is ignored). * (7) The @prestring for unix is NULL; it is included here so that * you can use Microsoft's declaration for importing or * exporting to a dll. See environ.h for examples of use. * Here, we set: @prestring = "LEPT_DLL ". Note in particular * the space character that will separate 'LEPT_DLL' from * the standard unix prototype that follows. */ char * parseForProtos(const char *filein, const char *prestring) { char *strdata, *str, *newstr, *parsestr, *secondword; l_int32 start, next, stop, charindex, found; size_t nbytes; SARRAY *sa, *saout, *satest; PROCNAME("parseForProtos"); if (!filein) return (char *)ERROR_PTR("filein not defined", procName, NULL); /* Read in the cpp output into memory, one string for each * line in the file, omitting blank lines. */ strdata = (char *)l_binaryRead(filein, &nbytes); sa = sarrayCreateLinesFromString(strdata, 0); saout = sarrayCreate(0); next = 0; while (1) { /* repeat after each non-static prototype is extracted */ searchForProtoSignature(sa, next, &start, &stop, &charindex, &found); if (!found) break; /* fprintf(stderr, " start = %d, stop = %d, charindex = %d\n", start, stop, charindex); */ str = captureProtoSignature(sa, start, stop, charindex); /* Make sure that the signature found by cpp is neither * static nor extern. We get 'extern' declarations from * header files, and with some versions of cpp running on * #include <sys/stat.h> we get something of the form: * extern ... (( ... )) ... ( ... ) { ... * For this, the 1st '(' is the lp, the 2nd ')' is the rp, * and there is a lot of garbage between the rp and the lb. * It is easiest to simply reject any signature that starts * with 'extern'. Note also that an 'extern' token has been * prepended to each prototype, so the 'static' or * 'extern' keywords we are looking for, if they exist, * would be the second word. */ satest = sarrayCreateWordsFromString(str); secondword = sarrayGetString(satest, 1, L_NOCOPY); if (strcmp(secondword, "static") && /* not static */ strcmp(secondword, "extern")) { /* not extern */ if (prestring) { /* prepend it to the prototype */ newstr = stringJoin(prestring, str); sarrayAddString(saout, newstr, L_INSERT); LEPT_FREE(str); } else { sarrayAddString(saout, str, L_INSERT); } } else { LEPT_FREE(str); } sarrayDestroy(&satest); skipToEndOfFunction(sa, stop, charindex, &next); if (next == -1) break; } /* Flatten into a string with newlines between prototypes */ parsestr = sarrayToString(saout, 1); LEPT_FREE(strdata); sarrayDestroy(&sa); sarrayDestroy(&saout); return parsestr; }
/*! * \brief sudokuReadFile() * * \param[in] filename of formatted sudoku file * \return array of 81 numbers, or NULL on error * * <pre> * Notes: * (1) The file format has: * * any number of comment lines beginning with '#' * * a set of 9 lines, each having 9 digits (0-9) separated * by a space * </pre> */ l_int32 * sudokuReadFile(const char *filename) { char *str, *strj; l_uint8 *data; l_int32 i, j, nlines, val, index, error; l_int32 *array; size_t size; SARRAY *saline, *sa1, *sa2; PROCNAME("sudokuReadFile"); if (!filename) return (l_int32 *)ERROR_PTR("filename not defined", procName, NULL); data = l_binaryRead(filename, &size); sa1 = sarrayCreateLinesFromString((char *)data, 0); sa2 = sarrayCreate(9); /* Filter out the comment lines; verify that there are 9 data lines */ nlines = sarrayGetCount(sa1); for (i = 0; i < nlines; i++) { str = sarrayGetString(sa1, i, L_NOCOPY); if (str[0] != '#') sarrayAddString(sa2, str, L_COPY); } LEPT_FREE(data); sarrayDestroy(&sa1); nlines = sarrayGetCount(sa2); if (nlines != 9) { sarrayDestroy(&sa2); L_ERROR("file has %d lines\n", procName, nlines); return (l_int32 *)ERROR_PTR("invalid file", procName, NULL); } /* Read the data into the array, verifying that each data * line has 9 numbers. */ error = FALSE; array = (l_int32 *)LEPT_CALLOC(81, sizeof(l_int32)); for (i = 0, index = 0; i < 9; i++) { str = sarrayGetString(sa2, i, L_NOCOPY); saline = sarrayCreateWordsFromString(str); if (sarrayGetCount(saline) != 9) { error = TRUE; sarrayDestroy(&saline); break; } for (j = 0; j < 9; j++) { strj = sarrayGetString(saline, j, L_NOCOPY); if (sscanf(strj, "%d", &val) != 1) error = TRUE; else array[index++] = val; } sarrayDestroy(&saline); if (error) break; } sarrayDestroy(&sa2); if (error) { LEPT_FREE(array); return (l_int32 *)ERROR_PTR("invalid data", procName, NULL); } return array; }