/* * parseForProtos() * * Input: filein (output of cpp) * prestring (<optional> string that prefaces each decl; * use NULL to omit) * Return: parsestr (string of function prototypes), or NULL on error * * Notes: * (1) We parse the output of cpp: * cpp -ansi <filein> * Three plans were attempted, with success on the third. * (2) Plan 1. A cursory examination of the cpp output indicated that * every function was preceeded by a cpp comment statement. * So we just need to look at statements beginning after comments. * Unfortunately, this is NOT the case. Some functions start * without cpp comment lines, typically when there are no * comments in the source that immediately precede the function. * (3) Plan 2. Consider the keywords in the language that start * parts of the cpp file. Some, like 'typedef', 'enum', * 'union' and 'struct', are followed after a while by '{', * and eventually end with '}, plus an optional token and a * final ';' Others, like 'extern' and 'static', are never * the beginnings of global function definitions. Function * prototypes have one or more sets of '(' followed eventually * by a ')', and end with ';'. But function definitions have * tokens, followed by '(', more tokens, ')' and then * immediately a '{'. We would generate a prototype from this * by adding a ';' to all tokens up to the ')'. So we use * these special tokens to decide what we are parsing. And * whenever a function definition is found and the prototype * extracted, we skip through the rest of the function * past the corresponding '}'. This token ends a line, and * is often on a line of its own. But as it turns out, * the only keyword we need to consider is 'static'. * (4) Plan 3. Consider the parentheses and braces for various * declarations. A struct, enum, or union has a pair of * braces followed by a semicolon. They cannot have parentheses * before the left brace, but a struct can have lots of parentheses * within the brace set. A function prototype has no braces. * A function declaration can have sets of left and right * parentheses, but these are followed by a left brace. * So plan 3 looks at the way parentheses and braces are * organized. Once the beginning of a function definition * is found, the prototype is extracted and we search for * the ending right brace. * (5) To find the ending right brace, it is necessary to do some * careful parsing. For example, in this file, we have * left and right braces as characters, and these must not * be counted. Somewhat more tricky, the file fhmtauto.c * generates code, and includes a right brace in a string. * So we must not include braces that are in strings. But how * do we know if something is inside a string? Keep state, * starting with not-inside, and every time you hit a double quote * that is not escaped, toggle the condition. Any brace * found in the state of being within a string is ignored. * (6) When a prototype is extracted, it is put in a canonical * form (i.e., cleaned up). Finally, we check that it is * not static and save it. (If static, it is ignored). * (7) The @prestring for unix is NULL; it is included here so that * you can use Microsoft's declaration for importing or * exporting to a dll. See environ.h for examples of use. * Here, we set: @prestring = "LEPT_DLL ". Note in particular * the space character that will separate 'LEPT_DLL' from * the standard unix prototype that follows. */ char * parseForProtos(const char *filein, const char *prestring) { char *strdata, *str, *newstr, *parsestr, *secondword; l_int32 nbytes, start, next, stop, charindex, found; SARRAY *sa, *saout, *satest; PROCNAME("parseForProtos"); if (!filein) return (char *)ERROR_PTR("filein not defined", procName, NULL); /* Read in the cpp output into memory, one string for each * line in the file, omitting blank lines. */ strdata = (char *)arrayRead(filein, &nbytes); sa = sarrayCreateLinesFromString(strdata, 0); saout = sarrayCreate(0); next = 0; while (1) { /* repeat after each non-static prototype is extracted */ searchForProtoSignature(sa, next, &start, &stop, &charindex, &found); if (!found) break; /* fprintf(stderr, " start = %d, stop = %d, charindex = %d\n", start, stop, charindex); */ str = captureProtoSignature(sa, start, stop, charindex); /* Make sure it is not static. Note that 'extern' has * been prepended to the prototype, so the 'static' * keyword, if it exists, would be the second word. */ satest = sarrayCreateWordsFromString(str); secondword = sarrayGetString(satest, 1, 0); if (strcmp(secondword, "static")) { /* not static */ if (prestring) { /* prepend it to the prototype */ newstr = stringJoin(prestring, str); sarrayAddString(saout, newstr, L_INSERT); FREE(str); } else sarrayAddString(saout, str, L_INSERT); } else FREE(str); sarrayDestroy(&satest); skipToEndOfFunction(sa, stop, charindex, &next); if (next == -1) break; } /* Flatten into a string with newlines between prototypes */ parsestr = sarrayToString(saout, 1); FREE(strdata); sarrayDestroy(&sa); sarrayDestroy(&saout); return parsestr; }
/* * parseForProtos() * * Input: filein (output of cpp) * prestring (<optional> string that prefaces each decl; * use NULL to omit) * Return: parsestr (string of function prototypes), or NULL on error * * Notes: * (1) We parse the output of cpp: * cpp -ansi <filein> * Three plans were attempted, with success on the third. * (2) Plan 1. A cursory examination of the cpp output indicated that * every function was preceded by a cpp comment statement. * So we just need to look at statements beginning after comments. * Unfortunately, this is NOT the case. Some functions start * without cpp comment lines, typically when there are no * comments in the source that immediately precede the function. * (3) Plan 2. Consider the keywords in the language that start * parts of the cpp file. Some, like 'typedef', 'enum', * 'union' and 'struct', are followed after a while by '{', * and eventually end with '}, plus an optional token and a * final ';' Others, like 'extern' and 'static', are never * the beginnings of global function definitions. Function * prototypes have one or more sets of '(' followed eventually * by a ')', and end with ';'. But function definitions have * tokens, followed by '(', more tokens, ')' and then * immediately a '{'. We would generate a prototype from this * by adding a ';' to all tokens up to the ')'. So we use * these special tokens to decide what we are parsing. And * whenever a function definition is found and the prototype * extracted, we skip through the rest of the function * past the corresponding '}'. This token ends a line, and * is often on a line of its own. But as it turns out, * the only keyword we need to consider is 'static'. * (4) Plan 3. Consider the parentheses and braces for various * declarations. A struct, enum, or union has a pair of * braces followed by a semicolon. They cannot have parentheses * before the left brace, but a struct can have lots of parentheses * within the brace set. A function prototype has no braces. * A function declaration can have sets of left and right * parentheses, but these are followed by a left brace. * So plan 3 looks at the way parentheses and braces are * organized. Once the beginning of a function definition * is found, the prototype is extracted and we search for * the ending right brace. * (5) To find the ending right brace, it is necessary to do some * careful parsing. For example, in this file, we have * left and right braces as characters, and these must not * be counted. Somewhat more tricky, the file fhmtauto.c * generates code, and includes a right brace in a string. * So we must not include braces that are in strings. But how * do we know if something is inside a string? Keep state, * starting with not-inside, and every time you hit a double quote * that is not escaped, toggle the condition. Any brace * found in the state of being within a string is ignored. * (6) When a prototype is extracted, it is put in a canonical * form (i.e., cleaned up). Finally, we check that it is * not static and save it. (If static, it is ignored). * (7) The @prestring for unix is NULL; it is included here so that * you can use Microsoft's declaration for importing or * exporting to a dll. See environ.h for examples of use. * Here, we set: @prestring = "LEPT_DLL ". Note in particular * the space character that will separate 'LEPT_DLL' from * the standard unix prototype that follows. */ char * parseForProtos(const char *filein, const char *prestring) { char *strdata, *str, *newstr, *parsestr, *secondword; l_int32 start, next, stop, charindex, found; size_t nbytes; SARRAY *sa, *saout, *satest; PROCNAME("parseForProtos"); if (!filein) return (char *)ERROR_PTR("filein not defined", procName, NULL); /* Read in the cpp output into memory, one string for each * line in the file, omitting blank lines. */ strdata = (char *)l_binaryRead(filein, &nbytes); sa = sarrayCreateLinesFromString(strdata, 0); saout = sarrayCreate(0); next = 0; while (1) { /* repeat after each non-static prototype is extracted */ searchForProtoSignature(sa, next, &start, &stop, &charindex, &found); if (!found) break; /* fprintf(stderr, " start = %d, stop = %d, charindex = %d\n", start, stop, charindex); */ str = captureProtoSignature(sa, start, stop, charindex); /* Make sure that the signature found by cpp is neither * static nor extern. We get 'extern' declarations from * header files, and with some versions of cpp running on * #include <sys/stat.h> we get something of the form: * extern ... (( ... )) ... ( ... ) { ... * For this, the 1st '(' is the lp, the 2nd ')' is the rp, * and there is a lot of garbage between the rp and the lb. * It is easiest to simply reject any signature that starts * with 'extern'. Note also that an 'extern' token has been * prepended to each prototype, so the 'static' or * 'extern' keywords we are looking for, if they exist, * would be the second word. */ satest = sarrayCreateWordsFromString(str); secondword = sarrayGetString(satest, 1, L_NOCOPY); if (strcmp(secondword, "static") && /* not static */ strcmp(secondword, "extern")) { /* not extern */ if (prestring) { /* prepend it to the prototype */ newstr = stringJoin(prestring, str); sarrayAddString(saout, newstr, L_INSERT); LEPT_FREE(str); } else { sarrayAddString(saout, str, L_INSERT); } } else { LEPT_FREE(str); } sarrayDestroy(&satest); skipToEndOfFunction(sa, stop, charindex, &next); if (next == -1) break; } /* Flatten into a string with newlines between prototypes */ parsestr = sarrayToString(saout, 1); LEPT_FREE(strdata); sarrayDestroy(&sa); sarrayDestroy(&saout); return parsestr; }