/*=export_func ao_string_tokenize * * what: tokenize an input string * * arg: + char const* + string + string to be tokenized + * * ret_type: token_list_t* * ret_desc: pointer to a structure that lists each token * * doc: * * This function will convert one input string into a list of strings. * The list of strings is derived by separating the input based on * white space separation. However, if the input contains either single * or double quote characters, then the text after that character up to * a matching quote will become the string in the list. * * The returned pointer should be deallocated with @code{free(3C)} when * are done using the data. The data are placed in a single block of * allocated memory. Do not deallocate individual token/strings. * * The structure pointed to will contain at least these two fields: * @table @samp * @item tkn_ct * The number of tokens found in the input string. * @item tok_list * An array of @code{tkn_ct + 1} pointers to substring tokens, with * the last pointer set to NULL. * @end table * * There are two types of quoted strings: single quoted (@code{'}) and * double quoted (@code{"}). Singly quoted strings are fairly raw in that * escape characters (@code{\\}) are simply another character, except when * preceding the following characters: * @example * @code{\\} double backslashes reduce to one * @code{'} incorporates the single quote into the string * @code{\n} suppresses both the backslash and newline character * @end example * * Double quote strings are formed according to the rules of string * constants in ANSI-C programs. * * example: * @example * #include <stdlib.h> * int ix; * token_list_t* ptl = ao_string_tokenize(some_string) * for (ix = 0; ix < ptl->tkn_ct; ix++) * do_something_with_tkn(ptl->tkn_list[ix]); * free(ptl); * @end example * Note that everything is freed with the one call to @code{free(3C)}. * * err: * NULL is returned and @code{errno} will be set to indicate the problem: * @itemize @bullet * @item * @code{EINVAL} - There was an unterminated quoted string. * @item * @code{ENOENT} - The input string was empty. * @item * @code{ENOMEM} - There is not enough memory. * @end itemize =*/ token_list_t* ao_string_tokenize(char const* str) { token_list_t* res = alloc_token_list(str); ch_t* pzDest; /* * Now copy each token into the output buffer. */ if (res == NULL) return res; pzDest = (ch_t*)(res->tkn_list[0]); res->tkn_ct = 0; do { res->tkn_list[ res->tkn_ct++ ] = pzDest; for (;;) { int ch = (ch_t)*str; if (IS_WHITESPACE_CHAR(ch)) { found_white_space: str = SPN_WHITESPACE_CHARS(str+1); break; } switch (ch) { case '"': copy_cooked(&pzDest, &str); if (str == NULL) { free(res); errno = EINVAL; return NULL; } if (IS_WHITESPACE_CHAR(*str)) goto found_white_space; break; case '\'': copy_raw(&pzDest, &str); if (str == NULL) { free(res); errno = EINVAL; return NULL; } if (IS_WHITESPACE_CHAR(*str)) goto found_white_space; break; case NUL: goto copy_done; default: str++; *(pzDest++) = (unsigned char)ch; } } copy_done:; /* * NUL terminate the last token and see if we have any more tokens. */ *(pzDest++) = NUL; } while (*str != NUL); res->tkn_list[ res->tkn_ct ] = NULL; return res; }
/*=export_func ao_string_tokenize * * what: tokenize an input string * * arg: + char const* + string + string to be tokenized + * * ret_type: token_list_t* * ret_desc: pointer to a structure that lists each token * * doc: * * This function will convert one input string into a list of strings. * The list of strings is derived by separating the input based on * white space separation. However, if the input contains either single * or double quote characters, then the text after that character up to * a matching quote will become the string in the list. * * The returned pointer should be deallocated with @code{free(3C)} when * are done using the data. The data are placed in a single block of * allocated memory. Do not deallocate individual token/strings. * * The structure pointed to will contain at least these two fields: * @table @samp * @item tkn_ct * The number of tokens found in the input string. * @item tok_list * An array of @code{tkn_ct + 1} pointers to substring tokens, with * the last pointer set to NULL. * @end table * * There are two types of quoted strings: single quoted (@code{'}) and * double quoted (@code{"}). Singly quoted strings are fairly raw in that * escape characters (@code{\\}) are simply another character, except when * preceding the following characters: * @example * @code{\\} double backslashes reduce to one * @code{'} incorporates the single quote into the string * @code{\n} suppresses both the backslash and newline character * @end example * * Double quote strings are formed according to the rules of string * constants in ANSI-C programs. * * example: * @example * #include <stdlib.h> * int ix; * token_list_t* ptl = ao_string_tokenize( some_string ) * for (ix = 0; ix < ptl->tkn_ct; ix++) * do_something_with_tkn( ptl->tkn_list[ix] ); * free( ptl ); * @end example * Note that everything is freed with the one call to @code{free(3C)}. * * err: * NULL is returned and @code{errno} will be set to indicate the problem: * @itemize @bullet * @item * @code{EINVAL} - There was an unterminated quoted string. * @item * @code{ENOENT} - The input string was empty. * @item * @code{ENOMEM} - There is not enough memory. * @end itemize =*/ token_list_t* ao_string_tokenize( char const* str ) { int max_token_ct = 1; /* allow for trailing NUL on string */ token_list_t* res; if (str == NULL) goto bogus_str; /* * Trim leading white space. Use "ENOENT" and a NULL return to indicate * an empty string was passed. */ while (isspace( (ch_t)*str )) str++; if (*str == NUL) { bogus_str: errno = ENOENT; return NULL; } /* * Take an approximate count of tokens. If no quoted strings are used, * it will be accurate. If quoted strings are used, it will be a little * high and we'll squander the space for a few extra pointers. */ { cc_t* pz = (cc_t*)str; do { max_token_ct++; while (! isspace( *++pz )) if (*pz == NUL) goto found_nul; while (isspace( *pz )) pz++; } while (*pz != NUL); found_nul: ; } res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) ); if (res == NULL) { errno = ENOMEM; return res; } /* * Now copy each token into the output buffer. */ { ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1)); res->tkn_ct = 0; do { res->tkn_list[ res->tkn_ct++ ] = pzDest; for (;;) { int ch = (ch_t)*str; if (isspace( ch )) { found_white_space: while (isspace( (ch_t)*++str )) ; break; } switch (ch) { case '"': copy_cooked( &pzDest, &str ); if (str == NULL) { free(res); errno = EINVAL; return NULL; } if (isspace( (ch_t)*str )) goto found_white_space; break; case '\'': copy_raw( &pzDest, &str ); if (str == NULL) { free(res); errno = EINVAL; return NULL; } if (isspace( (ch_t)*str )) goto found_white_space; break; case NUL: goto copy_done; default: str++; *(pzDest++) = ch; } } copy_done:; /* * NUL terminate the last token and see if we have any more tokens. */ *(pzDest++) = NUL; } while (*str != NUL); res->tkn_list[ res->tkn_ct ] = NULL; } return res; }