Example #1
0
/*=export_func ao_string_tokenize
 *
 * what: tokenize an input string
 *
 * arg:  + char const* + string + string to be tokenized +
 *
 * ret_type:  token_list_t*
 * ret_desc:  pointer to a structure that lists each token
 *
 * doc:
 *
 * This function will convert one input string into a list of strings.
 * The list of strings is derived by separating the input based on
 * white space separation.  However, if the input contains either single
 * or double quote characters, then the text after that character up to
 * a matching quote will become the string in the list.
 *
 *  The returned pointer should be deallocated with @code{free(3C)} when
 *  are done using the data.  The data are placed in a single block of
 *  allocated memory.  Do not deallocate individual token/strings.
 *
 *  The structure pointed to will contain at least these two fields:
 *  @table @samp
 *  @item tkn_ct
 *  The number of tokens found in the input string.
 *  @item tok_list
 *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
 *  the last pointer set to NULL.
 *  @end table
 *
 * There are two types of quoted strings: single quoted (@code{'}) and
 * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
 * escape characters (@code{\\}) are simply another character, except when
 * preceding the following characters:
 * @example
 * @code{\\}  double backslashes reduce to one
 * @code{'}   incorporates the single quote into the string
 * @code{\n}  suppresses both the backslash and newline character
 * @end example
 *
 * Double quote strings are formed according to the rules of string
 * constants in ANSI-C programs.
 *
 * example:
 * @example
 *    #include <stdlib.h>
 *    int ix;
 *    token_list_t* ptl = ao_string_tokenize(some_string)
 *    for (ix = 0; ix < ptl->tkn_ct; ix++)
 *       do_something_with_tkn(ptl->tkn_list[ix]);
 *    free(ptl);
 * @end example
 * Note that everything is freed with the one call to @code{free(3C)}.
 *
 * err:
 *  NULL is returned and @code{errno} will be set to indicate the problem:
 *  @itemize @bullet
 *  @item
 *  @code{EINVAL} - There was an unterminated quoted string.
 *  @item
 *  @code{ENOENT} - The input string was empty.
 *  @item
 *  @code{ENOMEM} - There is not enough memory.
 *  @end itemize
=*/
token_list_t*
ao_string_tokenize(char const* str)
{
    token_list_t* res = alloc_token_list(str);
    ch_t* pzDest;

    /*
     *  Now copy each token into the output buffer.
     */
    if (res == NULL)
        return res;

    pzDest = (ch_t*)(res->tkn_list[0]);
    res->tkn_ct  = 0;

    do  {
        res->tkn_list[ res->tkn_ct++ ] = pzDest;
        for (;;) {
            int ch = (ch_t)*str;
            if (IS_WHITESPACE_CHAR(ch)) {
            found_white_space:
                str = SPN_WHITESPACE_CHARS(str+1);
                break;
            }

            switch (ch) {
            case '"':
                copy_cooked(&pzDest, &str);
                if (str == NULL) {
                    free(res);
                    errno = EINVAL;
                    return NULL;
                }
                if (IS_WHITESPACE_CHAR(*str))
                    goto found_white_space;
                break;

            case '\'':
                copy_raw(&pzDest, &str);
                if (str == NULL) {
                    free(res);
                    errno = EINVAL;
                    return NULL;
                }
                if (IS_WHITESPACE_CHAR(*str))
                    goto found_white_space;
                break;

            case NUL:
                goto copy_done;

            default:
                str++;
                *(pzDest++) = (unsigned char)ch;
            }
        } copy_done:;

        /*
         * NUL terminate the last token and see if we have any more tokens.
         */
        *(pzDest++) = NUL;
    } while (*str != NUL);

    res->tkn_list[ res->tkn_ct ] = NULL;

    return res;
}
Example #2
0
/*=export_func ao_string_tokenize
 *
 * what: tokenize an input string
 *
 * arg:  + char const* + string + string to be tokenized +
 *
 * ret_type:  token_list_t*
 * ret_desc:  pointer to a structure that lists each token
 *
 * doc:
 *
 * This function will convert one input string into a list of strings.
 * The list of strings is derived by separating the input based on
 * white space separation.  However, if the input contains either single
 * or double quote characters, then the text after that character up to
 * a matching quote will become the string in the list.
 *
 *  The returned pointer should be deallocated with @code{free(3C)} when
 *  are done using the data.  The data are placed in a single block of
 *  allocated memory.  Do not deallocate individual token/strings.
 *
 *  The structure pointed to will contain at least these two fields:
 *  @table @samp
 *  @item tkn_ct
 *  The number of tokens found in the input string.
 *  @item tok_list
 *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
 *  the last pointer set to NULL.
 *  @end table
 *
 * There are two types of quoted strings: single quoted (@code{'}) and
 * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
 * escape characters (@code{\\}) are simply another character, except when
 * preceding the following characters:
 * @example
 * @code{\\}  double backslashes reduce to one
 * @code{'}   incorporates the single quote into the string
 * @code{\n}  suppresses both the backslash and newline character
 * @end example
 *
 * Double quote strings are formed according to the rules of string
 * constants in ANSI-C programs.
 *
 * example:
 * @example
 *    #include <stdlib.h>
 *    int ix;
 *    token_list_t* ptl = ao_string_tokenize( some_string )
 *    for (ix = 0; ix < ptl->tkn_ct; ix++)
 *       do_something_with_tkn( ptl->tkn_list[ix] );
 *    free( ptl );
 * @end example
 * Note that everything is freed with the one call to @code{free(3C)}.
 *
 * err:
 *  NULL is returned and @code{errno} will be set to indicate the problem:
 *  @itemize @bullet
 *  @item
 *  @code{EINVAL} - There was an unterminated quoted string.
 *  @item
 *  @code{ENOENT} - The input string was empty.
 *  @item
 *  @code{ENOMEM} - There is not enough memory.
 *  @end itemize
=*/
token_list_t*
ao_string_tokenize( char const* str )
{
    int max_token_ct = 1; /* allow for trailing NUL on string */
    token_list_t* res;

    if (str == NULL)  goto bogus_str;

    /*
     *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
     *  an empty string was passed.
     */
    while (isspace( (ch_t)*str ))  str++;
    if (*str == NUL) {
    bogus_str:
        errno = ENOENT;
        return NULL;
    }

    /*
     *  Take an approximate count of tokens.  If no quoted strings are used,
     *  it will be accurate.  If quoted strings are used, it will be a little
     *  high and we'll squander the space for a few extra pointers.
     */
    {
        cc_t* pz = (cc_t*)str;

        do {
            max_token_ct++;
            while (! isspace( *++pz ))
                if (*pz == NUL) goto found_nul;
            while (isspace( *pz ))  pz++;
        } while (*pz != NUL);

    found_nul:
        ;
    }

    res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) );
    if (res == NULL) {
        errno = ENOMEM;
        return res;
    }

    /*
     *  Now copy each token into the output buffer.
     */
    {
        ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1));
        res->tkn_ct  = 0;

        do  {
            res->tkn_list[ res->tkn_ct++ ] = pzDest;
            for (;;) {
                int ch = (ch_t)*str;
                if (isspace( ch )) {
                found_white_space:
                    while (isspace( (ch_t)*++str ))  ;
                    break;
                }

                switch (ch) {
                case '"':
                    copy_cooked( &pzDest, &str );
                    if (str == NULL) {
                        free(res);
                        errno = EINVAL;
                        return NULL;
                    }
                    if (isspace( (ch_t)*str ))
                        goto found_white_space;
                    break;

                case '\'':
                    copy_raw( &pzDest, &str );
                    if (str == NULL) {
                        free(res);
                        errno = EINVAL;
                        return NULL;
                    }
                    if (isspace( (ch_t)*str ))
                        goto found_white_space;
                    break;

                case NUL:
                    goto copy_done;

                default:
                    str++;
                    *(pzDest++) = ch;
                }
            } copy_done:;

            /*
             * NUL terminate the last token and see if we have any more tokens.
             */
            *(pzDest++) = NUL;
        } while (*str != NUL);

        res->tkn_list[ res->tkn_ct ] = NULL;
    }

    return res;
}