string xml_html_parser::transcode (string s2) { s= parse_string (s2); string encoding; if (test (s, "<?")) { s += 2; string target= parse_name (); skip_space (); if (target == "xml") { // since html==true implies we can accept horribly broken HTML, the // presence of an XML prolog is not enough to clear the flag. /* html= false; */ while (s && !test (s, "?>")) { string attname= parse_name (); skip_space (); if (!test (s, "=")) break; s += 1; skip_space (); string val; if (test (s, "\"")) { s += 1; val= parse_until ("\""); skip_space (); } else if (test (s, "'")) { s += 1; val= parse_until ("'"); skip_space (); } if (attname == "encoding") { encoding= upcase_all (val); break; } } } } if (N(encoding) != 0) { // cout << "encoding was specified\n" ; string s3= convert (s2, encoding, "UTF-8"); if (N(s3) == 0) /* conversion from specified charset failed, do nothing (and pray) */ ; else return s3; } else { // cout << "guess encoding\n" ; if (check_encoding (s2, "UTF-8")) /* input encoding seems to be utf-8, do nothing */ ; else { string s3= convert (s2, "ISO-8859-1", "UTF-8"); if (N(s3) != 0) return s3; } } return s2; }
string xml_html_parser::parse_quoted () { if (test (s, "\42")) { s += 1; return parse_until ("\42"); } if (test (s, "'")) { s += 1; return parse_until ("'"); } return ""; }
ss_inst *init_insts(stream_t *stream) { char *buf; char type; long IP, new_IP; new_IP = IP = getpos(stream); ss_inst *head, *ptr; ss_inst *indexes[4]; head = ptr = malloc(sizeof(ss_inst)); while (listench(stream)) { int len = get_parsable_length(stream); buf = calloc(len + 1, sizeof(char)); getstr(stream, len, buf); if (buf[len - 1] == '(' && *buf == '?') { type = '('; indexes[1] = parse_until(stream, ')'); } else if (*buf == '[') { type = '['; indexes[1] = parse_until(stream, ']'); } else if (*buf == '{') { type = '{'; indexes[1] = parse_until(stream, '}'); } else if (*buf == '?') { type = '?'; char iden = buf[len - 1]; if (iden == '(') { indexes[1] = parse_until(stream, ')');/* {expr} */ } else if (strchr(RETN, iden)) { move_stream(stream, 1); new_IP += get_parsable_length(stream); move_stream(stream, -1); } if (listench(stream) == '[') type = 'w'; move_stream(stream, new_IP - IP - 1); // TODO conseq & alt and loop body } else { type = 0; } indexes[0] = init_inst(type, buf); if (type) { for (int i = 1; i < indexes[0]->branch_no; i ++) indexes[0]->indexes[i] = indexes[i]; move_stream(stream, new_IP - IP - 1); } ptr->indexes[0] = indexes[0]; ptr = ptr->indexes[0]; } ss_inst *retn = head->indexes[0]; free_inst(head); return retn; }
tree xml_html_parser::parse_pi () { s += 2; string name= parse_name (); skip_space (); return tuple ("pi", name, parse_until ("?>")); }
tree xml_html_parser::parse_closing () { s += 2; string name= parse_name (); (void) parse_until (">"); return tuple ("end", name); }
static void parse_argv(const char **s, const char *argv[], size_t maxarg) { for (size_t i = 0; i < maxarg; i++) { skip_spaces(s); if (**s == '"' || **s == '\'') argv[i] = parse_delimited_text(s); else argv[i] = parse_until(s, " \t\n"); } }
static inline #endif PsycParseIndexRC psyc_parse_index (PsycParseIndexState *state, PsycString *idx) { ParseRC ret; if (state->cursor >= state->buffer.length) return PSYC_PARSE_INDEX_END; state->startc = state->cursor; switch (state->part) { case PSYC_INDEX_PART_START: case PSYC_INDEX_PART_TYPE: idx->length = 0; idx->data = NULL; switch (state->buffer.data[state->cursor]) { case '#': state->part = PSYC_INDEX_PART_LIST; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_INSUFFICIENT); goto PSYC_INDEX_PART_LIST; case '.': state->part = PSYC_INDEX_PART_DICT; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_INSUFFICIENT); goto PSYC_INDEX_PART_STRUCT; case '{': state->part = PSYC_INDEX_PART_DICT; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_INSUFFICIENT); goto PSYC_INDEX_PART_DICT_LENGTH; default: return PSYC_PARSE_INDEX_ERROR_TYPE; } case PSYC_INDEX_PART_LIST: PSYC_INDEX_PART_LIST: switch (parse_length((ParseState*)state, &idx->length)) { case PARSE_SUCCESS: // list index is complete state->part = PSYC_INDEX_PART_TYPE; return PSYC_PARSE_INDEX_LIST; case PARSE_INSUFFICIENT: // list index at the end of buffer return PSYC_PARSE_INDEX_LIST_LAST; case PARSE_ERROR: // no index return PSYC_PARSE_INDEX_ERROR_LIST; default: // should not be reached return PSYC_PARSE_INDEX_ERROR; } case PSYC_INDEX_PART_STRUCT: PSYC_INDEX_PART_STRUCT: switch (parse_keyword((ParseState*)state, idx)) { case PARSE_SUCCESS: // end of keyword state->part = PSYC_INDEX_PART_TYPE; return PSYC_PARSE_INDEX_STRUCT; case PARSE_INSUFFICIENT: // end of buffer return PSYC_PARSE_INDEX_STRUCT_LAST; case PARSE_ERROR: // no keyword return PSYC_PARSE_INDEX_ERROR_STRUCT; default: // should not be reached return PSYC_PARSE_INDEX_ERROR; } case PSYC_INDEX_PART_DICT_LENGTH: PSYC_INDEX_PART_DICT_LENGTH: switch (parse_length((ParseState*)state, &state->elemlen)) { case PARSE_SUCCESS: // length is complete state->elemlen_found = 1; state->elem_parsed = 0; idx->length = state->elemlen; idx->data = NULL; if (state->buffer.data[state->cursor] != ' ') return PSYC_PARSE_INDEX_ERROR_DICT_LENGTH; state->part = PSYC_INDEX_PART_DICT; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_INSUFFICIENT); break; case PARSE_INSUFFICIENT: // length is incomplete return PSYC_PARSE_DICT_INSUFFICIENT; case PARSE_ERROR: // no length state->part = PSYC_INDEX_PART_DICT; break; default: // should not be reached return PSYC_PARSE_INDEX_ERROR; } // fall thru case PSYC_INDEX_PART_DICT: if (state->elemlen_found) { switch (parse_binary((ParseState*)state, state->elemlen, idx, &state->elem_parsed)) { case PARSE_SUCCESS: if (idx->length == state->elem_parsed) ret = PSYC_PARSE_INDEX_DICT; else ret = PSYC_PARSE_INDEX_DICT_END; break; case PARSE_INCOMPLETE: if (idx->length == state->elem_parsed) ret = PSYC_PARSE_INDEX_DICT_START; else ret = PSYC_PARSE_INDEX_DICT_CONT; break; default: // should not be reached return PSYC_PARSE_INDEX_ERROR_DICT; } } else { switch (parse_until((ParseState*)state, '}', idx)) { case PARSE_SUCCESS: ret = PSYC_PARSE_INDEX_DICT; break; case PARSE_INSUFFICIENT: return PSYC_PARSE_INDEX_INSUFFICIENT; default: // should not be reached return PSYC_PARSE_INDEX_ERROR_DICT; } } state->part = PSYC_INDEX_PART_TYPE; state->cursor++; return ret; } return PSYC_PARSE_INDEX_ERROR; // should not be reached }
/** * Parse dictionary. * * dict = [ type ] *dict-item * dict-item = "{" ( dict-key / length SP OCTET) "}" * ( type [ SP dict-value ] / [ length ] [ ":" type ] [ SP *OCTET ] ) * dict-key = %x00-7C / %x7E-FF ; any byte except "{" * dict-value = %x00-7A / %x7C-FF ; any byte except "}" */ PsycParseDictRC psyc_parse_dict (PsycParseDictState *state, PsycString *type, PsycString *elem) { ParseRC ret; if (state->cursor >= state->buffer.length) return PSYC_PARSE_DICT_END; state->startc = state->cursor; switch (state->part) { case PSYC_DICT_PART_START: type->length = elem->length = 0; type->data = elem->data = NULL; state->part = PSYC_DICT_PART_TYPE; // fall thru case PSYC_DICT_PART_TYPE: switch (parse_keyword((ParseState*)state, type)) { case PARSE_SUCCESS: // end of keyword state->part = PSYC_DICT_PART_KEY_START; return PSYC_PARSE_DICT_TYPE; case PARSE_INSUFFICIENT: // end of buffer return PSYC_PARSE_DICT_END; case PARSE_ERROR: // no keyword state->part = PSYC_DICT_PART_KEY_START; break; default: // should not be reached return PSYC_PARSE_DICT_ERROR; } // fall thru case PSYC_DICT_PART_KEY_START: if (state->buffer.data[state->cursor] != '{') return PSYC_PARSE_DICT_ERROR_KEY_START; type->length = elem->length = 0; type->data = elem->data = NULL; state->elem_parsed = 0; state->elemlen_found = 0; state->part = PSYC_DICT_PART_KEY_LENGTH; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_DICT_INSUFFICIENT); // fall thru case PSYC_DICT_PART_KEY_LENGTH: switch (parse_length((ParseState*)state, &state->elemlen)) { case PARSE_SUCCESS: // length is complete state->elemlen_found = 1; state->elem_parsed = 0; elem->length = state->elemlen; elem->data = NULL; if (state->buffer.data[state->cursor] != ' ') return PSYC_PARSE_DICT_ERROR_KEY_LENGTH; state->part = PSYC_DICT_PART_KEY; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_INSUFFICIENT); break; case PARSE_INSUFFICIENT: // length is incomplete return PSYC_PARSE_DICT_INSUFFICIENT; case PARSE_ERROR: // no length state->part = PSYC_DICT_PART_KEY; break; default: // should not be reached return PSYC_PARSE_DICT_ERROR; } // fall thru case PSYC_DICT_PART_KEY: if (state->elemlen_found) { switch (parse_binary((ParseState*)state, state->elemlen, elem, &state->elem_parsed)) { case PARSE_SUCCESS: if (elem->length == state->elem_parsed) ret = PSYC_PARSE_DICT_KEY; else ret = PSYC_PARSE_DICT_KEY_END; break; case PARSE_INCOMPLETE: if (elem->length == state->elem_parsed) ret = PSYC_PARSE_DICT_KEY_START; else ret = PSYC_PARSE_DICT_KEY_CONT; break; default: // should not be reached return PSYC_PARSE_DICT_ERROR; } } else { switch (parse_until((ParseState*)state, '}', elem)) { case PARSE_SUCCESS: ret = PSYC_PARSE_DICT_KEY; break; case PARSE_INSUFFICIENT: return PSYC_PARSE_DICT_INSUFFICIENT; default: // should not be reached return PSYC_PARSE_DICT_ERROR; } } state->part = PSYC_DICT_PART_VALUE_START; state->startc = state->cursor; return ret; case PSYC_DICT_PART_VALUE_START: switch (state->buffer.data[state->cursor] != '}') return PSYC_PARSE_DICT_ERROR_VALUE_START; type->length = elem->length = 0; type->data = elem->data = NULL; state->elem_parsed = 0; state->elemlen_found = 0; state->part = PSYC_DICT_PART_VALUE_TYPE; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_DICT_VALUE_LAST); // fall thru case PSYC_DICT_PART_VALUE_TYPE: if (state->buffer.data[state->cursor] == '=') { ADVANCE_CURSOR_OR_RETURN(PSYC_PARSE_INSUFFICIENT); switch (parse_keyword((ParseState*)state, type)) { case PARSE_SUCCESS: switch (state->buffer.data[state->cursor]) { case ':': state->part = PSYC_DICT_PART_VALUE_LENGTH; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_DICT_VALUE_LAST); break; case ' ': state->part = PSYC_DICT_PART_VALUE; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_DICT_VALUE_LAST); goto PSYC_DICT_PART_VALUE; case '{': state->part = PSYC_DICT_PART_KEY_START; return PSYC_PARSE_DICT_VALUE; break; default: return PSYC_PARSE_DICT_ERROR_VALUE_TYPE; } break; case PARSE_INSUFFICIENT: // end of buffer return PSYC_PARSE_DICT_VALUE_LAST; case PARSE_ERROR: return PSYC_PARSE_DICT_ERROR_VALUE_TYPE; default: // should not be reached return PSYC_PARSE_DICT_ERROR; } } // fall thru case PSYC_DICT_PART_VALUE_LENGTH: switch (parse_length((ParseState*)state, &state->elemlen)) { case PARSE_SUCCESS: // length is complete state->elemlen_found = 1; state->elem_parsed = 0; elem->length = state->elemlen; elem->data = NULL; break; case PARSE_INSUFFICIENT: // length is incomplete return PSYC_PARSE_DICT_INSUFFICIENT; case PARSE_ERROR: // no length break; default: // should not be reached return PSYC_PARSE_DICT_ERROR; } switch (state->buffer.data[state->cursor]) { case ' ': state->part = PSYC_DICT_PART_VALUE; ADVANCE_STARTC_OR_RETURN(PSYC_PARSE_DICT_VALUE_LAST); break; case '{': state->part = PSYC_DICT_PART_KEY_START; return PSYC_PARSE_DICT_VALUE; default: return PSYC_PARSE_DICT_ERROR_VALUE_LENGTH; } // fall thru case PSYC_DICT_PART_VALUE: PSYC_DICT_PART_VALUE: if (state->elemlen_found) { switch (parse_binary((ParseState*)state, state->elemlen, elem, &state->elem_parsed)) { case PARSE_SUCCESS: if (elem->length == state->elem_parsed) ret = PSYC_PARSE_DICT_VALUE; else ret = PSYC_PARSE_DICT_VALUE_END; break; case PARSE_INCOMPLETE: if (elem->length == state->elem_parsed) ret = PSYC_PARSE_DICT_VALUE_START; else ret = PSYC_PARSE_DICT_VALUE_CONT; break; default: // should not be reached return PSYC_PARSE_DICT_ERROR; } } else { switch (parse_until((ParseState*)state, '{', elem)) { case PARSE_SUCCESS: ret = PSYC_PARSE_DICT_VALUE; break; case PARSE_INSUFFICIENT: return PSYC_PARSE_DICT_VALUE_LAST; default: // should not be reached return PSYC_PARSE_DICT_ERROR; } } state->part = PSYC_DICT_PART_KEY_START; return ret; } return PSYC_PARSE_DICT_ERROR; // should not be reached }
tree xml_html_parser::parse_notation () { s += 10; return tuple ("notation", parse_until (">")); }
tree xml_html_parser::parse_attlist () { s += 9; return tuple ("attlist", parse_until (">")); }
tree xml_html_parser::parse_element () { s += 9; return tuple ("element", parse_until (">")); }
tree xml_html_parser::parse_cdata () { s += 9; return tuple ("cdata", parse_until ("]]>")); }
tree xml_html_parser::parse_comment () { s += 4; return tuple ("comment", parse_until ("-->")); }
static char *parse_filename(const char **s) { skip_spaces(s); if (**s == '"' || **s == '\'') return parse_delimited_text(s); return parse_until(s, "\n"); }
static char *parse_shellcmd(const char **s) { skip_spaces(s); return parse_until(s, "\n"); }