myhtml_string_t * myhtml_node_text_set_with_charef(myhtml_tree_t* tree, myhtml_tree_node_t *node, const char* text, size_t length, myhtml_encoding_t encoding) { if(node == NULL) return NULL; if(encoding >= MyHTML_ENCODING_LAST_ENTRY) return NULL; if(node->token == NULL) { mcobject_async_status_t mcstatus; node->token = (myhtml_token_node_t*)mcobject_async_malloc(tree->token->nodes_obj, tree->mcasync_token_id, &mcstatus); if(mcstatus) return NULL; myhtml_token_node_clean(node->token); } if(node->token->str.data == NULL) { myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->str, (length + 2)); } else { if(node->token->str.size < length) { mchar_async_free(tree->mchar, node->token->str.node_idx, node->token->str.data); myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->str, length); } else node->token->str.length = 0; } myhtml_data_process_entry_t proc_entry; myhtml_data_process_entry_clean(&proc_entry); proc_entry.encoding = encoding; myhtml_encoding_result_clean(&proc_entry.res); myhtml_data_process(&proc_entry, &node->token->str, text, length); myhtml_data_process_end(&proc_entry, &node->token->str); node->token->raw_begin = 0; node->token->raw_length = 0; return &node->token->str; }
myhtml_string_t * myhtml_node_text_set(myhtml_tree_t* tree, myhtml_tree_node_t *node, const char* text, size_t length, myhtml_encoding_t encoding) { if(node == NULL) return NULL; if(encoding >= MyHTML_ENCODING_LAST_ENTRY) return NULL; if(node->token == NULL) { mcobject_async_status_t mcstatus; node->token = (myhtml_token_node_t*)mcobject_async_malloc(tree->token->nodes_obj, tree->mcasync_token_id, &mcstatus); if(mcstatus) return NULL; myhtml_token_node_clean(node->token); } if(node->token->my_str_tm.data == NULL) { myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->my_str_tm, (length + 2)); } else { if(node->token->my_str_tm.size < length) { mchar_async_free(tree->mchar, node->token->my_str_tm.node_idx, node->token->my_str_tm.data); myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->my_str_tm, length); } else node->token->my_str_tm.length = 0; } if(encoding != MyHTML_ENCODING_UTF_8) { myhtml_string_append_with_convert_encoding(&node->token->my_str_tm, text, length, encoding); } else { myhtml_string_append(&node->token->my_str_tm, text, length); } node->token->begin = 0; node->token->length = node->token->my_str_tm.length; return &node->token->my_str_tm; }
myhtml_string_t * myhtml_node_text_set_with_charef(myhtml_tree_t* tree, myhtml_tree_node_t *node, const char* text, size_t length, myhtml_encoding_t encoding) { if(node == NULL) return NULL; if(encoding >= MyHTML_ENCODING_LAST_ENTRY) return NULL; if(node->token == NULL) { mcobject_async_status_t mcstatus; node->token = (myhtml_token_node_t*)mcobject_async_malloc(tree->token->nodes_obj, tree->mcasync_token_id, &mcstatus); if(mcstatus) return NULL; myhtml_token_node_clean(node->token); } if(node->token->my_str_tm.data == NULL) { myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->my_str_tm, (length + 2)); } else { if(node->token->my_str_tm.size < length) { mchar_async_free(tree->mchar, node->token->my_str_tm.node_idx, node->token->my_str_tm.data); myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->my_str_tm, length); } else node->token->my_str_tm.length = 0; } myhtml_string_char_ref_chunk_t str_chunk = {0, 0, 0, {0}, false, encoding}; myhtml_encoding_result_clean(&str_chunk.res); myhtml_string_append_charef(&str_chunk, &node->token->my_str_tm, text, length); myhtml_string_append_charef_end(&str_chunk, &node->token->my_str_tm); node->token->begin = 0; node->token->length = node->token->my_str_tm.length; return &node->token->my_str_tm; }
void myhtml_parser_worker(mythread_id_t thread_id, mythread_queue_node_t *qnode) { myhtml_token_node_t* token = qnode->token; size_t mchar_node_id = qnode->tree->async_args[thread_id].mchar_node_id; if(token->tag_ctx_idx == MyHTML_TAG__TEXT || token->tag_ctx_idx == MyHTML_TAG__COMMENT) { myhtml_string_init(qnode->tree->mchar, mchar_node_id, &token->my_str_tm, (qnode->length + 2)); token->begin = token->my_str_tm.length; token->length = qnode->length; token->attr_first = NULL; token->attr_last = NULL; if(token->type & MyHTML_TOKEN_TYPE_RCDATA || token->type & MyHTML_TOKEN_TYPE_CDATA || token->type & MyHTML_TOKEN_TYPE_DATA) { token->length = myhtml_parser_add_text_with_charef(qnode->tree, &token->my_str_tm, qnode->text, qnode->begin, qnode->length); } else token->length = myhtml_parser_add_text(qnode->tree, &token->my_str_tm, qnode->text, qnode->begin, qnode->length); } else if(token->attr_first) { token->my_str_tm.data = NULL; token->my_str_tm.mchar = NULL; token->my_str_tm.node_idx = 0; token->my_str_tm.length = 0; token->my_str_tm.size = 0; token->begin = 0; token->length = 0; myhtml_token_attr_t* attr = token->attr_first; while(attr) { myhtml_string_init(qnode->tree->mchar, mchar_node_id, &attr->entry, (attr->name_length + attr->value_length + 8)); if(attr->name_length) { size_t begin = attr->name_begin; attr->name_begin = attr->entry.length; attr->name_length = myhtml_parser_add_text_lowercase(qnode->tree, &attr->entry, qnode->text, begin, attr->name_length); } if(attr->value_length) { size_t begin = attr->value_begin; attr->value_begin = attr->entry.length; attr->value_length = myhtml_parser_add_text_with_charef(qnode->tree, &attr->entry, qnode->text, begin, attr->value_length); } attr = attr->next; } } else { token->begin = 0; token->length = 0; token->attr_first = NULL; token->attr_last = NULL; token->my_str_tm.data = NULL; token->my_str_tm.mchar = NULL; token->my_str_tm.node_idx = 0; token->my_str_tm.length = 0; token->my_str_tm.size = 0; } token->type |= MyHTML_TOKEN_TYPE_DONE; }
void myhtml_parser_worker(mythread_id_t thread_id, mythread_queue_node_t *qnode) { myhtml_tree_t* tree = qnode->tree; myhtml_token_node_t* token = qnode->token; if(qnode->tree->parse_flags & MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN) { if(tree->callback_before_token) tree->callback_before_token_ctx = tree->callback_before_token(tree, token, tree->callback_before_token_ctx); token->type |= MyHTML_TOKEN_TYPE_DONE; if(tree->callback_after_token) tree->callback_after_token_ctx = tree->callback_after_token(tree, token, tree->callback_after_token_ctx); return; } if(tree->callback_before_token) tree->callback_before_token_ctx = tree->callback_before_token(tree, token, tree->callback_before_token_ctx); size_t mchar_node_id = qnode->tree->async_args[thread_id].mchar_node_id; if(token->tag_id == MyHTML_TAG__TEXT || token->tag_id == MyHTML_TAG__COMMENT) { myhtml_string_init(tree->mchar, mchar_node_id, &token->str, (token->raw_length + 1)); token->attr_first = NULL; token->attr_last = NULL; myhtml_data_process_entry_t proc_entry; myhtml_data_process_entry_clean(&proc_entry); proc_entry.encoding = tree->encoding; if(token->type & MyHTML_TOKEN_TYPE_DATA) { proc_entry.emit_null_char = true; myhtml_parser_token_data_to_string_charef(tree, &token->str, &proc_entry, token->raw_begin, token->raw_length); } else if(token->type & MyHTML_TOKEN_TYPE_RCDATA || token->type & MyHTML_TOKEN_TYPE_CDATA) { myhtml_parser_token_data_to_string_charef(tree, &token->str, &proc_entry, token->raw_begin, token->raw_length); } else myhtml_parser_token_data_to_string(tree, &token->str, &proc_entry, token->raw_begin, token->raw_length); } else if(token->attr_first) { myhtml_string_clean_all(&token->str); myhtml_token_attr_t* attr = token->attr_first; myhtml_data_process_entry_t proc_entry; myhtml_data_process_entry_clean(&proc_entry); proc_entry.encoding = tree->encoding; while(attr) { if(attr->raw_key_length) { myhtml_string_init(tree->mchar, mchar_node_id, &attr->key, (attr->raw_key_length + 1)); myhtml_parser_token_data_to_string_lowercase(tree, &attr->key, &proc_entry, attr->raw_key_begin, attr->raw_key_length); } else myhtml_string_clean_all(&attr->key); if(attr->raw_value_length) { myhtml_string_init(tree->mchar, mchar_node_id, &attr->value, (attr->raw_value_length + 1)); proc_entry.is_attributes = true; myhtml_parser_token_data_to_string_charef(tree, &attr->value, &proc_entry, attr->raw_value_begin, attr->raw_value_length); } else myhtml_string_clean_all(&attr->value); attr = attr->next; } } else { token->attr_first = NULL; token->attr_last = NULL; myhtml_string_clean_all(&token->str); } token->type |= MyHTML_TOKEN_TYPE_DONE; if(tree->callback_after_token) tree->callback_after_token_ctx = tree->callback_after_token(tree, token, tree->callback_after_token_ctx); }
void myhtml_parser_worker(mythread_id_t thread_id, mythread_queue_node_t *qnode) { myhtml_token_node_t* token = qnode->token; size_t mchar_node_id = qnode->tree->async_args[thread_id].mchar_node_id; if(token->tag_ctx_idx == MyHTML_TAG__TEXT || token->tag_ctx_idx == MyHTML_TAG__COMMENT) { myhtml_string_init(qnode->tree->mchar, mchar_node_id, &token->my_str_tm, (qnode->length + 4)); token->begin = token->my_str_tm.length; token->length = qnode->length; token->attr_first = NULL; token->attr_last = NULL; // for NULL token; NULL Token contains only one char == \0 // The further processing may be changed (in rules processing) to 'REPLACEMENT CHARACTER' (U+FFFD) if(token->type & MyHTML_TOKEN_TYPE_NULL) { token->length = 1; token->my_str_tm.length = 1; token->my_str_tm.data[0] = '\0'; } else if(token->type & MyHTML_TOKEN_TYPE_DATA || token->type & MyHTML_TOKEN_TYPE_RCDATA || token->type & MyHTML_TOKEN_TYPE_CDATA) { token->length = myhtml_parser_add_text_with_charef(qnode->tree, &token->my_str_tm, qnode->text, qnode->begin, qnode->length, false); } else token->length = myhtml_parser_add_text(qnode->tree, &token->my_str_tm, qnode->text, qnode->begin, qnode->length); } else if(token->attr_first) { token->my_str_tm.data = NULL; token->my_str_tm.mchar = NULL; token->my_str_tm.node_idx = 0; token->my_str_tm.length = 0; token->my_str_tm.size = 0; token->begin = 0; token->length = 0; myhtml_token_attr_t* attr = token->attr_first; while(attr) { myhtml_string_init(qnode->tree->mchar, mchar_node_id, &attr->entry, (attr->name_length + attr->value_length + 8)); if(attr->name_length) { size_t begin = attr->name_begin; attr->name_begin = attr->entry.length; attr->name_length = myhtml_parser_add_text_lowercase(qnode->tree, &attr->entry, qnode->text, begin, attr->name_length); } if(attr->value_length) { size_t begin = attr->value_begin; attr->value_begin = attr->entry.length; attr->value_length = myhtml_parser_add_text_with_charef(qnode->tree, &attr->entry, qnode->text, begin, attr->value_length, true); } attr->my_namespace = MyHTML_NAMESPACE_HTML; attr = attr->next; } } else { token->begin = 0; token->length = 0; token->attr_first = NULL; token->attr_last = NULL; token->my_str_tm.data = NULL; token->my_str_tm.mchar = NULL; token->my_str_tm.node_idx = 0; token->my_str_tm.length = 0; token->my_str_tm.size = 0; } token->type |= MyHTML_TOKEN_TYPE_DONE; }