Beispiel #1
0
static PyObject *constituents(PyObject *self, PyObject *args) {
    Dictionary    dict;
    Parse_Options opts;
    Sentence      sent;
    Linkage       linkage;
    CNode *       cn;

    /// Link counts
    int   num_linkages;


    const char *text;

    PyObject *output_list;

    if (!PyArg_ParseTuple(args, "s", &text))
        return NULL;

    opts = parse_options_create();
    parse_options_set_verbosity(opts, -1);

    setlocale(LC_ALL, "");
    dict = dictionary_create_default_lang();

    if (!dict) {
        PyErr_SetString(PyExc_RuntimeError, "Fatal error: Unable to open the dictionary");
        Py_INCREF(Py_None);
        return Py_None;
    }

    sent = sentence_create(text, dict);
    sentence_split(sent, opts);
    num_linkages = sentence_parse(sent, opts);

    if (num_linkages > 0) {
        linkage = linkage_create(0, sent, opts);

        cn = linkage_constituent_tree(linkage);
        output_list = build_tree(cn, linkage);
        if(output_list == Py_None) {
            Py_INCREF(output_list);
            return output_list;
        }
        linkage_free_constituent_tree(cn);

        linkage_delete(linkage);
    } else {
        sentence_delete(sent);
        dictionary_delete(dict);
        parse_options_delete(opts);

        Py_INCREF(Py_None);
        return Py_None;
    }
    sentence_delete(sent);
    dictionary_delete(dict);
    parse_options_delete(opts);

    return Py_BuildValue("S", output_list);
}
Beispiel #2
0
int sentence_parse(Sentence sent, Parse_Options opts)
{
	int rc;

	sent->num_valid_linkages = 0;

	/* If the sentence has not yet been split, do so now.
	 * This is for backwards compatibility, for existing programs
	 * that do not explicitly call the splitter.
	 */
	if (0 == sent->length)
	{
		rc = sentence_split(sent, opts);
		if (rc) return -1;
	}
	else
	{
		/* During a panic parse, we enter here a second time, with leftover
		 * garbage. Free it. We really should make the code that is panicking
		 * do this free, but right now, they have no API for it, so we do it
		 * as a favor. XXX FIXME someday. */
		free_sentence_disjuncts(sent);
	}

	/* Check for bad sentence length */
	if (MAX_SENTENCE <= sent->length)
	{
		prt_error("Error: sentence too long, contains more than %d words\n",
			MAX_SENTENCE);
		return -2;
	}

	resources_reset(opts->resources);

	/* Expressions were set up during the tokenize stage.
	 * Prune them, and then parse.
	 */
	expression_prune(sent, opts);
	print_time(opts, "Finished expression pruning");
	if (opts->use_sat_solver)
	{
		sat_parse(sent, opts);
	}
	else
	{
		classic_parse(sent, opts);
	}
	print_time(opts, "Finished parse");

	if ((verbosity > 0) &&
	   (PARSE_NUM_OVERFLOW < sent->num_linkages_found))
	{
		prt_error("Warning: Combinatorial explosion! nulls=%zu cnt=%d\n"
			"Consider retrying the parse with the max allowed disjunct cost set lower.\n"
			"At the command line, use !cost-max\n",
			sent->null_count, sent->num_linkages_found);
	}
	return sent->num_valid_linkages;
}
Beispiel #3
0
int sentence_parse(Sentence sent, Parse_Options opts)
{
	int rc;

	verbosity = opts->verbosity;
	debug = opts->debug;
	test = opts->test;

	sent->num_valid_linkages = 0;

	/* If the sentence has not yet been split, do so now.
	 * This is for backwards compatibility, for existing programs
	 * that do not explicitly call the splitter.
	 */
	if (0 == sent->length)
	{
		rc = sentence_split(sent, opts);
		if (rc) return -1;
	}

	/* Check for bad sentence length */
	if (MAX_SENTENCE <= sent->length)
	{
		prt_error("Error: sentence too long, contains more than %d words\n",
			MAX_SENTENCE);
		return -2;
	}

	/* Initialize/free any leftover garbage */
	free_sentence_disjuncts(sent);  /* Is this really needed ??? */
	resources_reset_space(opts->resources);

	if (resources_exhausted(opts->resources))
		return 0;

	/* Expressions were previously set up during the tokenize stage. */
	expression_prune(sent);
	print_time(opts, "Finished expression pruning");
	if (opts->use_sat_solver)
	{
		sat_parse(sent, opts);
	}
	else
	{
		chart_parse(sent, opts);
	}
	print_time(opts, "Finished parse");

	if ((verbosity > 0) &&
	   (PARSE_NUM_OVERFLOW < sent->num_linkages_found))
	{
		prt_error("WARNING: Combinatorial explosion! nulls=%zu cnt=%d\n"
			"Consider retrying the parse with the max allowed disjunct cost set lower.\n"
			"At the command line, use !cost-max\n",
			sent->null_count, sent->num_linkages_found);
	}
	return sent->num_valid_linkages;
}
Beispiel #4
0
// **************************************************************************
// HTTPヘッダを受信して解析する。
//
// 処理するのはGETのみ。GET以外のメソッドが来たらエラー
// 今のところ、URIとuser_agent、Range、Hostを解析。
// URIは、URIデコードもやる。
//
//	return: 0 		正常終了
//	return: 0以外 	エラー
// **************************************************************************
static int http_header_receive(int accept_socket, HTTP_RECV_INFO *http_recv_info_p)
{
	int result = 0;
	int	recv_len;
	unsigned char	line_buf[1024];	// 大きめに。
	unsigned char 	work_buf[1024];
	unsigned char 	work_buf2[1024];
	unsigned char 	split1[1024];
	unsigned char 	split2[1024];

	int		ret;
	int		i;
	int		j;

	// ================================
	// 1行づつ HTTPヘッダを受信
	// ================================
	for (i=0;;i++)
	{
		// 1行受信 実行。
		memset(line_buf, '\0', sizeof(line_buf));
		recv_len = line_receive(accept_socket, line_buf, sizeof(line_buf));

		// debug. 受信したヘッダ表示
		debug_log_output("'%s'(%d byte)\n", line_buf, recv_len );

		// 受信した内容をチェック。
		if ( i != 0 && recv_len == 0 ) // 空行検知。ヘッダ受信終了。
		{
			break;
		}
		else if ( recv_len < 0 ) // 受信失敗
		{
			return ( -1 );
		}

		// --------------------------
		// GETメッセージチェック
		// --------------------------
		if ( i == 0 ) // 1行目のみチェック
		{
			debug_log_output("URI Check start.'%s'\n", line_buf);

			// GET/POSTある?
			if (strncmp(line_buf, "GET ", 4) && strncmp(line_buf, "POST ", 5))
			{
				debug_log_output("'GET' or 'POST' not found. error.");
				return ( -1 );
			}
			strncpy(http_recv_info_p->request_method, line_buf, sizeof(http_recv_info_p->request_method));
			cut_after_character(http_recv_info_p->request_method, ' ');

			// 最初のスペースまでを削除。
			cut_before_character(line_buf, ' ');

			// 次にスペースが出てきたところの後ろまでを削除。
			cut_after_character(line_buf, ' ');

			// ===========================
			// GETオプション部解析
			// ===========================

			// REQUEST_URI用・Proxy用に値を保存
			strncpy(http_recv_info_p->request_uri, line_buf, sizeof(http_recv_info_p->request_uri));

			// '?'が存在するかチェック。
			if ( strchr(line_buf, '?') != NULL )
			{

				strncpy(work_buf, line_buf, sizeof(work_buf));

				// '?'より前をカット
				cut_before_character(work_buf, '?' );
				debug_log_output("work_buf = '%s'", work_buf );

				while ( 1 )
				{
					memset(split1, '\0', sizeof(split1));
					memset(split2, '\0', sizeof(split2));

					uri_decode(split1, sizeof(split1), work_buf, sizeof(work_buf) );
					strcpy(work_buf, split1);

					// 最初に登場する'&'で分割
					ret = sentence_split(work_buf, '&', split1, split2 );
					if ( ret == 0 ) // 分割成功
					{
						strncpy(work_buf, split2, sizeof(work_buf));
					}
					else if (strlen(work_buf) > 0) // まだwork_bufに中身ある?
					{
						strncpy( split1, work_buf, sizeof(split1));
						strncpy( work_buf, "", sizeof(work_buf));
					}
					else // 処理終了
					{
						break;
					}

					// -------------------------------------
					// GETした内容 解析開始
					// 超安直。いいのかこんな比較で。
					// -------------------------------------

					// URIデコード
					// uri_decode(work_buf2, sizeof(work_buf2), split1, sizeof(split1) );
					strcpy(work_buf2, split1);

					// "page="あるか調査。
					if (strncasecmp( work_buf2, "page=", strlen("page=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');
						if ( strlen(work_buf2) > 0 )
						{
							// 構造体に値を保存。
							http_recv_info_p->page = atoi(work_buf2);
						}

						continue;
					}

					// "menupage="あるか調査。
					if (strncasecmp( work_buf2, "menupage=", strlen("menupage=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');
						if ( strlen(work_buf2) > 0 )
						{
							// 構造体に値を保存。
							http_recv_info_p->menupage = atoi(work_buf2);
						}

						continue;
					}

					// "title=" DVD title selection
					if (strncasecmp( work_buf2, "title=", strlen("title=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');
						if ( strlen(work_buf2) > 0 )
						{
							// 構造体に値を保存。
							http_recv_info_p->title = atoi(work_buf2);
						}

						continue;
					}

					if (strncasecmp( work_buf2, "width=", strlen("width=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');
						if ( strlen(work_buf2) > 0 )
						{
							// 構造体に値を保存。
							global_param.target_jpeg_width = atoi(work_buf2);
						}

						continue;
					}

					if (strncasecmp( work_buf2, "height=", strlen("height=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');
						if ( strlen(work_buf2) > 0 )
						{
							// 構造体に値を保存。
							global_param.target_jpeg_height = atoi(work_buf2);
						}

						continue;
					}

					// "action="あるか調査。
					if (strncasecmp( work_buf2, "action=", strlen("action=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');

						// 構造体に値を保存。
						strncpy(http_recv_info_p->action, work_buf2, sizeof(http_recv_info_p->action));
						continue;
					}

					// "type=" allplay list type
					if (strncasecmp( work_buf2, "type=movie", strlen("type=movie") ) == 0 )
					{
						http_recv_info_p->default_file_type = TYPE_MOVIE;
						continue;
					}
					if (strncasecmp( work_buf2, "type=music", strlen("type=music") ) == 0 )
					{
						http_recv_info_p->default_file_type = TYPE_MUSIC;
						continue;
					}
					if (strncasecmp( work_buf2, "type=photo", strlen("type=photo") ) == 0 )
					{
						http_recv_info_p->default_file_type = TYPE_JPEG;
						continue;
					}
					if (strncasecmp( work_buf2, "type=soundtrack", strlen("type=soundtrack") ) == 0 )
					{
						http_recv_info_p->default_file_type = TYPE_MUSICLIST;
						continue;
					}
					if (strncasecmp( work_buf2, "type=slideshow", strlen("type=slideshow") ) == 0 )
					{
						http_recv_info_p->default_file_type = TYPE_PLAYLIST;
						continue;
					}
					
					// "option="あるか調査
					if (strncasecmp( work_buf2, "option=", strlen("option=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');

						// 構造体に値を保存。
						strncpy(http_recv_info_p->option, work_buf2, sizeof(http_recv_info_p->option));
						continue;
					}

					// "alias="あるか調査
					if (strncasecmp( work_buf2, "alias=", strlen("alias=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');

						// 構造体に値を保存。
						strncpy(http_recv_info_p->alias, work_buf2, sizeof(http_recv_info_p->alias));

						if (strncasecmp(http_recv_info_p->alias, "movie", strlen("movie")) == 0)
							http_recv_info_p->default_file_type = TYPE_MOVIE;
						else if (strncasecmp(http_recv_info_p->alias, "music", strlen("music")) == 0)
							http_recv_info_p->default_file_type = TYPE_MUSIC;
						else if (strncasecmp(http_recv_info_p->alias, "photo", strlen("photo")) == 0)
							http_recv_info_p->default_file_type = TYPE_JPEG;
						else
							http_recv_info_p->default_file_type = TYPE_UNKNOWN;

						continue;
					}

					if (strncasecmp( work_buf2, "lsearch=", strlen("lsearch=") ) == 0 )
					{
						cut_before_character(work_buf2, '=');

						strcpy(http_recv_info_p->lsearch, work_buf2);
						strcpy(http_recv_info_p->recv_uri, work_buf2);
						continue;
					}

					// "search=
					if (strncasecmp( work_buf2, "search", strlen("search") ) == 0 )
					{
						if (strncasecmp(work_buf2, "search_movie", strlen("search_movie")) == 0) {
							http_recv_info_p->search_type = TYPE_MOVIE;
							strcpy(http_recv_info_p->search_str, "_movie");
							http_recv_info_p->default_file_type = TYPE_MOVIE;
							// printf("search movie for ");
						} else if (strncasecmp(work_buf2, "search_music", strlen("search_music")) == 0) {
							http_recv_info_p->search_type = TYPE_MUSIC;
							strcpy(http_recv_info_p->search_str, "_music");
							http_recv_info_p->default_file_type = TYPE_MUSIC;
							// printf("search music for ");
						} else if (strncasecmp(work_buf2, "search_photo", strlen("search_photo")) == 0) {
							http_recv_info_p->search_type = TYPE_JPEG;
							strcpy(http_recv_info_p->search_str, "_photo");
							http_recv_info_p->default_file_type = TYPE_JPEG;
							// printf("search photo for ");
						} else if (strncasecmp(work_buf2, "search_all", strlen("search_all")) == 0) {
							http_recv_info_p->search_type = TYPE_UNKNOWN;
							strcpy(http_recv_info_p->search_str, "_all");
							http_recv_info_p->default_file_type = TYPE_UNKNOWN;
							// printf("search all for ");
						} else
							continue;

						cut_before_character(work_buf2, '=');

						strncpy(http_recv_info_p->search, work_buf2, sizeof(http_recv_info_p->search));
						if (http_recv_info_p->search[0] == '\0')
							// everything qualifies
							strcpy(http_recv_info_p->search, ".*");

						// printf("%s\n", http_recv_info_p->search);
						continue;
					}

					// "sort="あるか調査
					if (strncasecmp( work_buf2, "sort=", strlen("sort=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');

						// 構造体に値を保存。
						strncpy(http_recv_info_p->sort, work_buf2, sizeof(http_recv_info_p->sort));
						continue;
					}


					// "dvdopt="あるか調査
					if (strncasecmp( work_buf2, "dvdopt=", strlen("dvdopt=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');

						// 構造体に値を保存。
						strncpy(http_recv_info_p->dvdopt, work_buf2, sizeof(http_recv_info_p->dvdopt));
						continue;
					}

					// "focus="あるか調査
					if (strncasecmp( work_buf2, "focus=", strlen("focus=") ) == 0 )
					{
						// = より前を削除
						cut_before_character(work_buf2, '=');

						// 構造体に値を保存。
						strncpy(http_recv_info_p->focus, work_buf2, sizeof(http_recv_info_p->focus));
						continue;
					}




				}
			}

			debug_log_output("http_recv_info_p->page = '%d'", http_recv_info_p->page);
			debug_log_output("http_recv_info_p->title = '%d'", http_recv_info_p->title);
			debug_log_output("http_recv_info_p->action = '%s'", http_recv_info_p->action);
			debug_log_output("http_recv_info_p->option = '%s'", http_recv_info_p->option);
			debug_log_output("http_recv_info_p->dvdopt = '%s'", http_recv_info_p->dvdopt);

			// URIデコード
			cut_after_character(line_buf, '?');
			uri_decode(work_buf, sizeof(work_buf), line_buf, sizeof(line_buf) );
			strncpy(line_buf, work_buf, sizeof(line_buf));
			debug_log_output("URI(decoded):'%s'\n", line_buf);

			convert_language_code(line_buf, work_buf, sizeof(work_buf), CODE_AUTO, CODE_EUC);
			debug_log_output("URI(decoded,euc,FYI):'%s'\n", work_buf);


			// 構造体に保存
			if (http_recv_info_p->lsearch[0] == '\0')
				strncpy(http_recv_info_p->recv_uri, line_buf, sizeof(http_recv_info_p->recv_uri));

			continue;

		}

		// User-agent切り出し
		if ( strncasecmp(line_buf, HTTP_USER_AGENT, strlen(HTTP_USER_AGENT) ) == 0 )
		{
			// ':'より前を切る
			cut_before_character(line_buf, ':');
			cut_first_character(line_buf, ' ');

			// 構造体に保存
			strncpy( http_recv_info_p->user_agent, line_buf, sizeof(http_recv_info_p->user_agent));

			// Set the skin name based on user agent, if desired
			for(j=0; j<global_param.alternate_skin_count; j++) {
				debug_log_output("Checking for '%s'", global_param.alternate_skin_match[j]);
				if(strstr(line_buf, global_param.alternate_skin_match[j]) != NULL) {
					strcpy(global_param.skin_name, global_param.alternate_skin_name[j]);
					debug_log_output("User agent matches alternate skin '%s'", global_param.skin_name); 
					break;
				}
			}
			continue;
		}

		// Rangeあるかチェック
		if ( strncasecmp(line_buf, HTTP_RANGE,	strlen(HTTP_RANGE) ) == 0 )
		{
			debug_log_output("%s Detect.\n", HTTP_RANGE);
			// ':' より前を切る。
			cut_before_character(line_buf, ':');
			cut_first_character(line_buf, ' ');

			// recv_range にRangeの中身保存
			strncpy(http_recv_info_p->recv_range, line_buf, sizeof(http_recv_info_p->recv_range));

			// '=' より前を切る
			cut_before_character(line_buf, '=');


			// '-'で前後に分割。
			sentence_split(line_buf, '-', work_buf, work_buf2);

			debug_log_output("work_buf='%s'\n", work_buf);
			debug_log_output("work_buf2='%s'\n", work_buf2);

			// 値を文字列→数値変換
			http_recv_info_p->range_start_pos  = strtoull(work_buf, NULL, 10);

			if ( strlen(work_buf2) > 0 )
			{
				http_recv_info_p->range_end_pos = strtoull(work_buf2, NULL, 10);
			}


			debug_log_output("range_start_pos=%d\n", http_recv_info_p->range_start_pos);
			debug_log_output("range_end_pos=%d\n", http_recv_info_p->range_end_pos);

			continue;
		}

		// Hostあるかチェック
		if ( strncasecmp(line_buf, HTTP_HOST,	strlen(HTTP_HOST) ) == 0 )
		{
			debug_log_output("%s Detect.\n", HTTP_HOST);
			// ':' より前を切る。
			cut_before_character(line_buf, ':');
			cut_first_character(line_buf, ' ');

			strncpy(http_recv_info_p->recv_host, line_buf, sizeof(http_recv_info_p->recv_host));
			if(NULL == strchr(http_recv_info_p->recv_host, ':')) {
				debug_log_output("CLIENT BUG: Host header field was missing port number - fixing");
				snprintf(http_recv_info_p->recv_host + strlen(http_recv_info_p->recv_host), sizeof(http_recv_info_p->recv_host)-1, ":%d", global_param.server_port);
				debug_log_output("%s '%s'", HTTP_HOST, http_recv_info_p->recv_host);
			}

			continue;
		}

		if (strncasecmp(line_buf, HTTP_AUTHORIZATION, strlen(HTTP_AUTHORIZATION)) == 0) {
			debug_log_output("%s Detect.\n", HTTP_AUTHORIZATION);
			// ':' より前を切る。
			cut_before_character(line_buf, ':');
			cut_first_character(line_buf, ' ');
			if (strncmp(line_buf, "Basic ", 6)) {
				debug_log_output("received '%s', is not supported.", line_buf);
				continue;
			}

			strncpy(http_recv_info_p->passwd, line_buf + 6, sizeof(http_recv_info_p->passwd));

			continue;
		}

		// Content-Lengthあるかチェック
		if ( strncasecmp(line_buf, HTTP_CONTENT_LENGTH_STR,	strlen(HTTP_CONTENT_LENGTH_STR) ) == 0 )
		{
			debug_log_output("%s Detect.\n", HTTP_CONTENT_LENGTH_STR);
			// ':' より前を切る。
			cut_before_character(line_buf, ':');
			cut_first_character(line_buf, ' ');
			/* recv_content_length は わざと long です. */
			http_recv_info_p->recv_content_length = strtol(line_buf, NULL, 10);
			debug_log_output("Content-Length: %ld", http_recv_info_p->recv_content_length);
			continue;
		}

	}

	return result;
}
Beispiel #5
0
static PyObject *domains(PyObject *self, PyObject *args) {
    Dictionary    dict;
    Parse_Options opts;
    Sentence      sent;
    Linkage       linkage;
    //CNode *       cn;

    /// Link counts
    int   num_linkages;
    int   links;
    int   i;
    int   j = 0;
    int num_domains;
    const char *text;

    PyObject *output_list;
    PyObject *temp;
    output_list = PyList_New(0);

    if (!PyArg_ParseTuple(args, "s", &text))
        return NULL;

    opts = parse_options_create();
    parse_options_set_verbosity(opts, -1);

    setlocale(LC_ALL, "");
    dict = dictionary_create_default_lang();

    if (!dict) {
        PyErr_SetString(PyExc_RuntimeError, "Fatal error: Unable to open the dictionary");
        Py_INCREF(Py_None);
        return Py_None;
    }

    sent = sentence_create(text, dict);
    sentence_split(sent, opts);
    num_linkages = sentence_parse(sent, opts);

    if (num_linkages > 0) {
        linkage = linkage_create(0, sent, opts);
        links = linkage_get_num_sublinkages(linkage);
        for(i=0; i<=links; i++) {
            num_domains = linkage_get_link_num_domains(linkage, i);
            const char **temp1 = linkage_get_link_domain_names(linkage, i);
            //for(j=0; j<=num_domains; j++){
            while(num_domains < j) {
                temp = PyString_FromString(temp1[j]);
                PyList_Append(output_list, temp);
                j++;
            }
            j = 0;
        }
        linkage_delete(linkage);
    } else {
        sentence_delete(sent);
        dictionary_delete(dict);
        parse_options_delete(opts);
        Py_INCREF(Py_None);
        return Py_None;
    }
    sentence_delete(sent);
    dictionary_delete(dict);
    parse_options_delete(opts);

    return Py_BuildValue("Si", output_list, num_domains);
}
Beispiel #6
0
/// This is the basic sentence dissection
static PyObject *sentence(PyObject *self, PyObject *args) {
    Dictionary    dict;
    Parse_Options opts;
    Sentence      sent;
    Linkage       linkage;
    Linkage       sub_linkage;
    char *        diagram;

    /// Link counts
    int   num_linkages;
    int   links;

    ///  Index's for the iterators
    int   link_idx;
    int   word_idx;
    int   num_words;
    long   span;
    long   sub_linkages;

    const char *text;
    const char *d_output;

    PyObject *output_list;
    PyObject *word_list;
    PyObject *word2_list;
    PyObject *span_list;
    PyObject *temp;
    PyObject *sublinkage_list;
    PyObject *_diagram;

    output_list = PyList_New(0);
    word_list   = PyList_New(0);
    word2_list  = PyList_New(0);
    sublinkage_list = PyList_New(0);

    span_list = PyList_New(0);

    if (!PyArg_ParseTuple(args, "s", &text))
        return NULL;

    opts = parse_options_create();
    parse_options_set_verbosity(opts, -1);
    parse_options_set_screen_width(opts, 50);

    setlocale(LC_ALL, "");
    dict = dictionary_create_default_lang();

    if (!dict) {
        PyErr_SetString(PyExc_RuntimeError, "Fatal error: Unable to open the dictionary");
        Py_INCREF(Py_None);
        return Py_None;
    }

    sent = sentence_create(text, dict);
    sentence_split(sent, opts);
    num_linkages = sentence_parse(sent, opts);

    if (num_linkages > 0) {
        linkage = linkage_create(0, sent, opts);
        /// Get the lengths of everything
        num_words = linkage_get_num_words(linkage);
        links = linkage_get_num_links(linkage);

        for(link_idx=0; link_idx < links; link_idx++) {
            PyObject *temp_subLen;

            diagram = linkage_print_diagram(linkage);
            _diagram = PyString_FromString(diagram);

            sub_linkage = linkage_create(link_idx, sent, opts);
            sub_linkages = linkage_get_num_sublinkages(linkage);

            temp_subLen = PyLong_FromLong(sub_linkages);
            linkage_delete(sub_linkage);
            PyList_Append(sublinkage_list, temp_subLen);

            span = linkage_get_link_length(linkage, link_idx);
            PyList_Append(span_list, PyInt_FromLong(span));

            PyObject *temp_list;
            temp_list = PyList_New(0);
            /// Sub Group these (left and right labels)
            const char *t1 = linkage_get_link_llabel(linkage, link_idx);
            temp = PyString_FromString(t1);
            PyList_Append(temp_list, temp);

            const char *t2 = linkage_get_link_rlabel(linkage, link_idx);
            temp = PyString_FromString(t2);
            PyList_Append(temp_list, temp);
            /// Then add to the main list
            PyList_Append(output_list, temp_list);


            /// Just the label
            const char *t3 = linkage_get_link_label(linkage, link_idx);
            temp = PyString_FromString(t3);
            PyList_Append(word2_list, temp);
        }

        for(word_idx=0; word_idx < num_words; word_idx++) {
            d_output = linkage_get_word(linkage, word_idx);
            PyObject *word;

            word = PyString_FromString(d_output);
            PyList_Append(word_list, word);
        }

        linkage_free_diagram(diagram);
        linkage_delete(linkage);

    } else {
        sentence_delete(sent);
        dictionary_delete(dict);
        parse_options_delete(opts);

        Py_INCREF(Py_None);
        return Py_None;
    }

    sentence_delete(sent);
    dictionary_delete(dict);
    parse_options_delete(opts);

    return Py_BuildValue("SSSSSS", word_list, span_list, output_list, word2_list, sublinkage_list, _diagram);
}
Beispiel #7
0
int main(int argc, char * argv[])
{
	FILE            *input_fh = stdin;
	Dictionary      dict;
	const char     *language = NULL;
	int             num_linkages;
	Label           label = NO_LABEL;
	Command_Options *copts;
	Parse_Options   opts;
	bool batch_in_progress = false;

	isatty_stdin = isatty(fileno(stdin));
	isatty_stdout = isatty(fileno(stdout));

#ifdef _WIN32
	/* If compiled with MSVC/MinGW, we still support running under Cygwin.
	 * This is done by checking running_under_cygwin to resolve
	 * incompatibilities. */
	const char *ostype = getenv("OSTYPE");
	if ((NULL != ostype) && (0 == strcmp(ostype, "cygwin")))
		running_under_cygwin = true;

	/* argv encoding is in the current locale. */
	argv = argv2utf8(argc);
	if (NULL == argv)
	{
		prt_error("Fatal error: Unable to parse command line\n");
		exit(-1);
	}

#ifdef _MSC_VER
	_set_printf_count_output(1); /* enable %n support for display_1line_help()*/
#endif /* _MSC_VER */

	win32_set_utf8_output();
#endif /* _WIN32 */

#if LATER
	/* Try to catch the SIGWINCH ... except this is not working. */
	struct sigaction winch_act;
	winch_act.sa_handler = winch_handler;
	winch_act.sa_sigaction = NULL;
	sigemptyset (&winch_act.sa_mask);
	winch_act.sa_flags = 0;
	sigaction (SIGWINCH, &winch_act, NULL);
#endif

	copts = command_options_create();
	if (copts == NULL || copts->panic_opts == NULL)
	{
		prt_error("Fatal error: unable to create parse options\n");
		exit(-1);
	}
	opts = copts->popts;

	setup_panic_parse_options(copts->panic_opts);
	copts->panic_mode = true;

	parse_options_set_max_parse_time(opts, 30);
	parse_options_set_linkage_limit(opts, 1000);
	parse_options_set_min_null_count(opts, 0);
	parse_options_set_max_null_count(opts, 0);
	parse_options_set_short_length(opts, 16);
	parse_options_set_islands_ok(opts, false);
	parse_options_set_display_morphology(opts, false);

	save_default_opts(copts); /* Options so far are the defaults */

	if ((argc > 1) && (argv[1][0] != '-')) {
		/* The dictionary is the first argument if it doesn't begin with "-" */
		language = argv[1];
	}

	for (int i = 1; i < argc; i++)
	{
		if (strcmp("--help", argv[i]) == 0)
		{
			print_usage(stdout, argv[0], copts, 0);
		}

		if (strcmp("--version", argv[i]) == 0)
		{
			printf("Version: %s\n", linkgrammar_get_version());
			printf("%s\n", linkgrammar_get_configuration());
			exit(0);
		}
	}

	/* Process command line variable-setting commands (only). */
	for (int i = 1; i < argc; i++)
	{
		if (argv[i][0] == '-')
		{
			const char *var = argv[i] + ((argv[i][1] != '-') ? 1 : 2);
			if ((var[0] != '!') && (0 > issue_special_command(var, copts, NULL)))
				print_usage(stderr, argv[0], copts, -1);
		}
		else if (i != 1)
		{
			prt_error("Fatal error: Unknown argument '%s'.\n", argv[i]);
			print_usage(stderr, argv[0], copts, -1);
		}
	}

	if (language && *language)
	{
		dict = dictionary_create_lang(language);
		if (dict == NULL)
		{
			prt_error("Fatal error: Unable to open dictionary.\n");
			exit(-1);
		}
	}
	else
	{
		dict = dictionary_create_default_lang();
		if (dict == NULL)
		{
			prt_error("Fatal error: Unable to open default dictionary.\n");
			exit(-1);
		}
	}

	/* Process the command line '!' commands */
	for (int i = 1; i < argc; i++)
	{
		if ((argv[i][0] == '-') && (argv[i][1] == '!'))
		{
			if (0 > issue_special_command(argv[i]+1, copts, dict))
				print_usage(stderr, argv[0], copts, -1);
		}
	}

	check_winsize(copts);

	prt_error("Info: Dictionary version %s, locale %s\n",
		linkgrammar_get_dict_version(dict),
		linkgrammar_get_dict_locale(dict));
	prt_error("Info: Library version %s. Enter \"!help\" for help.\n",
		linkgrammar_get_version());

	/* Main input loop */
	while (true)
	{
		char *input_string;
		Sentence sent = NULL;

		/* Make sure stderr is shown even when MSVC binary runs under
		 * Cygwin/MSYS pty (in that case it is fully buffered(!)). */
		fflush(stderr);

		verbosity = parse_options_get_verbosity(opts);
		debug = parse_options_get_debug(opts);
		test = parse_options_get_test(opts);

		input_string = fget_input_string(input_fh, stdout, /*check_return*/false);
		check_winsize(copts);

		if (NULL == input_string)
		{
			if (ferror(input_fh))
				prt_error("Error: Read: %s\n", strerror(errno));

			if (input_fh == stdin) break;
			fclose (input_fh);
			input_fh = stdin;
			continue;
		}

		/* Discard whitespace characters from end of string. */
		for (char *p = &input_string[strlen(input_string)-1];
		     (p > input_string) && strchr(WHITESPACE, *p) ; p--)
		{
			*p = '\0';
		}

		/* If the input string is just whitespace, then ignore it. */
		if (strspn(input_string, WHITESPACE) == strlen(input_string))
			continue;

		char command = special_command(input_string, copts, dict);
		if ('e' == command) break;    /* It was an exit command */
		if ('c' == command) continue; /* It was another command */
		if (-1 == command) continue;  /* It was a bad command */

		/* We have to handle the !file command inline; it's too hairy
		 * otherwise ... */
		if ('f' == command)
		{
			char * filename = &input_string[strcspn(input_string, WHITESPACE)] + 1;
			int fnlen = strlen(filename);

			if (0 == fnlen)
			{
				prt_error("Error: Missing file name argument\n");
				continue;
			}

			if ('\n' == filename[fnlen-1]) filename[fnlen-1] = '\0';

			struct stat statbuf;
			if ((0 == stat(filename, &statbuf)) && statbuf.st_mode & S_IFDIR)
			{
				prt_error("Error: Cannot open %s: %s\n",
				        filename, strerror(EISDIR));
				continue;
			}

			input_fh = fopen(filename, "r");

			if (NULL == input_fh)
			{
				prt_error("Error: Cannot open %s: %s\n", filename, strerror(errno));
				input_fh = stdin;
				continue;
			}
			continue;
		}


		if (!copts->batch_mode) batch_in_progress = false;
		if ('\0' != test[0])
		{
			/* In batch mode warn only once.
			 * In auto-next-linkage mode don't warn at all. */
			if (!batch_in_progress && !auto_next_linkage_test(test))
			{
				fflush(stdout);
				/* Remind the developer this is a test mode. */
				prt_error("Warning: Tests enabled: %s\n", test);
				if (copts->batch_mode) batch_in_progress = true;
			}
		}

		if (copts->echo_on)
		{
			printf("%s\n", input_string);
		}

		if (copts->batch_mode || auto_next_linkage_test(test))
		{
			label = strip_off_label(input_string);
		}

		// Post-processing-based pruning will clip away connectors
		// that we might otherwise want to examine. So disable PP
		// pruning in this situation.
		if (copts->display_bad)
			parse_options_set_perform_pp_prune(opts, false);
		else
			parse_options_set_perform_pp_prune(opts, true);

		sent = sentence_create(input_string, dict);

		if (sentence_split(sent, opts) < 0)
		{
			sentence_delete(sent);
			sent = NULL;
			continue;
		}

		if (0 != copts->display_wordgraph)
		{
			const char *wg_display_flags = ""; /* default flags */
			switch (copts->display_wordgraph)
			{
				case 1:     /* default flags */
					break;
				case 2:     /* subgraphs with a legend */
					wg_display_flags = "sl";
					break;
				case 3:
					{
						/* Use esoteric flags from the test user variable. */
						const char *s = test_enabled(test, "wg");
						if ((NULL != s) && (':' == s[0])) wg_display_flags = s;
					}
					break;
				default:
					prt_error("Warning: wordgraph=%d: Unknown value, using 1\n",
								 copts->display_wordgraph);
					copts->display_wordgraph = 1;
			}
			sentence_display_wordgraph(sent, wg_display_flags);
		}

		/* First parse with the default disjunct_cost as set by the library
		 * (currently 2.7). Usually parse here with no null links.
		 * However, if "-test=one-step-parse" is used and we are said to
		 * parse with null links, allow parsing here with null links too. */
		bool one_step_parse = !copts->batch_mode && copts->allow_null &&
		                    test_enabled(test, "one-step-parse");
		int max_null_count = one_step_parse ? sentence_length(sent) : 0;

		parse_options_set_min_null_count(opts, 0);
		parse_options_set_max_null_count(opts, max_null_count);
		parse_options_reset_resources(opts);

		num_linkages = sentence_parse(sent, opts);

		/* num_linkages is negative only on a hard-error;
		 * typically, due to a zero-length sentence.  */
		if (num_linkages < 0)
		{
			sentence_delete(sent);
			sent = NULL;
			continue;
		}

#if 0
		/* Try again, this time omitting the requirement for
		 * definite articles, etc. This should allow for the parsing
		 * of newspaper headlines and other clipped speech.
		 *
		 * XXX Unfortunately, this also allows for the parsing of
		 * all sorts of ungrammatical sentences which should not
		 * parse, and leads to bad parses of many other unparsable
		 * but otherwise grammatical sentences.  Thus, this trick
		 * pretty much fails; we leave it here to document the
		 * experiment.
		 */
		if (num_linkages == 0)
		{
			parse_options_set_disjunct_cost(opts, 4.5);
			num_linkages = sentence_parse(sent, opts);
			if (num_linkages < 0) continue;
		}
#endif /* 0 */

		/* Try using a larger list of disjuncts */
		/* XXX FIXME: the lg_expand_disjunct_list() routine is not
		 * currently a part of the public API; it should be made so,
		 * or this expansion idea should be abandoned... not sure which.
		 */
		if ((num_linkages == 0) && parse_options_get_use_cluster_disjuncts(opts))
		{
			int expanded;
			if (verbosity > 0) fprintf(stdout, "No standard linkages, expanding disjunct set.\n");
			parse_options_set_disjunct_cost(opts, 3.9);
			expanded = lg_expand_disjunct_list(sent);
			if (expanded)
			{
				num_linkages = sentence_parse(sent, opts);
			}
			if (0 < num_linkages) printf("Got One !!!!!!!!!!!!!!!!!\n");
		}

		/* If asked to show bad linkages, then show them. */
		if ((num_linkages == 0) && (!copts->batch_mode))
		{
			if (copts->display_bad)
			{
				num_linkages = sentence_num_linkages_found(sent);
			}
		}

		/* Now parse with null links */
		if (!one_step_parse && num_linkages == 0 && !copts->batch_mode)
		{
			if (verbosity > 0) fprintf(stdout, "No complete linkages found.\n");

			if (copts->allow_null)
			{
				/* XXX should use expanded disjunct list here too */
				parse_options_set_min_null_count(opts, 1);
				parse_options_set_max_null_count(opts, sentence_length(sent));
				num_linkages = sentence_parse(sent, opts);
			}
		}

		if (verbosity > 0)
		{
			if (parse_options_timer_expired(opts))
				fprintf(stdout, "Timer is expired!\n");

			if (parse_options_memory_exhausted(opts))
				fprintf(stdout, "Memory is exhausted!\n");
		}

		if ((num_linkages == 0) &&
			copts->panic_mode &&
			parse_options_resources_exhausted(opts))
		{
			/* print_total_time(opts); */
			batch_errors++;
			if (verbosity > 0) fprintf(stdout, "Entering \"panic\" mode...\n");
			/* If the parser used was the SAT solver, set the panic parser to
			 * it too.
			 * FIXME? Currently, the SAT solver code is not too useful in
			 * panic mode since it doesn't handle parsing with null words, so
			 * using the regular parser in that case could be beneficial.
			 * However, this currently causes a crash due to a memory
			 * management mess. */
			parse_options_set_use_sat_parser(copts->panic_opts,
				parse_options_get_use_sat_parser(opts));
			parse_options_reset_resources(copts->panic_opts);
			parse_options_set_verbosity(copts->panic_opts, verbosity);
			(void)sentence_parse(sent, copts->panic_opts);
			if (verbosity > 0)
			{
				if (parse_options_timer_expired(copts->panic_opts))
					fprintf(stdout, "Panic timer is expired!\n");
			}
		}

		if (verbosity > 1) parse_options_print_total_time(opts);

		const char *rc = "";
		if (copts->batch_mode)
		{
			batch_process_some_linkages(label, sent, copts);
		}
		else
		{
			rc = process_some_linkages(input_fh, sent, copts);
		}

		fflush(stdout);
		sentence_delete(sent);
		sent = NULL;

		if ((NULL == rc) && (input_fh == stdin)) break;
	}

	if (copts->batch_mode)
	{
		/* print_time(opts, "Total"); */
		fprintf(stderr,
				"%d error%s.\n", batch_errors, (batch_errors==1) ? "" : "s");
	}

	/* Free stuff, so that mem-leak detectors don't complain. */
	command_options_delete(copts);
	dictionary_delete(dict);

	printf ("Bye.\n");
	return 0;
}
Beispiel #8
0
 SPOTriplets NLP::sentence2triplets ( const char* sentence )
 {
   // vector of triplets
   SPOTriplets triplets;

   #ifdef DEBUG
     std::cout << "The sentence: " << sentence << std::endl;
   #endif
   // creates a Sentence from the input char*
   Sentence sent = sentence_create ( sentence, dict_ );
   #ifdef DEBUG
     std::cout << "Sentence created" << std::endl;
   #endif
   // tokenizes the sentence
   sentence_split ( sent, parse_opts_ );
   #ifdef DEBUG
     std::cout << "Sentence splitted" << std::endl;
   #endif
   // searches for all possible linkages
   int num_linkages = sentence_parse ( sent, parse_opts_ );
   #ifdef DEBUG
     std::cout << "Sentence parsed" << std::endl;
     std::cout << "Number of linkages: " << num_linkages << std::endl;
   #endif

   // just one triplet
   SPOTriplet triplet;

   // if there is any linkage in the sentence
   if( num_linkages > 0 )
   {
     // create the linkage
     Linkage linkage = linkage_create ( 0, sent, parse_opts_ );

     #ifdef DEBUG
       // prints the sentence's diagram
       std::cout << "The diagram: " << std::endl;
       char *diagram = linkage_print_diagram(linkage, true, 800);
       std::cout << diagram << std::endl;
       linkage_free_diagram( diagram );
       // end print diagram
     #endif

     std::vector<std::string> labels;

     // 1. find the S_link
     // S* except there is an SJ* because then S* except Spx
     // two cases: there is SJ* and there is not SJ*

     // TODO: VJlp VJrp same as SJ but to predications
     // TODO: SFut SFst what the f**k?                                     ###FIXED###
     // TODO: His form was shining like the light not working              ###FIXED###
     // TODO: Car is mine not working                                      ###FIXED###
     // TODO: The little brown bear has eaten all of the honey not working ###FIXED###

     // REGEXES
     std::regex SJ_( "SJ.*" );
     std::regex VJ_( "VJ.*");
     std::regex subject( "(Ss.*)|(SFut)|(Sp\*.*)" );
     std::regex Spx( "Spx.*" );
     // TODO:fix theese initializer list not allowed                       ###FIXED###
     std::regex predicate( "(Pv.*)|(Pg.*)|(PP.*)|(I.*)|(TO)|(MVi.*)" );
     // TODO: make one from theese // (Sp.*)|(Ss.*)                        ###FIXED###
     std::regex noun_adject_object ( "(O.*)|(Os.*)|(Op.*)|(MVpn.*)|(Pa.*)|(MVa.*)" );
     std::regex preposition ( "(MVp.*)|(Pp.*)|(OF)|(TO)" );
     std::regex prep_object ( "(J.*)|(TI)|(I.*)|(ON)" );
     // TODO: problems with matching!! Pg*!!                               ###FIXED###
     // TODO: problems with matching!! Mvp.*!!                             ###FIXED###

     bool s_found = false;
     bool p_found = false;
     bool o_found = false;
     bool SJ = false;

     // search for SJ.s labels
     for( auto label: labels )
     {
       if( std::regex_match( label, SJ_ ) )
       {
         SJ = true;
         break;
       }
     }

     // multiple subject in the sentence
     if( SJ )
     {
       // SPls left -> first subject
       // SPrs right -> second subject
       // Spx right -> predicate
       // SJ-s are multiple subjects
       std::string temp;
       // go through every linkage
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         // get their label
         std::string l = linkage_get_link_label( linkage, i );
         // if there is an SJl* label
         if( std::regex_match( l, std::regex( "SJl.*" ) ) )
         {
           // SJls left side
           triplet.s = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
           triplet.cut( triplet.s );
           temp = triplet.s + " ";
           // and word
           triplet.s = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           triplet.cut( triplet.s );
           temp += triplet.s + " ";

           // find SJr*
           for( int j = 0; j < linkage_get_num_links( linkage ); ++j )
           {
             std::string m = linkage_get_link_label( linkage, j );
             if( std::regex_match( m, std::regex( "SJr.*" ) ) )
             {
               triplet.s = linkage_get_word( linkage, linkage_get_link_rword( linkage, j ) );
               triplet.cut();
               temp += triplet.s;
               triplet.s = temp;

               s_found = true;
               #ifdef DEBUG
                 std::cout << "Subject found: " << triplet.s << std::endl;
               #endif
               break;
             } // if
           } // for
           break;
         } // if
       } // for

       // now we have the subject

       // find Spx and its right side will be the starter predicate
       std::string current_word;
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         std::string l = linkage_get_link_label( linkage, i );
         if( std::regex_match( l, std::regex( "Spx.*" ) ) )
         {
           triplet.p = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           current_word = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
         }
       }
       // from now all the same as on the else branch !!!!

       bool predicate_match = false;

       // search for the linkage that has triplet.s as left!
       do
       {
         predicate_match = false;

         for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
         {
           // every linkage's left word
           std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
           // every linkage's label
           std::string l = linkage_get_link_label( linkage, i );

           if( std::regex_match( l, predicate ) && word_i == current_word )
           {
             // found predicate
             triplet.p = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
             current_word = triplet.p;
             predicate_match = true;
             break;
           }
         }
       }
       while( predicate_match );

       // we now have the predicate too
       // TODO: multiple predicates!
       p_found = true;
       #ifdef DEBUG
         std::cout << "Predicate found: " << triplet.p << std::endl;
       #endif

       // ###COPY BEGIN###

       // search for noun object or adjective object
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         // get every linkage label
         std::string l = linkage_get_link_label( linkage, i );
         // get the left word of every linkage
         std::string l_word = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
         // if thete is a label that match AND its left word is the predicate
         if( std::regex_match( l, noun_adject_object ) && triplet.p == l_word )
         {
           // then the object is that linkage's right word
           triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           triplet.cut( triplet.o );
           o_found = true;
           #ifdef DEBUG
             std::cout << "Adjective or noun object found: " << triplet.o << std::endl;
           #endif
         } // if
       } // for

       // still not found object, then search for preposition
       if( !o_found )
       {
         // go through every linkage
         for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
         {
           // get the linkage's label
           std::string l = linkage_get_link_label( linkage, i );
           // and left word
           std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
           // if there is a linkage which is a preposition and its left word is the predicate
           if( std::regex_match( l, preposition ) && triplet.p == word_i )
           {
             // found preposition
             // search for prep_object
             // then the temp will contain the preposition label's right word
             std::string temp = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
             #ifdef DEBUG
               std::cout << "Preposition found! and its rigth word is: " << temp << std::endl;
             #endif

             for( int j = 0; j < linkage_get_num_links( linkage ); ++j )
             {
               // every linkages
               std::string m = linkage_get_link_label( linkage, j );
               // every left word
               std::string word_j = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) );

               // if there is a label with match and its left is exactly the preposition's right
               if( std::regex_match( m, prep_object ) && temp == word_j )
               {
                 triplet.o = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) );
                 triplet.cut(triplet.o);

                 triplet.o += " ";
                 // save o
                 std::string temp = triplet.o;

                 triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, j ) );
                 triplet.cut(triplet.o);
                 temp += triplet.o;

                 triplet.o = temp;
                 o_found = true;
                 #ifdef DEBUG
                   std::cout << "Object found: " << triplet.o << std::endl;
                 #endif
               } // if( std::regex_match( m, prep_object ) && temp == word_j ) END
             } // for J END
           } // if( std::regex_match( l, preposition ) && triplet.p == word_i ) END
         } // for I END
       } // if( !o_found ) END

       if( s_found && p_found && o_found )
       {
         // TODO: cut the words itself not the whole triplet
         // have to cut every word itself
         // triplet.cut();
         triplet.cut(triplet.s);
         triplet.cut(triplet.p);
         triplets.push_back( triplet );
         s_found = false;
         p_found = false;
         o_found = false;
       }
       // ###COPY END###
     }
     else // only one subject
     {
       // except Spx!!!
       // S left -> subject
       // S right -> predicate at first
       // if the word next to S right, is an element of Pv*, Pg* PP*, I*, TO, MVi*
       // then the new predicate will be that word

       std::string current_word;

       // search for subject (S_link)
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         // get the linkage's label
         std::string l = linkage_get_link_label( linkage, i );

         if( std::regex_match( l, subject ) )
         {
           // subject found
           triplet.s = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
           s_found = true;
           current_word = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           triplet.p = current_word;
           #ifdef DEBUG
             std::cout << "Subject found: " << triplet.s << std::endl;
           #endif
           break;
         }
       }

       if( s_found )
       {
         bool predicate_match = false;

         // search for the linkage that has triplet.s as left!
         do
         {
           predicate_match = false;

           for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
           {
             // every linkage's left word
             std::string l_word = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
             // every linkage's label
             std::string l = linkage_get_link_label( linkage, i );

             if( std::regex_match( l, predicate ) && l_word == current_word )
             {
               // found predicate
               triplet.p = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
               current_word = triplet.p;
               predicate_match = true;
               break;
             }
           } // for END
         } while( predicate_match );

         p_found = true;
         #ifdef DEBUG
           std::cout << "Predicate found: " << triplet.p << std::endl;
         #endif
       } // if( s_found ) END

       // subject and predicate found
       // search for object

       // from k to linkage_get_num_links( linkage )
       // if there is any of the noun, adjective od preposition object then that
       // label's right will give the object.

       // !!! search only between labels that has triplet.p as left word !!!!!

       // search for noun object or adjective objects
       // go through all links
       for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
       {
         // get every linkage label
         std::string l = linkage_get_link_label( linkage, i );
         // get the left word of every linkage
         std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );
         // if thete is a label that match AND its left word is the predicate
         if( std::regex_match( l, noun_adject_object ) && triplet.p == word_i )
         {
           // then the object is that linkage's right word
           triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
           o_found = true;
           triplet.cut(triplet.o);
           #ifdef DEBUG
             std::cout << "Adjective or noun object found: " << triplet.o << std::endl;
           #endif
         } // if END
       } // for END

       // still not found object, then search for preposition
       if( !o_found )
       {
         // go through every linkage
         for( int i = 0; i < linkage_get_num_links( linkage ); ++i )
         {
           // get the linkage's label
           std::string l = linkage_get_link_label( linkage, i );
           // and left word
           std::string word_i = linkage_get_word( linkage, linkage_get_link_lword( linkage, i ) );

           // if there is a linkage which is a preposition and its left word is the predicate
           if( std::regex_match( l, preposition ) && triplet.p == word_i )
           {
             // found preposition
             // search for prep_object
             // then the temp will contain the preposition label's right word
             std::string temp = linkage_get_word( linkage, linkage_get_link_rword( linkage, i ) );
             #ifdef DEBUG
               std::cout << "Preposition found! and its rigth word is: " << temp << std::endl;
             #endif

             // start search from there
             for( int j = 0; j < linkage_get_num_links( linkage ); ++j )
             {
               // every linkages
               std::string m = linkage_get_link_label( linkage, j );
               // every left word
               std::string word_j = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) );
               #ifdef DEBUG
                 if( std::regex_match( m, prep_object ) )
                     std::cout << m << " DOES match to (J.*)|(TI)|(I.*)|(ON)" << std::endl;
               #endif

               // if there is a label with match and its left is exactly the preposition's right
               if( std::regex_match( m, prep_object ) && temp == word_j )
               {
                 triplet.o = linkage_get_word( linkage, linkage_get_link_lword( linkage, j ) );
                 triplet.cut(triplet.o);

                 triplet.o += " ";
                 // save o
                 std::string temp = triplet.o;

                 triplet.o = linkage_get_word( linkage, linkage_get_link_rword( linkage, j ) );
                 triplet.cut(triplet.o);
                 temp += triplet.o;

                 triplet.o = temp;
                 #ifdef DEBUG
                   std::cout << "Object found: " << triplet.o << std::endl;
                 #endif
                 o_found = true;
               }
             } // for
           } // if
         } // for
       } // if( o_found ) END

       if( s_found && p_found && o_found )
       {
         // TODO: cut the words itself not the whole triplet ###FIXED###
         // have to cut every word itself
         // triplet.cut();

         triplet.cut(triplet.s);
         triplet.cut(triplet.p);
         triplets.push_back( triplet );
         s_found = false;
         p_found = false;
         o_found = false;
       }

     } // end else

     linkage_delete ( linkage );
   } // if( num_linkages > 0 ) END
Beispiel #9
0
// **************************************************************************
// HTTPヘッダを受信して解析する。
//
// 処理するのはGETのみ。GET以外のメソッドが来たらエラー
// 今のところ、URIとuser_agent、Range、Hostを解析。
// URIは、URIデコードもやる。
//
//	return: 0 		正常終了
//	return: 0以外 	エラー
// **************************************************************************
static int http_header_receive(int accept_socket, HTTP_RECV_INFO *http_recv_info_p)
{
    int result = 0;
    int	recv_len;
    char	line_buf[1024];	// 大きめに。
    char 	work_buf[1024];
    char 	work_buf2[1024];
    char 	split1[1024];
    char 	split2[1024];
    int		ret;
    int		i;
    // ================================
    // 1行づつ HTTPヘッダを受信
    // ================================
    for (i=0;;i++){
        // 1行受信 実行。
        recv_len = line_receive(accept_socket, line_buf, sizeof(line_buf));
        // 受信した内容をチェック。
        if ( recv_len == 0 ){ // 空行検知。ヘッダ受信終了。
            break;
        }else if ( recv_len < 0 ){ // 受信失敗
            return ( -1 );
        }
        // debug. 受信したヘッダ表示
        debug_log_output("'%s'(%d byte)\n", line_buf, recv_len );
        // --------------------------
        // GETメッセージチェック
        // --------------------------
        if ( i == 0 ){ // 1行目のみチェック
            debug_log_output("%d:URI Check start.'%s'\n", accept_socket,line_buf);
            // GETある?
            if       ( strstr(line_buf, "GET") != NULL ){
                http_recv_info_p->isGet = 1;
            }else if ( strstr(line_buf, "HEAD") != NULL ){
                http_recv_info_p->isGet = 2;
            }else if ( strstr(line_buf, "POST") != NULL ){
                http_recv_info_p->isGet = 3;
            }else{
                debug_log_output("'GET' not found. error.%d",accept_socket);
                return ( -1 );
            }
            // 最初のスペースまでを削除。
            cut_before_character(line_buf, ' ');
            // 次にスペースが出てきたところの後ろまでを削除。
            cut_after_character(line_buf, ' ');
            // ===========================
            // GETオプション部解析
            // ===========================
            // REQUEST_URI用・Proxy用に値を保存
            strncpy(http_recv_info_p->request_uri, line_buf, sizeof(http_recv_info_p->request_uri));
            // '?'が存在するかチェック。
            if ( strchr(line_buf, '?') != NULL ){
                strncpy(work_buf, line_buf, sizeof(work_buf));
                // '?'より前をカット
                cut_before_character(work_buf, '?' );
                while ( 1 ){
                    memset(split1, 0, sizeof(split1));
                    memset(split2, 0, sizeof(split2));
                    // 最初に登場する'&'で分割
                    ret = sentence_split(work_buf, '&', split1, split2 );
                    if ( ret == 0 ){ // 分割成功
                        strncpy(work_buf, split2, sizeof(work_buf));
                    }else if (strlen(work_buf) > 0){ // まだwork_bufに中身ある?
                        strncpy( split1, work_buf, sizeof(split1));
                        strncpy( work_buf, "", sizeof(work_buf));
                    }else{ // 処理終了
                        break;
                    }
                    // -------------------------------------
                    // GETした内容 解析開始
                    // 超安直。いいのかこんな比較で。
                    // -------------------------------------
                    // URIデコード
                    uri_decode(work_buf2, sizeof(work_buf2), split1, sizeof(split1) );
                    // "action="あるか調査。
                    if (strncasecmp( work_buf2, "action=", strlen("action=") ) == 0 ){
                        // = より前を削除
                        cut_before_character(work_buf2, '=');
                        // 構造体に値を保存。
                        strncpy(http_recv_info_p->action, work_buf2, sizeof(http_recv_info_p->action));
                        continue;
                    }
                }
            }
            debug_log_output("http_recv_info_p->action = '%s'", http_recv_info_p->action);
            // URIデコード
            cut_after_character(line_buf, '?');
            uri_decode(work_buf, sizeof(work_buf), line_buf, sizeof(line_buf) );
            strncpy(line_buf, work_buf, sizeof(line_buf));
            debug_log_output("URI(decoded):'%s'\n", line_buf);
            convert_language_code(line_buf, work_buf, sizeof(work_buf), CODE_AUTO, CODE_EUC);
            debug_log_output("URI(decoded,euc,FYI):'%s'\n", work_buf);
            // 構造体に保存
            strncpy(http_recv_info_p->recv_uri, line_buf, sizeof(http_recv_info_p->recv_uri));
            //httpから始まってる場合には、http://以降の最初の'/'の前でカット
            if( strncmp(http_recv_info_p->recv_uri,"http://",7)==0){
                char* ptr = strstr(http_recv_info_p->recv_uri+7,"/");
                if( ptr ){
                    strcpy(http_recv_info_p->recv_uri,ptr);
                }
            }
            continue;
        }
        // User-agent切り出し
        if ( strncasecmp(line_buf, HTTP_USER_AGENT, strlen(HTTP_USER_AGENT) ) == 0 ){
            debug_log_output("User-agent: Detect.\n");
            // ':'より前を切る
            cut_before_character(line_buf, ':');
            cut_first_character(line_buf, ' ');
            // 構造体に保存
            strncpy( http_recv_info_p->user_agent, line_buf, sizeof(http_recv_info_p->user_agent));
            continue;
        }
        // Rangeあるかチェック
        if ( strncasecmp(line_buf, HTTP_RANGE,	strlen(HTTP_RANGE) ) == 0 ){
            debug_log_output("%s Detect.\n", HTTP_RANGE);
            // ':' より前を切る。
            cut_before_character(line_buf, ':');
            cut_first_character(line_buf, ' ');
            // recv_range にRangeの中身保存
            strncpy(http_recv_info_p->recv_range, line_buf, sizeof(http_recv_info_p->recv_range));
            // '=' より前を切る
            cut_before_character(line_buf, '=');
            // '-'で前後に分割。
            sentence_split(line_buf, '-', work_buf, work_buf2);
            debug_log_output("wrok_buf='%s'\n", work_buf);
            debug_log_output("wrok_buf2='%s'\n", work_buf2);
            // 値を文字列→数値変換
            http_recv_info_p->range_start_pos  = strtoull(work_buf, NULL, 10);
            if ( strlen(work_buf2) > 0 ){
                http_recv_info_p->range_end_pos = strtoull(work_buf2, NULL, 10);
            }
            debug_log_output("range_start_pos=%d\n", http_recv_info_p->range_start_pos);
            debug_log_output("range_end_pos=%d\n", http_recv_info_p->range_end_pos);
            continue;
        }
        // Hostあるかチェック
        if ( strncasecmp(line_buf, HTTP_HOST,	strlen(HTTP_HOST) ) == 0 ){
            // ':' より前を切る。
            cut_before_character(line_buf, ':');
            cut_first_character(line_buf, ' ');
            strncpy(http_recv_info_p->recv_host, line_buf, sizeof(http_recv_info_p->recv_host));
            debug_log_output("%s Detect. %s '%s'", HTTP_HOST, HTTP_HOST, http_recv_info_p->recv_host);
            continue;
        }
        // Content-Lengthあるかチェック
        if ( strncasecmp(line_buf, HTTP_CONTENT_LENGTH1, strlen(HTTP_CONTENT_LENGTH1) ) == 0 ){
            // ':' より前を切る。
            cut_before_character(line_buf, ':');
            cut_first_character(line_buf, ' ');
            strncpy(http_recv_info_p->content_length, line_buf, sizeof(http_recv_info_p->content_length));
            debug_log_output("%s Detect. %s '%s'", HTTP_CONTENT_LENGTH1, HTTP_CONTENT_LENGTH1, http_recv_info_p->content_length);
            continue;
        }
        // Content-TYPEあるかチェック
        if ( strncasecmp(line_buf, HTTP_CONTENT_TYPE1, strlen(HTTP_CONTENT_TYPE1) ) == 0 ){
            // ':' より前を切る。
            cut_before_character(line_buf, ':');
            cut_first_character(line_buf, ' ');
            strncpy(http_recv_info_p->content_type, line_buf, sizeof(http_recv_info_p->content_type));
            debug_log_output("%s Detect. %s '%s'", HTTP_CONTENT_TYPE1, HTTP_CONTENT_TYPE1, http_recv_info_p->content_type);
            continue;
        }
    }
    return result;
}