Beispiel #1
0
Reading *alloc_reading(const Reading& o) {
	Reading *r = pool_get(pool_readings);
	if (r == 0) {
		r = new Reading(o);
	}
	else {
		r->mapped = o.mapped;
		r->deleted = o.deleted;
		r->noprint = o.noprint;
		r->matched_target = false;
		r->matched_tests = false;
		r->baseform = o.baseform;
		r->hash = o.hash;
		r->hash_plain = o.hash_plain;
		r->number = o.number + 100;
		r->tags_bloom = o.tags_bloom;
		r->tags_plain_bloom = o.tags_plain_bloom;
		r->tags_textual_bloom = o.tags_textual_bloom;
		r->mapping = o.mapping;
		r->parent = o.parent;
		r->next = o.next;
		r->hit_by = o.hit_by;
		r->tags_list = o.tags_list;
		r->tags = o.tags;
		r->tags_plain = o.tags_plain;
		r->tags_textual = o.tags_textual;
		r->tags_numerical = o.tags_numerical;
		if (r->next) {
			r->next = alloc_reading(*r->next);
		}
	}
	return r;
}
Beispiel #2
0
Reading *Reading::allocateReading(Cohort *p) {
	return alloc_reading(p);
}
Beispiel #3
0
Reading *Reading::allocateReading(const Reading& r) {
	return alloc_reading(r);
}
std::vector<Cohort*> MweSplitApplicator::splitMwe(Cohort* cohort) {
	constexpr UChar rtrimblank[] = { ' ', '\n', '\r', '\t', 0 };
	constexpr UChar textprefix[] = { ':', 0 };
	std::vector<Cohort*> cos;
	size_t n_wftags = 0;
	size_t n_goodreadings = 0;
	for (auto rter1 : cohort->readings) {
		if (maybeWfTag(rter1) != NULL) {
			++n_wftags;
		}
		++n_goodreadings;
	}

	if (n_wftags < n_goodreadings) {
		if (n_wftags > 0) {
			u_fprintf(ux_stderr, "WARNING: Line %u: Some but not all main-readings of %S had wordform-tags (not completely mwe-disambiguated?), not splitting.\n", cohort->line_number, cohort->wordform->tag.c_str());
			// We also don't split if wordform-tags were only on sub-readings, but should we warn on such faulty input?
		}
		cos.push_back(cohort);
		return cos;
	}
	for (auto r : cohort->readings) {
		size_t pos = std::numeric_limits<size_t>::max();
		Reading* prev = NULL; // prev == NULL || prev->next == rNew (or a ->next of rNew)
		for (auto sub = r; sub; sub = sub->next) {
			const Tag* wfTag = maybeWfTag(sub);
			if (wfTag == NULL) {
				prev = prev->next;
			}
			else {
				++pos;
				Cohort* c;
				while (cos.size() < pos + 1) {
					c = alloc_cohort(cohort->parent);
					c->global_number = gWindow->cohort_counter++;
					cohort->parent->appendCohort(c);
					cos.push_back(c);
				}
				c = cos[pos];

				const size_t wfEnd = wfTag->tag.size() - 3; // index before the final '>"'
				const size_t i = 1 + wfTag->tag.find_last_not_of(rtrimblank, wfEnd);
				const UString& wf = wfTag->tag.substr(0, i) + wfTag->tag.substr(wfEnd + 1);
				if (c->wordform != 0 && wf != c->wordform->tag) {
					u_fprintf(ux_stderr, "WARNING: Line %u: Ambiguous wordform-tags for same cohort, '%S' vs '%S', not splitting.\n", numLines, wf.c_str(), c->wordform->tag.c_str());
					cos.clear();
					cos.push_back(cohort);
					return cos;
				}
				c->wordform = addTag(wf);
				if (i < wfEnd + 1) {
					c->text = textprefix + wfTag->tag.substr(i, wfEnd + 1 - i);
				}

				Reading* rNew = alloc_reading(*sub);
				for (size_t i = 0; i < rNew->tags_list.size(); ++i) {
					auto& tter = rNew->tags_list[i];
					if (tter == wfTag->hash || tter == rNew->parent->wordform->hash) {
						rNew->tags_list.erase(rNew->tags_list.begin() + i);
						rNew->tags.erase(tter);
					}
				}
				cos[pos]->appendReading(rNew);
				rNew->parent = cos[pos];

				if (prev != NULL) {
					free_reading(prev->next);
					prev->next = 0;
				}
				prev = rNew;
			}
		}
	}
	if (cos.size() == 0) {
		u_fprintf(ux_stderr, "WARNING: Line %u: Tried splitting %S, but got no new cohorts; shouldn't happen.", numLines, cohort->wordform->tag.c_str());
		cos.push_back(cohort);
	}
	// The last word forms are the top readings:
	cos[0]->text = cohort->text;
	std::reverse(cos.begin(), cos.end());
	return cos;
}