Exemplo n.º 1
0
void Misc::limpiarLabeled(const std::string& input, const std::string& output) {
	std::ifstream labeled(input.c_str());
	std::ofstream cleanLabeled(output.c_str());
	std::string header, line;
	std::vector<std::string> lineVec;
	int bytesLeidos = 0;
	int bytesTotales = 33556378; // Hardcodeadisimo

	std::cout << "LIMPIANDO LABELED...\n";
	// Header:
	labeled >> header >> header >> header;
	labeled.ignore(1,'\n');

	while (labeled.peek() != EOF) {
		std::getline(labeled, line);
		lineVec = split(line, '\t');
		processText(lineVec[2]);
		removeStopwords(lineVec[2]);
		cleanLabeled << lineVec[0] + '\t' << lineVec[1] << '\t' << lineVec[2];

		labeled.ignore(1, '\n');
		bytesLeidos += line.size() + 1;
		std::cout << "\r" << (int)(bytesLeidos*100.0/bytesTotales) << "%";
		if (labeled.peek() != EOF)
			cleanLabeled << '\n';
	}
	std::cout << "\r100%\n";
}
Exemplo n.º 2
0
// virtual
void GNeighborTransducer::transduce(GData* pDataLabeled, GData* pDataUnlabeled, int labelDims)
{
	if(labelDims != 1)
		ThrowError("Only 1 nominal label is supported");
	if(!pDataLabeled->relation()->areNominal(pDataLabeled->relation()->size() - 1, 1))
		ThrowError("Only nominal labels are supported");
	if(!pDataLabeled->relation()->areContinuous(0, pDataLabeled->relation()->size() - 1))
		ThrowError("Only continuous features are supported");
	if(pDataLabeled->cols() != pDataUnlabeled->cols())
		ThrowError("relations don't match");

	// Make a dataset containing all rows
	GData dataAll(pDataLabeled->relation());
	dataAll.reserve(pDataLabeled->rows() + pDataUnlabeled->rows());
	GReleaseDataHolder hDataAll(&dataAll);
	for(size_t i = 0; i < pDataUnlabeled->rows(); i++)
		dataAll.takeRow(pDataUnlabeled->row(i));
	for(size_t i = 0; i < pDataLabeled->rows(); i++)
		dataAll.takeRow(pDataLabeled->row(i));
	int featureDims = pDataLabeled->cols() - labelDims;
	sp_relation pRelInputs = new GUniformRelation(featureDims, 0);
	dataAll.setRelation(pRelInputs);

	// Find friends
	GNeighborFinder* pNF;
	if(m_intrinsicDims == 0)
		pNF = new GNeighborFinderCacheWrapper(new GKdTree(&dataAll, 0, m_friendCount, NULL, true), true);
	else
		pNF = new GManifoldNeighborFinder(
			&dataAll,
			m_friendCount, // littleK
			m_friendCount * 4, // bigK
			m_intrinsicDims, // intrinsicDims
			m_alpha, // alpha
			m_beta, // beta
			false, // prune?
			m_pRand);
	Holder<GNeighborFinder> hNF(pNF);
	GTEMPBUF(size_t, neighbors, m_friendCount);
	int labelValues = pDataLabeled->relation()->valueCount(featureDims);
	GTEMPBUF(double, tallys, labelValues);

	// Label the unlabeled patterns
	GBitTable labeled(pDataUnlabeled->rows());
	GData labelList(3); // pattern index, most likely label, confidence
	labelList.newRows(pDataUnlabeled->rows());
	for(size_t i = 0; i < pDataUnlabeled->rows(); i++)
		labelList.row(i)[0] = i;
	while(labelList.rows() > 0)
	{
		// Compute the most likely label and the confidence for each pattern
		for(size_t i = 0; i < labelList.rows(); i++)
		{
			// Find the most common label
			double* pRow = labelList.row(i);
			size_t index = (size_t)pRow[0];
			pNF->neighbors(neighbors, index);
			GVec::setAll(tallys, 0.0, labelValues);
			for(int j = 0; j < m_friendCount; j++)
			{
				if(neighbors[j] >= dataAll.rows())
					continue;
				double* pFriend = dataAll.row(neighbors[j]);
				if(neighbors[j] >= pDataUnlabeled->rows())
				{
					if((int)pFriend[featureDims] >= 0 && (int)pFriend[featureDims] < labelValues)
						tallys[(int)pFriend[featureDims]] += 1.0;
				}
				else if(labeled.bit(neighbors[j]))
				{
					if((int)pFriend[featureDims] >= 0 && (int)pFriend[featureDims] < labelValues)
						tallys[(int)pFriend[featureDims]] += 0.6;
				}
			}
			int label = GVec::indexOfMax(tallys, labelValues, m_pRand);
			double conf = tallys[label];

			// Penalize for dissenting votes
			for(int j = 0; j < m_friendCount; j++)
			{
				if(neighbors[j] >= dataAll.rows())
					continue;
				double* pFriend = dataAll.row(neighbors[j]);
				if(neighbors[j] >= pDataUnlabeled->rows())
				{
					if((int)pFriend[featureDims] != label)
						conf *= 0.5;
				}
				else if(labeled.bit(neighbors[j]))
				{
					if((int)pFriend[featureDims] != label)
						conf *= 0.8;
				}
			}
			pRow[1] = label;
			pRow[2] = conf;
		}
		labelList.sort(2);

		// Assign the labels to the patterns we are most confident about
		size_t maxCount = MAX((size_t)5, pDataLabeled->rows() / 5);
		size_t count = 0;
		for(size_t i = labelList.rows() - 1; i < labelList.rows(); i--)
		{
			double* pRow = labelList.row(i);
			size_t index = (size_t)pRow[0];
			int label = (int)pRow[1];
			pDataUnlabeled->row(index)[featureDims] = label;
			labeled.set(index);
			labelList.deleteRow(i);
			if(count >= maxCount)
				break;
			count++;
		}
	}
}