vf2D getCPTArray(DSL_network &network, string &childName) { //DSL_network* net = node->Network(); // node network int childIdx = network.FindNode(childName.c_str()); DSL_node* node = network.GetNode(childIdx); DSL_nodeDefinition *def = node->Definition(); const DSL_Dmatrix &cpt = *def->GetMatrix(); DSL_intArray coords; unsigned int colSize = def->GetNumberOfOutcomes(); unsigned int rowSize = cpt.GetSize() / colSize; vf2D cpt2(rowSize , vf1D(colSize, 0.0)); unsigned int colIdx = 0; unsigned int rowIdx = 0; for (int elemIdx = 0; elemIdx < cpt.GetSize(); elemIdx ++) { cpt2[rowIdx][colIdx] = cpt[elemIdx]; ++colIdx; if(colIdx == colSize) { colIdx = 0; ++rowIdx; } } return cpt2; }
vf1D getWeights(DSL_network &network, string &childName) { int childIdx = network.FindNode(childName.c_str()); DSL_node* node = network.GetNode(childIdx); int handle = node->Handle(); DSL_nodeDefinition *def = node->Definition(); const DSL_Dmatrix &cpt = *def->GetMatrix(); const DSL_intArray &parents = network.GetParents(handle); int parentCount = parents.NumItems(); DSL_intArray coords; unsigned int colSize = def->GetNumberOfOutcomes(); unsigned int rowSize = cpt.GetSize() / colSize; vf1D weights(rowSize, 0.0); unsigned int rowIdx = 0; for (int elemIdx = 0; elemIdx < cpt.GetSize(); elemIdx += colSize) { cpt.IndexToCoordinates(elemIdx, coords); double mult = 1.0; for (int parentIdx = 0; parentIdx < parentCount; parentIdx ++) { DSL_node *parentNode = network.GetNode(parents[parentIdx]); const DSL_Dmatrix &parent_cpt = *parentNode->Definition()->GetMatrix(); mult *= parent_cpt[coords[parentIdx]]; } weights[rowIdx++] = mult; } return weights; }
int main(int argc, char* argv[]) { ios_base::sync_with_stdio(0); string data_infile = string(argv[1]); string network_infile = string(argv[2]); string child_name = string("C1"); if (argv[3] == string("EM") || argv[3] == string("Smile")) { DSL_network net = LearnParamsEM(data_infile, network_infile, child_name); int childIdx = net.FindNode(child_name.c_str()); DSL_node* childNode = net.GetNode(childIdx); if (argv[3] == string("Smile")) { childNode->ChangeType(DSL_NOISY_MAX); } //printCPT(childNode); } else if (argv[3] == string("Naive")) { DSL_network net = LearnParamsNaive(data_infile, network_infile, child_name); //int childIdx = net.FindNode(child_name.c_str()); //DSL_node* childNode = net.GetNode(childIdx); //printCPT(childNode); } else if (argv[3] == string("Gauss")) { DSL_network net = LearnParamsGaussJordan(data_infile, network_infile, child_name); //int childIdx = net.FindNode(child_name.c_str()); //DSL_node* childNode = net.GetNode(childIdx); //printCPT(childNode); } else if (argv[3] == string("NvGJ")) { DSL_network netGJ = LearnParamsGaussJordan(data_infile, network_infile, child_name); DSL_network netNaive = LearnParamsNaive(data_infile, network_infile, child_name); DSL_network netEM = LearnParamsEM(data_infile, network_infile, child_name); DSL_network netOriginal = OpenNetwork(network_infile); vf1D weights = getWeights(netOriginal, child_name); vf2D cptGJ = getCPTArray(netGJ, child_name); vf2D cptNaive = getCPTArray(netNaive, child_name); vf2D cptEM = getCPTArray(netEM, child_name); vf2D cptOriginal = getCPTArray(netOriginal, child_name); cout << "Gauss:" << EuclidianDistance( cptOriginal, cptGJ, weights) << endl; cout << "Naive:" << EuclidianDistance( cptOriginal, cptNaive, weights) << endl; cout << "EM:" << EuclidianDistance( cptOriginal, cptEM, weights) << endl; //cout << "Euclidian distance (orig vs GJ): " << EuclidianDistance(original, netGJ, child_name) << endl; //cout << "Euclidian distance (orig vs Naive): " << EuclidianDistance(original, netNaive, child_name) << endl; //int childIdx = net.FindNode(child_name.c_str()); } return 0; }
void generateDatafile(int n, string inname) { DSL_network theNet; theNet.ReadFile(inname.c_str()); int cancer = theNet.FindNode("LungCancer"); DSL_node *node = theNet.GetNode(cancer); cptMap cpt1 = get_cptmap(node); keyMap km1; km1[string("Smoker")] = string("True"); km1[string("Genetic")] = string("True"); km1[string("CoalWorker")] = string("True"); km1[string("BadDiet")] = string("True"); printMap(km1); cout << "SIZE: "<< cpt1.size() << endl; cout << cpt1[km1]["True"]; }
DSL_network LearnParamsEM(string data_infile, string network_infile, string child_name) { DSL_dataset ds; if (ds.ReadFile(data_infile.c_str()) != DSL_OKAY) { cout << "Cannot read data file... exiting." << endl; exit(1); } DSL_network originalNet; if (originalNet.ReadFile(network_infile.c_str(), DSL_XDSL_FORMAT) != DSL_OKAY) { cout << "Cannot read network... exiting." << endl; exit(1); } int childIdx = originalNet.FindNode(child_name.c_str()); originalNet.GetNode(childIdx)->ChangeType(DSL_CPT); vector<DSL_datasetMatch> matches; string err; if (ds.MatchNetwork(originalNet, matches, err) != DSL_OKAY) { cout << "Cannot match network... exiting." << endl; exit(1); } DSL_em em; em.SetUniformizeParameters(true); em.SetRandomizeParameters(true); em.SetSeed(0); em.SetEquivalentSampleSize(1); if (em.Learn(ds, originalNet, matches) != DSL_OKAY) { cout << "Cannot learn parameters... exiting." << endl; exit(1); } return originalNet; }
LearningInfo(string data_infile, string network_infile, string child_name) { if (dataSet.ReadFile(data_infile.c_str()) != DSL_OKAY) { cout << "Cannot read data file... exiting." << endl; exit(1); } if (originalNet.ReadFile(network_infile.c_str(), DSL_XDSL_FORMAT) != DSL_OKAY) { cout << "Cannot read network... exiting." << endl; exit(1); } string err; if (dataSet.MatchNetwork(originalNet, matches, err) != DSL_OKAY) { cout << "Cannot match network... exiting." << endl; exit(1); } for(unsigned int i=0 ; i < matches.size() ; ++i) { matchNetToData[matches[i].node] = matches[i].column; matchDataToNet[matches[i].column] = matches[i].node; } childIdx = originalNet.FindNode(child_name.c_str()); childNode = originalNet.GetNode(childIdx); if (childNode->Definition()->GetType() != (DSL_CHANCE | DSL_DISCRETE | DSL_NOISY_MAX) ) { cout << "Child should be a NoisyMAX... exiting" << endl; // ewentualnie zmienic na noisy-max ręcznie exit(1); } childMAXDefinition = new DSL_noisyMAX(*(childNode->Definition())); DSL_intArray &parents = originalNet.GetParents(childNode->Handle()); numberOfParents = parents.NumItems(); parentIndices = vector<int>(numberOfParents, 0); for(int i=0; i<numberOfParents; ++i) parentIndices[i] = parents[i]; childDimension = childNode->Definition()->GetNumberOfOutcomes(); parentDimensions = vector<int>(numberOfParents, 0); sumParentDimensions = 0; parentOutcomesStrengths = vector<DSL_intArray>(numberOfParents); minimalNumberOfParameters = 1; // minimal number of unique parameters to calculate (count leak right away) for(int parentIdx = 0 ; parentIdx < numberOfParents ; ++parentIdx) { DSL_node *parentNode = originalNet.GetNode(parentIndices[parentIdx]); sumParentDimensions += (parentDimensions[parentIdx] = parentNode->Definition()->GetNumberOfOutcomes()); //parent dimension is equal to the number of outcomes parentOutcomesStrengths[parentIdx] = childMAXDefinition->GetParentOutcomeStrengths(parentIdx); //for (int stateIdx=0 ; stateIdx < parentDimensions[parentIdx] ; ++stateIdx) // cout << parentOutcomesStrengths[parentIdx][stateIdx] << " "; //cout << endl; minimalNumberOfParameters += parentDimensions[parentIdx] - 1; // (each parent dimension reduced by one) because we don't count distinguished states of parents distinguishedStates[parentIdx] = parentOutcomesStrengths[parentIdx][parentDimensions[parentIdx] - 1]; } int sumOffset = 0; parameterRowOffset = vi1D(numberOfParents + 1, 0); // +1 so we know the offset for LEAK column for(int parentIdx = 0; parentIdx < numberOfParents ; ++parentIdx) { parameterRowOffset[parentIdx] = sumOffset; sumOffset += parentDimensions[parentIdx] - 1; } parameterRowOffset[numberOfParents] = sumOffset; parametersRowLength = minimalNumberOfParameters; minimalNumberOfParameters *= (childDimension - 1); // number of unique rows, last row is always 1.0 - sum // DEBUG(minimalNumberOfParameters); // DEBUG(childDimension); // DEBUGV(parentDimensions); // DEBUGV(parameterRowOffset); //for(int j=0; j< 7 ; ++j) { // DSL_datasetVarInfo vi = ds.GetVariableInfo(j); // cout << "discreete:" << vi.discrete << " id:" << vi.id << endl << " missingInt:" << vi.missingInt << " mF:" << vi.missingFloat << "snames:"<< endl; // for(int i=0;i<vi.stateNames.size(); ++i) // cout << vi.stateNames[i]<< " "; // cout <<endl; //} //for(int i = 0; i < ds.GetNumberOfRecords(); ++i) { // vector<int> row(ds.GetNumberOfVariables(), 0); // int sum_ones = 0; // for(int j = 0; j < ds.GetNumberOfVariables(); ++j) { // sum_ones += (row[j] = ds.GetInt(j,i)); // } //} //vector<int> rd = ds.GetIntData(0); //cout <<"RDSize:"<<rd.size()<< endl; //for(int i=0;i<rd.size();++i) { // cout << vi.stateNames[rd[i]] << endl; //} // }
void DSLPNLConverter::CreateFactors(DSL_network& dslNet, CBNet* pnlBNet) { int i,j,k; // Read number of nodes in the net int numberOfNodes = dslNet.GetNumberOfNodes(); // This is a way PNL likes it pnlBNet->AllocFactors(); DSL_Dmatrix* dslMatrix; CCPD* pnlCPD; for (i=0;i<numberOfNodes;i++) { // Get parents of the ith node // IMPORTANT -- we should preserve order from DSL_network, since // probabilities will be according DSL ordering DSL_intArray dslParents; dslParents = dslNet.GetParents(dslNet.FindNode(theIds[i])); // establish sizes and allocate memory int numberOfNodesInDomain = dslParents.NumItems() + 1; int* domain = new int[numberOfNodesInDomain]; CNodeType** nodeTypes = new CNodeType*[numberOfNodesInDomain]; // establish members of the domain for (j=0;j<numberOfNodesInDomain-1;j++) domain[j] = dslParents[j]; domain[numberOfNodesInDomain-1] = i; // Fill up node types for (j=0;j<numberOfNodesInDomain;j++) nodeTypes[j] = const_cast <CNodeType*> (pnlBNet->GetNodeType(domain[j])); // Read CPT from SMILE dslNet.GetNode(dslNet.FindNode(theIds[i]))->Definition()->GetDefinition(&dslMatrix); // Alloc space for CPT int sizeOfCPT = dslMatrix->GetSize(); float* flatCPT = new float[sizeOfCPT]; // Here we convert 'copy' numbers from SMILE to PNL // The painful part is convert double to float. // Additionally we check if after conversion they sum-up to 1 int numberOfMyStates = nodeTypes[numberOfNodesInDomain-1]->GetNodeSize(); int iterations = sizeOfCPT/numberOfMyStates; for (j=0;j<iterations;j++) { float sum = 0.0f; for (k=0;k<numberOfMyStates;k++) { flatCPT[j*numberOfMyStates+k] = static_cast <float> (dslMatrix->Subscript(j*numberOfMyStates+k)); sum += flatCPT[j*numberOfMyStates+k]; } if (sum!=1.0f) { for (k=0;k<numberOfMyStates;k++) flatCPT[j*numberOfMyStates+k] /= sum; } } #ifdef DSLPNL_DEBUG std::cerr << "Node "<< i << " domain : "; for (j=0;j<numberOfNodesInDomain;j++) { std::cerr << domain[j] << " "; } std::cerr << std::endl; for (j=0;j<sizeOfCPT;j++) std::cerr << flatCPT[j] << " "; std::cerr << std::endl; #endif CModelDomain* pMD = pnlBNet->GetModelDomain(); pnlCPD = CTabularCPD::Create(domain, numberOfNodesInDomain, pMD, flatCPT); if (pnlCPD==NULL) { std::cout << "We got a problem with creating CPD" << std::endl; return; } pnlBNet->AttachFactor(pnlCPD); delete[] nodeTypes; delete[] domain; delete[] flatCPT; } }
string dynamicEMTraining(string theNet, string trainingSet){ // open the data set: DSL_dataset ds; if (ds.ReadFile(trainingSet.c_str()) != DSL_OKAY) { cout << "Cannot read data file... exiting." << endl; exit(1); } // open the network: DSL_network net; if (net.ReadFile(theNet.c_str(), DSL_XDSL_FORMAT) != DSL_OKAY) { cout << "Cannot read network... exiting." << endl; exit(1); } // match the data set and the network (variables): vector<DSL_datasetMatch> dsMap(ds.GetNumberOfVariables()); int varCnt = 0; // the number of variables occuring both in the data set and the network for (int i = 0; i < ds.GetNumberOfVariables(); i++) { string id = ds.GetId(i); const char* idc = id.c_str(); bool done = false; for (int j = 0; j < (int) strlen(idc) && !done; j++) { if (idc[j] == '_') { char* nodeId = (char*) malloc((j+1) * sizeof(char)); strncpy(nodeId, idc, j); nodeId[j] = '\0'; int nodeHdl = net.FindNode(nodeId); if (nodeHdl >= 0) { DSL_intArray orders; net.GetTemporalOrders(nodeHdl, orders); dsMap[varCnt].node = nodeHdl; dsMap[varCnt].slice = atoi(idc + j + 1); dsMap[varCnt].column = i; varCnt++; free(nodeId); done = true; } } } if (!done) { int nodeHdl = net.FindNode(idc); if (nodeHdl >= 0) { dsMap[varCnt].node = nodeHdl; dsMap[varCnt].slice = 0; dsMap[varCnt].column = i; varCnt++; } } } dsMap.resize(varCnt); // match the data set and the network (states): for (int i = 0; i < dsMap.size(); i++) { DSL_datasetMatch &m = dsMap[i]; int nodeHdl = m.node; int colIdx = m.column; DSL_idArray* ids = net.GetNode(nodeHdl)->Definition()->GetOutcomesNames(); const DSL_datasetVarInfo &varInfo = ds.GetVariableInfo(colIdx); const vector<string> &stateNames = varInfo.stateNames; vector<int> map(stateNames.size(), -1); for (int j = 0; j < (int) stateNames.size(); j++) { const char* id = stateNames[j].c_str(); for (int k = 0; k < ids->NumItems(); k++) { char* tmpid = (*ids)[k]; if (!strcmp(id, tmpid)) { map[j] = k; } } } for (int k = 0; k < ds.GetNumberOfRecords(); k++) { if (ds.GetInt(colIdx, k) >= 0) { ds.SetInt(colIdx, k, map[ds.GetInt(colIdx, k)]); } } } // learn parameters: DSL_em em; if (em.Learn(ds, net, dsMap) != DSL_OKAY) { cout << "Cannot learn parameters... exiting." << endl; exit(1); } net.WriteFile((theNet.insert(theNet.find_last_of("/") + 1, "res_")).c_str(), DSL_XDSL_FORMAT); return theNet; }