TypeDictNode *newDictNode(char c){ TypeDictNode *n; n = (TypeDictNode *) monmalloc(sizeof(TypeDictNode)); n->symbol = c; n->child = NULL; n->sibling = NULL; n->index = -1; return n; }
double score(TypeSetOfSequences *dec) { TypeNumber n, *present, *total; TypePosition p; double score; int *out; TypeSymbol c; present =(TypeNumber*) monmalloc(dec->cardinal*sizeof(TypeNumber)); total =(TypeNumber*) monmalloc(dec->cardinal*sizeof(TypeNumber)); out =(int*) monmalloc(dec->cardinal*sizeof(int)); for(c=0; c<dec->cardinal; c++) { present[c] = -1; total[c] = 0; out[c] = 0; } for(n=0; n<dec->number; n++) { for(p=0; p<dec->size[n]; p++) if(dec->sequence[n][p]<dec->cardinal && !out[dec->sequence[n][p]]) { if(present[dec->sequence[n][p]] == n) out[dec->sequence[n][p]] = 1; else { present[dec->sequence[n][p]] = n; total[dec->sequence[n][p]]++; } } } score = 0; for(c=0; c<dec->cardinal; c++) if(!out[c]) // score += (total[c]*(total[c]-1))/2; score += (pow(total[c], 1)*(total[c]-1)); monfree((void*)out); monfree((void*)total); monfree((void*)present); return score; }
double scorebis(TypeSetOfSequences *dec) { TypeNumber n, *last, *present; TypePosition p, *total; double score; TypeSymbol c; last =(TypeNumber*) monmalloc(dec->cardinal*sizeof(TypeNumber)); present =(TypeNumber*) monmalloc(dec->cardinal*sizeof(TypeNumber)); total =(TypePosition*) monmalloc(dec->cardinal*sizeof(TypePosition)); for(c=0; c<dec->cardinal; c++) { present[c] = 0; total[c] = 0; last[c] = -1; } for(n=0; n<dec->number; n++) { for(p=0; p<dec->size[n]; p++) if(dec->sequence[n][p]<dec->cardinal) { if(last[dec->sequence[n][p]] != n) { last[dec->sequence[n][p]] = n; present[dec->sequence[n][p]]++; } total[dec->sequence[n][p]]++; } } score = 0; for(c=0; c<dec->cardinal; c++) if(total[c]>1 && total[c] <3* present[c]) score += (pow(total[c], 0.5)); // score += pow(((double)present[c])/((double)total[c]), 20.); // score /= dec->cardinal; monfree((void*)last); monfree((void*)total); monfree((void*)present); return score; }
int addIndex(char *name, TypeIndex *species) { int sizeTmp, index; if(strlen(name) == 0) return -1; sizeTmp = species->size; index = getIndexString(name, species->dict, &(species->size)); if(species->size > sizeTmp) { if(sizeTmp >= species->buffer) { species->buffer += INC_BUFFER_UTILS; species->name = (char**) monrealloc(species->name, species->buffer*sizeof(char*)); } species->name[index] = (char*) monmalloc((strlen(name)+1)*sizeof(char)); strcpy(species->name[index], name); } return index; }
int main(int argc, char **argv) { TypePosition orderstart=1, orderend=10; char option[256], inputFileName[SIZE_BUFFER_CHAR], outputFileName[SIZE_BUFFER_CHAR], bufferOutput[SIZE_BUFFER_CHAR], *table, outputFormat = 'r', typeDec = 'l', typeAlphabet = '?', typeCalc = 'g'; TypeSetOfSequences *set; TypeAlignment aln, atmp; int fixed = 0; double threshold = 0.001, tmin = 0.00000000001, tmax=0.1, tstep = 0.00001; double thre; TypeNumber n; TypeDistance distA, distB, distC; TypePosition l, tot; double corrAB, corrAC; TypeSetOfSequences *dec; TypeMarkovModel *model; FILE *fi, *fo; int i = 1, typeDist = 0; for(i=0; i<256; i++) option[i] = 0; for(i=1; i<argc && *(argv[i]) == '-'; i++) { int j; for(j=1; argv[i][j] != '\0'; j++) option[argv[i][j]] = 1; if(option['f']) { option['f'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &outputFormat) == 1) i++; } if(option['s']) { option['s'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &typeAlphabet) == 1) i++; } if(option['c']) { option['c'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &typeCalc) == 1) i++; } if(option['m']) { option['m'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%d", &typeDist) == 1) i++; if(typeDist >= MAX_FUNC) typeDist = 0; } if(option['t']) { option['t'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%lf", &threshold) == 1) i++; } if(option['h']) { printf("%s\n", HELPMESSAGE); exitProg(ExitOk, NULL); } } if (i>=argc || sscanf(argv[i++], "%s", inputFileName) != 1) exitProg(ErrorArgument, HELPMESSAGE); if (i>=argc || sscanf(argv[i++], "%s", outputFileName) != 1) exitProg(ErrorArgument, HELPMESSAGE); switch(typeAlphabet) { case 'd': table = (char*) monmalloc((strlen(DNA)+1)*sizeof(char)); strcpy(table, DNA); break; case 'r': table = (char*) monmalloc((strlen(RNA)+1)*sizeof(char)); strcpy(table, RNA); break; case 'p': table = (char*) monmalloc((strlen(PRO)+1)*sizeof(char)); strcpy(table, PRO); break; case '?': default: table = (char*) monmalloc(sizeof(char)); table[0] = '\0'; } if(fi = fopen(inputFileName, "r")) { aln = readAlignement(fi, table, typeAlphabet == '?'); switch(typeAlphabet) { case 'd': case 'r': aln.ambiguity = getXNAAmbiguity(); break; case 'p': aln.ambiguity = getProteinAmbiguity(); break; case '?': default: aln.ambiguity.number = 0; } aln.cardinal -= aln.ambiguity.number; fclose(fi); } else { exitProg(ErrorReading, inputFileName); } if(!(fo = fopen(outputFileName, "w"))) exitProg(ErrorWriting, outputFileName); fixAlignmentAmbiguity(&aln); set=toSequences(&aln); model = estimateMarkovModel(set); distA = computeWholeDistancePairAln(aln, computeNorm1Aln); dec = getDecodedFromThreshold(set, threshold, model); distB = computeWholeDistanceDec(dec); corrAB = computeCorrelation(distA, distB); //printf("%lE\n", corrAB); monfree((void*)distB.table); for(n=0; n<dec->number; n++) monfree((void*) dec->sequence[n]); monfree((void*) dec->sequence); monfree((void*) dec->size); monfree((void*) dec); distC = computeMSMDistance(set); corrAC = computeCorrelation(distA, distC); monfree((void*)distC.table); monfree((void*)distA.table); fprintf(fo, "%lE\t%lE\n", corrAB, corrAC); fprintf(stdout, "%lE\t%lE\n", corrAB, corrAC); monfree((void*)set->size); for(n=0; n<set->number; n++) monfree((void*)set->sequence[n]); monfree((void*)set->sequence); monfree((void*)set); fclose(fo); /* sprintf(bufferOutput, "%s_Ali.nex", outputFileName); if(!(fo = fopen(bufferOutput, "w"))) exitProg(ErrorWriting, bufferOutput); printDistanceNexus(fo, distA); fclose(fo); sprintf(bufferOutput, "%s_New.nex", outputFileName); if(!(fo = fopen(bufferOutput, "w"))) exitProg(ErrorWriting, bufferOutput); printDistanceNexus(fo, distB); fclose(fo); */ exitProg(ExitOk,NULL); return 0; }
void initIndex(TypeIndex *index) { index->size = 0; index->buffer = INC_BUFFER_UTILS; index->name = (char **) monmalloc(index->buffer*sizeof(char*)); index->dict = newDictNode('x'); }
int main(int argc, char **argv) { TypePosition orderstart=1, orderend=10; char option[256], inputFileName[SIZE_BUFFER_CHAR], outputFileName[SIZE_BUFFER_CHAR], bufferOutput[SIZE_BUFFER_CHAR], *table, outputFormat = 'r', typeDec = 'l', typeAlphabet = '?', typeCalc = 'g'; TypeSetOfSequences set; int fixed = 0; double threshold = 0.001; /* TypeDistFunction *distfunc[MAX_FUNC]= {computeProba, computeKullbackLeiber1, computePham, computeCommon, computeCommonBis, computeGillesPham, computeMatchesBis, computeMatches, computeAlex, computeAlexBis}; */ FILE *fi, *fo; int i = 1, typeDist = 0; for(i=0; i<256; i++) option[i] = 0; for(i=1; i<argc && *(argv[i]) == '-'; i++) { int j; for(j=1; argv[i][j] != '\0'; j++) option[argv[i][j]] = 1; if(option['f']) { option['f'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &outputFormat) == 1) i++; } if(option['s']) { option['s'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &typeAlphabet) == 1) i++; } if(option['c']) { option['c'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &typeCalc) == 1) i++; } if(option['m']) { option['m'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%d", &typeDist) == 1) i++; if(typeDist >= MAX_FUNC) typeDist = 0; } if(option['t']) { option['t'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%lf", &threshold) == 1) i++; } if(option['h']) { printf("%s\n", HELPMESSAGE); exitProg(ExitOk, NULL); } } if (i>=argc || sscanf(argv[i++], "%s", inputFileName) != 1) exitProg(ErrorArgument, HELPMESSAGE); if (i>=argc || sscanf(argv[i++], "%s", outputFileName) != 1) exitProg(ErrorArgument, HELPMESSAGE); switch(typeAlphabet) { case 'd': table = (char*) monmalloc((strlen(DNA)+1)*sizeof(char)); strcpy(table, DNA); break; case 'r': table = (char*) monmalloc((strlen(RNA)+1)*sizeof(char)); strcpy(table, RNA); break; case 'p': table = (char*) monmalloc((strlen(PRO)+1)*sizeof(char)); strcpy(table, PRO); break; case '?': default: table = (char*) monmalloc(sizeof(char)); table[0] = '\0'; } if(fi = fopen(inputFileName, "r")) { set = readSequencesFasta(fi, table, typeAlphabet == '?'); switch(typeAlphabet) { case 'd': case 'r': set.ambiguity = getXNAAmbiguity(); break; case 'p': set.ambiguity = getProteinAmbiguity(); break; case '?': default: set.ambiguity.number = 0; } set.cardinal -= set.ambiguity.number; fclose(fi); } else { exitProg(ErrorReading, inputFileName); } if(fo = fopen(outputFileName, "w")) { TypeDistance dist; fixSequencesAmbiguity(&set); dist = computeMSMDistance(&set); switch(outputFormat) { case 't': printDistanceTable(fo, dist); break; case 'r': printDistanceRaw(fo, dist); break; case 'p': printDistancePhylip(fo, dist); break; case 'n': printDistanceNexus(fo, dist); } fclose(fo); } else { exitProg(ErrorWriting, outputFileName); } exitProg(ExitOk,NULL); return 0; }
int main(int argc, char **argv) { TypePosition orderstart=1, orderend=10, length = 10; char option[256], inputFileName[SIZE_BUFFER_CHAR], outputFileName[SIZE_BUFFER_CHAR], bufferOutput[SIZE_BUFFER_CHAR], *table, outputFormat = 'r', typeDec = 'l', typeAlphabet = 'd', typeCalc = 'g'; TypeSetOfSequences set; int fixed = 0, flagThre = 1; double threshold = 0.001, tmin = -25, tmax = -3, tprec = 0.5; /* TypeDistFunction *distfunc[MAX_FUNC]= {computeProba, computeKullbackLeiber1, computePham, computeCommon, computeCommonBis, computeGillesPham, computeMatchesBis, computeMatches, computeAlex, computeAlexBis}; */ FILE *fi, *fo; int i = 1, typeDist = 0; for(i=0; i<256; i++) option[i] = 0; for(i=1; i<argc && *(argv[i]) == '-'; i++) { int j; for(j=1; argv[i][j] != '\0'; j++) option[argv[i][j]] = 1; if(option['f']) { option['f'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &outputFormat) == 1) i++; } if(option['s']) { option['s'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &typeAlphabet) == 1) i++; } if(option['c']) { option['c'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &typeCalc) == 1) i++; } if(option['m']) { option['m'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%d", &typeDist) == 1) i++; if(typeDist >= MAX_FUNC) typeDist = 0; } if(option['t']) { option['t'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%lf", &threshold) == 1) i++; flagThre = 1; } if(option['l']) { option['l'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%ld", &length) == 1) i++; flagThre = 0; } if(option['h']) { printf("%s\n", HELPMESSAGE); exitProg(ExitOk, NULL); } } if (i>=argc || sscanf(argv[i++], "%s", inputFileName) != 1) exitProg(ErrorArgument, HELPMESSAGE); if (i>=argc || sscanf(argv[i++], "%s", outputFileName) != 1) exitProg(ErrorArgument, HELPMESSAGE); switch(typeAlphabet) { case 'd': table = (char*) monmalloc((strlen(DNA)+1)*sizeof(char)); strcpy(table, DNA); break; case 'r': table = (char*) monmalloc((strlen(RNA)+1)*sizeof(char)); strcpy(table, RNA); break; case 'p': table = (char*) monmalloc((strlen(PRO)+1)*sizeof(char)); strcpy(table, PRO); break; case '?': default: table = (char*) monmalloc(sizeof(char)); table[0] = '\0'; } if(fi = fopen(inputFileName, "r")) { set = readSequencesFasta(fi, table, typeAlphabet == '?'); switch(typeAlphabet) { case 'd': case 'r': set.ambiguity = getXNAAmbiguity(); break; case 'p': set.ambiguity = getProteinAmbiguity(); break; case '?': default: set.ambiguity.number = 0; } set.cardinal -= set.ambiguity.number; fclose(fi); } else { exitProg(ErrorReading, inputFileName); } if(fo = fopen(outputFileName, "w")) { TypeSetOfSequences *dec; TypeDistance dist; double tmid, t, smax, tres, sc, scl, scr; TypeNumber n; TypeCodeScheme *scheme; TypeMarkovModel *model; fixSequencesAmbiguity(&set); scheme = (TypeCodeScheme*) monmalloc(sizeof(TypeCodeScheme)); scheme->suffixTree = getSuffixTree(&set); scheme->code = (TypePosition*) monmalloc(scheme->suffixTree->size*sizeof(TypePosition)); scheme->buffSize = INC_SIZE_CODE; scheme->lengthCode = (TypePosition*) monmalloc(scheme->buffSize*sizeof(TypePosition)); model = estimateMarkovModel(&set); while((tmax-tmin)>4*tprec) { tmid = (tmax+tmin)/2.; scheme->cardCode = 0; buildCodeThreshold(exp(tmid-3.*tprec/2.), scheme->suffixTree->root, 0, 1., model, scheme); dec = getDecodedFromScheme(scheme); scl = score(dec); for(n=0; n<dec->number; n++) monfree((void*) dec->sequence[n]); monfree((void*) dec->sequence); monfree((void*) dec->size); monfree((void*) dec); scheme->cardCode = 0; buildCodeThreshold(exp(tmid+3.*tprec/2.), scheme->suffixTree->root, 0, 1., model, scheme); dec = getDecodedFromScheme(scheme); scr = score(dec); for(n=0; n<dec->number; n++) monfree((void*) dec->sequence[n]); monfree((void*) dec->sequence); monfree((void*) dec->size); monfree((void*) dec); if(scl>scr) tmax = tmid+3.*tprec/2.; else tmin = tmid-3.*tprec/2.; } if(scl>scr) { smax = scl; tres = exp(tmid-3.*tprec/2.); } else { smax = scr; tres = exp(tmid+3.*tprec/2.); } scheme->cardCode = 0; buildCodeThreshold(exp(tmid), scheme->suffixTree->root, 0, 1., model, scheme); dec = getDecodedFromScheme(scheme); sc = score(dec); for(n=0; n<dec->number; n++) monfree((void*) dec->sequence[n]); monfree((void*) dec->sequence); monfree((void*) dec->size); monfree((void*) dec); if(sc>smax) { smax = scl; tres = exp(tmid); } printf("%.4lE\t%lf\n", tres, smax); scheme->cardCode = 0; buildCodeThreshold(tres, scheme->suffixTree->root, 0, 1., model, scheme); dec = getDecodedFromScheme(scheme); freeModel(model); freeScheme(scheme); dist = computeWholeDistanceDec(dec); switch(outputFormat) { case 't': printDistanceTable(fo, dist); break; case 'r': printDistanceRaw(fo, dist); break; case 'p': printDistancePhylip(fo, dist); break; case 'n': printDistanceNexus(fo, dist); } fclose(fo); } else { exitProg(ErrorWriting, outputFileName); } exitProg(ExitOk,NULL); return 0; }
int main(int argc, char **argv) { TypePosition orderstart=1, orderend=10; char option[256], inputFileName[SIZE_BUFFER_CHAR], outputFileName[SIZE_BUFFER_CHAR], bufferOutput[SIZE_BUFFER_CHAR], *table, outputFormat = 'r', typeDec = 'l', typeAlphabet = '?', typeCalc = 'g', type = 't'; TypeSetOfSequences *set, seq; TypeAlignment aln, atmp; int fixed = 0; double threshold = 0.001, tmin = 1E-20, tmax=0.1, tstep = 0.00001, qmin = -25, qmax = -3, qprec = 0.5; double thre; TypeNumber n; TypeDistance distA, distB; TypePosition l, tot, lmax = 50; TypeSuffixTree *suffixTree; TypeMarkovModel *model; TypeCodeScheme *scheme; /* TypeDistFunction *distfunc[MAX_FUNC]= {computeProba, computeKullbackLeiber1, computePham, computeCommon, computeCommonBis, computeGillesPham, computeMatchesBis, computeMatches, computeAlex, computeAlexBis}; */ FILE *fi, *fo; int i = 1, typeDist = 0; for(i=0; i<256; i++) option[i] = 0; for(i=1; i<argc && *(argv[i]) == '-'; i++) { int j; for(j=1; argv[i][j] != '\0'; j++) option[argv[i][j]] = 1; if(option['f']) { option['f'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &outputFormat) == 1) i++; } if(option['s']) { option['s'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &typeAlphabet) == 1) i++; } if(option['c']) { option['c'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &typeCalc) == 1) i++; } if(option['m']) { option['m'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%lf", &tmin) == 1) i++; if(typeDist >= MAX_FUNC) typeDist = 0; } if(option['t']) { option['t'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%lf", &threshold) == 1) i++; } if(option['y']) { option['y'] = 0; if((i+1)<argc && sscanf(argv[i+1], "%c", &type) == 1) i++; } if(option['h']) { printf("%s\n", HELPMESSAGE); exitProg(ExitOk, NULL); } } if (i>=argc || sscanf(argv[i++], "%s", inputFileName) != 1) exitProg(ErrorArgument, HELPMESSAGE); if (i>=argc || sscanf(argv[i++], "%s", outputFileName) != 1) exitProg(ErrorArgument, HELPMESSAGE); switch(typeAlphabet) { case 'd': table = (char*) monmalloc((strlen(DNA)+1)*sizeof(char)); strcpy(table, DNA); break; case 'r': table = (char*) monmalloc((strlen(RNA)+1)*sizeof(char)); strcpy(table, RNA); break; case 'p': table = (char*) monmalloc((strlen(PRO)+1)*sizeof(char)); strcpy(table, PRO); break; case '?': default: table = (char*) monmalloc(sizeof(char)); table[0] = '\0'; } if(fi = fopen(inputFileName, "r")) { aln = readAlignement(fi, table, typeAlphabet == '?'); switch(typeAlphabet) { case 'd': case 'r': aln.ambiguity = getXNAAmbiguity(); break; case 'p': aln.ambiguity = getProteinAmbiguity(); break; case '?': default: aln.ambiguity.number = 0; } aln.cardinal -= aln.ambiguity.number; fclose(fi); } else { exitProg(ErrorReading, inputFileName); } fixAlignmentAmbiguity(&aln); set=toSequences(&aln); if(!(fo = fopen(outputFileName, "w"))) exitProg(ErrorWriting, outputFileName); distA = computeWholeDistancePairAln(aln, computeNorm1Aln); scheme = (TypeCodeScheme*) monmalloc(sizeof(TypeCodeScheme)); scheme->suffixTree = getSuffixTree(set); scheme->code = (TypePosition*) monmalloc(scheme->suffixTree->size*sizeof(TypePosition)); scheme->buffSize = INC_SIZE_CODE; scheme->lengthCode = (TypePosition*) monmalloc(scheme->buffSize*sizeof(TypePosition)); if(type == 't') { int l; model = estimateMarkovModel(set); // for(thre=tmin; thre<=tmax; thre *= 10.0) { for(l=tmin; l<=-1; l++) { double t; int k; thre = pow(10.0, (double) l); for(k=0; k<10; k++) { // for(t=thre; t<thre*10; t+=thre) { double corr, sc; TypeSetOfSequences *dec; t = ((double)k+1.)*thre; scheme->cardCode = 0; buildCodeThreshold(t, scheme->suffixTree->root, 0, 1., model, scheme); //printLengthDistribution(stdout, scheme->lengthCode,scheme->cardCode); dec = getDecodedFromScheme(scheme); //printf("cardinal dec = %ld\n", dec->cardinal); distB = computeWholeDistanceDec(dec); corr = computeCorrelation(distA, distB); monfree((void*)distB.table); sc = score(dec); printf("%lE\t%lf\t%.2lf\n", t, corr, sc); fprintf(fo, "%lE\t%lf\t%.2lf\n", t, corr, sc); for(n=0; n<dec->number; n++) monfree((void*) dec->sequence[n]); monfree((void*) dec->sequence); monfree((void*) dec->size); monfree((void*) dec); } } fprintf(stdout, "\n\n%.4lE\n\n", findMode(set, qmin, qmax, qprec, scheme, model)); freeModel(model); } else { for(l = lmax; l>=1; l--) { double corr; TypeSetOfSequences *dec; scheme->cardCode = 0; buildCodeLength(l, scheme->suffixTree->root, 0, scheme); //printLengthDistribution(stdout, scheme->lengthCode,scheme->cardCode); dec = getDecodedFromScheme(scheme); //printf("cardinal dec = %ld\n", dec->cardinal); distB = computeWholeDistanceDec(dec); corr = computeCorrelation(distA, distB); monfree((void*)distB.table); fprintf(fo, "%ld\t%lf\n", l, corr); fprintf(stdout, "%ld\t%lf\n", l, corr); for(n=0; n<dec->number; n++) monfree((void*) dec->sequence[n]); monfree((void*) dec->sequence); monfree((void*) dec->size); monfree((void*) dec); } } freeScheme(scheme); monfree((void*)distA.table); fprintf(stdout, "\n\n%ld\n\n", totalLength(*set)); monfree((void*)set->size); for(n=0; n<set->number; n++) monfree((void*)set->sequence[n]); monfree((void*)set->sequence); monfree((void*)set); fclose(fo); /* sprintf(bufferOutput, "%s_Ali.nex", outputFileName); if(!(fo = fopen(bufferOutput, "w"))) exitProg(ErrorWriting, bufferOutput); printDistanceNexus(fo, distA); fclose(fo); sprintf(bufferOutput, "%s_New.nex", outputFileName); if(!(fo = fopen(bufferOutput, "w"))) exitProg(ErrorWriting, bufferOutput); printDistanceNexus(fo, distB); fclose(fo); */ ; exitProg(ExitOk,NULL); return 0; }