int main( int argc, char *argv[] ) { int i ; int ret ; Alignments alignments ; Alignments clippedAlignments ; Blocks blocks ; Genome genome ; char *genomeFile = NULL ; if ( argc < 2 ) { printf( "%s", usage ) ; exit( 0 ) ; } minimumSupport = 2 ; minimumEffectiveLength = 200 ; kmerSize = 23 ; breakN = 1 ; minContigSize = 200 ; prefix = NULL ; VERBOSE = false ; outputConnectionSequence = false ; aggressiveMode = false ; for ( i = 1 ; i < argc ; ++i ) { if ( !strcmp( "-b", argv[i] ) ) { alignments.Open( argv[i + 1]) ; ++i ; } else if ( !strcmp( "-o", argv[i] ) ) { prefix = argv[i + 1] ; ++i ; } else if ( !strcmp( "-f", argv[i] ) ) { genomeFile = argv[i + 1] ; ++i ; } else if ( !strcmp( "-ms", argv[i] ) ) { minimumSupport = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-ml", argv[i] ) ) { minimumEffectiveLength = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-k", argv[i] ) ) { kmerSize = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-breakN", argv[i] ) ) { breakN = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-minContigSize", argv[i] ) ) { minContigSize = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( "-v", argv[i] ) ) { VERBOSE = true ; } else if ( !strcmp( "-cs", argv[i] ) ) { outputConnectionSequence = true ; } /*else if ( !strcmp( "-aggressive", argv[i] ) ) { aggressiveMode = true ; }*/ else if ( !strcmp( "-bc", argv[i] ) ) { // So far, assume the input is from BWA mem clippedAlignments.Open( argv[i + 1] ) ; clippedAlignments.SetAllowSupplementary( true ) ; ++i ; } else { fprintf( stderr, "Unknown parameter: %s\n", argv[i] ) ; exit( 1 ) ; } } if ( !alignments.IsOpened() ) { printf( "Must use -b to specify the bam file." ) ; return 0 ; } if ( prefix != NULL ) { char buffer[255] ; sprintf( buffer, "%s.out", prefix ) ; fpOut = fopen( buffer, "w" ) ; } else { char buffer[255] ; prefix = strdup( "rascaf" ) ; sprintf( buffer, "%s.out", prefix ) ; fpOut = fopen( buffer, "w" ) ; } if ( genomeFile != NULL ) { genome.Open( alignments, genomeFile ) ; alignments.Rewind() ; } if ( outputConnectionSequence == true && genomeFile == NULL ) { fprintf( stderr, "Must use -f to specify assembly file when using -cs\n" ) ; exit( EXIT_FAILURE ) ; } // 74619 //printf( "%c\n", genome.GetNucleotide( 74619, 4 ) ) ; //exit(0) ; // Build the graph ret = blocks.BuildExonBlocks( alignments, genome ) ; alignments.Rewind() ; fprintf( stderr, "Found %d exon blocks.\n", ret ) ; if ( clippedAlignments.IsOpened() ) { fprintf( stderr, "Extend exon blocks with clipped alignments.\n" ) ; Blocks extendBlocks ; extendBlocks.BuildExonBlocks( clippedAlignments, genome ) ; clippedAlignments.Rewind() ; ret = blocks.ExtendExonBlocks( extendBlocks ) ; fprintf( stderr, "Found %d exon blocks after extension.\n", ret ) ; } blocks.GetAlignmentsInfo( alignments ) ; alignments.Rewind() ; ret = blocks.BuildGeneBlocks( alignments, genome ) ; alignments.Rewind() ; fprintf( stderr, "Found %d gene blocks.\n", ret ) ; blocks.BuildGeneBlockGraph( alignments ) ; if ( clippedAlignments.IsOpened() ) { blocks.AddGeneBlockGraphByClippedAlignments( clippedAlignments ) ; } // Cleaning blocks.CleanGeneBlockGraph( alignments, genome ) ; // Scaffolding Scaffold scaffold( blocks, genome ) ; //scaffold.Init( blocks ) ; int componentCnt = scaffold.BuildComponent() ; fprintf( stderr, "Found %d non-trivial gene block components.\n", componentCnt ) ; // Possible for parallelization for ( i = 0 ; i < componentCnt ; ++i ) { scaffold.ScaffoldComponent( i ) ; } scaffold.ScaffoldGenome() ; // Output the command line fprintf( fpOut, "command line: " ) ; char *fullpath = (char *)malloc( sizeof( char ) * 4096 ) ; for ( i = 0 ; i < argc ; ++i ) { char c = ' ' ; if ( i == argc - 1 ) c = '\n' ; if ( i > 0 && !strcmp( argv[i - 1], "-b" ) ) { if ( realpath( argv[i], fullpath ) == NULL ) { fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ; exit( 1 ) ; } fprintf( fpOut, "%s%c", fullpath, c ) ; } else if ( i > 0 && !strcmp( argv[i - 1], "-f" ) ) { if ( realpath( argv[i], fullpath ) == NULL ) { fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ; exit( 1 ) ; } fprintf( fpOut, "%s%c", fullpath, c ) ; } else fprintf( fpOut, "%s%c", argv[i], c ) ; } free( fullpath ) ; scaffold.Output( fpOut, alignments ) ; return 0 ; }
int main( int argc, char *argv[] ) { int i, j, k ; FILE *fp ; char buffer[10100] ; std::vector<std::string> fields ; int binSize = 50 ; const char *backboneName = NULL ; // the compatilibity between the alignment and allele // The likelihood of this read from this allele struct _compatible **compatibility ; int **snpAllele ; // whether a snp showed up in the allele. Alignments alignments ; alignments.Open( argv[2] ) ; for ( i = 3 ; i < argc ; ++i ) { if ( !strcmp( argv[i], "-b" ) ) { backboneName = argv[i + 1] ; alignments.OnlyChrom( backboneName ) ; ++i ; } else { fprintf( stderr, "Unknown argument %s.\n", argv[i] ) ; exit( 1 ) ; } } // Parse the files associate with the snps // Firstly, read in the snp list have the information sprintf( buffer, "%s.snp", argv[1] ) ; fp = fopen( buffer, "r" ) ; k = 0 ; while ( fgets( buffer, sizeof( buffer ), fp ) ) { Split( buffer, '\t', fields ) ; if ( backboneName && strcmp( fields[2].c_str(), backboneName ) ) continue ; snpNameToId[ fields[0] ] = k ; struct _snpInfo info ; info.type = fields[1][0] ; if ( info.type == 'd' ) { info.position = atoi( fields[3].c_str() ) ; info.length = atoi( fields[4].c_str() ) ; } else if ( info.type == 'i' ) { info.position = atoi( fields[3].c_str() ) ; info.length = strlen( fields[4].c_str() ) ; } else { info.position = atoi( fields[3].c_str() ) ; // notice that the snp file is 0-based index. info.length = 1 ; info.nucleotide = fields[4][0] ; } snpInfo.push_back( info ) ; std::vector< int > tmpList ; snpLink.push_back( tmpList ) ; for ( int p = 0 ; p < info.length ; ++p ) { if ( info.type != 'i' || p == 0 ) { if ( positionToSnp.find( info.position + p ) == positionToSnp.end() ) { positionToSnp[ info.position + p] = tmpList ; } positionToSnp[ info.position + p].push_back( k ) ; } } ++k ; } fclose( fp ) ; // Read in the link file. Determine the id of alleles and the association // of alleles and snps. // TODO: obtain the length of each allele and take the length into account in the statistical model // Add the id for the backbound int backboneLength = 0 ; sprintf( buffer, "%s_backbone.fa", argv[1] ) ; fp = fopen( buffer, "r" ) ; /*for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i ) ; buffer[i] = '\0' ; std::string backboneName( buffer + 1 ) ; alleleNameToId[ backboneName ] = 0 ; alleleIdToName.push_back( backboneName ) ;*/ bool start = false ; while ( fgets( buffer, sizeof( buffer ), fp ) ) { if ( buffer[0] == '>' ) { for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i ) ; buffer[i] = '\0' ; if ( !strcmp( backboneName, buffer + 1 ) ) { start = true ; } else if ( start ) break ; } if ( start && buffer[0] != '>' ) { int len = strlen( buffer ) ; if ( buffer[len - 1 ] == '\n' ) backboneLength += len - 1 ; else backboneLength += len ; } } fclose( fp ) ; /*k = 0 ; if ( k == 0 ) { std::vector<int> tmpList ; alleleSnpList.push_back( tmpList ) ; alleleLength.push_back( backboneLength ) ; }*/ // scanning the link file sprintf( buffer, "%s.link", argv[1] ) ; fp = fopen( buffer, "r" ) ; k = 0 ; while ( fgets( buffer, sizeof( buffer ), fp ) ) { std::vector<std::string> tmpFields ; Split( buffer, '\t', tmpFields ) ; // skip the snps from other backbones if ( snpNameToId.find( tmpFields[0] ) == snpNameToId.end() ) continue ; int snpId = snpNameToId[ tmpFields[0] ] ; Split( tmpFields[1].c_str(), ' ', fields ) ; int size = fields.size() ; for ( i = 0 ; i < size ; ++i ) { if ( alleleNameToId.find( fields[i] ) == alleleNameToId.end() ) { //printf( "%s %d\n", fields[i].c_str(), k ) ; alleleNameToId[ fields[i] ] = k ; alleleIdToName.push_back( fields[i] ) ; std::vector<int> tmpList ; alleleSnpList.push_back( tmpList ) ; alleleLength.push_back( backboneLength ) ; ++k ; } int alleleId = alleleNameToId[ fields[i] ] ; //if ( snpId == 118 ) // printf( "%s: %s %d\n", tmpFields[0].c_str(), fields[i].c_str(), alleleId ) ; snpLink[ snpId ].push_back( alleleId ) ; alleleSnpList[ alleleId ].push_back( snpId ) ; if ( snpInfo[ snpId ].type == 'd' ) { alleleLength[ alleleId ] -= snpInfo[ snpId ].length ; } else if ( snpInfo[ snpId ].type == 'i' ) { alleleLength[ alleleId ] += snpInfo[ snpId ].length ; } } } fclose( fp ) ; int numOfAllele = alleleIdToName.size() ; int numOfSnps = snpLink.size() ; snpAllele = new int* [numOfSnps] ; for ( i = 0 ; i < numOfSnps ; ++i ) { snpAllele[i] = new int[numOfAllele] ; memset( snpAllele[i], 0, sizeof( int ) * numOfAllele ) ; } for ( i = 0 ; i < numOfSnps ; ++i ) { int size = snpLink[i].size() ; for ( j = 0 ; j < size ; ++j ) { snpAllele[i][ snpLink[i][j] ] = 1 ; } } // Compute the compatbility score for each alignment and the allele // Get the number of alignment int numOfAlignments = 0 ; while ( alignments.Next() ) ++numOfAlignments ; alignments.Rewind() ; compatibility = new struct _compatible*[numOfAlignments] ; for ( i = 0 ; i < numOfAlignments ; ++i ) { compatibility[i] = new struct _compatible[ numOfAllele ] ; for ( j = 0 ; j < numOfAllele ; ++j ) { compatibility[i][j].value = 0 ;//-log( (double)alleleLength[j] ) / log( 10.0 ); } } i = 0 ; bool *snpHit = new bool[ numOfSnps ] ; while ( alignments.Next() ) { struct _pair coord = alignments.segments[0] ; alignmentCoords.push_back( coord ) ; memset( snpHit, 0, sizeof( bool ) * numOfSnps ) ; Split( alignments.GetFieldZ( "Zs" ), ',', fields ) ; int size = fields.size() ; for ( k = 0 ; k < size ; ++k ) { std::vector<std::string> subfields ; Split( fields[k].c_str(), '|', subfields ) ; int snpId = snpNameToId[ subfields[2] ] ; snpHit[ snpId ] = true ; } for ( k = coord.a ; k <= coord.b ; ++k ) { int size = positionToSnp[k].size() ; for ( int l = 0 ; l < size ; ++l ) { // if this SNP is hit. Then other allele don't have this snp // will deduct its likelihood //TODO: the deduction can be based on the quality score of the read int tag = 0 ; int snpId = positionToSnp[k][l] ; if ( snpHit[ snpId ] ) { tag = 0 ; } else { // if this SNP is not hit, then every allele containing this snp // will deduct its likelihood tag = 1 ; } for ( j = 0 ; j < numOfAllele ; ++j ) { if ( snpAllele[ snpId ][j] == tag ) { int v = -2 ; //if ( snpInfo[ snpId ].type == 'd' || snpInfo[ snpId ].type == 'i' ) // v = -4 * snpInfo[ snpId ].length ; if ( snpInfo[ snpId ].type == 'd' && snpInfo[ snpId ].position < k && k != coord.a ) { // The penality has already been subtracted. v = 0 ; } compatibility[i][j].value += v ; /*if ( i == 8 && j == 78 ) { printf( "Bad snp %d: %d %d\n", tag, k, positionToSnp[k][l] ) ; }*/ } } } } ++i ; } //printf( "%d %d\n", numOfAlignments, numOfAllele ) ; // Now, let's consider every pair of alleles, and compute its log likelihood double **logLikelihood ; logLikelihood = new double *[ numOfAllele] ; for ( j = 0 ; j < numOfAllele ; ++j ) { logLikelihood[j] = new double[ numOfAllele ] ; //memset( logLikelihood[j], 0, sizeof( double ) * numOfAllele ) ; for ( k = 0 ; k < numOfAllele ; ++k ) logLikelihood[j][k] = 0 ; } int prevBin = -1 ; double assignJBin = 0 ; double assignKBin = 0 ; for ( j = 0 ; j < numOfAllele ; ++j ) { for ( k = j ; k < numOfAllele ; ++k ) { double binAdjust = 0 ; double averageRead = ( (double)numOfAlignments ) / (double)( alleleLength[j] + alleleLength[k] ) * binSize ; for ( i = 0 ; i < numOfAlignments ; ++i ) { double vj = compatibility[i][j].value ; double vk = compatibility[i][k].value ; double weightJ = 0, weightK = 0 ; if ( vj == vk ) { weightJ = weightK = 0.5 ; } else if ( vj == vk + 2 ) { if ( vj == 0 ) { weightJ = 1 ; } else { weightJ = 0.99 ; weightK = 0.01 ; } } else if ( vk == vj + 2 ) { if ( vk == 0 ) { weightK = 1 ; } else { weightJ = 0.01 ; weightK = 0.99 ; } } else { if ( vk > vj ) weightK = 1 ; else weightJ = 1 ; } double l = weightJ * compatibility[i][j].value + weightK * compatibility[i][k].value ; if ( alignmentCoords[i].a / binSize != prevBin ) { if ( prevBin != -1 && ( assignJBin > averageRead + 4 * sqrt( averageRead ) || assignKBin > averageRead + 4 * sqrt( averageRead ) ) ) { //if ( j == 8 && k == 78 ) // printf( "%lf: %lf %lf %d %d\n", averageRead, assignJBin, assignKBin, alleleLength[j], alleleLength[k] ) ; binAdjust -= 4 ; } prevBin = alignmentCoords[i].a / binSize ; assignJBin = 0 ; assignKBin = 0 ; } assignJBin += weightJ ; assignKBin += weightK ; /*if ( j == 8 && k == 78 && l < 0 ) { printf( "Bad alignment %d (%s %s). %lf %lf: %lf\n", i, alleleIdToName[j].c_str(), alleleIdToName[k].c_str(), compatibility[i][j].value, compatibility[i][k].value, l ) ; }*/ logLikelihood[j][k] += l ; } logLikelihood[j][k] += ( -log( (double)alleleLength[j] ) / log(10.0 ) - log( (double)alleleLength[k] ) / log(10.0) ) ; logLikelihood[j][k] += binAdjust ; } } // Find the result double max ; int maxj = -1 ; int maxk = -1 ; std::vector< struct _result > results ; for ( j = 0 ; j < numOfAllele ; ++j ) { for ( k = j ; k < numOfAllele ; ++k ) { if ( maxj == -1 || logLikelihood[j][k] > max ) { maxj = j ; maxk = k ; max = logLikelihood[j][k] ; } struct _result r ; r.a = j ; r.b = k ; r.logLikelihood = logLikelihood[j][k] ; results.push_back( r ) ; } } //printf( "%s %s %lf\n", alleleIdToName[ maxj ].c_str(), alleleIdToName[ maxk ].c_str(), max) ; //printf( "%lf\n", logLikelihood[124][128] ) ; if ( results.size() == 0 ) { printf( "-1 -1 -1\n" ) ; exit( 1 ) ; } std::sort( results.begin(), results.end(), CompResult ) ; i = 0 ; printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), results[i].logLikelihood ) ; k = results.size() ; for ( i = 1 ; i < k ; ++i ) { if ( results[i].logLikelihood != results[0].logLikelihood ) break ; printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), results[i].logLikelihood ) ; } return 0 ; }