Exemplo n.º 1
0
int main( int argc, char *argv[] )
{
	int i ;
	int ret ;

	Alignments alignments ;
	Alignments clippedAlignments ;
	Blocks blocks ;
	Genome genome ;
	char *genomeFile = NULL ;
	
	if ( argc < 2 )
	{
		printf( "%s", usage ) ;
		exit( 0 ) ;
	}

	minimumSupport = 2 ;
	minimumEffectiveLength = 200 ;
	kmerSize = 23 ;
	breakN = 1 ;
	minContigSize = 200 ;
	prefix = NULL ;
	VERBOSE = false ;
	outputConnectionSequence = false ;
	aggressiveMode = false ;

	for ( i = 1 ; i < argc ; ++i )
	{
		if ( !strcmp( "-b", argv[i] ) )
		{
			alignments.Open( argv[i + 1]) ;
			++i ;
		}
		else if ( !strcmp( "-o", argv[i] ) )
		{
			prefix = argv[i + 1] ;
			++i ;
		}
		else if ( !strcmp( "-f", argv[i] ) )
		{
			genomeFile = argv[i + 1] ;
			++i ;
		}
		else if ( !strcmp( "-ms", argv[i] ) )
		{
			minimumSupport = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-ml", argv[i] ) )
		{
			minimumEffectiveLength = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-k", argv[i] ) )
		{
			kmerSize = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-breakN", argv[i] ) )
		{
			breakN = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-minContigSize", argv[i] ) )
		{
			minContigSize = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-v", argv[i] ) )
		{
			VERBOSE = true ;
		}
		else if ( !strcmp( "-cs", argv[i] ) )
		{
			outputConnectionSequence = true ;
		}
		/*else if ( !strcmp( "-aggressive", argv[i] ) )
		{
			aggressiveMode = true ;
		}*/
		else if ( !strcmp( "-bc", argv[i] ) )
		{
			// So far, assume the input is from BWA mem
			clippedAlignments.Open( argv[i + 1] ) ;
			clippedAlignments.SetAllowSupplementary( true ) ;
			++i ;
		}
		else
		{
			fprintf( stderr, "Unknown parameter: %s\n", argv[i] ) ;
			exit( 1 ) ;
		}
	}

	if ( !alignments.IsOpened() )
	{
		printf( "Must use -b to specify the bam file." ) ;
		return 0 ;
	}

	if ( prefix != NULL )
	{
		char buffer[255] ;
		sprintf( buffer, "%s.out", prefix ) ;
		fpOut = fopen( buffer, "w" ) ;
	}
	else
	{
		char buffer[255] ;
		prefix = strdup( "rascaf" ) ;
		sprintf( buffer, "%s.out", prefix ) ;
		fpOut = fopen( buffer, "w" ) ;
	}
	
	if ( genomeFile != NULL )
	{
		genome.Open( alignments, genomeFile ) ;
		alignments.Rewind() ;
	}

	if ( outputConnectionSequence == true && genomeFile == NULL )
	{
		fprintf( stderr, "Must use -f to specify assembly file when using -cs\n" ) ;	
		exit( EXIT_FAILURE ) ;
	}
	// 74619
	//printf( "%c\n", genome.GetNucleotide( 74619, 4 ) ) ;
	//exit(0) ;
	// Build the graph
	ret = blocks.BuildExonBlocks( alignments, genome ) ;
	alignments.Rewind() ;
	fprintf( stderr, "Found %d exon blocks.\n", ret ) ;
	if ( clippedAlignments.IsOpened() )
	{
		fprintf( stderr, "Extend exon blocks with clipped alignments.\n" ) ;
		Blocks extendBlocks ;
		extendBlocks.BuildExonBlocks( clippedAlignments, genome ) ;
		clippedAlignments.Rewind() ;

		ret = blocks.ExtendExonBlocks( extendBlocks ) ;
		fprintf( stderr, "Found %d exon blocks after extension.\n", ret ) ;
	}

	blocks.GetAlignmentsInfo( alignments ) ;
	alignments.Rewind() ;

	ret = blocks.BuildGeneBlocks( alignments, genome ) ;
	alignments.Rewind() ;
	fprintf( stderr, "Found %d gene blocks.\n", ret ) ;
	
	blocks.BuildGeneBlockGraph( alignments ) ;
	if ( clippedAlignments.IsOpened() )
	{
		blocks.AddGeneBlockGraphByClippedAlignments( clippedAlignments ) ; 
	}
	
	// Cleaning
	blocks.CleanGeneBlockGraph( alignments, genome ) ;

	// Scaffolding
	Scaffold scaffold( blocks, genome ) ;
	//scaffold.Init( blocks ) ;
	int componentCnt = scaffold.BuildComponent() ;
	fprintf( stderr, "Found %d non-trivial gene block components.\n", componentCnt ) ;
	// Possible for parallelization
	for ( i = 0 ; i < componentCnt ; ++i )
	{
		scaffold.ScaffoldComponent( i ) ;
	}
	
	scaffold.ScaffoldGenome() ;
	
	// Output the command line
	fprintf( fpOut, "command line: " ) ;
	char *fullpath = (char *)malloc( sizeof( char ) * 4096 ) ;
	for ( i = 0 ; i < argc ; ++i )
	{
		char c = ' ' ;
		if ( i == argc - 1 )
			c = '\n' ;
		if ( i > 0 && !strcmp( argv[i - 1], "-b" ) )
		{
			if ( realpath( argv[i], fullpath ) == NULL )
			{
				fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ;
				exit( 1 ) ;
			}
			fprintf( fpOut, "%s%c", fullpath, c ) ;
		}
		else if ( i > 0 && !strcmp( argv[i - 1], "-f" ) )
		{
			if ( realpath( argv[i], fullpath ) == NULL )
			{
				fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ;
				exit( 1 ) ;
			}
			fprintf( fpOut, "%s%c", fullpath, c ) ;
		}
		else
			fprintf( fpOut, "%s%c", argv[i], c ) ;
	}
	free( fullpath ) ;
	scaffold.Output( fpOut, alignments ) ;
	return 0 ;
}
Exemplo n.º 2
0
int main( int argc, char *argv[] )
{
	int i, j, k ;	
	FILE *fp ;
	char buffer[10100] ;
	std::vector<std::string> fields ;
	int binSize = 50 ;
	const char *backboneName = NULL ;

	// the compatilibity between the alignment and allele
	// The likelihood of this read from this allele
	struct _compatible **compatibility ; 
	int **snpAllele ; // whether a snp showed up in the allele.

	Alignments alignments ;
	alignments.Open( argv[2] ) ;

	for ( i = 3 ; i < argc ; ++i )
	{
		if ( !strcmp( argv[i], "-b" ) )
		{
			backboneName = argv[i + 1] ;
			alignments.OnlyChrom( backboneName ) ;
			++i ;
		}
		else
		{
			fprintf( stderr, "Unknown argument %s.\n", argv[i] ) ;
			exit( 1 ) ;
		}
	}

	// Parse the files associate with the snps
	// Firstly, read in the snp list have the information
	sprintf( buffer, "%s.snp", argv[1] ) ;
	fp = fopen( buffer, "r" ) ;	
	k = 0 ;
	while ( fgets( buffer, sizeof( buffer ), fp ) )
	{
		Split( buffer, '\t', fields ) ;
		if ( backboneName && strcmp( fields[2].c_str(), backboneName ) )
			continue ;
		snpNameToId[ fields[0] ] = k ;
		struct _snpInfo info ;
		info.type = fields[1][0] ;
		if ( info.type == 'd' )
		{
			info.position = atoi( fields[3].c_str() ) ;
			info.length = atoi( fields[4].c_str() ) ;
		}
		else if ( info.type == 'i' )
		{
			info.position = atoi( fields[3].c_str() ) ;
			info.length = strlen( fields[4].c_str() ) ;
		}
		else
		{
			info.position = atoi( fields[3].c_str()	) ; // notice that the snp file is 0-based index.
			info.length = 1 ;
			info.nucleotide = fields[4][0] ;
		}
		snpInfo.push_back( info ) ;
		std::vector< int > tmpList ;
		snpLink.push_back( tmpList ) ;
		for ( int p = 0 ; p < info.length ; ++p )
		{
			if ( info.type != 'i' || p == 0 )
			{
				if ( positionToSnp.find( info.position + p ) == positionToSnp.end() )
				{
					positionToSnp[ info.position + p] = tmpList ;
				}
				positionToSnp[ info.position + p].push_back( k ) ;
			}
		}
		++k ;
	} 
	fclose( fp ) ;
	// Read in the link file. Determine the id of alleles and the association
	// of alleles and snps.
	// TODO: obtain the length of each allele and take the length into account in the statistical model
	// Add the id for the backbound
	int backboneLength = 0 ;
	sprintf( buffer, "%s_backbone.fa", argv[1] ) ;
	fp = fopen( buffer, "r" ) ;
	/*for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i )
		;
	buffer[i] = '\0' ;
	std::string backboneName( buffer + 1 ) ;
	alleleNameToId[ backboneName ] = 0 ;
	alleleIdToName.push_back( backboneName ) ;*/
	bool start = false ;
	while ( fgets( buffer, sizeof( buffer ), fp ) )
	{
		if ( buffer[0] == '>' )
		{
			for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i )
				;
			buffer[i] = '\0' ;
			if ( !strcmp( backboneName, buffer + 1 ) )
			{
				start = true ;
			}
			else if ( start )
				break ;
		}
		if ( start && buffer[0] != '>' )
		{
			int len = strlen( buffer ) ;
			if ( buffer[len - 1 ] == '\n' )
				backboneLength += len - 1 ;
			else
				backboneLength += len ;
		}
	}
	fclose( fp ) ;
	/*k = 0 ;
	if ( k == 0 )
	{
		std::vector<int> tmpList ;
		alleleSnpList.push_back( tmpList ) ;
		alleleLength.push_back( backboneLength ) ;
	}*/


	// scanning the link file
	sprintf( buffer, "%s.link", argv[1] ) ;
	fp = fopen( buffer, "r" ) ;
	k = 0 ; 
	while ( fgets( buffer, sizeof( buffer ), fp ) )
	{
		std::vector<std::string> tmpFields ;
		Split( buffer, '\t', tmpFields ) ;
		// skip the snps from other backbones
		if ( snpNameToId.find( tmpFields[0] ) == snpNameToId.end() )
			continue ;

		int snpId = snpNameToId[ tmpFields[0] ] ;
		Split( tmpFields[1].c_str(), ' ', fields ) ;
		int size = fields.size() ;
	
		for ( i = 0 ; i < size ; ++i )
		{
			if ( alleleNameToId.find( fields[i] ) == alleleNameToId.end() )
			{
				//printf( "%s %d\n", fields[i].c_str(), k ) ;
				alleleNameToId[ fields[i] ] = k ;
				alleleIdToName.push_back( fields[i] ) ;
				std::vector<int> tmpList ;
				alleleSnpList.push_back( tmpList ) ;
				alleleLength.push_back( backboneLength ) ;
				++k ;
			}

			int alleleId = alleleNameToId[ fields[i] ] ;
			//if ( snpId == 118 )
			//	printf( "%s: %s %d\n", tmpFields[0].c_str(), fields[i].c_str(), alleleId ) ;
			snpLink[ snpId ].push_back( alleleId ) ;
			alleleSnpList[ alleleId ].push_back( snpId ) ;
			if ( snpInfo[ snpId ].type == 'd' )
			{
				alleleLength[ alleleId ] -= snpInfo[ snpId ].length ;
			}
			else if ( snpInfo[ snpId ].type == 'i' )
			{
				alleleLength[ alleleId ] += snpInfo[ snpId ].length ;
			}
		}
	} 
	fclose( fp ) ;
	int numOfAllele = alleleIdToName.size() ;
	int numOfSnps = snpLink.size() ;
	snpAllele = new int* [numOfSnps] ;
	for ( i = 0 ; i < numOfSnps ; ++i )
	{
		snpAllele[i] = new int[numOfAllele] ;
		memset( snpAllele[i], 0, sizeof( int ) * numOfAllele ) ;
	}
	for ( i = 0 ; i < numOfSnps ; ++i )
	{
		int size = snpLink[i].size() ;
		for ( j = 0 ; j < size ; ++j )
		{
			snpAllele[i][ snpLink[i][j] ] = 1 ;
		}
	}

	// Compute the compatbility score for each alignment and the allele
 	// Get the number of alignment
	int numOfAlignments = 0 ;
	while ( alignments.Next() )
		++numOfAlignments ;
		
	alignments.Rewind() ;

	compatibility = new struct _compatible*[numOfAlignments] ;	
	for ( i = 0 ; i < numOfAlignments ; ++i )
	{
		compatibility[i] = new struct _compatible[ numOfAllele ] ;
		for ( j = 0 ; j < numOfAllele ; ++j )
		{
			compatibility[i][j].value = 0 ;//-log( (double)alleleLength[j] ) / log( 10.0 );
		}
	}

	i = 0 ; 
	bool *snpHit = new bool[ numOfSnps ] ;
	while ( alignments.Next() )
	{
		struct _pair coord = alignments.segments[0] ;
		alignmentCoords.push_back( coord ) ;
		memset( snpHit, 0, sizeof( bool ) * numOfSnps ) ;
		Split( alignments.GetFieldZ( "Zs" ), ',', fields ) ;
		int size = fields.size() ;
		for ( k = 0 ; k < size ; ++k )
		{
			std::vector<std::string> subfields ;
			Split( fields[k].c_str(), '|', subfields ) ;
			int snpId = snpNameToId[ subfields[2] ] ;	
			snpHit[ snpId ] = true ;
		}
		
		for ( k = coord.a ; k <= coord.b ; ++k )
		{
			int size = positionToSnp[k].size() ;
			for ( int l = 0 ; l < size ; ++l )
			{
				// if this SNP is hit. Then other allele don't have this snp 
				// will deduct its likelihood
				//TODO: the deduction can be based on the quality score of the read
				int tag = 0 ;
				int snpId = positionToSnp[k][l] ;

				if ( snpHit[ snpId ] )
				{
					tag = 0 ;
				}
				else
				{
				// if this SNP is not hit, then every allele containing this snp 
				// will deduct its likelihood
					tag = 1 ;
				}
				for ( j = 0 ; j < numOfAllele ; ++j )
				{
					if ( snpAllele[ snpId  ][j] == tag )
					{
						int v = -2 ;
						//if ( snpInfo[ snpId ].type == 'd' || snpInfo[ snpId ].type == 'i' )
						//	v = -4 * snpInfo[ snpId ].length ;	
						if ( snpInfo[ snpId ].type == 'd' && snpInfo[ snpId ].position < k 
							&& k != coord.a )
						{
							// The penality has already been subtracted.
							v = 0 ;
						}
						compatibility[i][j].value += v ;
						/*if ( i == 8 && j == 78 )
						{
							printf( "Bad snp %d: %d %d\n", tag, k, positionToSnp[k][l] ) ;
						}*/
					}
				}
			}
		}
		++i ;
	}
	//printf( "%d %d\n", numOfAlignments, numOfAllele ) ;
	// Now, let's consider every pair of alleles, and compute its log likelihood 
	double **logLikelihood ;
	logLikelihood = new double *[ numOfAllele] ;
	for ( j = 0 ; j < numOfAllele ; ++j )
	{
		logLikelihood[j] = new double[ numOfAllele ] ;
		//memset( logLikelihood[j], 0, sizeof( double ) * numOfAllele ) ;
		for ( k = 0 ; k < numOfAllele ; ++k )
			logLikelihood[j][k] = 0 ;
	}

	int prevBin = -1 ;
	double assignJBin = 0 ;
	double assignKBin = 0 ;
	for ( j = 0 ; j < numOfAllele ; ++j )
	{
		for ( k = j ; k < numOfAllele ; ++k )
		{
			double binAdjust = 0 ;
			double averageRead = ( (double)numOfAlignments ) / (double)( alleleLength[j] + alleleLength[k] ) * binSize ;
			for ( i = 0 ; i < numOfAlignments ; ++i )
			{
				double vj = compatibility[i][j].value ;
				double vk = compatibility[i][k].value ;
				double weightJ = 0, weightK = 0 ;
				if ( vj == vk )
				{
					weightJ = weightK = 0.5 ;
				}
				else if ( vj == vk + 2 )
				{
					if ( vj == 0 )
					{
						weightJ = 1 ;
					}
					else
					{
						weightJ = 0.99 ;
						weightK = 0.01 ;
					}
				}
				else if ( vk == vj + 2 )
				{
					if ( vk == 0 )
					{
						weightK = 1 ;
					}
					else
					{
						weightJ = 0.01 ;
						weightK = 0.99 ;
					}
				}
				else
				{
					if ( vk > vj )
						weightK = 1 ;
					else
						weightJ = 1 ;
				}
				
				double l = weightJ * compatibility[i][j].value + weightK * compatibility[i][k].value ;
				if ( alignmentCoords[i].a / binSize != prevBin )
				{
					if ( prevBin != -1 && 
						( assignJBin > averageRead + 4 * sqrt( averageRead ) 
							|| assignKBin > averageRead + 4 * sqrt( averageRead ) ) )
					{
						//if ( j == 8 && k == 78 )
						//	printf( "%lf: %lf %lf %d %d\n", averageRead, assignJBin, assignKBin, alleleLength[j], alleleLength[k] ) ;
						binAdjust -= 4 ;
					}
					prevBin = alignmentCoords[i].a / binSize ;
					assignJBin = 0 ;
					assignKBin = 0 ;
				}
				assignJBin += weightJ ;
				assignKBin += weightK ;
				
				/*if ( j == 8 && k == 78 && l < 0 )
				{
					printf( "Bad alignment %d (%s %s). %lf %lf: %lf\n", i,
						alleleIdToName[j].c_str(), alleleIdToName[k].c_str(),
						 compatibility[i][j].value, compatibility[i][k].value, l ) ;
				}*/
				logLikelihood[j][k] += l ;
			}
			logLikelihood[j][k] += ( -log( (double)alleleLength[j] ) / log(10.0 ) -
							log( (double)alleleLength[k] ) / log(10.0) ) ;
			logLikelihood[j][k] += binAdjust ;
		}
	}
	
	// Find the result
	double max ;
	int maxj = -1 ;
	int maxk = -1 ;
	std::vector< struct _result > results ;
	for ( j = 0 ; j < numOfAllele ; ++j )
	{
		for ( k = j ; k < numOfAllele ; ++k )
		{
			if ( maxj == -1 || logLikelihood[j][k] > max )
			{
				maxj = j ;
				maxk = k ;
				max = logLikelihood[j][k] ;
			}
			struct _result r ;
			r.a = j ;
			r.b = k ;
			r.logLikelihood = logLikelihood[j][k] ;
			results.push_back( r ) ;
		}
	}

	//printf( "%s %s %lf\n", alleleIdToName[ maxj ].c_str(), alleleIdToName[ maxk ].c_str(), max) ;
	//printf( "%lf\n", logLikelihood[124][128] ) ;	
	
	if ( results.size() == 0 )
	{
		printf( "-1 -1 -1\n" ) ;
		exit( 1 ) ;
	}
	std::sort( results.begin(), results.end(), CompResult ) ;	
	i = 0 ;
	printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), 
			results[i].logLikelihood ) ;
	k = results.size() ;
	for ( i = 1 ; i < k ; ++i )
	{
		if ( results[i].logLikelihood != results[0].logLikelihood )
			break ;
		printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), 
			results[i].logLikelihood ) ;
	}
	return 0 ;
}