//SequenceFinder.cpp //Coded by: Vishakh (vishakh@uci.edu) unless otherwise stated //Modified heavily by: Nicholas Urrea //ICS 175A, Winter 2004 //Project: Motif detection in yeast //Purpose: Calculates scores for substrings of gene families, finds sequences and constructs //probability matrices. #include "SequenceFinder.h" // FUNCTION: constructor // PURPOSE: Set default KMER and OCCURENCE LIMIT sizes // ADDED BY: Nicholas (as a default filter for results) SequenceFinder::SequenceFinder() { KMER_SIZE = 6; OCCURENCE_LIMIT = 0; } /* *FUNCTION: void assignScores(DTable& dt_big, DTable& dt_sml) *PURPOSE: Adds simple and binomial method scores to smaller table *REMARKS: Converts the given tables to vectors and uses those for all operations. Simplification of binomial distribution formula suggested by Nick. EDITED BY: Nicholas Urrea (tabulated score results, added poisson and made unused score results 1.0 since we determine good score results by low probability) EDITED BY: Joseph Bertolami (added progress bar link and support) */ #include "GuiSimple.h" void SequenceFinder::assignScores(DTable& dt_big, DTable& dt_sml) { //Convert tables to vectors cout<<"Flattening.."; table_sml = dt_sml.flatten(); table_big = dt_big.flatten(); cout<<"..flattened.."<=10 && i% (table_sml.size()/10) == 0 ) { cout << perc << "% "; perc+=10; } currPair = table_sml[i]; currSeq = currPair.getSequence(); // Process KMERS of exact size only // REMARKS: Fe{B} = expected_frequency = num of genes in big table / total num of sequence sizes if( currSeq.size() == KMER_SIZE ) { expected_frequency = (double) dt_big.getCount(currSeq) / BIG_LENGTH; occurences = currPair.getCount(); expected_occurences = expected_frequency * SML_LENGTH; // half frequency of symmetric kmers; dtable should not have processed 2x the amt if ( perm.getComplement( currSeq ) == currSeq ) { expected_frequency /= 2; } // compute over-represented k-mers with occurences greater than a specified limit if ( occurences > expected_occurences && occurences > OCCURENCE_LIMIT ) { table_sml[i].score1 = expected_occurences/occurences; table_sml[i].score2 = poissonDistribution( expected_occurences, occurences ); table_sml[i].score3 = binomialDistribution( occurences, SML_LENGTH, expected_frequency ); //(k,n,p) /* Sig - quick way to get top scores double bin_w=currSeq.size(); double bin_Npal=0; if(currSeq.size()%2==0) bin_Npal = pow(4.0,bin_w/2); double bin_D = pow(4.0, bin_w) - (pow(4.0, bin_w) - bin_Npal)/2; double bin_sig = 0.0 -(log10(table_sml[i].score2 * bin_D)); */ } // under-represented kmers - throw out else { table_sml[i].score1 = 1.0; table_sml[i].score2 = 1.0; table_sml[i].score3 = 1.0; } } // not of KMER_SIZE else { table_sml[i].score1 = 1.0; table_sml[i].score2 = 1.0; table_sml[i].score3 = 1.0; } } cout << "done." << endl; } /* *FUNCTION: void sort() *PURPOSE: Sorts the small table vector by either simple or binomial scores *REMARKS: */ void SequenceFinder::sortTable() { //using C++ STL to sort the table sort(table_sml.begin(), table_sml.end()); } /* *FUNCTION: void createMatrices(vector > genes); *PURPOSE: Gets a list of genes from the family being investigated and constructs probability matrices for the top sequences *REMARKS: EDITED BY: Joseph Bertolami (added Progress bar link and support) */ void SequenceFinder::createMatrices(vector > genes) { int total_progress = 0; //loop through top 10 seqeunces for(int topseq=0; topseq<10; topseq++) { DString seed = table_sml[topseq].getSequence(); ProbMatrix pm(seed); //use seed to construct a new probability matrix that will be constantly updated until it stabilizes //pm.print(); //iterate the probabilty matrix repeatedly for stabilization for(int iteration=0; iteration<5; iteration++) { vector bestMatches; //vector of best matches from each gene family //for each gene in the family, find the best match for the current matrix for(int i=0; ibestScore) { bestScore = currScore; bestMatch = currSeq; } } bestMatches.push_back(bestMatch); total_progress++; IncrementProgress(); } //we have best matches...now update the probability matrix //for(int i=0; i getTable() *PURPOSE: Returns the table of gene family sequences for the GUI *REMARKS: */ vector SequenceFinder::getTable() { return table_sml; } /* *FUNCTION: vector getMatrices() *PURPOSE: Returns probability matrices for the GUI *REMARKS: */ vector SequenceFinder::getMatrices() { return matrices; } /* FUNCTION: poissonDistribution() * PURPOSE: Calculate the poisson distribution * REMARKS: Poisson distribution P(N=k) => (e^-lambda * lambda^k) / k! where lambda = expected number of occurences during a given interval (bin_p * size) and k = the probability that there are exactly k occurences (bin_p) ADDED BY: Nicholas Urrea (nurrea@uci.edu) */ double SequenceFinder::poissonDistribution(double lambda, double poisson_k) { // heads up, k may be bigger than 170 and screw up the program! (171! = cannot compute) if( lambda > 170 ) return -1; return ( pow(2.7182818284590452353602875, -lambda) * pow(lambda, poisson_k) ) / factorial(poisson_k); } /* PURPOSE: calculate the inverse of log 10. REMARKS: to get inverse log of a number ie; 12.0301 you do the following: 1. calculate 10 to the power of the ceiling of given number (10^ 12) 2. calculate 10 to the power of the decimal part of given number (10^ 0.301) 3. multiply part 1 and 2 together (10^0.0301) * (10^ 12) to get answer ADDED BY: Nicholas Urrea (nurrea@uci.edu) */ double SequenceFinder::inverseLog10(double log) { return pow( 10.0, (double)log - ceil(log) ) * pow( 10.0, (double)ceil(log) ); } /* *FUNCTION: double logOfFactorial(double n) *PURPOSE: Return the log of the factorial of 'n' *REMARKS: This is based on the formula: * log n! = log(n) + log(n-1) + log(n-2) + ... + log(2) + log(1) EDITED BY: Nicholas Urrea (changed log() to log10 to keep log base-10 constant throughout calculations) */ double SequenceFinder::logOfFactorial(double n) { double ans = 0; for(double i=1; i<=n; i++) { ans += log10(i); } return ans; } /* FUNCTION: factorial(double n) PURPOSE: returns factorial of 'n' REMARKS: Factorial of n is n * n-1 * .... * 1 ADDED BY: Nicholas Urrea (nurrea@uci.edu) */ double SequenceFinder::factorial(double n) { double ans = 1; for(double i=1; i<=n; i++) { ans *= i; } return ans; } /* PURPOSE: Compute binomial distribution from 0 to k-1. REMARKS: IF the number was OVERLY represented, then the probability should be very tiny, because the probability of the number showing up drastically more often than its expected frequency (bin_p) should be tiny. Since the numbers for bin_n came to be too large in our data set, we had to sum the log of the binomial distribution. Below explains the process. p(k occurances) = C(n,k) p^k (1-p)^(n-k); but we need smaller numbers, thus: log[p(k occurances)] =>log[C(n,k)] + k*log(p) + (n-k)log(1-p) First we calculate log[C(n,k)] Since C(n,k) = n!/ (k! * (n-k)!), log[C(n,k)] = log(n!) - [log(k!) + log((n-k)!)] EDITED BY: Nicholas Urrea (due to tiny precision errors, summation had to be put on hold, also made sure every log was of base 10 and verified correctness) */ double SequenceFinder::binomialDistribution(double bin_k, double bin_n, double bin_p) { long double prob=0.0; // get log of binomial distribution probability, since factorial of bin_n are too big for STL C++ // sum each individual log from 0 to k-1, small prob = overly represented number = what we want for(double i = bin_k; i <= bin_k; i++) { prob += logOfFactorial( bin_n ) - ( logOfFactorial(i) + logOfFactorial( bin_n-i ) ); prob += i * log10( bin_p ); prob += (bin_n - i ) * log10( 1.0 - bin_p ); } return inverseLog10(prob); } /* *FUNCTION: printTopScores(int n) *PURPOSE: Prints the oligonucleotides with the top 'n' scores *REMARKS: EDITED BY: Nicholas Urrea (fixed format of printing a bit, and added the ability to prints out scores and not their reverse complements, if you remove the comments) */ void SequenceFinder::printTopScores(int n) { // Prints top scores WITH reverse complements cout<<"TOP SEQUENCES\nRank\tGene\tOcc\tExpected/Act\tPoisson\t\tBinomial"< 0 && table_sml[i].getSequence() == perm.getComplement( table_sml[i-1].getSequence() ) ) { rank--; } else { cout<= 6 && n <= 8 ) KMER_SIZE = n; else KMER_SIZE = 6; // default kmer size } /* FUNCTION: setOccurenceLimit(int n) PURPOSE: Sets a limit of occurence for a subsequence in order for it to be processed. The purpose of limitations are to speed up processing time and reduce the results to mostly good ones. Before an occurence of around 5, those KMers will rarely be gene regulators so limits are good. ADDED BY: Nicholas Urrea // */ void SequenceFinder::setOccurenceLimit(int n) { OCCURENCE_LIMIT = n; }