//dtable.cpp //Coded by Jeff Weiss (jmweiss@uci.edu) //ICS 175A, Winter 2004 //Project: Motif detection in yeast //Purpose: provides a hash table of genomicPairs, to allow for storage of genomic //sequences and their realted statistical values as they relate to this project #include "dtable.h" #include #include #include using namespace std; /* *FUNCTION: DTable *PURPOSE: copy constructor *REMARKS: make sure rhs is complete, no checking happens here */ DTable::DTable(const DTable & rhs) { MAX_TABLE_SIZE=LOCAL_MAX_TABLE_SIZE; cout<<"created"; totalSequenceCount=rhs.totalSequenceCount; totalUniqueSequences=rhs.totalUniqueSequences; genomicSequenceList = rhs.genomicSequenceList; sequenceRepresentedLength = rhs.sequenceRepresentedLength; numOfSequences = rhs.numOfSequences; } /* *FUNCTION: DTable *PURPOSE: constructor *REMARKS: */ DTable::DTable() { MAX_TABLE_SIZE=LOCAL_MAX_TABLE_SIZE; cout<<"created"; totalSequenceCount=0; totalUniqueSequences=0; genomicSequenceList.resize(MAX_TABLE_SIZE); sequenceRepresentedLength = 0; numOfSequences = 0; } /* *AUTHOR: Joseph Bertolami *FUNCTION: operator = *PURPOSE: assignment operator overload *REMARKS: make sure rhs is complete, no error checking happens here */ DTable DTable::operator = (const DTable & rhs) { MAX_TABLE_SIZE=LOCAL_MAX_TABLE_SIZE; totalSequenceCount=rhs.totalSequenceCount; totalUniqueSequences=rhs.totalUniqueSequences; genomicSequenceList = rhs.genomicSequenceList; sequenceRepresentedLength = rhs.sequenceRepresentedLength; numOfSequences = rhs.numOfSequences; return *this; } /* *FUNCTION: getSequenceRepresentedLength *PURPOSE: returns the length of genomic squence that all items in the table represent *REMARKS: this sqeuence can not be reconstructed from DTable, it is determined at DTable creation time */ int DTable::getSequenceRepresentedLength() { return sequenceRepresentedLength; } /* *FUNCTION: setSequenceRepresentedLength *PURPOSE: sets the length of the sequence represented by all the entries in DTable *REMARKS: */ void DTable::setSequenceRepresentedLength(int newLength) { sequenceRepresentedLength = newLength; } /* *FUNCTION: setNumSequences *PURPOSE: sets the number of sequences in DTable *REMARKS: */ void DTable::setNumOfSequences(int newLength) { numOfSequences = newLength; } /* *FUNCTION: getNumOfSequences *PURPOSE: gets the number of sequences in the DTable *REMARKS: */ int DTable::getNumOfSequences() { return numOfSequences; } /* *FUNCTION: getHash *PURPOSE: returns the hash value of a given string *REMARKS: */ int DTable::getHash(DString toHash) { int hashKey = 0; for( unsigned int i = 0; i < toHash.size(); i++ ) { hashKey += (toHash[i] * (i+1)); } return hashKey%MAX_TABLE_SIZE; // return (toHash.size())%MAX_TABLE_SIZE; } /* *FUNCTION: getCount *PURPOSE: gets the number of occurances of a given gentic sequence *REMARKS: returns 0 if sequence is not found in the table */ int DTable::getCount(DString toFind) { int currentHashValue = getHash(toFind); //if ( genomicSequenceList[currentHashValue].empty() ) // return 0; for (unsigned int i = 0; i < genomicSequenceList[currentHashValue].size(); i++) { if ( genomicSequenceList[currentHashValue][i].getSequence().strcmp(toFind) != 0) { //item found return genomicSequenceList[currentHashValue][i].getCount(); } } return 0; //default return, not found case } /* *FUNCTION: add *PURPOSE: adds a GenomicPair to the table *REMARKS: used mainly for internal functioanlity, such as when loading from file */ void DTable::add(GenomicPair newPair) { if ( newPair.getSequence().size()!=0 ) genomicSequenceList[getHash(newPair.getSequence())].push_back(newPair); //add to table } /* *FUNCTION: add *PURPOSE: will add specified DString to table, if DString already exists, it will increment the count *REMARKS: */ void DTable::add(DString toAdd) { /* insert genomic sequence toAdd into genomicSequenceList increment totalSequence count if sequence has not been encountered before, increment totalUniqueSequences */ int currentHashValue = getHash(toAdd); if ( genomicSequenceList.size() < currentHashValue ) { genomicSequenceList.resize(currentHashValue+1); } if ( genomicSequenceList[currentHashValue].size()==0) { // cout << "in empty"; /* case where hash value has not been encountered before we have enountered a new sequence, so add it to the proper point */ GenomicPair pairToAdd(toAdd); // cout<<"Inserting: "; // pairToAdd.print(); genomicSequenceList[currentHashValue].push_back(pairToAdd); totalUniqueSequences++; totalSequenceCount++; } else { /* hash value has been encountered before either there has been a hash collision or a sequence has been repeated */ bool sequenceExists=false; for (unsigned int i = 0; i < genomicSequenceList[currentHashValue].size() ; i++) { if ( genomicSequenceList[currentHashValue][i].getSequence().strcmp(toAdd)!=0 ) { genomicSequenceList[currentHashValue][i].incrementCount(); sequenceExists=true; totalSequenceCount++; // cout<<"Incremeting Count for:"; // genomicSequenceList[currentHashValue][i].print(); } } if (!sequenceExists) { /* sequence isnt found, we have a hash collision */ GenomicPair pairToAdd(toAdd); genomicSequenceList[currentHashValue].push_back(pairToAdd); totalUniqueSequences++; totalSequenceCount++; // cout<<"collision insertion"; // pairToAdd.print(); } } } /* *FUNCTION: flatten *PURPOSE: will turn the hash table into a vector of GenomicPair's *REMARKS: since this removes the time advantage we have of using a hash table, use this only when nessecary */ vector DTable::flatten() { vector returnMe; /* return an array of GenomicPairs, with each element in the array being a unique GenomicPair this is the only public way to enumerate the list of genetic sequences */ for ( unsigned int i = 0; i < genomicSequenceList.size(); i++ ) { for( unsigned int j = 0; j < genomicSequenceList[i].size(); j++ ) { returnMe.push_back(genomicSequenceList[i][j]); } } return returnMe; } /* *FUNCTION: print *PURPOSE: print contents of DTable to stdout *REMARKS: for testing/development purposes only, do not call in production code */ void DTable::print() { cout<toPrint=flatten(); for ( unsigned int i = 0 ; i < toPrint.size(); i++ ) { toPrint[i].print(); cout<>curLine; //read and throw away human-readable input line inputFile>>curLine; //this is MAX_TABLE_SIZE MAX_TABLE_SIZE=curLine; genomicSequenceList.clear(); //clear current table genomicSequenceList.resize(MAX_TABLE_SIZE); //resize table inputFile>>curLine; //this is sequenceRepresentedLength sequenceRepresentedLength=curLine; inputFile>>curLine; //this is totalSequenceCount totalSequenceCount = curLine; inputFile>>curLine; //this is totalUniqueSequences totalUniqueSequences=curLine; inputFile.get(); //clear newline character while(!inputFile.eof()) { int newCount; int newScore1,newScore2,newScore3; char ch; string newInput; //DString newSequence; while( inputFile.get(ch) ) //remove pair seperator { if ( ch != '\n' ) newInput+=ch; else break; } newInput.clear(); while( inputFile.get(ch) ) //get genome name { if ( ch != '\n' ) newInput+=ch; else break; } // cout<