Main Page | Class Hierarchy | Class List | File List | Class Members

gmcheck.hpp

00001 
00002 // Copyright (c) 2003, The Institute for Genomic Research (TIGR), Rockville,
00003 // Maryland, U.S.A.  All rights reserved.
00005 
00006 static int INTERNAL_INT_ID;
00007 static int INITIAL_INT_ID;
00008 static int TERMINAL_INT_ID;
00009 static int SINGLE_INT_ID;
00010 
00011 static bool checkForAdjacentStopCodon(const RegionPrediction& predReg, const RegionPrediction& itemReg, 
00012                                       const DnaStr& theSequence) {
00013   if(itemReg.getEnd5()-predReg.getEnd3() > 3) {
00014     char codon[3];
00015     int idx = predReg.getEnd3()+1;
00016     codon[0] = theSequence[idx];
00017     codon[1] = theSequence[idx+1];
00018     codon[2] = theSequence[idx+2];
00019     return theSequence.isStop(codon);
00020   }
00021   return false;
00022 }
00023 
00024 static bool isValidFrameFromLen(int start, int stop, int bp_need, int frame) {
00025   const int raw_len = stop-start+1;
00026   if(raw_len == 1) {
00027     if(bp_need == 1 && frame == 0) return true;
00028     if(bp_need == 2 && frame == 1) return true;
00029     if(bp_need == 3 && frame == 2) return true;
00030   } else if(raw_len == 2) {
00031     if(bp_need == 1 && frame == 2) return true;
00032     if(bp_need == 2 && frame == 0) return true;
00033     if(bp_need == 3 && frame == 1) return true;
00034   } else {
00035     int leading_bp = (raw_len-bp_need) % 3;
00036     int predFrame = AnnotationItem::leadingBp2Frame(leading_bp);
00037     if(predFrame == frame) return true;
00038   }
00039   return false;
00040 }
00041 
00042 static char justDontAskAboutThisOptimizationHackShit[3];
00043 static const char* extractCodonFrom3to5(const AnnotationItem& item, const UnitLoc& ul,
00044                                         const DnaStr& theSequence, int bp_need) {
00045   char* codon = justDontAskAboutThisOptimizationHackShit; 
00046   if(bp_need <= 0) return codon;
00047   const int start = item.getEnd5(), stop = item.getEnd3();
00048   int clen = stop-start+1, cnt = 0;
00049 
00050   if(!isValidFrameFromLen(start,stop,bp_need,ul.getFrame())) return NULL;
00051   while(bp_need > 0 && clen > 0) {
00052     --bp_need;
00053     codon[bp_need] = theSequence[(stop-cnt)];
00054     --clen;
00055     ++cnt;
00056   }
00057 
00058   if(bp_need == 0) return codon;
00059 
00060   assert(ul.getStrnd()==theSequence.getStrnd());
00061   const UnitScore& unit = item.getMatrix().getUnitScore(ul);
00062   const AnnotationItem* predCell = unit.getPredCell();
00063   if(predCell) {
00064      const AnnotationItem& predItem = *predCell;
00065      const UnitLoc& pul = unit.getPredUnitLoc();
00066      return extractCodonFrom3to5(predItem,pul,theSequence,bp_need);
00067   }
00068   return NULL;
00069 }
00070 
00075 static bool checkFirstCodonOfNewModel(const AnnotationItem& item, int itemType, int currFrame,
00076                    const AnnotationItem& pred, const UnitLoc& pl) {
00077   int predType = pl.getType();
00078   int predFrame = pl.getFrame();
00079   bool rghtBnd = pl.isRghtBnd();
00080   bool isBeginParse = pl.isBeginParse();
00081   //preCond
00082   assert(currFrame==0||currFrame==1);
00083   assert((item.length()==1&&currFrame==1) || (currFrame==0&&(item.length()==1||item.length()==2)));
00084   if( itemType == Internal::getInstance().getScoreIdx() || predType == Internal::getInstance().getScoreIdx()) 
00085     return false;
00086   if( item.getStrnd() == dsu::ePos && (itemType==Terminal::getInstance().getScoreIdx())) return false;
00087   if( item.getStrnd() == dsu::eNeg && (itemType==Initial::getInstance().getScoreIdx())) return false;
00088 
00089   bool isContig = genemodels::isContiguous(pred.getEnd3(),pred.getStrnd(),item.getEnd5(),item.getStrnd());
00090   // just lookoing to see if this is part of the 1st codon of a new model
00091 
00092   // if this is first frame, then this must be the
00093   // the start of the codon and so the predecessor has to be part 
00094   // of another exon
00095   if( currFrame==0 && !isContig ) return true;
00096 
00097   // if this is part of the frame 1
00098   // then the predecessor must be of length 1 and it must be the start
00099   // of the new model
00100   bool predIsStart = false;
00101   int exonCnt = 0;
00102   const UnitScore& us = pred.getMatrix().getUnitScore(UnitLoc(predType,predFrame,exonCnt,pred.getStrnd(),rghtBnd,isBeginParse));
00103   if(us.getPredCell()) {
00104     const AnnotationItem& predPred = *us.getPredCell();
00105     predIsStart = !genemodels::isContiguous(predPred.getEnd3(), us.getPredStrnd(),pred.getEnd5(),pred.getStrnd());
00106   } else {
00107     predIsStart = true;
00108   }
00109   if( currFrame==1 && pred.length()==1 && predIsStart ) return true;
00110 
00111   // don't worry about yet
00112   // currFrame==3
00113   return false;
00114 }
00115 
00119 static bool passSpecialLookAhead(const AnnotationItem& item, int itemType, int currFrame,
00120                    const AnnotationItem& pred, const UnitLoc& pul, bool newModel) {
00121   // PreCond
00122   int predType = pul.getType();
00123   assert((item.length()==1&&currFrame==1) || (currFrame==0&&(item.length()==1||item.length()==2)));
00124   bool isContig = genemodels::isContiguous(pred.getEnd3(),pred.getStrnd(),item.getEnd5(),item.getStrnd());
00125   const DnaStr& str = item.getSeq();
00126 
00127   // even if contig, if the contig is only 2bp, can assume that we can check next bp, cuase an exon has to at least be 3bp
00128   if( (isContig && pred.length()==1) || !isContig) {
00129     // make sure that pred's pred is not contig, otherwise it is a >2bp exon and can't assume next bp will be included
00130     const UnitScore& us = pred.getMatrix().getUnitScore(pul);
00131     if(us.getPredCell() || !isContig) {
00132       bool checkCodon = !isContig;
00133       if(us.getPredCell() && isContig) {
00134         const AnnotationItem& predPred = *us.getPredCell();
00135         checkCodon = !genemodels::isContiguous(predPred.getEnd3(),us.getPredStrnd(),pred.getEnd5(),pred.getStrnd());
00136       }
00137       char codon[3];
00138       bool isStop = false;
00139       if(checkCodon && currFrame == 1) {
00140       // check that we're not at end of sequence
00141         if(item.getEnd3()+1 >= (signed)item.getSeq().length()) return true;
00142         assert(us.getStrnd()==pred.getStrnd());
00143         codon[0] = str[pred.getEnd3()];
00144         codon[1] = str[item.getEnd5()];
00145         codon[2] = str[item.getEnd5()+1];
00146         isStop = str.isStop(codon);
00147       } else if(checkCodon && currFrame == 0) {
00148         // skip at the end
00149         if(item.getEnd5()+2 >= (signed)str.length()) return true;
00150         codon[0] = str[item.getEnd5()];
00151         codon[1] = str[item.getEnd5()+1];
00152         codon[2] = str[item.getEnd5()+2];
00153         isStop = str.isStop(codon);
00154       }
00155       if(checkCodon) {
00156         if( !newModel ) {
00157           if(item.getStrnd()==dsu::ePos && 
00158              // model could end hear in which case this would be a valid terminal or single ending on a stop
00159              (predType != Terminal::getInstance().getScoreIdx() || predType != Single::getInstance().getScoreIdx()) && 
00160              (itemType != Terminal::getInstance().getScoreIdx() || itemType != Single::getInstance().getScoreIdx()) && isStop) {
00161             return false;
00162           } 
00163           else if(item.getStrnd()==dsu::eNeg && isStop)
00164             return false;
00165         } else if( newModel ) {
00166           // if neg strnd, can start off with stop, so nothing to check
00167           assert(itemType != Internal::getInstance().getScoreIdx());
00168           if(dsu::ePos==item.getStrnd())
00169             assert(itemType == Initial::getInstance().getScoreIdx() || itemType == Single::getInstance().getScoreIdx());
00170           else if(dsu::eNeg==item.getStrnd()) {
00171             assert(itemType == Terminal::getInstance().getScoreIdx() || itemType == Single::getInstance().getScoreIdx());
00172           }
00173           // positives start off with an an initial so we can check for stop
00174           bool isAdjStop = checkForAdjacentStopCodon(pred.getRegionPrediction(),item.getRegionPrediction(),str);
00175           if(item.getStrnd() == dsu::ePos && isStop) {
00176             return false;
00177             // negatives start off with an an terminal so we can check for no stop
00178           } else if( item.getStrnd() == dsu::eNeg && (!isStop && !isAdjStop) )
00179             return false;
00180         }
00181       }
00182     }
00183   }
00184   // made it through all the special checks
00185   return true;
00186 }
00187 
00188 static bool predEndsModel(const AnnotationItem& predItem, const UnitLoc& pul, int len) {
00189   const UnitScore& pc = predItem.getMatrix().getUnitScore(pul);
00190   if( len == predItem.length() ) return true;
00191   else if(pc.getPredCell()) {
00192     const AnnotationItem& prev = *pc.getPredCell();
00193     if( prev.getStrnd() != predItem.getStrnd() ||
00194        !genemodels::isContiguous(prev.getEnd3(),pc.getPredStrnd(),predItem.getEnd5(),predItem.getStrnd()) ) {
00195       return false;
00196     } else { 
00197       const UnitLoc& npc = pc.getPredUnitLoc();
00198       return predEndsModel(prev,npc,len-predItem.length());
00199     }
00200   } else {
00201     return false;
00202   }
00203 }
00204 
00205 
00210 bool doProcess(const AnnotationItem& item, const AnnotationItem& predItem, const UnitLoc& pul) {
00211   bool isContig =  genemodels::isContiguous(predItem.getEnd3(), predItem.getStrnd(), item.getEnd5(), item.getStrnd());
00212   if(isContig) return true;
00213 
00214   // bp 3
00215   /* check if the were connecting with a predecessor that was scored/valid
00216   **/
00217   const UnitScore& unit = predItem.getMatrix().getUnitScore(pul);
00218   if(!unit.isValid()) return false;
00219   if(predItem.length() >= 3) return true;
00220 
00221   // bp 2
00222   const AnnotationItem* predPredIter = unit.getPredCell();
00223   if(!predPredIter) return false;
00224   const AnnotationItem& predPredItem = *predPredIter;
00225   int end3 = predPredItem.getEnd3();
00226   // contiguous
00227   if(end3+1 != predItem.getEnd5()) return false;
00228   if( predPredItem.length() + predItem.length() >= 3) return true;
00229 
00230   // bp 1
00231   const UnitScore& nextUnit = 
00232     predPredItem.getMatrix().getUnitScore(unit.getPredUnitLoc());
00233   if(!nextUnit.isValid()) return false;
00234 
00235   const AnnotationItem* nextPredIter = nextUnit.getPredCell();
00236   if(!nextPredIter) return false;
00237   const AnnotationItem& nextPredItem = *nextPredIter;
00238   int nextEnd3 = nextPredItem.getEnd3();
00239   if(nextEnd3+1 != end3) return false;
00240   return true;
00241 }
00242 
00243 static int currentFrame2NumNeed(int frm) {
00244   switch(frm) {
00245   case 2: return 2;
00246   case 1: return 1;
00247   case 0: return 0;
00248   default:
00249     assert(0);
00250     return -1;
00251   }
00252 }
00253 
00254 static int getProperFrameForCurrFrameOne(int len) {
00255   int predFrame = -1;
00256   const int curr_frame = 0;
00257   if(len>=3) {
00258     int leading_bp = (len-currentFrame2NumNeed(curr_frame)) % 3;
00259     predFrame = AnnotationItem::leadingBp2Frame(leading_bp);
00260   } else if(len == 2) {
00261     predFrame = 1;
00262   } else {
00263     predFrame = 2;
00264   }
00265   return predFrame;
00266 }
00267 
00268 #if _RUNTIME_DEBUG >= 1
00269 static bool noStops(const DnaStr& seq, int start, int stop, bool verbose) {
00270 #else
00271 static bool noStops(const DnaStr& seq, int start, int stop) {
00272 #endif 
00273   for(int i = start; i <= stop-2; i+=3) {
00274 #if _RUNTIME_DEBUG >= 1
00275     if(verbose)
00276       seq.printCodon(i);
00277 #endif
00278     if(seq.isStop(i)) {
00279       return false;
00280     }
00281   }
00282   return true;
00283 }
00284 
00285 static bool hasAdjStop(const AnnotationItem& curr, const AnnotationItem& pred) {
00286   assert(curr.getStrnd() == dsu::eNeg);
00287   if((curr.getEnd5()-pred.getEnd3()) >= 4) {
00288     const DnaStr& str = curr.getSeq();
00289     if(str.isStop(curr.getEnd5()-3)) return true;
00290   }
00291   return false;
00292 }
00293 
00294 static bool check4PredContig(const AnnotationItem& item, const UnitLoc& ul, int bp_need) {
00295   const int start = item.getEnd5(), stop = item.getEnd3();
00296   int clen = stop-start+1;
00297   bp_need -= clen;
00298   if( bp_need <= 0 ) return true;
00299   const UnitScore& unit = item.getMatrix().getUnitScore(ul);
00300   const AnnotationItem* predCell = unit.getPredCell();
00301   const UnitLoc& pul = unit.getPredUnitLoc();
00302   if(predCell) {
00303      const AnnotationItem& predItem = *predCell;
00304      bool isContig = genemodels::isContiguous(predItem.getEnd3(),pul.getStrnd(),item.getEnd5(),item.getStrnd());
00305      if( !isContig && bp_need == 1) {
00306        return false;
00307      } else
00308        return check4PredContig(predItem,pul,bp_need);
00309   }
00310   return false;
00311 }
00312 
00313 inline static bool ignoreHelp(int itemPost, int itemPre, dsu::Strand_t strnd, bool isContig) {
00314   // ignore gene model starting with "internal"
00315   if(itemPost == TERMINAL_INT_ID && itemPre == TERMINAL_INT_ID && !isContig) return true;
00316   if(itemPost == INITIAL_INT_ID && itemPre == INITIAL_INT_ID && !isContig) return true;
00317 
00318   if(strnd == dsu::ePos) {
00319     if(itemPost == SINGLE_INT_ID &&
00320        (itemPre == INITIAL_INT_ID || itemPre == INTERNAL_INT_ID)) return true;
00321     if(itemPost == TERMINAL_INT_ID &&
00322       itemPre == SINGLE_INT_ID) return true;
00323     if(itemPost == INTERNAL_INT_ID &&
00324      (itemPre != INTERNAL_INT_ID && itemPre != INITIAL_INT_ID))
00325     return true;
00326     if(itemPost == INITIAL_INT_ID && itemPre == INTERNAL_INT_ID) return true;
00327   } else if(strnd == dsu::eNeg) {
00328     if(itemPost == SINGLE_INT_ID &&
00329        (itemPre == TERMINAL_INT_ID || itemPre == INTERNAL_INT_ID)) return true;
00330     if(itemPost == INITIAL_INT_ID &&
00331       itemPre == SINGLE_INT_ID) return true;
00332     if(itemPost == INTERNAL_INT_ID &&
00333        (itemPre != INTERNAL_INT_ID && itemPre != TERMINAL_INT_ID))
00334     return true;
00335     if(itemPost == TERMINAL_INT_ID && itemPre == INTERNAL_INT_ID) return true;
00336   } else assert(0);
00337      
00338   return false;
00339 }
00340 
00341 inline static  bool ignoreConnection(int currType, dsu::Strand_t currStrnd, int predType, dsu::Strand_t predStrnd, bool isContig) {
00342   if(currStrnd == predStrnd && isContig) return false;
00343 
00344   if(currStrnd == dsu::ePos && predStrnd == dsu::ePos) {
00345     return ignoreHelp(currType,predType,currStrnd,isContig);
00346   } else if(currStrnd == dsu::ePos && predStrnd == dsu::eNeg) {
00347     if((currType==SINGLE_INT_ID||currType==INITIAL_INT_ID) && 
00348        (predType==SINGLE_INT_ID||predType==INITIAL_INT_ID)) return false;
00349     return true;
00350   } else if(currStrnd == dsu::eNeg && predStrnd == dsu::ePos) {
00351     if((currType==SINGLE_INT_ID||currType==TERMINAL_INT_ID) && 
00352        (predType==SINGLE_INT_ID||predType==TERMINAL_INT_ID)) return false;
00353     return true;
00354   } else if(currStrnd == dsu::eNeg && predStrnd == dsu::eNeg) {
00355     return ignoreHelp(currType,predType,currStrnd,isContig);
00356   }
00357   return true;
00358 }
00359 
00363 static bool isEndOfStartCodonPass(const AnnotationItem& item, int itemType, int currFrame,
00364                                   const AnnotationItem& pred, int predType, int predFrame,
00365                                   const DnaStr& theSequence) {
00366   return true;
00367 }