/********************************************************************************************************
 * QRNA - Comparative analysis of biological sequences 
 *         with pair hidden Markov models, pair stochastic context-free
 *        grammars, and probabilistic evolutionary  models.
 *       
 * Version 2.0.0 (JUN 2003)
 *
 * Copyright (C) 2000-2003 Howard Hughes Medical Institute/Washington University School of Medicine
 * All Rights Reserved
 * 
 *     This source code is distributed under the terms of the
 *     GNU General Public License. See the files COPYING and LICENSE
 *     for details.
 ***********************************************************************************************************/

/* SRE prototype.
 * to compile:
 * cc -L ~/lib/squid.linux -I ~/lib/squid.linux -o qrna main.c -lsquid -lm
 * 
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>

#include "squid.h"
#include "structs.h"


#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif
char Alphabet[] = AMINO_ALPHABET;
char DNAAlphabet[] = DNA_ALPHABET;

/* Function: KHS2ct()
 * 
 * Purpose:  Convert a secondary structure string to an array of integers
 *           representing what position each position is base-paired 
 *           to (0..len-1), or -1 if none. This is off-by-one from a
 *           Zuker .ct file representation.
 *           
 *           The .ct representation can accomodate pseudoknots but the 
 *           secondary structure string cannot easily; the string contains
 *           "Aa", "Bb", etc. pairs as a limited representation of
 *           pseudoknots. The string contains "><" for base pairs.
 *           Other symbols are ignored. If allow_pseudoknots is FALSE,
 *           the pseudoknot symbols will be ignored and these positions
 *           will be treated as single stranded.
 *           
 * Return:   ret_ct is allocated here and must be free'd by caller.
 *           Returns 1 on success, 0 if ss is somehow inconsistent.
 */
int 
KHS2ct(char *ss, int len, int allow_pseudoknots, int **ret_ct)
{
  struct intstack_s *dolist[27];
  int *ct;
  int  i;
  int  pos, pair;
  int  status = 1;              /* success or failure return status */

  for (i = 0; i < 27; i++)
    dolist[i] = InitIntStack();

  if ((ct = (int *) malloc (len * sizeof(int))) == NULL)
    Die("malloc failed");
  for (pos = 0; pos < len; pos++)
    ct[pos] = -1;

  for (pos = 0; ss[pos] != '\0'; pos++)
    {
      if (ss[pos] > 127) status = 0; /* bulletproof against SGI buggy ctype.h */

      else if (ss[pos] == '>')  /* left side of a pair: push onto stack 0 */
        PushIntStack(dolist[0], pos);
      else if (ss[pos] == '<')  /* right side of a pair; resolve pair */
        {
          if (! PopIntStack(dolist[0], &pair))
            { status = 0; }
          else
            {
              ct[pos]  = pair;
              ct[pair] = pos;
            }
        }
                                /* same stuff for pseudoknots */
      else if (allow_pseudoknots && isupper((int) ss[pos]))
        PushIntStack(dolist[ss[pos] - 'A' + 1], pos);
      else if (allow_pseudoknots && islower((int) ss[pos]))
        {
          if (! PopIntStack(dolist[ss[pos] - 'a' + 1], &pair))
            { status = 0; }
          else
            {
              ct[pos]  = pair;
              ct[pair] = pos;
            }
        }
      else if (allow_pseudoknots && !isgap(ss[pos])) status = 0; /* bad character */
    }

  for (i = 0; i < 27; i++)
    if ( FreeIntStack(dolist[i]) > 0)
      status = 0;

  *ret_ct = ct;
  return status;
}

/* Function: CheckNullModel()
 * Date:     SRE, Wed Jun 10 13:36:54 1998 [St. Louis]
 *
 * Purpose:  Verify that a null model is ok
 *
 * Args:     nullmodel - model to check [4][4]
 *
 * Returns:  void. Prints some stuff.
 */
void
CheckNullModel(double nullmodel[4][4])
{
  int x;
  int y;
  double sum = 0.0;

  for (x = 0; x < 4; x++)
    for (y = 0; y < 4; y++)
      sum += nullmodel[x][y];
  printf("null sum is %f\n", sum);
}

/* Function: CheckPAMModel()
 * Date:     SRE, Wed Jun 10 13:40:01 1998 [St. Louis]
 *
 * Purpose:  Verify that a PAM model is OK
 *
 * Args:     pammodel - model to check [64][64]
 *
 * Returns:  void. prints stuff.
 */
void
CheckPAMModel(double pammodel[64][64])
{
  int x, y;
  double sum = 0.0;

  for (x = 0; x < 64; x++)
    for (y = 0; y < 64; y++)
      sum += pammodel[x][y];
  printf("pam sum is %f\n", sum);
}


/* Function: PrintCodonString()
 * Date:     SRE, Wed Jun 10 14:00:28 1998 [St. Louis]
 */
void
PrintCodonString(int x)
{
  char s[4];

  s[0] = DNAAlphabet[x/16];
  s[1] = DNAAlphabet[(x%16)/4];
  s[2] = DNAAlphabet[x%4];
  s[3] = '\0';
  printf("%3s ", s);
}


/* Function: PrintPAMModel()
 * Date:     SRE, Wed Jun 10 13:51:53 1998 [St. Louis]
 *
 * Purpose:  Print a pam model
 *
 * Args:     pammodel[64][64]
 *           nullmodel[4][4]
 *
 * Returns:  void. prints stuff.
 */
void
PrintPAMModel(double pammodel[64][64], double nullmodel[4][4])
{
  int x,y;
  double nullp;
  double info = 0.0;
  double expect = 0.0;

  for (x = 0; x < 64; x++)
    for (y = 0; y < 64; y++)
      {
	nullp  = nullmodel[(x/16)][y/16];
	nullp *= nullmodel[((x%16)/4)][((y%16)/4)];
	nullp *= nullmodel[(x%4)][(y%4)];

	PrintCodonString(x);
	PrintCodonString(y), 
	printf("%3s %3s %12f %12f %12f\n",
	       stdcode3[x], stdcode3[y], 
	       pammodel[x][y], nullp,
	       log(pammodel[x][y] / nullp));

	info += pammodel[x][y] * log(pammodel[x][y] / nullp);
	expect += nullp * log(pammodel[x][y] / nullp);
      }

  printf("info content per codon: %f bits\n", info);
  printf("expectation per codon:  %f bits\n", expect);
}

/* Function: PrintRNAModel()
 * Date:     SRE, Wed Jun 10 16:11:23 1998 [St. Louis]
 *
 * Purpose:  print base pair alignment stuff for RNA model
 *
 * Args:     rnamodel, nullmodel
 *
 * Returns:  void, prints stuff.
 */
void
PrintRNAModel(struct rna_s *rnamodel, double nullmodel[4][4])
{
  int x1,x2,y1,y2;
  double nullp;
  double info = 0.0;
  double expect = 0.0;

  for (x1 = 0; x1 < 4; x1++)
    for (y1 = 0; y1 < 4; y1++)
      for (x2 = 0; x2 < 4; x2++)
	for (y2 = 0; y2 < 4; y2++)
	  {
	    nullp = nullmodel[x1][x2] * nullmodel[y1][y2];
	    printf("%c-%c %c-%c %12f %12f %12f\n",
		   DNAAlphabet[x1], DNAAlphabet[y1],
		   DNAAlphabet[x2], DNAAlphabet[y2],
		   rnamodel->pxy[x1][y1][x2][y2], nullp,
		   log(rnamodel->pxy[x1][y1][x2][y2] / nullp));
	    info += rnamodel->pxy[x1][y1][x2][y2] * 
	      log(rnamodel->pxy[x1][y1][x2][y2] / nullp);
	    expect += nullp * 
	      log(rnamodel->pxy[x1][y1][x2][y2] / nullp);
	  }
  printf("RNA model info   = %f bits\n", info);
  printf("RNA model expect = %f bits\n", expect);
}





/* Function: NormalizePAMModel()
 * Date:     SRE, Wed Jun 10 13:41:57 1998 [St. Louis]
 *
 * Purpose:  Normalize a PAM model; because of integer->float
 *           conversion, can expect some rough edges.
 *
 * Args:     pammodel[64][64]
 *
 * Returns:  void
 */
void
NormalizePAMModel(double pammodel[64][64])
{
  int x, y;
  double sum = 0.0;

  for (x = 0; x < 64; x++)
    for (y = 0; y < 64; y++)
      sum += pammodel[x][y];
  for (x = 0; x < 64; x++)
    for (y = 0; y < 64; y++)
      pammodel[x][y] /= sum;
}



/* Function: DefaultGeneticCode()
 * 
 * Purpose:  Configure aacode, mapping triplets to amino acids.
 *           Triplet index: AAA = 0, AAC = 1, ... UUU = 63.
 *           AA index: alphabetical: A=0,C=1... Y=19
 *           Stop codon: -1. 
 *           Uses the stdcode1[] global translation table from SQUID.
 *           
 * Args:     aacode  - preallocated 0.63 array for genetic code
 *                     
 * Return:   (void)
 */
void
DefaultGeneticCode(int aacode[64])
{
  int x;

  for (x = 0; x < 64; x++) {
    if (*(stdcode1[x]) == '*') aacode[x] = -1;
    else                       aacode[x] = SYMIDX(*(stdcode1[x]));
  }
}

/* Function: DefaultCodonBias()
 * 
 * Purpose:  Configure a codonbias table, mapping triplets to
 *           probability of using the triplet for the amino acid
 *           it represents: P(triplet | aa).
 *           The default is to assume codons are used equiprobably.
 *           
 * Args:     codebias:  0..63 array of P(triplet|aa), preallocated.
 * 
 * Return:   (void)
 */
void
DefaultCodonBias(double codebias[64])
{
  codebias[0]  = 1./2.; /* AAA Lys 2 */
  codebias[1]  = 1./2.; /* AAC Asn 2 */
  codebias[2]  = 1./2.; /* AAG Lys 2 */
  codebias[3]  = 1./2.; /* AAU Asn 2 */
  codebias[4]  = 1./4.; /* ACA Thr 4 */
  codebias[5]  = 1./4.; /* ACC Thr 4 */
  codebias[6]  = 1./4.; /* ACG Thr 4 */
  codebias[7]  = 1./4.; /* ACU Thr 4 */
  codebias[8]  = 1./6.; /* AGA Ser 6 */
  codebias[9]  = 1./6.; /* AGC Arg 6 */
  codebias[10] = 1./6.; /* AGG Ser 6 */
  codebias[11] = 1./6.; /* AGU Arg 6 */
  codebias[12] = 1./3.; /* AUA Ile 3 */
  codebias[13] = 1./3.; /* AUC Ile 3 */
  codebias[14] = 1.;    /* AUG Met 1 */
  codebias[15] = 1./3.; /* AUU Ile 3 */
  codebias[16] = 1./2.; /* CAA Gln 2 */
  codebias[17] = 1./2.; /* CAC His 2 */
  codebias[18] = 1./2.; /* CAG Gln 2 */
  codebias[19] = 1./2.; /* CAU His 2 */
  codebias[20] = 1./4.; /* CCA Pro 4 */
  codebias[21] = 1./4.; /* CCC Pro 4 */
  codebias[22] = 1./4.; /* CCG Pro 4 */
  codebias[23] = 1./4.; /* CCU Pro 4 */
  codebias[24] = 1./6.; /* CGA Arg 6 */
  codebias[25] = 1./6.; /* CGC Arg 6 */
  codebias[26] = 1./6.; /* CGG Arg 6 */
  codebias[27] = 1./6.; /* CGU Arg 6 */
  codebias[28] = 1./6.; /* CUA Leu 6 */
  codebias[29] = 1./6.; /* CUC Leu 6 */
  codebias[30] = 1./6.; /* CUG Leu 6 */
  codebias[31] = 1./6.; /* CUU Leu 6 */
  codebias[32] = 1./2.; /* GAA Glu 2 */
  codebias[33] = 1./2.; /* GAC Asp 2 */
  codebias[34] = 1./2.; /* GAG Glu 2 */
  codebias[35] = 1./2.; /* GAU Asp 2 */
  codebias[36] = 1./4.; /* GCA Ala 4 */
  codebias[37] = 1./4.; /* GCC Ala 4 */
  codebias[38] = 1./4.; /* GCG Ala 4 */
  codebias[39] = 1./4.; /* GCU Ala 4 */
  codebias[40] = 1./4.; /* GGA Gly 4 */
  codebias[41] = 1./4.; /* GGC Gly 4 */
  codebias[42] = 1./4.; /* GGG Gly 4 */
  codebias[43] = 1./4.; /* GGU Gly 4 */
  codebias[44] = 1./4.; /* GUA Val 4 */
  codebias[45] = 1./4.; /* GUC Val 4 */
  codebias[46] = 1./4.; /* GUG Val 4 */
  codebias[47] = 1./4.; /* GUU Val 4 */
  codebias[48] = 0.;    /* UAA och - */
  codebias[49] = 1./2.; /* UAC Tyr 2 */
  codebias[50] = 0.;    /* UAG amb - */
  codebias[51] = 1./2.; /* UAU Tyr 2 */
  codebias[52] = 1./6.; /* UCA Ser 6 */
  codebias[53] = 1./6.; /* UCC Ser 6 */
  codebias[54] = 1./6.; /* UCG Ser 6 */
  codebias[55] = 1./6.; /* UCU Ser 6 */
  codebias[56] = 0.;    /* UGA opa - */
  codebias[57] = 1./2.; /* UGC Cys 2 */
  codebias[58] = 1.;    /* UGG Trp 1 */
  codebias[59] = 1./2.; /* UGU Cys 2 */
  codebias[60] = 1./6.; /* UUA Leu 6 */
  codebias[61] = 1./2.; /* UUC Phe 2 */
  codebias[62] = 1./6.; /* UUG Leu 6 */
  codebias[63] = 1./2.; /* UUU Phe 2 */
}


/* Function: ConstructESTModel()
 * Date:     SRE, Wed Jun 10 08:46:06 1998 [St. Louis]
 *
 * Purpose:  Return an estmodel: a 20x64 matrix for P(xyz | a),
 *           for converting an amino acid into a distribution
 *           over possible codons. Incorporates an error
 *           model as well as a codon bias model.
 *
 * Derivation:
 *           call observed codon x
 *           call actual codon y
 *           call actual amino acid a.
 *           want: P(x | a)
 *            
 *           P(x,a) = \sum_y P(x,y,a)     
 *           P(x,a) = \sum_y P(x | y,a) P(y,a)
 *           P(x,a) = \sum_y P(x | y,a) P(y | a) P(a)
 *                           ^ assume base errors independent of a
 *           P(x,a) = 
 *           P(x | a) P(a) =  \sum_y P(x | y) P(y | a) P(a)                 
 *           P(x | a) = \sum_y P(x | y) P(y | a) 
 *                   error model ^        ^ codon bias model
 *                   
 * Args:     accuracy - per-base sequence accuracy
 *           aacode   - genetic code, 0..63 lookup for aa indices 0..19
 *           codebias - 0..63 codon bias probabilities per aa
 *           estmodel - RETURN: [20][64] A..YxAAA..UUU P(x | a)
 *
 * Returns:  (void)
 *           Fills in the estmodel.
 */
void
ConstructESTModel(int aacode[64], double codebias[64], 
		  double estmodel[20][64])
{
  double pxy;			/* P(x | y) */
  int x, y, a;			/* indices */

  
  for (a = 0; a < 20; a++)
    for (x = 0; x < 64; x++)
      {
	estmodel[a][x] = 0.0;
	for (y = 0; y < 64; y++)
	  {
	    if (aacode[y] != a) continue; /* genetic code, P(y|a) = 0.0 */

				/* count mutations from y to x brute force */
	    pxy =   (x / 16 == y / 16)     ? accuracy : 1.0 - accuracy;
	    pxy *=  ((x%16)/4 == (y%16)/4) ? accuracy : 1.0 - accuracy;
	    pxy *=  (x%4 == y%4)           ? accuracy : 1.0 - accuracy;
    
	    estmodel[a][x] += pxy * codebias[y];
	  }
      }
}

/* Function: ConstructPAMModel()
 * Date:     SRE, Wed Jun 10 08:13:35 1998 [St. Louis]
 *
 * Purpose:  Given a substitution matrix, return a
 *           PAMmodel: a 64x64 joint probability matrix
 *           for codons aligned to codons.
 *
 * Args:     pam      - 27x27 integer score matrix, alphabetic
 *           scale    - scale of pam matrix           
 *           aafq     - background amino acid frequencies 0..19
 *           estmodel - 64x20 AAA..UUUxA..Y P(xyz | a) conversion to codons
 *           pammodel - 64x64 AAA..UUUxAAA..UUU joint prob matrix
 *
 * Returns:  (void)
 *           fills in pammodel.
 */
void
ConstructPAMModel(int **pam, float scale, float *aafq,
		  double estmodel[20][64], double pammodel[64][64])
{
  int a, b;			/* 0..20 indices for amino acids */
  int idxa, idxb;		/* 0..26 indices for amino acids */
  double aajoint[20][20];	/* joint aa probs P(ab)          */
  int xyz1, xyz2;		/* indices for codons 0..63      */


  /* Convert integer pam matrix back to joint probabilities P(ab)
   */
  for (a = 0; a < 20; a++)
    for (b = 0; b < 20; b++)
      {
	idxa = Alphabet[a] - 'A';
	idxb = Alphabet[b] - 'A';
	aajoint[a][b] = aafq[a] * aafq[b] * exp((float) pam[idxa][idxb] * scale);
      }

  /* Fill in pammodel
   *   P(xyz1,xyz2) = \sum_ab P(xyz1,xyz2 | ab) P(ab)
   *     independence assumption for converting aa to codon gives:
   *   P(xyz1,xyz2) = \sum_ab P(xyz1 | a) P(xyz2 | b) P(ab)
   *                             ^ estmodel  ^estmodel  ^aajoint  
   */
  for (xyz1 = 0; xyz1 < 64; xyz1++)
    for (xyz2 = 0; xyz2 < 64; xyz2++)
      {
	pammodel[xyz1][xyz2] = 0.0;
	for (a = 0; a < 20; a++)
	  for (b = 0; b < 20; b++)
	    pammodel[xyz1][xyz2] += 
	      estmodel[a][xyz1] * estmodel[b][xyz2] * aajoint[a][b];
      }

  /* Normalize it
   */
  NormalizePAMModel(pammodel);
}



/* Function: ConstructRNAModel()
 * Date:     SRE, Wed Jun 10 15:40:08 1998 [St. Louis]
 *
 * Purpose:  Build the RNA model from the null model under
 *           certain simplifying assumptions; namely, that
 *           P(x1 y1 x2 y2) = P(x1 y1) P (x2 y2) P(x1 x2) / (P(x1) P(x2))
 *
 * Args:     nullmodel[4][4]
 *           rnamodel  - pointer to alloc'ed rnamodel
 *
 * Returns:  void
 *           fills in rnamodel
 */
void
ConstructRNAModel(double nullmodel[4][4], struct rna_s *rnamodel)
{
  int x,y;
  int x1,x2,y1,y2;
  double pairprob[4][4];
  double sum;

  rnamodel->tsl = 0.30;		/* made up transitions */
  rnamodel->tsr = 0.30;
  rnamodel->tsp = 0.30;
  rnamodel->tss = 0.04;
  rnamodel->tse = 0.06;
  
				/* copy singlets */
  for (x = 0; x < 4; x++)
    for (y = 0; y < 4; y++)
      rnamodel->px[x][y] = nullmodel[x][y];

				/* precalculate pairprob */
  for (x = 0; x < 4; x++)
    for (y = 0; y < 4; y++)
      pairprob[x][y] = 0.0;
				/* ad hoc: GU = 1/7 of pairs, split evenly */
				/* (1:5.57 ratio from Elena, SSU90 count) */
  pairprob[0][3] = pairprob[3][0] = pairprob[1][2] = pairprob[2][1] = 0.21428571;
  pairprob[2][3] = pairprob[3][2] = 0.07142857;

				/* calculate pairs */
  sum = 0.0;
  for (x1 = 0; x1 < 4; x1++)
    for (y1 = 0; y1 < 4; y1++)
      for (x2 = 0; x2 < 4; x2++)
	for (y2 = 0; y2 < 4; y2++)
	  {
	    rnamodel->pxy[x1][y1][x2][y2] =
	      pairprob[x2][y2] * pairprob[x1][y1] *
	      ( (nullmodel[x1][x2] / (0.25 * 0.25)) + 
		(nullmodel[y1][y2] / (0.25 * 0.25))) * 0.5;
	    sum +=  rnamodel->pxy[x1][y1][x2][y2];
	  }
  printf("rna sum is %f\n", sum);

				/* renormalize to clean up a bit */
  for (x1 = 0; x1 < 4; x1++)
    for (y1 = 0; y1 < 4; y1++)
      for (x2 = 0; x2 < 4; x2++)
	for (y2 = 0; y2 < 4; y2++)
	  rnamodel->pxy[x1][y1][x2][y2] /= sum;
}




/* Function: ConstructNullModel()
 * Date:     SRE, Wed Jun 10 08:00:30 1998 [St. Louis]
 *
 * Purpose:  Given a PAMModel, marginalize to calculate
 *           a random model.
 *
 * Args:     pammodel  - 64x64 AAA..UUUxAAA..UUU joint prob matrix (prealloc)
 *           nullmodel - 4x4 A..UxA..U joint prob matrix (prealloc)           
 *
 * Returns:  (void)
 *           Fills in nullmodel.
 */
void
ConstructNullModel(double pammodel[64][64], double nullmodel[4][4])
{
  int x1, y1, z1;
  int x2, y2, z2;
  
  /* Zero null model
   */
  for (x1 = 0; x1 < 4; x1++)
    for (x2 = 0; x2 < 4; x2++)
      nullmodel[x1][x2] = 0.0;

  /* Marginalize and average over three positions
   */
  for (x1 = 0; x1 < 4; x1++)
    for (y1 = 0; y1 < 4; y1++)
      for (z1 = 0; z1 < 4; z1++)
	for (x2 = 0; x2 < 4; x2++)
	  for (y2 = 0; y2 < 4; y2++)
	    for (z2 = 0; z2 < 4; z2++)
	      {
		nullmodel[x1][x2] += pammodel[CODON(x1,y1,z1)][CODON(x2,y2,z2)] / 3.0;
		nullmodel[y1][y2] += pammodel[CODON(x1,y1,z1)][CODON(x2,y2,z2)] / 3.0;
		nullmodel[z1][z2] += pammodel[CODON(x1,y1,z1)][CODON(x2,y2,z2)] / 3.0;
	      }
}


/* Function: ScoreWithNull()
 * Date:     SRE, Wed Jun 10 16:46:16 1998 [St. Louis]
 *
 * Purpose:  Score an ungapped sequence alignment with null model.
 *
 * Args:     s1, s2    -- equal length sequences, ACGT /only/
 *           L         -- lengths of s1,s2 
 *           nullmodel -- [4][4] substitution matrix
 *
 * Returns:  log likelihood, log P(s1,s2 | null)
 */
double
ScoreWithNull(char *s1, char *s2, int L, double nullmodel[4][4])
{
  int x;			/* position in s1,s2 */
  double sc;

  sc = 0.0;
  for (x = 0; x < L; x++)
    sc += log(nullmodel[DNAIDX(s1[x])][DNAIDX(s2[x])]);
  return sc;
}


/* Function: pam_score_single_frame()
 * Date:     SRE, Wed Jun 10 16:52:46 1998 [St. Louis]
 *
 * Purpose:  called by ScoreWithPam().
 *           Scores a single frame, starting with first position
 *
 * Args:     s1, s2    -- equal length sequences, ACGT /only/
 *           L         -- lengths of s1,s2 
 *           pammodel  -- [64][64] codon substitution matrix
 *
 * Returns:  log likelihood, log P(s1,s2 | pammodel,frame)
 */
double
pam_score_single_frame(char *s1, char *s2, int L, double pammodel[64][64])
{
  int x;
  int codon1, codon2;		/* lookups for two codons  */
  double sc;

  sc = 0.0;
  for (x = 0; x < L-2; x += 3)	/* skip by triplets */
    {
      codon1 = DNAIDX(s1[x]) * 16 + DNAIDX(s1[x+1]) * 4 + DNAIDX(s1[x+2]);
      codon2 = DNAIDX(s2[x]) * 16 + DNAIDX(s2[x+1]) * 4 + DNAIDX(s2[x+2]);
      sc += log(pammodel[codon1][codon2]);
    }
  return sc;
}


/* Function: ScoreWithPAM()
 * Date:     SRE, Wed Jun 10 16:50:31 1998 [St. Louis]
 *
 * Purpose:  Score an ungapped sequence alignment with PAM model.
 *           Assumes equal prior over six frames, and sums.
 *
 * Args:     s1, s2    -- equal length sequences, ACGT /only/
 *           L         -- lengths of s1,s2 
 *           pammodel  -- [64][64] codon substitution matrix
 *
 * Returns:  log likelihood, log P(s1,s2 | pammodel)
 */
double
ScoreWithPAM(char *s1, char *s2, int L, double pammodel[64][64])
{
  double sc[6];
  char *r1, *r2;                   /* reverse complements */
  double totsc;
  int    i;			/* frame counter */

				/* frames 0..2 */
  i = 0;
  if (L >= 3) { sc[i] = pam_score_single_frame(s1,   s2,   L,   pammodel); i++;}
  if (L >= 4) { sc[i] = pam_score_single_frame(s1+1, s2+1, L-1, pammodel); i++;}
  if (L >= 5) { sc[i] = pam_score_single_frame(s1+2, s2+2, L-2, pammodel); i++;}

				/* revcomp */
  r1 = MallocOrDie(sizeof(char) * (L+1));
  r2 = MallocOrDie(sizeof(char) * (L+1));
  revcomp(r1, s1);
  revcomp(r2, s2);
				/* frames 3..5 */
  if (L >= 3) {sc[i] = pam_score_single_frame(r1,   r2,   L,   pammodel); i++; }
  if (L >= 4) {sc[i] = pam_score_single_frame(r1+1, r2+1, L-1, pammodel); i++; }
  if (L >= 5) {sc[i] = pam_score_single_frame(r1+2, r2+2, L-2, pammodel); i++; }
  
  totsc = DLogSum(sc, i);
  totsc += log(1.0/6.0);	/* prior */

  free(r1);
  free(r2);
  return totsc;
}

/* Function: ScoreWithRNA()
 * Date:     SRE, Wed Jun 10 17:58:05 1998 [St. Louis]
 *
 * Purpose:  Score an ungapped sequence alignment with RNA model.
 *           Sums over all possible structures. Nussinov-like algorithm
 *
 */
double
ScoreWithRNA(char *s1, char *s2, int L, struct rna_s *rna)
{
  double **mx;
  double *sc;
  double  totsc;
  int     idx;
  int     x1,x2,y1,y2;
  int     i,j,d,k;

  /* Allocate a 0..L-1 square matrix.
   */
  mx = MallocOrDie(sizeof(double *) * L);
  for (i = 0; i < L; i++)
    mx[i] = MallocOrDie(sizeof(double) * L);
  sc = MallocOrDie(sizeof(double) * (L+3));

  /* Initialize diagonal and short subseqs (L state only)
   * Enforce a minimum loop length of 3
   */
  for (i = 0; i < L; i++)
    {
      x1 = DNAIDX(s1[i]);
      x2 = DNAIDX(s2[i]);
      mx[i][i] = log(rna->px[x1][x2]) + log(rna->tsl) + log(rna->tse);
    }
  for (i = 0; i < L-1; i++)
    {
      x1 = DNAIDX(s1[i]);
      x2 = DNAIDX(s2[i]);
      mx[i][i+1] = mx[i+1][i+1] + log(rna->tsl) + log(rna->px[x1][x2]);
    }
  for (i = 0; i < L-2; i++)
    {
      x1 = DNAIDX(s1[i]);
      x2 = DNAIDX(s2[i]);
      mx[i][i+2] = mx[i+1][i+2] + log(rna->tsl) + log(rna->px[x1][x2]);
    }
      

  /* Main recursion
   */
  for (d = 4; d <= L; d++)
    for (i = 0; i <= L-d; i++)
      {
	j = i + d - 1;
	x1 = DNAIDX(s1[i]);
	x2 = DNAIDX(s2[i]);
	y1 = DNAIDX(s1[j]);
	y2 = DNAIDX(s2[j]);
				/* i left; connect to i+1, j; emit x */
	idx = 0;
	sc[idx] = mx[i+1][j] + log(rna->tsl) + log(rna->px[x1][x2]);
	idx++;
				/* j right; connect to i, j-1; emit y */
	sc[idx] = mx[i][j-1] + log(rna->tsr) + log(rna->px[y1][y2]);
	idx++;
				/* i,j pair; connect to i+1,j-1; emit xy */
	sc[idx] = mx[i+1][j-1] + log(rna->tsp) + log(rna->pxy[x1][y1][x2][y2]);
	idx++;
				/* bifurcations */
	for (k = i; k < j; k++)
	  {
	    sc[idx] = mx[i][k] + mx[k+1][j] + log(rna->tss);
	    idx++;
	  }
				/* summation */
	mx[i][j] = DLogSum(sc, idx);
      }

  totsc = mx[0][L-1];
  Free2DArray(mx, L);
  free(sc);
  return totsc;
}

/* Function: SimulateNullSequences()
 * Date:     SRE, Thu Jun 11 09:09:49 1998 [St. Louis]
 *
 * Purpose:  Given a sequence, simulate a second sequence
 *           related to it by the nullmodel.
 *
 * Args:     s1        - the starting sequence 
 *           L         - length of s1
 *           nullmodel - the null model
 *
 * Returns:  s2, an alloc'ed simulated second sequence
 */
char *
SimulateNullSequence(char *s1, int L, double nullmodel[4][4])
{
  double pxy[4][4];		/* P(x | y) conditionals */
  char *s2;
  int   x,y;
  int   pos;

  s2 = MallocOrDie(sizeof(char) * (L+1));
  
  /* Calculate conditionals (symmetric)
   */
  for (x = 0; x < 4; x++)
    {
      for (y = 0; y < 4; y++)
	pxy[x][y] = nullmodel[x][y] / 0.25;
      DNorm(pxy[x], 4);
    }
  
  for (pos = 0; pos < L; pos++)
    {
      x = DNAIDX(s1[pos]);
      y = DChoose(pxy[x], 4);
      s2[pos] = DNAAlphabet[y];
    }
  s2[L] = '\0';
  return s2;
}

/* Function: SimulateCodingSequences()
 * Date:     SRE, Thu Jun 11 10:06:28 1998 [St. Louis]
 *
 * Purpose:  Simulate an ungapped alignment of two coding
 *           sequences, given a pammodel
 *
 * Args:     L        - length of the alignment; will be made next lowest multiple of three
 *           pammodel - model, [64][64] joint probabilities
 *           ret_s1   - RETURN: first seq
 *           ret_s2   - RETURN: second seq
 *
 * Returns:  void
 */
void
SimulateCodingSequences(int L, double pammodel[64][64], char **ret_s1, char **ret_s2)
{
  char  *s1, *s2;
  int    pos;
  double p[4096];
  int    x,y, i;

  L = (L / 3) * 3;  /* make a multiple of three */

  s1 = MallocOrDie(sizeof(char) * (L+1));
  s2 = MallocOrDie(sizeof(char) * (L+1));
  
  for (i = 0; i < 4096; i++) 
    {
      x = i / 64;
      y = i % 64;
      p[i] = pammodel[x][y];
    }

  for (pos = 0; pos < L; pos += 3)
    {
      i = DChoose(p, 4096);
      x = i / 64;
      y = i % 64;

      s1[pos]   = DNAAlphabet[x/16];
      s1[pos+1] = DNAAlphabet[(x%16)/4];
      s1[pos+2] = DNAAlphabet[x%4];

      s2[pos]   = DNAAlphabet[y/16];
      s2[pos+1] = DNAAlphabet[(y%16)/4];
      s2[pos+2] = DNAAlphabet[y%4];
    }
  s1[L] = '\0';
  s2[L] = '\0';
  *ret_s1 = s1;
  *ret_s2 = s2;
  return;
}

/* Function: SimulateRNASequences()
 * Date:     SRE, Thu Jun 11 10:33:28 1998 [St. Louis]
 *
 * Purpose:  Simulated an ungapped alignment of two sequences
 *           generated by the RNA model. Not a complete generation:
 *           rather, is given a structure (implying a given parse tree)
 *           and just generates residues.
 *           
 * Args:     ct     : Zuker .ct notation for an RNA structure 
 *                      e.g. 0..L-1 values if paired -1 if not          
 *           L      : length of structure
 *           ret_s1 : RETURN: string 1
 *           ret_s2 : RETURN: string 2
 *
 * Returns:  void
 */
void
SimulateRNASequences(struct rna_s *rna, int *ct, int L, char **ret_s1, char **ret_s2)
{
  char *s1, *s2;
  double pair[256];
  double single[16];
  int z;
  int x1,x2,y1,y2;
  int i,j;
  double totsc;

  s1 = MallocOrDie(sizeof(char) * (L+1));
  s2 = MallocOrDie(sizeof(char) * (L+1));

  for (z = 0; z < 16; z++) 
    {
      x1 = z / 4;
      x2 = z % 4;
      single[z] = rna->px[x1][x2];
    }
  for (z = 0; z < 256; z++)
    {
      x1 = z / 64;
      y1 = (z % 64) / 16;
      x2 = (z % 16) / 4;
      y2 = z % 4;
      pair[z] = rna->pxy[x1][y1][x2][y2];
    }

  totsc = 0.0;
  for (i = 0; i < L; i++)
    {
      if (ct[i] == -1)		/* singlet */
	{
	  z  = DChoose(single, 16);
	  x1 = z / 4;
	  x2 = z % 4;

	  s1[i] = DNAAlphabet[x1];
	  s2[i] = DNAAlphabet[x2];
	  
	  totsc += log(rna->px[x1][x2]);

	}
      else if (ct[i] > i)	/* doublet that we haven't done yet */
	{
	  j = ct[i];
	  z = DChoose(pair, 256);
	  x1 = z / 64;
	  y1 = (z % 64) / 16;
	  x2 = (z % 16) / 4;
	  y2 = z % 4;
	  
	  s1[i] = DNAAlphabet[x1];
	  s2[i] = DNAAlphabet[x2];
	  s1[j] = DNAAlphabet[y1];
	  s2[j] = DNAAlphabet[y2];

	  totsc += log(rna->pxy[x1][y1][x2][y2]);
	}
    }    


  /* specific correction for tRNAPhe parse tree
   * (-82.23)
   */
  totsc += 34.0 * log(rna->tsl);
  totsc += 21.0 * log(rna->tsp);
  totsc += 2.0  * log(rna->tss);
  totsc += 3.0  * log(rna->tse);

  /* printf("simulated RNAs should score at least %f\n", totsc); */
  s1[L] = '\0';
  s2[L] = '\0';
  *ret_s1 = s1;
  *ret_s2 = s2;
  return;
}



/* Function: Posterior()
 * Date:     SRE, Thu Jun 11 09:26:43 1998 [St. Louis]
 *
 * Purpose:  Calculate the posterior probability of a model.
 *           Done in log space for numerical stability reasons.
 *           (do the algebra)
 *
 * Args:     sc1     - log P(D|M) of the model under consideration
 *           sc2,sc3 - log P(D|M) of the other two models
 *
 * Returns:  Posterior P(M|D)
 */
double
Posterior(double sc1, double sc2, double sc3)
{
  double d[3];

  d[0] = 0.0;
  d[1] = sc2 - sc1;
  d[2] = sc3 - sc1;
  return 1.0 / exp(DLogSum(d, 3));
}



int
main(int argc, char **argv)
{
  char *seq1;
  char *seq2;
  char *khs;
  int   L;
  char *pamfile;
  char *blastpamfile;
  FILE *pamfp;
  int **pam;
  float scale;
  int    aacode[64];
  double codebias[64];
  double pammodel[64][64];
  double estmodel[20][64];
  double nullmodel[4][4];
  struct rna_s rnamodel;
  double nullsc, pamsc, rnasc;

  pamfile  = "BLOSUM62";

  /* Secondary structure of DF6280, yeast tRNA-phe
   */
  khs = ">>>>>>>..>>>>........<<<<.>>>>>.......<<<<<.....>>>>>.......<<<<<<<<<<<<...."; 
  khs = ">>>>>>>..>>>>>>....<<<<<<.>>>>>>>...<<<<<<<.....>>>>>>>...<<<<<<<<<<<<<<...."; 



  /* A pair of U1A hits
   */
  seq1 = "GTTCCCTGGCTTCAAGGAGGTCCGTCTGGTACCCGGGCGGCATGACATCGCCTTCGTGGA\
GTTTGACAATGAGGTACAGGCAGGGGCAGCTCGCGATGCCCTGCAGGGCTTTAAGATCAC\
GCAGAACAACGCCATGAAGATCTCCTTTGCCAAGAAGTAG"; 
  seq2 = "GTTCCCTGGCTTCAAGGAGGTGCGTCTGGTCCCTGGGCGCCATGACATCGCCTTCGTGGA\
GTTTGACAATGAAGTGCAGGCTGGGGCAGCACGAGATGCCCTGCAAGGCTTTAAGATCAC\
ACAAAACAATGCTATGAAGATCTCTTTTGCCAAGAAGTAG";
  L = strlen(seq1);

  /* Synthetic random sequence
   */
  /*
  seq1 = "GCTCGTGACACTGTCCCAGGCCTTATGTCCATGACCTTCCATAGGCTAGGTATGATGGTTTAGGAATATCCTAGGTACTGTGTTCTTTGGGCGCATCAAC";
  seq2 = "GCTCGTGACACTGTCCCAGGCCTTATGTCCATGACCTTCCATAGGCTAGGTATGATGGTTTAGGAATATCCTAGGTACTGTGTTCTTTGGGCGCATCAAC";
  L = strlen(seq1);
  */

  /* yeast tRNA-phe vs. Barley tRNA-Phe emb|X02683|HVTRNPHE 
  
  seq1 = "GCGGATTTAGCTCAGTTGGGAGAGCGCCAGACTGAAGATCTGGAGGTCCTGTGTTCGATCCACAGAATTCGCACCA";
  seq2 = "GCGGGGATAGCTCAGTTGGGAGAGCGTCAGACTGAAGATCTGAAGGTCGCGTGTTCGATCCACGCTCACCGCACCA"; 
  L = strlen(seq1);

  seq1 = "GGGGGGGGGGTTCGCCCCCCCCCC";
  seq2 = "AAAAAAAAAATTCGTTTTTTTTTT";
  L = strlen(seq1);
 */

  /* C. elegans R144.rna1 SRP-RNA vs. F44A2
  seq1= "AAATGATAAGTGGTCTGAAGGTATCGAGAAGGCTGTGTATGTGTGTCAACTTACAAGCGC\
GGTCCGGCCACTGCACCACGCGGTCGCTTTTGTCTGCTACCTTTCGGTCCTGAACCATCC\
ACGACTCCTTATGCACCCGGGACTAACCGCCTTCCGACAGTTATCCCCTGGTGACGTAGC\
GCTTGGCACGGACGCTCGGTCGACGTAGACGACGACATACACAGGACTCCTCCTCTCACT\
CCATCCACACTTATCTACCGTCAAGAAGCTGACTCACAAGCGCCCGCCACGACGCTCGGT\
GG";
  seq2 = "AATTGATAAGTGGTCTAAAGGTATCAGAGAGGCAGTGGGTGAGCGACAACTCGCAAGCGC\
TATCCGGCCACTGCACCGCGCGGAGGCTTTTGCCTGCTGCCTTTCGGCCCTGAGCCGTAT\
ACCGCTCCTTATGCGCCCGGGACCGCCCGCCTTCCGCCAGATGACCCCTAGCGACGCAGC\
GCTTGGCACGGACGCTCGGTCGCCGTTGCCAGCAACACCCAAAGGGCTCCTCCCTTCACT\
CCATCCACACTTATCTACCGCCAAGAAGCTGGCTTACAGACGCCCGCCACGACGCCCGGT\
GG";
  L = strlen(seq1);
   */



  /* Load a PAM substitution matrix
   */
  blastpamfile = FileConcat("aa", pamfile);
  if ((pamfp = fopen(pamfile, "r")) == NULL &&
      (pamfp = EnvFileOpen(pamfile, "BLASTMAT")) == NULL &&
      (pamfp = EnvFileOpen(blastpamfile, "BLASTMAT")) == NULL)
    Die("Failed to open PAM scoring matrix file %s", pamfile);
  if (! ParsePAMFile(pamfp, &pam, &scale))
    Die("Failed to parse PAM file");
  fclose(pamfp);
  free(blastpamfile);

  /* Calculate the models
   */
  DefaultGeneticCode(aacode);
  DefaultCodonBias(codebias);
  ConstructESTModel(aacode, codebias, estmodel);
  ConstructPAMModel(pam, scale, aafq, estmodel, pammodel);
  ConstructNullModel(pammodel, nullmodel);
  ConstructRNAModel(nullmodel, &rnamodel);

  /* CheckNullModel(nullmodel); */
  /* CheckPAMModel(pammodel); */
  /* PrintPAMModel(pammodel, nullmodel);   */
  /* PrintRNAModel(&rnamodel, nullmodel);   */
  
   /*
  L = strlen(khs);
  KHS2ct(khs, L, FALSE, &ct);
  */

  /* L = 75; */

  nullsc = ScoreWithNull(seq1, seq2, L, nullmodel);
  pamsc  = ScoreWithPAM(seq1, seq2, L, pammodel);
  rnasc  = ScoreWithRNA(seq1, seq2, L, &rnamodel);

  printf("null= %12f  pam= %12f  rna= %12f ", nullsc, pamsc, rnasc);
  if (nullsc >= pamsc && nullsc >= rnasc)
    printf("posterior= %12f   winner= NULL\n", Posterior(nullsc, pamsc, rnasc));
  if (pamsc >= nullsc && pamsc >= rnasc)
    printf("posterior= %12f   winner= CODING\n", Posterior(pamsc, nullsc, rnasc));
  if (rnasc >= nullsc && rnasc >= pamsc)
    printf("posterior= %12f   winner= RNA\n", Posterior(rnasc, nullsc, pamsc));

#ifdef SRE_REMOVED
  for (i = 0; i < 100; i++)
    {
      SimulateRNASequences(&rnamodel, ct, L, &seq1, &seq2); 
      /* SimulateCodingSequences(L, pammodel, &seq1, &seq2);  */

      /* seq2 = SimulateNullSequence(seq1, L, nullmodel); */
      nullsc = ScoreWithNull(seq1, seq2, L, nullmodel);
      pamsc  = ScoreWithPAM(seq1, seq2, L, pammodel);
      rnasc  = ScoreWithRNA(seq1, seq2, L, &rnamodel);
      
      /* 
      printf("%s\n", seq1);
      printf("%s\n", seq2);
      printf("%s\n", khs);
	*/

      printf("null= %12f  pam= %12f  rna= %12f ", nullsc, pamsc, rnasc);

      if (nullsc >= pamsc && nullsc >= rnasc)
	printf("posterior= %12f   winner= NULL\n", Posterior(nullsc, pamsc, rnasc));
      if (pamsc >= nullsc && pamsc >= rnasc)
	printf("posterior= %12f   winner= CODING\n", Posterior(pamsc, nullsc, rnasc));
      if (rnasc >= nullsc && rnasc >= pamsc)
	printf("posterior= %12f   winner= RNA\n", Posterior(rnasc, nullsc, pamsc));
	
      free(seq1);
      free(seq2);
    }
#endif

  /* Cleanup
   */
  Free2DArray(pam,27);
  return EXIT_SUCCESS;
}

