Edinburgh Speech Tools  2.4-release
EST_Ngrammar.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* An EST_Ngram class for building and manipulating bigrams trigrams etc */
38 /* */
39 /*=======================================================================*/
40 #include <iostream>
41 #include <fstream>
42 #include <cstring>
43 #include <cmath>
44 #include <climits>
45 #include <cfloat>
46 
47 using namespace std;
48 
49 #include "EST_Ngrammar.h"
50 #include "EST_Pathname.h"
51 #include "EST_Token.h"
52 #include "EST_io_aux.h"
53 
54 
55 const EST_DiscreteProbDistribution PSTnullProbDistribution;
56 static EST_String NOVOCAB("NOVOCAB");
57 
58 // **********************************************************************
59 
60 EST_NgrammarState::EST_NgrammarState(const EST_NgrammarState &s)
61 {
62  clear();
63  init(s.id(),s.pdf_const());
64 }
65 
66 EST_NgrammarState::EST_NgrammarState(const EST_NgrammarState *const s)
67 {
68  clear();
69  init(s->id(),s->pdf_const()); // copy
70 }
71 
72 EST_NgrammarState::~EST_NgrammarState()
73 {
74  clear();
75 }
76 
77 void EST_NgrammarState::clear()
78 {
79  p_id = -1;
80  p_pdf.clear();
81 }
82 
83 void EST_NgrammarState::init()
84 {
85  p_id=-1;
86  p_pdf.init();
87 }
88 
89 void EST_NgrammarState::init(int id,EST_Discrete *d)
90 {
91  p_id = id;
92  p_pdf.init(d);
93 }
94 
95 
96 void EST_NgrammarState::init(int id, const EST_DiscreteProbDistribution &pdf)
97 {
98  p_id = id;
99  p_pdf = pdf; // copy it
100 }
101 
102 
103 ostream& operator<<(ostream& s, const EST_NgrammarState &a)
104 {
105  s << "(" << a.id() << ": " << a.pdf_const() << " )";
106  return s;
107 }
108 
109 // **********************************************************************
110 
111 EST_BackoffNgrammarState::~EST_BackoffNgrammarState()
112 {
113  p_pdf.clear();
114  children.clear();
115 }
116 
117 
118 void EST_BackoffNgrammarState::clear()
119 {
120  backoff_weight=0;
121  p_pdf.clear();
122 }
123 
124 void EST_BackoffNgrammarState::init()
125 {
126  backoff_weight=0;
127  p_pdf.init();
128 }
129 
130 void EST_BackoffNgrammarState::init(const EST_Discrete *d,int level)
131 {
132  backoff_weight=0;
133  p_level = level;
134  p_pdf.init(d);
135 }
136 
137 void EST_BackoffNgrammarState::init(const EST_DiscreteProbDistribution &pdf, int level)
138 {
139  backoff_weight=0;
140  p_level = level;
141  p_pdf = pdf; // copy it
142 }
143 
144 bool EST_BackoffNgrammarState::accumulate(const EST_StrVector &words,
145  const double count)
146 {
147 
148 // int i;
149 // cerr << "accumulate ";
150 // for(i=0;i<words.n();i++)
151 // {
152 // if(words.n()-1-p_level == i)
153 // cerr <<"*";
154 // cerr << words(i);
155 // if(words.n()-1-p_level == i)
156 // cerr <<"*";
157 // }
158 // cerr << endl;
159 
160  // update pdf at this node
161  p_pdf.cumulate(words(words.n()-1-p_level),count);
162 
163  // then go down
164  if (words.n()-1-p_level > 0) // not at bottom of tree
165  {
166 
168 
169  // have we got the child
170  s = get_child(words(words.n()-1-p_level));
171  if (s==NULL)
172  // have to extend the tree
173  s = add_child(p_pdf.get_discrete(),words);
174  return s->accumulate(words,count);
175 
176  }
177  else
178  {
179  return true;
180  }
181 }
182 
183 bool EST_BackoffNgrammarState::accumulate(const EST_IVector &words,
184  const double count)
185 {
186 
187 // int i;
188 // cerr << "accumulate level " << p_level << " : ";
189 // for(i=0;i<words.n();i++)
190 // {
191 // if(words.n()-1-p_level == i)
192 // cerr <<"*";
193 // cerr << p_pdf.item_name(words(i));
194 // if(words.n()-1-p_level == i)
195 // cerr <<"*";
196 // cerr << " ";
197 // }
198 // cerr << endl;
199 
200  // update pdf at this node
201  p_pdf.cumulate(words(words.n()-1-p_level),count);
202 
203  //cerr << *this << endl;
204  //cerr << endl;
205 
206  // then go down
207  if (words.n()-1-p_level > 0) // not at bottom of tree
208  {
209 
211 
212  // have we got the child
213  s = get_child(words(words.n()-1-p_level));
214  if (s==NULL)
215  // have to extend the tree
216  s = add_child(p_pdf.get_discrete(),words);
217 
218  // get pointer again in case we built more than one level
219  s = get_child(words(words.n()-1-p_level));
220  if (s==NULL)
221  {
222  cerr << "Failed to extend tree - unknown reason !" << endl;
223  return false;
224  }
225  return s->accumulate(words,count);
226  }
227  else
228  {
229  return true;
230  }
231 }
232 
233 
235 EST_BackoffNgrammarState::add_child(const EST_Discrete *d,
236  const EST_StrVector &words)
237 {
239 
240  if (words.n()-1-p_level > 0) // more history still to go
241  {
242  // see if we can go down the tree
243  s = get_child(words(words.n()-1-p_level));
244  if (s != NULL)
245  return s->add_child(d,words);
246  else
247  {
248  // construct tree as we go
250  new_child->init(d,p_level+1);
251  children.add(words(words.n()-1-p_level), (void*)new_child);
252  return new_child->add_child(d,words);
253  }
254  }
255  else
256  {
257  // this is the node we are trying to add !
258  return this;
259  }
260 }
261 
262 
263 
265 EST_BackoffNgrammarState::add_child(const EST_Discrete *d,
266  const EST_IVector &words)
267 {
269 
270  if (words.n()-1-p_level > 0) // more history still to go
271  {
272  // see if we can go down the tree
273  s = get_child(words(words.n()-1-p_level));
274  if (s != NULL)
275  return s->add_child(d,words);
276  else
277  {
278  // construct tree as we go
280  new_child->init(d,p_level+1);
281  children.add(p_pdf.get_discrete()->name(words(words.n()-1-p_level)), (void*)new_child);
282  return new_child->add_child(d,words);
283  }
284  }
285  else
286  {
287  // this is the node we are trying to add !
288  return this;
289  }
290 }
291 
292 void EST_BackoffNgrammarState::remove_child(EST_BackoffNgrammarState *child,
293  const EST_String &name)
294 {
295 
296  child->zap();
297  // can't remove from StringTrie, but can set pointer to NULL
298  children.add(name,NULL);
299  delete child;
300 }
301 
302 void EST_BackoffNgrammarState::print_freqs(ostream &os,
303  const int order,
304  EST_String followers)
305 {
306  // not right - just print out, then recurse through children
307  // change to use 'backoff_traverse'
308 
309  EST_Litem *k;
310  double freq;
311  EST_String name;
312  for (k=p_pdf.item_start();
313  !p_pdf.item_end(k);
314  k = p_pdf.item_next(k))
315  {
316  p_pdf.item_freq(k,name,freq);
317  EST_BackoffNgrammarState *s = ((EST_BackoffNgrammarState*)(children.lookup(name)));
318  if (p_level==order-1)
319  {
320  if(freq>0)
321  os << name << " " << followers
322  << ": " << freq << endl;
323  }
324  else if (s!=NULL)
325  s->print_freqs(os,order,name+" "+followers);
326 
327  }
328 }
329 
330 bool EST_BackoffNgrammarState::ngram_exists(const EST_StrVector &words,
331  const double threshold) const
332 {
333  const EST_BackoffNgrammarState *s;
334  s = get_state(words);
335  if (s != NULL)
336  {
337  // return true for unigrams (state level 0)
338  // even if freq < threshold
339  return (bool)((s->level()==0) ||
340  ( s->frequency(words(0)) > threshold ));
341  }
342  else
343  return false;
344 }
345 
346 const EST_BackoffNgrammarState *const
347 EST_BackoffNgrammarState::get_state(const EST_StrVector &words) const
348 {
350  if (words.n()-1-p_level > 0) // more history still to go
351  {
352  s = get_child(words(words.n()-1-p_level));
353  if (s != NULL)
354  {
355  //cerr << "traversing from " << *this << endl;
356  //cerr << " to " << *s << endl << endl;
357  return s->get_state(words);
358  }
359  else
360  {
361  //cerr << "got NULL" << endl;
362  return NULL;
363  }
364  }
365  else
366  {
367  //cerr << "got " << *this << endl;
368  return this;
369  }
370 }
371 
372 void EST_BackoffNgrammarState::zap()
373 {
374 
375  // recursively delete this state and all its children
376  EST_Litem *k;
377  double freq;
378  EST_String name;
379  for (k=p_pdf.item_start();
380  !p_pdf.item_end(k);
381  k = p_pdf.item_next(k))
382  {
383  p_pdf.item_freq(k,name,freq);
384  EST_BackoffNgrammarState *child = get_child(name);
385  if (child!=NULL)
386  remove_child(child,name);
387  }
388 
389  children.clear();
390  p_pdf.clear();
391 
392 }
393 
394 
395 const double EST_BackoffNgrammarState::get_backoff_weight(const EST_StrVector &words) const
396 {
398  if (words.n()-1-p_level >= 0)
399  {
400  s = get_child(words(words.n()-1-p_level));
401  if (s != NULL)
402  return s->get_backoff_weight(words);
403  else
404  {
405  // if there is no node, the weight would have been 1 anyway
406  //cerr << "Couldn't get weight for " << words << endl;
407  return 1; // default
408  }
409  }
410 
411  else
412  {
413  // reached node
414 
415 /*
416  cerr << "got bw for ";
417  for(int i=0;i<words.n();i++)
418  cerr << words(i) << " ";
419  cerr << " at level " << p_level << " = " << backoff_weight << endl;
420 */
421  return backoff_weight;
422  }
423 }
424 
425 
426 bool EST_BackoffNgrammarState::set_backoff_weight(const EST_StrVector &words, const double w)
427 {
429  if (words.n()-1-p_level >= 0)
430  {
431  s = get_child(words(words.n()-1-p_level));
432  if (s != NULL)
433  return s->set_backoff_weight(words,w);
434  else
435  {
436  // we can't set the weight because the node
437  // doesn't exist - this is an error if the weight
438  // is not 1
439  if (w != 1)
440  {
441  cerr << "Couldn't set weight for " << words
442  << " to " << w << endl;
443  return false;
444  }
445  else
446  return true; // a weight of 1 does not need storing
447  }
448  }
449  else
450  {
451  // reached node
452  backoff_weight = w;
453  return true;
454  }
455 }
456 
457 void EST_BackoffNgrammarState::frequency_of_frequencies(EST_DVector &ff)
458 {
459  int max=ff.n();
460  EST_Litem *k;
461  double freq;
462  EST_String name;
463  for (k=p_pdf.item_start();
464  !p_pdf.item_end(k);
465  k = p_pdf.item_next(k))
466  {
467  p_pdf.item_freq(k,name,freq);
468  if(freq < max)
469  ff[(int)(freq+0.5)] += 1;
470  }
471 }
472 
473 ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a)
474 {
475  s << "(backoff level:" << a.p_level
476  << " weight:" << a.backoff_weight << " " << a.pdf_const() << " )";
477 
478  return s;
479 }
480 
481 // **********************************************************************
482 
483 void EST_Ngrammar::default_values()
484 {
485  p_representation = EST_Ngrammar::sparse;
486  p_entry_type = EST_Ngrammar::frequencies; // by default
487  sparse_representation.clear();
488  allow_oov=false;
489  p_num_samples = 0;
490  p_num_states = 0;
491  p_states = 0;
492  vocab = 0;
493  pred_vocab = 0;
494  backoff_threshold = 1.0;
495  backoff_unigram_floor_freq = 0.0;
496 }
497 
498 EST_Ngrammar::~EST_Ngrammar()
499 {
500  delete [] p_states;
501 }
502 
503 void EST_Ngrammar::clear()
504 {
505  // to do
506 }
507 
508 bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
509  const EST_StrList &wordlist)
510 {
511  return (bool)(init_vocab(wordlist) && p_init(o,r));
512 }
513 
514 bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
515  const EST_StrList &wordlist,
516  const EST_StrList &predlist)
517 {
518  return (bool)(init_vocab(wordlist,predlist) && p_init(o,r));
519 }
520 
521 bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
522  EST_Discrete &v)
523 {
524  vocab = &v;
525  pred_vocab = vocab;
526  vocab_pdf.init(pred_vocab);
527  return p_init(o,r);
528 }
529 
530 bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
531  EST_Discrete &v, EST_Discrete &pv)
532 {
533  vocab = &v;
534  pred_vocab = &pv;
535  vocab_pdf.init(pred_vocab);
536  return p_init(o,r);
537 }
538 
539 bool EST_Ngrammar::p_init(int o, EST_Ngrammar::representation_t r)
540 {
541  if (o <= 0)
542  {
543  cerr << "EST_Ngrammar order must be > 0" << endl;
544  return false;
545  }
546 
547  p_order = o;
548  p_representation = r;
549  p_number_of_sentences = 0;
550 
551  switch(p_representation)
552  {
553 
554  case EST_Ngrammar::sparse:
555  sparse_representation.init(p_order);
556  return true;
557  break;
558 
559  case EST_Ngrammar::dense:
560  return init_dense_representation();
561  break;
562 
563  case EST_Ngrammar::backoff:
564  return init_backoff_representation();
565  break;
566 
567  default:
568  cerr << "Unknown internal representation requested for EST_Ngrammar"
569  << endl;
570  return false;
571  break;
572  }
573 }
574 
575 bool EST_Ngrammar::init_dense_representation()
576 {
577  // allocate a flattened N-dimensional matrix of states
578  int i;
579 
580  if (vocab->length() <= 0)
581  {
582  cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
583  << endl;
584  return false;
585  }
586 
587  p_num_states = (int)pow(float(vocab->length()),float(p_order-1));
588  p_states = new EST_NgrammarState[p_num_states];
589  for (i=0; i < p_num_states; i++)
590  p_states[i].init(i,pred_vocab);
591 
592  return true;
593 }
594 
595 bool EST_Ngrammar::init_sparse_representation()
596 {
597 
598  if (vocab->length() <= 0)
599  {
600  cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
601  << endl;
602  return false;
603  }
604 
605  p_num_states = (int)pow(float(vocab->length()),float(p_order-1));
606  p_states = new EST_NgrammarState[p_num_states];
607 
608  return (bool)(p_states != NULL);
609 }
610 
611 bool EST_Ngrammar::init_backoff_representation()
612 {
613 
614  // nothing much to do
615  backoff_representation = new EST_BackoffNgrammarState;
616  backoff_representation->init(vocab,0);
617  return true;
618 }
619 
620 
621 const EST_StrVector &EST_Ngrammar::make_ngram_from_index(const int index) const
622 {
623  int i,ind=index;
624  EST_StrVector *ngram = new EST_StrVector;
625  ngram->resize(p_order-1); // exclude last word
626 
627  // matches 'find_dense_state_index' so first word is most significant
628  // also, cannot fill in last word
629 
630  for(i=p_order-2;i>=0;i--)
631  {
632 #if defined(sun) && ! defined(__svr4__)
633 /* SunOS */
634  int rem = ind%vocab->length();
635  int quot = ind/vocab->length();
636  (*ngram)[i] = wordlist_index(rem);
637  ind = quot;
638 #else
639  div_t d = div(ind,vocab->length());
640  (*ngram)[i] = wordlist_index(d.rem);
641  ind = d.quot;
642 #endif
643  }
644 
645  return *ngram;
646 }
647 
648 
649 
650 
651 bool EST_Ngrammar::init_vocab(const EST_StrList &word_list)
652 {
653  vocab = new EST_Discrete();
654  if(!vocab->init(word_list))
655  return false;
656 
657  pred_vocab = vocab; // same thing in this case
658  vocab_pdf.init(pred_vocab);
659 
660  return (bool)(vocab != NULL);
661 }
662 
663 bool EST_Ngrammar::init_vocab(const EST_StrList &word_list,
664  const EST_StrList &pred_list)
665 {
666  vocab = new EST_Discrete();
667  if(!vocab->init(word_list))
668  return false;
669  pred_vocab = new EST_Discrete();
670  if(!pred_vocab->init(pred_list))
671  return false;
672  vocab_pdf.init(pred_vocab);
673 
674  return (bool)(vocab != NULL);
675 }
676 
677 bool EST_Ngrammar::check_vocab(const EST_StrList &word_list)
678 {
679  EST_Discrete *comp_vocab = new EST_Discrete();
680 
681  if(!comp_vocab->init(word_list))
682  {
683  delete comp_vocab;
684  return false;
685  }
686 
687  if(*vocab != *comp_vocab)
688  {
689  delete comp_vocab;
690  return false;
691  }
692 
693  delete comp_vocab;
694  return true;
695 }
696 
697 const EST_String & EST_Ngrammar::wordlist_index(int i) const
698 {
699  return vocab->name(i);
700 }
701 
702 int EST_Ngrammar::predlist_index(const EST_String &word) const
703 {
704 
705  if (word=="") // can't lookup
706  return -1;
707 
708  int i;
709  i = pred_vocab->index(word);
710  if(i >= 0 )
711  return i;
712 
713  cerr << "Word \"" << word << "\" is not in the predictee word list" << endl;
714 
715  if (allow_oov)
716  {
717  i = pred_vocab->index(OOV_MARKER);
718  if(i >= 0)
719  return i;
720 
721  cerr << "Even " << OOV_MARKER << " is not in the predictee word list !" << endl;
722  }
723 
724  return -1;
725 }
726 
727 
728 const EST_String & EST_Ngrammar::predlist_index(int i) const
729 {
730  return pred_vocab->name(i);
731 }
732 
733 int EST_Ngrammar::wordlist_index(const EST_String &word, const bool report) const
734 {
735 
736  if (word=="") // can't lookup
737  return -1;
738 
739  int i;
740  i = vocab->index(word);
741  if(i >= 0 )
742  return i;
743 
744  if(report)
745  cerr << "Word \"" << word << "\" is not in the word list" << endl;
746 
747  if (allow_oov)
748  {
749  i = vocab->index(OOV_MARKER);
750  if(i >= 0)
751  return i;
752  if(report)
753  cerr << "Even " << OOV_MARKER << " is not in the word list !" << endl;
754  }
755 
756  return -1;
757 }
758 
759 bool EST_Ngrammar::build(const EST_StrList &filenames,
760  const EST_String &prev,
761  const EST_String &prev_prev,
762  const EST_String &last,
763  const EST_String &input_format,
764  const EST_String &oov_mode,
765  const int mincount,
766  const int maxcount)
767 {
768 
769  p_sentence_start_marker = prev;
770  p_sentence_end_marker = last;
771 
772 
773  // when backing off, safest modes ...
774  if( (p_representation == EST_Ngrammar::backoff) &&
775  (oov_mode != "skip_file") &&
776  (oov_mode != "skip_sentence"))
777  cerr << "Warning : building a backoff grammar" << endl
778  << " with oov_mode '" << oov_mode
779  << "' is not recommended !" << endl;
780 
781  if( (oov_mode != "skip_ngram") &&
782  (oov_mode != "skip_sentence") &&
783  (oov_mode != "skip_file") &&
784  (oov_mode != "use_oov_marker") )
785  {
786  cerr << "Unknown oov_mode '" << oov_mode << "'" << endl;
787  return false;
788  }
789 
790  if( (oov_mode == "skip_sentence") &&
791  (input_format == "ngram_per_line"))
792  {
793  cerr << "Sorry, with input format 'ngram_per_line' you cannot " << endl
794  << " select oov_mode 'skip_sentence'" << endl;
795  return false;
796  }
797 
798  if(oov_mode == "use_oov_marker")
799  allow_oov = true;
800  else
801  allow_oov = false;
802 
803  bool skip_this;
804  EST_String new_filename;
805  EST_Litem *p;
806  for (p = filenames.head(); p; p = p->next())
807  {
808  cerr << "Building from " << filenames(p) << endl;
809 
810  skip_this=false;
811  if( ((oov_mode == "skip_sentence") &&
812  (input_format == "sentence_per_file")) ||
813  (oov_mode == "skip_file") )
814  skip_this = oov_preprocess(filenames(p),new_filename,
815  "skip if found");
816 
817  else if( ((oov_mode == "skip_sentence") &&
818  (input_format == "sentence_per_line")) ||
819  ((oov_mode == "skip_ngram") &&
820  (input_format == "ngram_per_line")) )
821  oov_preprocess(filenames(p),new_filename,"eliminate lines");
822 
823  else
824  new_filename = filenames(p);
825 
826 
827  if(skip_this)
828  {
829  cerr << "Skipping " << filenames(p)
830  << " (out of vocabulary words found)" << endl;
831  }
832  else
833  {
834 
835  switch(p_representation)
836  {
837  case EST_Ngrammar::sparse:
838  {
839  if (input_format != "")
840  {
841  cerr << "Can't build sparse ngram from '" << input_format;
842  cerr << "' data" << endl;
843  return false;
844  }
845  else if (!build_sparse(new_filename,prev,prev_prev,last))
846  return false;
847  }
848  break;
849 
850  case EST_Ngrammar::dense:
851  if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
852  return false;
853  break;
854 
855  case EST_Ngrammar::backoff:
856  if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
857  return false;
858  break;
859 
860  default:
861  cerr << "Unknown internal representation set for EST_Ngrammar"
862  << endl;
863  return false;
864  break;
865  }
866  }
867 
868  if((new_filename != filenames(p)) &&
869  (new_filename != "") &&
870  !delete_file(new_filename) )
871  {
872  cerr << "Warning : couldn't remove temporary file : "
873  << new_filename << endl;
874  }
875 
876  } // loop round files
877 
878  if (p_representation == EST_Ngrammar::backoff)
879  return compute_backoff_weights(mincount,maxcount);
880 
881  return true;
882 }
883 
884 void EST_Ngrammar::accumulate(const EST_StrVector &words,
885  const double count)
886 {
887  if (words.n() < p_order)
888  cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
889  else
890  {
891  p_num_samples++;
892  const EST_String &w = lastword(words);
893  vocab_pdf.cumulate(w,count);
894 
895  switch(p_representation)
896  {
897  case EST_Ngrammar::dense:
898  case EST_Ngrammar::sparse:
899  find_state(words).cumulate(w,count);
900  break;
901 
902  case EST_Ngrammar::backoff:
903  backoff_representation->accumulate(words,count);
904  break;
905 
906  default:
907  cerr << "EST_Ngrammar::accumulate : invalid representation !"
908  << endl;
909  break;
910  }
911  } // switch
912 }
913 
914 void EST_Ngrammar::accumulate(const EST_IVector &words,
915  const double count)
916 {
917 
918  /*
919  int i;
920  for(i=0;i<words.n();i++)
921  {
922  cerr << vocab_pdf.item_name(words(i));
923  cerr << " ";
924  }
925  cerr << endl;
926  */
927 
928  if (words.n() < p_order)
929  cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
930  else
931  {
932  p_num_samples++;
933  vocab_pdf.cumulate(words(p_order-1),count);
934 
935  switch(p_representation)
936  {
937 
938  case EST_Ngrammar::dense:
939  case EST_Ngrammar::sparse:
940  find_state(words).cumulate(words(p_order-1),count);
941  break;
942 
943  case EST_Ngrammar::backoff:
944  backoff_representation->accumulate(words,count);
945  break;
946 
947  default:
948  cerr << "EST_Ngrammar::accumulate : invalid representation !"
949  << endl;
950  break;
951  } // switch
952  }
953 }
954 
955 
956 bool EST_Ngrammar::ngram_exists(const EST_StrVector &words) const
957 {
958  switch(p_representation)
959  {
960  case EST_Ngrammar::sparse:
961  return false;
962  break;
963 
964  case EST_Ngrammar::dense:
965  return true; // always exists !
966  break;
967 
968  case EST_Ngrammar::backoff:
969  {
970  if(words.n()==1)
971  return backoff_representation->ngram_exists(words,0);
972  else
973  return backoff_representation->ngram_exists(words,backoff_threshold);
974  }
975  break;
976 
977  default:
978  cerr << "ngram_exists: unknown ngrammar representation" << endl;
979  break;
980  }
981  return false;
982 }
983 
984 bool EST_Ngrammar::ngram_exists(const EST_StrVector &words, const double threshold) const
985 {
986  if (p_representation != EST_Ngrammar::backoff)
987  {
988  cerr << "Not a backoff grammar !" << endl;
989  return false;
990  }
991 
992  return backoff_representation->ngram_exists(words,threshold);
993 
994 }
995 
996 
997 const double EST_Ngrammar::get_backoff_weight(const EST_StrVector &words) const
998 {
999  if(p_representation == EST_Ngrammar::backoff)
1000  return backoff_representation->get_backoff_weight(words);
1001  else
1002  {
1003  cerr << "Can't get backoff weight - not a backed off ngrammar !" << endl;
1004  return 0;
1005  }
1006 }
1007 
1008 bool EST_Ngrammar::set_backoff_weight(const EST_StrVector &words, const double w)
1009 {
1010  if(p_representation == EST_Ngrammar::backoff)
1011  return backoff_representation->set_backoff_weight(words,w);
1012  else
1013  {
1014  cerr << "Can't set backoff weight - not a backed off ngrammar !" << endl;
1015  return false;
1016  }
1017 }
1018 
1019 
1020 bool EST_Ngrammar::build_sparse(const EST_String &filename,
1021  const EST_String &prev,
1022  const EST_String &prev_prev,
1023  const EST_String &last)
1024 {
1025  sparse_representation.build(filename,prev,prev_prev,last);
1026  return true;
1027 }
1028 
1029 
1030 void EST_Ngrammar::fill_window_start(EST_IVector &window,
1031  const EST_String &prev,
1032  const EST_String &prev_prev) const
1033 {
1034  int i;
1035 
1036  for (i=0; i<window.n()-1; i++)
1037  window[i] = wordlist_index(prev_prev);
1038  window[i++] = wordlist_index(prev);
1039 }
1040 
1041 void EST_Ngrammar::fill_window_start(EST_StrVector &window,
1042  const EST_String &prev,
1043  const EST_String &prev_prev) const
1044 {
1045  int i;
1046 
1047  for (i=0; i<window.n()-1; i++)
1048  window[i] = prev_prev;
1049  window[i++] = prev;
1050 }
1051 
1052 bool EST_Ngrammar::oov_preprocess(const EST_String &filename,
1053  EST_String &new_filename,
1054  const EST_String &what)
1055 {
1056  ostream *ost=0;
1057  EST_TokenStream ts;
1058  new_filename="";
1059  int bad_line_count=0;
1060  int good_line_count=0;
1061 
1062  // if filename is stdin we have to make a temporary file
1063  // if we are eliminating lines we also need a temporary file
1064 
1065  // what = "skip if found" | "eliminate lines"
1066 
1067  bool write_out = false;
1068  if( (what == "eliminate lines") || (filename == "-") )
1069  write_out = true;
1070 
1071  // open the original files for reading
1072  if (filename == "-")
1073  {
1074  if( ts.open(stdin, FALSE) == -1)
1075  {
1076  cerr << "EST_Ngrammar:: failed to open stdin";
1077  cerr << " for reading" << endl;
1078  return false;
1079  }
1080  }
1081  else if (ts.open(filename) == -1){
1082  cerr << "EST_Ngrammar: failed to open file \"" << filename
1083  << "\" for reading" << endl;
1084  return false; // hmmm ?
1085  }
1086 
1087  // open the output file if necessary
1088  if(write_out)
1089  {
1090  new_filename = make_tmp_filename();
1091  ost = new ofstream(new_filename);
1092 
1093  if(!(*ost))
1094  {
1095  cerr << "Ngrammar: couldn't create temporary file \""
1096  << new_filename << "\"" << endl;
1097  new_filename="";
1098  return false;
1099  }
1100  }
1101  else
1102  new_filename = filename;
1103 
1104  EST_String s,this_line;
1105  bool bad_line=false;
1106  while (!ts.eof())
1107  {
1108  s=ts.get().string();
1109 
1110  if (!bad_line && (s != ""))
1111  {
1112  if(wordlist_index(s,false) < 0)
1113  {
1114 
1115  if(what == "eliminate lines")
1116  {
1117  bad_line=true;
1118  }
1119  else // skip file
1120  {
1121  if(write_out)
1122  {
1123  // input was stdin, but we are now discarding it
1124  delete ost;
1125  if(!delete_file(new_filename))
1126  cerr << "Warning : couldn't delete temporary file '"
1127  << new_filename << "'" << endl;
1128  }
1129  new_filename="";
1130  return false;
1131  }
1132 
1133  }
1134  else
1135  this_line += s + " ";
1136  }
1137 
1138  // write out
1139  if(ts.eoln())
1140  {
1141  if(bad_line)
1142  {
1143  bad_line_count++;
1144  }
1145  else
1146  {
1147  if(write_out)
1148  {
1149  // this_line
1150  *ost << this_line << endl;
1151  good_line_count++;
1152  }
1153  }
1154  bad_line=false;
1155  this_line = "";
1156  }
1157 
1158  }
1159  cerr << "skipped " << bad_line_count << " and kept "
1160  << good_line_count << " lines from file " << filename << endl;
1161  return true;
1162 }
1163 
1164 bool EST_Ngrammar::build_ngram(const EST_String &filename,
1165  const EST_String &prev,
1166  const EST_String &prev_prev,
1167  const EST_String &last,
1168  const EST_String &input_format)
1169 {
1170  p_entry_type = EST_Ngrammar::frequencies;
1171  int bad_word=0;
1172  EST_String s;
1173  EST_TokenStream ts;
1174  int eoln_is_eos = FALSE;
1175  int sliding_window = TRUE;
1176  int count=0;
1177  clear();
1178 
1179  if ( (input_format == "") || (input_format == "sentence_per_line") )
1180  {
1181  // may do something here later
1182  eoln_is_eos = TRUE;
1183  sliding_window = TRUE;
1184  }
1185  else if (input_format == "sentence_per_file")
1186  {
1187  eoln_is_eos = FALSE;
1188  sliding_window = TRUE;
1189  p_number_of_sentences = 1;
1190  }
1191  else if(input_format == "ngram_per_line")
1192  {
1193  eoln_is_eos = FALSE;
1194  sliding_window = FALSE;
1195  p_number_of_sentences = 1;
1196  }
1197  else
1198  {
1199  cerr << "Can't build from '" << input_format << "' data" << endl;
1200  return false;
1201  }
1202 
1203  EST_IVector window(p_order);
1204 
1205  if (filename == "-")
1206  {
1207  if( ts.open(stdin, FALSE) == -1)
1208  {
1209  cerr << "EST_Ngrammar:: failed to open stdin";
1210  cerr << " for reading" << endl;
1211  return false;
1212  }
1213  }
1214  else if (ts.open(filename) == -1){
1215  cerr << "EST_Ngrammar: failed to open \"" << filename
1216  << "\" for reading" << endl;
1217  return false;
1218  }
1219 
1220  // default input format is one sentence per line
1221  // full stops and commas etc. are taken literally
1222  // i.e. the user must do the preprocessing
1223 
1224  // we assume that all of prev,prev_prev,last are either
1225  // null or set, not a mixture of the two
1226 
1227  // at start window contains
1228  // [prev_prev, prev_prev, ....., prev_prev, prev]
1229  // This is not added to the ngram model though
1230  if(sliding_window)
1231  {
1232  fill_window_start(window,prev,prev_prev);
1233  if (window(p_order-1) == -1)
1234  bad_word = p_order;
1235  else if( (p_order>1) && (window(p_order-2) == -1))
1236  bad_word = p_order-1;
1237  else
1238  bad_word=0;
1239 
1240  if(bad_word > 0)
1241  cerr << "at start : bad word at " << bad_word << endl;
1242 
1243  }
1244  while (!ts.eof())
1245  {
1246  s=ts.get().string();
1247 
1248  if (s != "")
1249  {
1250  if(sliding_window)
1251  {
1252  slide(window,-1);
1253  if (bad_word > 0)
1254  bad_word--;
1255 
1256  window[p_order-1] = wordlist_index(s);
1257  if (window(p_order-1) < 0)
1258  {
1259  cerr << "EST_Ngrammar::build_ngram " <<
1260  " word \"" << s << "\" is not in vocabulary, skipping"
1261  << endl;
1262  bad_word = p_order;
1263  }
1264  if (bad_word == 0)
1265  accumulate(window);
1266  else
1267  {
1268  cerr << "not accumulating : bad word at " << bad_word;
1269  cerr << " window=" << window; // l<< endl;
1270  bad_word--;
1271  }
1272  }
1273  else
1274  {
1275  // not a sliding window - wait for end of line and accumulate
1276  if(count < p_order)
1277  {
1278  if(count == p_order-1) // last thing in window (predictee)
1279  window[count++] = predlist_index(s);
1280  else // not last thing (predictor)
1281  window[count++] = wordlist_index(s);
1282 
1283  if (window(count-1) < 0)
1284  {
1285  cerr << "EST_Ngrammar::build_ngram " <<
1286  " word \"" << s << "\" is not in vocabulary, skipping"
1287  << endl;
1288  bad_word = 1;
1289  }
1290  }
1291  else
1292  cerr << "Too many items on line - ignoring trailing ones !" << endl;
1293  }
1294  }
1295 
1296  // end of sentence ?
1297  if(ts.eoln())
1298  {
1299 
1300  if(!sliding_window)
1301  {
1302  if((count == p_order) && bad_word == 0)
1303  accumulate(window);
1304  count = 0;
1305  bad_word = 0;
1306  }
1307  else if (eoln_is_eos)
1308  {
1309  // have there been any accumulates since the last increment
1310  if (window(p_order-1) != wordlist_index(last))
1311  p_number_of_sentences += 1;
1312 
1313  slide(window,-1);
1314  window[p_order-1] = wordlist_index(last);
1315 
1316  if(window(p_order-1) == -1)
1317  {
1318  //cerr << "WARNING : skipping over bad word '"
1319  //<< last << "'" << endl;
1320  bad_word = p_order;
1321  }
1322 
1323  if (bad_word == 0)
1324  accumulate(window);
1325 
1326  fill_window_start(window,prev,prev_prev);
1327 
1328  // check for bad tags
1329  if (window(p_order-1) == -1)
1330  bad_word = p_order;
1331  else if( (p_order>1) && (window(p_order-2) == -1) )
1332  bad_word = p_order-1;
1333  else
1334  bad_word=0;
1335  }
1336  }
1337  }
1338 
1339  // if last accumulate was at end of sentence
1340  // we don't need to do the extra accumulate
1341  if ( sliding_window && (window(p_order-1) != wordlist_index(prev)))
1342  {
1343 
1344  // and finish with the ngram [.....last few words,last]
1345  slide(window,-1);
1346  window[p_order-1] = wordlist_index(last);
1347 
1348  if (window(p_order-1) == -1)
1349  bad_word=p_order;
1350 
1351  if (bad_word == 0)
1352  {
1353  accumulate(window);
1354  p_number_of_sentences += 1;
1355  }
1356  }
1357 
1358  ts.close();
1359 
1360  cerr << "Accumulated " << p_number_of_sentences << " sentences." << endl;
1361  return true;
1362 }
1363 
1364 
1365 void compute_backoff_weight(EST_Ngrammar *n, EST_StrVector &ngram, void*)
1366 {
1367  int i,j;
1368  double sum1=0,sum2=0;
1369 
1370 
1371  EST_StrVector new_ngram;
1372  new_ngram.resize(ngram.n()-1);
1373  for(i=0;i<new_ngram.n();i++)
1374  new_ngram[i] = ngram(i);
1375 
1376 
1377  cerr << "computing backoff w for ";
1378  for(i=0;i<new_ngram.n();i++)
1379  cerr << new_ngram(i) << " ";
1380  cerr << " \r";
1381 
1382  cerr << endl;
1383 
1384  // only have backoff weights if the associated n-1 gram exists
1385  if (!n->ngram_exists(new_ngram))
1386  {
1387  cerr << " NONE";
1388 
1389  // if ngram really exists, but was below threshold
1390  // set backoff weight to 1 (always back off)
1391  if (n->ngram_exists(new_ngram,0))
1392  {
1393  if(!n->set_backoff_weight(new_ngram,1))
1394  cerr << "WARNING : couldn't set weight !" << endl;
1395  cerr << " = 1";
1396  }
1397  cerr << endl;
1398  return;
1399  }
1400 
1401  // save
1402  EST_String tmp = ngram(ngram.n()-1);
1403 
1404  // for each possible word in the last position
1405  for(j=0;j<n->get_pred_vocab_length();j++)
1406  {
1407  ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
1408 
1409  for(i=0;i<ngram.n();i++)
1410  cerr << ngram(i) << " ";
1411 
1412  if (n->ngram_exists(ngram))
1413  {
1414  cerr << n->probability(ngram) << " exists " << endl;
1415  // if we have the complete ngram, add it to sum1
1416  sum1 += n->probability(ngram);
1417  }
1418  else
1419  {
1420 
1421  // make this faster - take out of loop
1422 
1423  // or add the n-1 gram, excluding the first word to sum2
1424  EST_StrVector tmp_ngram;
1425  tmp_ngram.resize(ngram.n()-1);
1426  for(i=0;i<tmp_ngram.n();i++)
1427  tmp_ngram[i] = ngram(i+1);
1428 
1429  cerr << " unseen, P(";
1430  for(i=0;i<tmp_ngram.n();i++)
1431  cerr << tmp_ngram(i) << " ";
1432 
1433  cerr << ") = " << n->probability(tmp_ngram) << " " << endl;
1434  sum2 += n->probability(tmp_ngram);
1435  }
1436  }
1437 
1438  // and fill in the backoff weight
1439 
1440  if (sum2 == 0) // no unseen ngrams, no backing off
1441  {
1442  if(!n->set_backoff_weight(new_ngram,1))
1443  cerr << "WARNING : couldn't set weight !" << endl;
1444  cerr << 0 << endl;
1445  }
1446  else
1447  {
1448  if (sum1 > 1)
1449  {
1450  cerr << "NEGATIVE WEIGHT for ";
1451  for(i=0;i<new_ngram.n();i++)
1452  cerr << new_ngram(i) << " ";
1453  cerr << endl;
1454 
1455  cerr << "sum1=" << sum1 << " sum2=" << sum2;
1456  cerr << " " << (1 - sum1) / sum2 << endl;
1457 
1458  for(j=0;j<n->get_pred_vocab_length();j++)
1459  {
1460  ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
1461 
1462 
1463  if (n->ngram_exists(ngram))
1464  {
1465 
1466  for(i=0;i<ngram.n();i++)
1467  cerr << ngram(i) << " ";
1468  cerr << " exists, prob = ";
1469  cerr << n->probability(ngram,false,true) << endl;
1470  }
1471 
1472  }
1473  sum1 = 1;
1474  sum2 = 1; // to get a weight of 0
1475  }
1476 
1477  if(!n->set_backoff_weight(new_ngram, (1 - sum1) / sum2 ))
1478  cerr << "WARNING : couldn't set weight !" << endl;
1479 
1480  cerr << "sum1=" << sum1 << " sum2=" << sum2;
1481  cerr << " weight=" << (1 - sum1) / sum2 << endl;
1482  }
1483 
1484  // restore
1485  ngram[ngram.n()-1] = tmp;
1486 
1487 }
1488 
1489 bool EST_Ngrammar::compute_backoff_weights(const int mincount,
1490  const int maxcount)
1491 {
1492 
1493 
1494  backoff_threshold = mincount;
1495  backoff_discount = new EST_DVector[p_order];
1496 
1497  int i,o;
1498 
1499  // but we must have children from the root node
1500  // for every unigram, since these can not be backed off
1501  // (must therefore be present, even if zero)
1502  // smoothing will fill in a floor for any zero frequencies
1503 
1504  backoff_restore_unigram_states();
1505 
1506  Good_Turing_discount(*this,maxcount,0.5);
1507 
1508  // and since some frequencies will have been set to zero
1509  // we have to prune away those branches of the tree
1510  //prune_backoff_representation();
1511 
1512 
1513  // now compute backoff weights, to satisfy the
1514  // 'probs sum to 1' condition
1515 
1516  // condition is (trigram case):
1517  // sum( p(w3|w1,w2) ) = 1, over all w1,w2
1518 
1519  // p(wd3|wd1,wd2)=
1520  // if(trigram exists) p_3(wd1,wd2,wd3)
1521  // else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)
1522  // else p(wd3|w2)
1523  // and for a given wd3 they all sum to 1
1524 
1525  // so compute three sums :
1526  // sum(p_3(wd1,wd2,wd3)), for all w1,w2 where we have the trigram
1527  // sum(p(wd3|wd2)), for all w1,w2 where we have the bigram(w1,w2)
1528  // but not the trigram
1529  // sum(p(wd3|wd2)), for all w1,w2 where we don't have either
1530 
1531  // could probably do this recursively and more elegantly
1532  // but it makes my head hurt
1533 
1534  for (o=2;o<=order();o++) // for (o=1 .. .????
1535  {
1536 
1537  cerr << "Backing off order " << o << endl;
1538  //cerr << "=======================" << endl;
1539 
1540  EST_StrVector words;
1541  words.resize(o);
1542 
1543  // for each possible history (ngram, excluding last word)
1544  // compute the backoff weight
1545  for(i=0;i<o-1;i++)
1546  words[i]="";
1547  words[o-1] = "!FILLED!";
1548  iterate(words,&compute_backoff_weight,NULL);
1549 
1550  //cerr << "=========done==========" << endl;
1551 
1552  }
1553 
1554  return true;
1555 }
1556 
1557 
1558 void EST_Ngrammar::backoff_restore_unigram_states()
1559 {
1560  // for every item in the root pdf, look for a child
1561  // and make it if not present
1562 
1563  // short cut is to cumulate some zero freq bigrams
1564  // to force construction of states where necessary
1565  EST_StrVector words;
1566  int j;
1567  words.resize(2);
1568  words[0] = "wibble"; // doesn't matter what this is, count is 0
1569  for(j=0;j<get_pred_vocab_length();j++)
1570  {
1571  words[1] = get_pred_vocab_word(j);
1572  backoff_representation->accumulate(words,0);
1573  }
1574 
1575 }
1576 
1577 
1578 void EST_Ngrammar::prune_backoff_representation(EST_BackoffNgrammarState *start_state)
1579 {
1580 
1581  if (start_state == NULL)
1582  start_state = backoff_representation;
1583 
1584  //cerr << "pruning state " << start_state << endl << *start_state << endl;
1585 
1586  // remove any branches with zero frequency count
1587 
1588  // find children of this state with zero freq and zap them
1589  EST_Litem *k;
1590  double freq;
1591  EST_String name;
1592  for (k=start_state->pdf_const().item_start();
1593  !start_state->pdf_const().item_end(k);
1594  k = start_state->pdf_const().item_next(k))
1595  {
1596  start_state->pdf_const().item_freq(k,name,freq);
1597  if (freq < TINY_FREQ)
1598  {
1599  EST_BackoffNgrammarState *child = start_state->get_child(name);
1600 
1601  if (child!=NULL)
1602  {
1603  //cerr << "Zapping " << name << " : " << child->level()
1604  //<< " " << child<< endl;
1605  start_state->remove_child(child,name);
1606  }
1607  }
1608 
1609  }
1610 
1611  // then recurse through remaining children
1612  for (k=start_state->pdf_const().item_start();
1613  !start_state->pdf_const().item_end(k);
1614  k = start_state->pdf_const().item_next(k))
1615  {
1616  start_state->pdf_const().item_freq(k,name,freq);
1617  EST_BackoffNgrammarState *child = start_state->get_child(name);
1618  if (child!=NULL)
1619  {
1620  //cerr << "recursing to " << name << " : " << child->level() << endl;
1621  //if((child!=NULL) && (child->level() == 3))
1622  //cerr << *child << endl;
1623  prune_backoff_representation(child);
1624  }
1625  }
1626 }
1627 
1628 
1629 ostream& operator<<(ostream& s, EST_Ngrammar &n)
1630 {
1631  switch(n.p_representation)
1632  {
1633  case EST_Ngrammar::sparse:
1634  n.sparse_representation.print_freqs(s);
1635  break;
1636 
1637  case EST_Ngrammar::dense:
1638  s << "Dense" << endl;
1639  // s << n.dense_representation << endl;
1640  break;
1641 
1642  case EST_Ngrammar::backoff:
1643  s << "Backoff" << endl;
1644  s << *(n.backoff_representation) << endl;
1645  break;
1646 
1647  default:
1648  cerr << "Unknown internal representation of EST_Ngrammar : can't print"
1649  << endl;
1650  break;
1651  }
1652 
1653  return s;
1654 }
1655 
1656 bool
1657 EST_Ngrammar::set_entry_type(EST_Ngrammar::entry_t new_type)
1658 {
1659  if (new_type == p_entry_type)
1660  return true;
1661 
1662  // entry type conversion --- hmmmmm
1663 
1664  cerr << "Couldn't do entry type conversion !" << endl;
1665  return false;
1666 }
1667 
1668 bool EST_Ngrammar::sparse_to_dense()
1669 {
1670  cerr << "EST_Ngrammar::sparse_to_dense() "
1671  <<" not implemented" << endl;
1672  return false;
1673 }
1674 
1675 bool EST_Ngrammar::dense_to_sparse()
1676 {
1677  cerr << "EST_Ngrammar::dense_to_sparse()"
1678  <<" not implemented" << endl;
1679  return false;
1680 }
1681 
1682 int EST_Ngrammar::find_dense_state_index(const EST_IVector &words,
1683  int index) const
1684 {
1685  int i,ind=0;
1686  int vl,wa;
1687  for(i=0;i<p_order-1;i++)
1688  {
1689  vl = vocab->length();
1690  wa = words.a_no_check(i+index);
1691  if (wa < 0) wa = 0;
1692  // printf("awb_debug ngrammar i %d ind %d v.length() %d words.a_no_check() %d\n",i,ind,vl,wa);
1693  ind = ind*vl + wa;
1694  }
1695 
1696  return ind;
1697 }
1698 
1699 int EST_Ngrammar::find_next_state_id(int state, int word) const
1700 {
1701  // Find a new state index from the current after moving with word
1702  int i,f;
1703 
1704  if (p_order == 1)
1705  return 0;
1706  for (f=1,i=0; i<p_order-2; i++)
1707  f*=vocab->length();
1708  return ((state%f)*vocab->length())+word;
1709 }
1710 
1711 EST_NgrammarState &EST_Ngrammar::find_state(const EST_StrVector &words)
1712 {
1713  switch(p_representation)
1714  {
1715  case EST_Ngrammar::sparse:
1716  // return p_states[sparse_representation.find_state(words)];
1717  return p_states[0];
1718  break;
1719 
1720  case EST_Ngrammar::dense:
1721  {
1722  EST_IVector tmp(words.n());
1723  int i;
1724  for(i=0;i<p_order-1;i++)
1725  {
1726  tmp[i] = wordlist_index(words(i));
1727  if (tmp(i) == -1) break;
1728  }
1729  tmp[i] = pred_vocab->index(words(i));
1730  if (tmp(i) == -1) break;
1731  return p_states[find_dense_state_index(tmp)];
1732  }
1733  break;
1734 
1735  case EST_Ngrammar::backoff:
1736  cerr << "find_state: not valid in backoff mode !" << endl;
1737  break;
1738 
1739  default:
1740  cerr << "find_state: unknown ngrammar representation" << endl;
1741  break;
1742  }
1743 
1744  return p_states[0];
1745 }
1746 
1747 
1748 const EST_NgrammarState &
1749 EST_Ngrammar::find_state_const(const EST_StrVector &words) const
1750 {
1751  switch(p_representation)
1752  {
1753  case EST_Ngrammar::sparse:
1754  // return p_states[sparse_representation.find_state(words)];
1755  return p_states[0];
1756  break;
1757 
1758  case EST_Ngrammar::dense:
1759  {
1760  EST_IVector tmp(words.n());
1761  int i;
1762  for(i=0;i<p_order-1;i++)
1763  {
1764  tmp[i] = wordlist_index(words(i));
1765  if (tmp(i) == -1) break;
1766  }
1767  tmp[i] = pred_vocab->index(words(i));
1768  if (tmp(i) == -1) break;
1769  return p_states[find_dense_state_index(tmp)];
1770  }
1771  break;
1772 
1773  case EST_Ngrammar::backoff:
1774  cerr << "find_state_const: not valid in backoff mode !" << endl;
1775  break;
1776 
1777  default:
1778  cerr << "find_state: unknown ngrammar representation" << endl;
1779  break;
1780  }
1781  return p_states[0];
1782 }
1783 
1784 
1785 EST_NgrammarState &EST_Ngrammar::find_state(const EST_IVector &words)
1786 {
1787  switch(p_representation)
1788  {
1789  case EST_Ngrammar::sparse:
1790  // return p_states[sparse_representation.find_state(words)];
1791  return p_states[0];
1792  break;
1793 
1794  case EST_Ngrammar::dense:
1795  return p_states[find_dense_state_index(words)];
1796  break;
1797 
1798  case EST_Ngrammar::backoff:
1799  cerr << "find_state: not valid in backoff mode !" << endl;
1800  break;
1801 
1802  default:
1803  cerr << "find_state: unknown ngrammar representation" << endl;
1804  break;
1805  }
1806  return p_states[0];
1807 }
1808 
1809 
1810 const EST_NgrammarState &
1811 EST_Ngrammar::find_state_const(const EST_IVector &words) const
1812 {
1813  switch(p_representation)
1814  {
1815  case EST_Ngrammar::sparse:
1816  // return p_states[sparse_representation.find_state(words)];
1817  return p_states[0];
1818  break;
1819  case EST_Ngrammar::dense:
1820  return p_states[find_dense_state_index(words)];
1821  break;
1822 
1823  case EST_Ngrammar::backoff:
1824  cerr << "find_state_const: not valid in backoff mode !" << endl;
1825  break;
1826 
1827  default:
1828  cerr << "find_state: unknown ngrammar representation" << endl;
1829  break;
1830  }
1831 
1832  return p_states[0];
1833 }
1834 
1835 
1836 bool EST_Ngrammar::set_representation(EST_Ngrammar::representation_t new_representation)
1837 {
1838 
1839  if (new_representation == p_representation)
1840  return true;
1841 
1842  if (new_representation == EST_Ngrammar::sparse)
1843  return sparse_to_dense();
1844  else if (new_representation == EST_Ngrammar::dense)
1845  return dense_to_sparse();
1846  else
1847  {
1848  cerr << "set_representation: unknown ngrammar representation" << endl;
1849  return FALSE;
1850  }
1851 }
1852 
1853 double EST_Ngrammar::probability(const EST_StrVector &words, bool force, const bool trace) const
1854 {
1855  // Probability of this ngram
1856  (void)force;
1857 
1858  switch(p_representation)
1859  {
1860  case EST_Ngrammar::sparse:
1861  case EST_Ngrammar::dense:
1862  return find_state_const(words).probability(lastword(words));
1863  break;
1864 
1865  case EST_Ngrammar::backoff:
1866  return backoff_probability(words,trace);
1867  break;
1868 
1869  default:
1870  cerr << "probability: unknown ngrammar representation" << endl;
1871  return -1;
1872  break;
1873  }
1874 }
1875 
1876 double EST_Ngrammar::frequency(const EST_StrVector &words, bool force, const bool trace) const
1877 {
1878  // Frequency of this ngram
1879  (void)force;
1880 
1881  switch(p_representation)
1882  {
1883  case EST_Ngrammar::sparse:
1884  case EST_Ngrammar::dense:
1885  return find_state_const(words).frequency(lastword(words));
1886  break;
1887 
1888  case EST_Ngrammar::backoff:
1889  return backoff_probability(words,trace); // can't do freqs
1890  break;
1891 
1892  default:
1893  cerr << "probability: unknown ngrammar representation" << endl;
1894  return -1;
1895  break;
1896  }
1897 }
1898 
1899 const EST_String &EST_Ngrammar::predict(const EST_StrVector &words,
1900  double *prob,int *state) const
1901 {
1902  // What's the most probable word given this list of words
1903 
1904  switch(p_representation)
1905  {
1906  case EST_Ngrammar::sparse:
1907  case EST_Ngrammar::dense:
1908  {
1909  const EST_NgrammarState &s = find_state_const(words);
1910  *state = s.id();
1911  return s.most_probable(prob);
1912  }
1913  break;
1914 
1915  case EST_Ngrammar::backoff:
1916  state=NULL; // there are no states per se
1917  return backoff_most_probable(words,prob);
1918  break;
1919 
1920  default:
1921  cerr << "probability: unknown ngrammar representation" << endl;
1922  return EST_String::Empty;
1923  break;
1924  }
1925 }
1926 
1927 const EST_String &EST_Ngrammar::predict(const EST_IVector &words,
1928  double *prob,int *state) const
1929 {
1930  // What's the most probable word given this list of words
1931 
1932  switch(p_representation)
1933  {
1934  case EST_Ngrammar::sparse:
1935  case EST_Ngrammar::dense:
1936  {
1937  const EST_NgrammarState &s = find_state_const(words);
1938  *state = s.id();
1939  return s.most_probable(prob);
1940  }
1941  break;
1942 
1943  case EST_Ngrammar::backoff:
1944  cerr << "probability: IVector access to backoff not supported" << endl;
1945  return EST_String::Empty;
1946  break;
1947 
1948  default:
1949  cerr << "probability: unknown ngrammar representation" << endl;
1950  return EST_String::Empty;
1951  break;
1952  }
1953 }
1954 
1955 int EST_Ngrammar::find_state_id(const EST_StrVector &words) const
1956 {
1957  switch(p_representation)
1958  {
1959  case EST_Ngrammar::sparse:
1960  case EST_Ngrammar::dense:
1961  {
1962  const EST_NgrammarState &s = find_state_const(words);
1963  return s.id();
1964  }
1965  default:
1966  cerr << "Ngrammar: representation doesn't support states" << endl;
1967  return 0;
1968  break;
1969  }
1970 }
1971 
1972 int EST_Ngrammar::find_state_id(const EST_IVector &words) const
1973 {
1974  switch(p_representation)
1975  {
1976  case EST_Ngrammar::sparse:
1977  case EST_Ngrammar::dense:
1978  {
1979  const EST_NgrammarState &s = find_state_const(words);
1980  return s.id();
1981  }
1982  default:
1983  cerr << "Ngrammar: representation doesn't support states" << endl;
1984  return 0;
1985  break;
1986  }
1987 }
1988 
1989 EST_String EST_Ngrammar::get_vocab_word(int i) const
1990 {
1991  if (vocab)
1992  return vocab->name(i);
1993  else
1994  return NOVOCAB;
1995 }
1996 
1997 int EST_Ngrammar::get_vocab_word(const EST_String &s) const
1998 {
1999  int index = vocab->name(s);
2000  return index;
2001 }
2002 
2003 double EST_Ngrammar::reverse_probability(const EST_StrVector &words,
2004  bool force) const
2005 {
2006  // Whats the probability of this ngram-1 given last word in ngram
2007  (void)force;
2008 
2009  switch(p_representation)
2010  {
2011  case EST_Ngrammar::sparse:
2012  case EST_Ngrammar::dense:
2013  {
2014  const EST_NgrammarState &s = find_state_const(words);
2015  // need number of occurrences of words[p_order-1]
2016  return s.frequency(lastword(words))/
2017  vocab_pdf.frequency(lastword(words));
2018  }
2019  break;
2020 
2021  case EST_Ngrammar::backoff:
2022  return backoff_reverse_probability(words);
2023  break;
2024 
2025  default:
2026  cerr << "probability: unknown ngrammar representation" << endl;
2027  return -1;
2028  break;
2029  }
2030 }
2031 
2032 double EST_Ngrammar::reverse_probability(const EST_IVector &words,
2033  bool force) const
2034 {
2035  // Whats the probability of this ngram-1 given last word in ngram
2036  (void)force;
2037 
2038  switch(p_representation)
2039  {
2040  case EST_Ngrammar::sparse:
2041  case EST_Ngrammar::dense:
2042  {
2043  const EST_NgrammarState &s = find_state_const(words);
2044  // need number of occurrences of words[p_order-1]
2045  return s.frequency(lastword(words))/
2046  vocab_pdf.frequency(lastword(words));
2047  }
2048  break;
2049 
2050  case EST_Ngrammar::backoff:
2051  cerr << "probability: reverse prob unavailable for backoff ngram"
2052  << endl;
2053  return -1;
2054  break;
2055 
2056  default:
2057  cerr << "probability: unknown ngrammar representation" << endl;
2058  return -1;
2059  break;
2060  }
2061 }
2062 
2064 EST_Ngrammar::prob_dist(int state) const
2065 {
2066  return p_states[state].pdf_const();
2067 }
2068 
2070 EST_Ngrammar::prob_dist(const EST_StrVector &words) const
2071 {
2072 
2073  switch(p_representation)
2074  {
2075  case EST_Ngrammar::sparse:
2076  case EST_Ngrammar::dense:
2077  {
2078  const EST_NgrammarState &s = find_state_const(words);
2079  return s.pdf_const();
2080  }
2081  break;
2082 
2083  case EST_Ngrammar::backoff:
2084  return backoff_prob_dist(words);
2085  break;
2086 
2087  default:
2088  cerr << "probability: unknown ngrammar representation" << endl;
2089  return PSTnullProbDistribution;
2090  break;
2091  }
2092 }
2093 
2095 EST_Ngrammar::prob_dist(const EST_IVector &words) const
2096 {
2097 
2098  switch(p_representation)
2099  {
2100  case EST_Ngrammar::sparse:
2101  case EST_Ngrammar::dense:
2102  {
2103  const EST_NgrammarState &s = find_state_const(words);
2104  return s.pdf_const();
2105  }
2106  break;
2107 
2108  case EST_Ngrammar::backoff:
2109  cerr << "probability: unsupport IVector access of backoff ngram" << endl;
2110  return PSTnullProbDistribution;
2111  break;
2112 
2113  default:
2114  cerr << "probability: unknown ngrammar representation" << endl;
2115  return PSTnullProbDistribution;
2116  break;
2117  }
2118 }
2119 
2120 EST_read_status
2121 EST_Ngrammar::load(const EST_String &filename)
2122 {
2123 
2124  EST_read_status r_val;
2125 
2126  //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
2127  //return r_val;
2128  //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
2129  //return r_val;
2130  if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
2131  return r_val;
2132  if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
2133  return r_val;
2134 
2135  // maybe the file is compressed ?
2136  EST_Pathname fname(filename);
2137  EST_String tmp_fname("");
2138 
2139  // crude but effective
2140  if (fname.extension() == GZIP_FILENAME_EXTENSION)
2141  tmp_fname = uncompress_file_to_temporary(filename,
2142  "gzip --decompress --stdout");
2143  else if (fname.extension() == COMPRESS_FILENAME_EXTENSION)
2144  tmp_fname = uncompress_file_to_temporary(filename,"uncompress -c");
2145 
2146  if(tmp_fname != "")
2147  {
2148  r_val = load(tmp_fname);
2149  delete_file(tmp_fname);
2150  return r_val;
2151  }
2152  else
2153  return misc_read_error;
2154 
2155  cerr << "EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2156  return wrong_format;
2157 }
2158 
2159 EST_read_status
2160 EST_Ngrammar::load(const EST_String &filename, const EST_StrList &wordlist)
2161 {
2162 
2163  // for backed off grammars
2164  // need a wordlist to load ARPA format
2165 
2166  EST_read_status r_val;
2167 
2168  if ((r_val = load_ngram_arpa(filename, *this, wordlist)) != wrong_format)
2169  return r_val;
2170 
2171  // try other types, then check wordlist fits
2172  //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
2173  //return r_val;
2174  //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
2175  //return r_val;
2176  if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
2177  {
2178  if ((r_val == format_ok) && check_vocab(wordlist))
2179  return r_val;
2180  else
2181  {
2182  cerr << "Wordlist file does not match grammar wordlist !" << endl;
2183  return misc_read_error;
2184  }
2185  }
2186 
2187  if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
2188  {
2189  if ((r_val == format_ok) && check_vocab(wordlist))
2190  return r_val;
2191  else
2192  {
2193  cerr << "Wordlist does not match grammar !" << endl;
2194  return misc_read_error;
2195  }
2196  }
2197 
2198 
2199  cerr << "EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2200  return wrong_format;
2201 }
2202 
2203 
2204 void
2205 EST_Ngrammar::make_htk_compatible()
2206 {
2207 
2208  cerr << "EST_Ngrammar::make_htk_compatible() not written yet." << endl;
2209  return;
2210 }
2211 
2212 EST_write_status
2213 EST_Ngrammar::save(const EST_String &filename, const EST_String type,
2214  const bool trace,double floor)
2215 {
2216 
2217  if (type == "")
2218  return save(filename,"cstr_ascii",false,floor); // choose default type
2219  if (type == "htk_ascii")
2220  return save_ngram_htk_ascii(filename, *this, floor);
2221  //if (type == "htk_binary")
2222  //return save_htk_binary(filename, *this);
2223  if (type == "arpa")
2224  return save_ngram_arpa(filename, *this);
2225  if (type == "cstr_ascii")
2226  return save_ngram_cstr_ascii(filename, *this,trace,floor);
2227  if (type == "cstr_bin")
2228  return save_ngram_cstr_bin(filename, *this, trace,floor);
2229  if (type == "wfst")
2230  return save_ngram_wfst(filename, *this);
2231 
2232  cerr << "EST_Ngrammar::save unknown output file type " << type << endl;
2233  return write_fail;
2234 }
2235 
2236 void EST_Ngrammar::iterate(EST_StrVector &words,
2237  void (*function)(EST_Ngrammar *n,
2238  EST_StrVector &words,
2239  void *params),
2240  void *params)
2241 {
2242  int i,j=-1;
2243  EST_String tmp;
2244 
2245  // find next item in ngram to fill in
2246  for(i=0;i<words.n();i++)
2247  if (words[i] == "")
2248  {
2249  j=i;
2250  break;
2251  }
2252 
2253  if (j==-1)
2254  {
2255  // all filled in, so do the function
2256  (*function)(this,words,params);
2257 
2258  }
2259  else
2260  {
2261 
2262  tmp = words(j);
2263  if (j==p_order-1) // final position - use pred vocab
2264  {
2265  for(i=0;i<pred_vocab->length();i++){
2266  words[j] = pred_vocab->name(i);
2267  iterate(words,function,params);
2268  }
2269 
2270  }
2271  else
2272  {
2273  for(i=0;i<vocab->length();i++){
2274  words[j] = vocab->name(i);
2275  iterate(words,function,params);
2276  }
2277  }
2278  words[j] = tmp;
2279  }
2280 }
2281 
2282 void EST_Ngrammar::const_iterate(EST_StrVector &words,
2283  void (*function)(const EST_Ngrammar *const n,
2284  EST_StrVector &words,
2285  void *params),
2286  void *params) const
2287 {
2288  int i,j=-1;
2289  EST_String tmp;
2290 
2291  // find next item in ngram to fill in
2292  for(i=0;i<words.n();i++)
2293  if (words[i] == "")
2294  {
2295  j=i;
2296  break;
2297  }
2298 
2299  if (j==-1)
2300  {
2301  // all filled in, so do the function
2302  (*function)(this,words,params);
2303 
2304  }
2305  else
2306  {
2307 
2308  tmp = words(j);
2309  if (j==p_order-1) // final position - use pred vocab
2310  {
2311  for(i=0;i<pred_vocab->length();i++){
2312  words[j] = pred_vocab->name(i);
2313  const_iterate(words,function,params);
2314  }
2315 
2316  }
2317  else
2318  {
2319  for(i=0;i<vocab->length();i++){
2320  words[j] = vocab->name(i);
2321  const_iterate(words,function,params);
2322  }
2323  }
2324  words[j] = tmp;
2325  }
2326 }
2327 
2328 void EST_Ngrammar::print_freqs(ostream &os,double floor)
2329 {
2330 
2331  if (p_representation == EST_Ngrammar::backoff)
2332  backoff_representation->print_freqs(os,p_order);
2333  else
2334  {
2335  int i,j;
2336  EST_Litem *k;
2337  EST_IVector window(p_order-1);
2338 
2339  for (i=0; i < p_num_states; i++)
2340  {
2341  // print out each ngram : freq
2342  for (k=p_states[i].pdf().item_start();
2343  !p_states[i].pdf().item_end(k);
2344  k = p_states[i].pdf().item_next(k))
2345  {
2346  double freq;
2347  EST_String name;
2348  int ind = i;
2349  p_states[i].pdf().item_freq(k,name,freq);
2350  if (freq == 0)
2351  freq = floor;
2352  if (freq > 0)
2353  {
2354  for (j = p_order-2; j >= 0; j--)
2355  {
2356  window[j] = ind%vocab->length();
2357  ind /= vocab->length();
2358  }
2359  for (j = 0; j < p_order-1; j++)
2360  os << wordlist_index(window(j)) << " ";
2361  os << name << " : " << freq << endl;
2362  }
2363  }
2364  }
2365  }
2366 }
2367 
2369 EST_Ngrammar::backoff_prob_dist(const EST_StrVector &words) const
2370 {
2371  // need to make this on the fly
2372  // by looking at all possible words in the final
2373  // position
2374 
2375  int i,j;
2376  EST_StrVector ngram;
2377  ngram.resize(words.n()+1);
2378  for(i=0;i<words.n();i++)
2379  ngram[i] = words(i);
2380 
2382 
2383  for(j=0;j<get_pred_vocab_length();j++)
2384  {
2385  ngram[ngram.n()-1] = get_pred_vocab_word(j);
2386  double tmp = backoff_probability(ngram,false);
2387  p->set_frequency(j,tmp); // actually probability
2388  }
2389  p->set_num_samples(1.0); // we can't do it in frequencies
2390 
2391  return *p;
2392 }
2393 
2394 const double EST_Ngrammar::get_backoff_discount(const int order, const double freq) const
2395 {
2396  if(order > p_order)
2397  {
2398  cerr << "order too great in EST_Ngrammar::get_backoff_discount" << endl;
2399  return 0;
2400  }
2401 
2402  else if( (int)freq < backoff_discount[order-1].n())
2403  return backoff_discount[order-1]((int)freq);
2404 
2405  else
2406  return 0;
2407 }
2408 
2409 const double EST_Ngrammar::backoff_probability(const EST_StrVector &words,
2410  const bool trace) const
2411 {
2412  const EST_BackoffNgrammarState *state;
2413  int i;
2414  EST_StrVector new_ngram;
2415  double f=0,f2=0;
2416 
2417 
2418  if (trace)
2419  {
2420  cerr << "backoff_probability( ";
2421  for(i=0;i<words.n();i++)
2422  cerr << words(i) << " ";
2423  cerr << ") ";
2424  }
2425 
2426  // are we down to the unigram ?
2427  if (words.n() == 1)
2428  {
2429  if (trace)
2430  cerr << "unigram " << backoff_representation->probability(words(0))
2431  << endl;
2432 
2433  f=backoff_representation->frequency(words(0));
2434 
2435  // it seems outrageous, but use a floor because unigram
2436  // probs of zero mess up backing off
2437  if(f>0)
2438  return f / backoff_representation->pdf_const().samples();
2439  else
2440  return backoff_unigram_floor_freq / backoff_representation->pdf_const().samples();
2441  }
2442 
2443  // the first n-1 words in the ngram -- deeply inefficient at the moment !
2444  // should pass separately
2445  new_ngram.resize(words.n()-1);
2446  for(i=0;i<new_ngram.n();i++)
2447  new_ngram[i] = words(i);
2448 
2449  // see if we have the ngram
2450  state=backoff_representation->get_state(words);
2451 
2452  if( (state != NULL) &&
2453  ((f=state->frequency(words(0))) > backoff_threshold) )
2454  {
2455 
2456  //if (trace)
2457  //cerr << "in state " << state << " = " << *state << endl << endl;
2458 
2459 
2460  // because f>0, the freq of new_ngram must be non-zero too
2461 
2462  // special case - we don't have a freq for !ENTER (or whatever it's called)
2463  // so use the no. of sentences used to build this grammar
2464  if((new_ngram(0) == p_sentence_start_marker) ||
2465  (new_ngram(0) == p_sentence_end_marker) )
2466  {
2467  f2 = p_number_of_sentences;
2468  if (trace)
2469  cerr << "special freq used : " << f2 << endl;
2470  }
2471  else
2472  {
2473  state=backoff_representation->get_state(new_ngram);
2474  if (state == NULL)
2475  {
2476  cerr << "Something went horribly wrong !" << endl;
2477  return -1;
2478  }
2479  // state->frequency(new_ngram(0)) is the number of times
2480  // we saw new_ngram
2481 
2482  f2=state->frequency(new_ngram(0));
2483 
2484  if (trace)
2485  {
2486  //cerr << "n-1 state is " << *state << endl;
2487  cerr << " using freq for " << new_ngram(0) << " of " << f2 << endl;
2488  }
2489  }
2490 
2491  if (trace)
2492  {
2493 
2494  cerr << " ..... got (" << f << " - "
2495  << get_backoff_discount(state->level()+1,f)
2496  << ")/" << f2 << " = "
2497  << (f - get_backoff_discount(state->level()+1,f) ) / f2
2498  << endl;
2499  }
2500  return (f - get_backoff_discount(state->level()+1,f) ) / f2;
2501  }
2502 
2503  else // back off
2504  {
2505 
2506  double bo_wt = get_backoff_weight(new_ngram);
2507 
2508  for(i=0;i<new_ngram.n();i++)
2509  new_ngram[i] = words(i+1);
2510 
2511  if (trace)
2512  {
2513  cerr << "backed off(" << bo_wt << ") to (";
2514  for(i=0;i<new_ngram.n();i++)
2515  cerr << new_ngram(i) << " ";
2516  cerr << ") ";
2517  }
2518 
2519  return bo_wt * backoff_probability(new_ngram,trace);
2520  }
2521 
2522  // should never reach here !
2523  // but gcc thinks it does
2524  cerr << "oops !";
2525  return -1;
2526 
2527 }
2528 
2529 
2530 const double
2531 EST_Ngrammar::backoff_reverse_probability_sub(const EST_StrVector &words,
2532  const EST_BackoffNgrammarState *root) const
2533 {
2534 
2535  // so similar to backoff prob - should combine
2536  // to do ......
2537 
2538  const EST_BackoffNgrammarState *state;
2539  int i;
2540  EST_StrVector new_ngram;
2541  double f=0;
2542 
2543  /*
2544  cerr << "backoff_rev_probability_sub( ";
2545  for(i=0;i<words.n();i++)
2546  cerr << words(i) << " ";
2547  cerr << ") ";
2548  */
2549  // are we down to the unigram ?
2550  if (words.n() == 1)
2551  {
2552  //cerr << "unigram " << root->probability(words(0))
2553  //<< endl;
2554  return root->probability(words(0));
2555  }
2556 
2557  // the first n-1 words in the ngram -- deeply inefficient at the moment !
2558  // should pass separately
2559  new_ngram.resize(words.n()-1);
2560  for(i=0;i<new_ngram.n();i++)
2561  new_ngram[i] = words(i);
2562 
2563  // see if we have the ngram
2564  state=root->get_state(words);
2565 
2566 
2567  if( (state != NULL) &&
2568  ((f=state->frequency(words(0))) > 0) )
2569  {
2570  // because f>0, the freq of new_ngram must be non-zero too
2571  state=root->get_state(new_ngram);
2572  if (state == NULL)
2573  {
2574  cerr << "Something went horribly wrong !" << endl;
2575  return -1;
2576  }
2577  // state->frequency(new_ngram(0)) is the number of times
2578  // we saw new_ngram
2579  //cerr << "got " << f << "/" << state->frequency(new_ngram(0)) << endl;
2580  return f / state->frequency(new_ngram(0));
2581  }
2582 
2583  else // back off
2584  {
2585 
2586  double bo_wt = root->get_backoff_weight(new_ngram);
2587  //double bo_wt = root->get_backoff_weight(words);
2588 
2589  for(i=0;i<new_ngram.n();i++)
2590  new_ngram[i] = words(i+1);
2591 
2592  //cerr << "backed off(" << bo_wt << ") ";
2593  return bo_wt * backoff_reverse_probability_sub(new_ngram,root);
2594  }
2595 
2596  // should never reach here !
2597  // but gcc thinks it does
2598  cerr << "oops !";
2599  return -1;
2600 
2601 }
2602 
2603 const double
2604 EST_Ngrammar::backoff_reverse_probability(const EST_StrVector &words) const
2605 {
2606 
2607  // probability of words 1...n-1 , given the last word
2608  // easier to compute from the backoff tree structure than
2609  // 'normal' probability
2610 
2611  // find level 1 child corresponding to history before last word
2612  EST_BackoffNgrammarState *state;
2613  state = backoff_representation->get_child(words(words.n()-1));
2614 
2615  if(state == NULL)
2616  {
2617  // predictee isn't there so ......... ???
2618  return 0;
2619  }
2620 
2621  // now find backoff probability of words 0...n-2
2622  // starting from this state
2623  return backoff_reverse_probability_sub(words,state);
2624 
2625 }
2626 
2627 const EST_String &
2628 EST_Ngrammar::backoff_most_probable(const EST_StrVector &words,
2629  double *prob) const
2630 {
2631  return backoff_prob_dist(words).most_probable(prob);
2632 }
2633 
2634 void slide(EST_IVector &v, const int l)
2635 {
2636  int i;
2637 
2638  // slide elements by 'l' without wraparound
2639 
2640  if (l==0)
2641  return;
2642 
2643  else if (l<0)
2644  {
2645  // slide left
2646 
2647  for(i=0;i<v.n()+l;i++)
2648  v[i] = v(i-l);
2649 
2650  for(;i<v.n();i++)
2651  v[i] = 0;
2652 
2653  }
2654  else
2655  {
2656  // slide right
2657 
2658  for(i=v.n()-1;i>=l;i--)
2659  v[i] = v(i-l);
2660 
2661  for(;i>=0;i--)
2662  v[i] = 0;
2663  }
2664 }
2665 
2666 void
2667 EST_Ngrammar::backoff_traverse(EST_BackoffNgrammarState *start_state,
2668  void (*function)(EST_BackoffNgrammarState *s,
2669  void *params),
2670  void *params)
2671 {
2672 
2673  // apply the function to this node
2674  function(start_state,params);
2675 
2676  // and recurse down the tree
2677  EST_Litem *k;
2678  double freq;
2679  EST_String name;
2680  for (k=start_state->pdf_const().item_start();
2681  !start_state->pdf_const().item_end(k);
2682  k = start_state->pdf_const().item_next(k))
2683  {
2684  start_state->pdf_const().item_freq(k,name,freq);
2685  EST_BackoffNgrammarState *child = start_state->get_child(name);
2686  if (child!=NULL)
2687  backoff_traverse(child,function,params);
2688 
2689  }
2690 }
2691 
2692 void
2693 EST_Ngrammar::backoff_traverse(EST_BackoffNgrammarState *start_state,
2694  void (*function)(EST_BackoffNgrammarState *s,
2695  void *params),
2696  void *params,
2697  const int level)
2698 {
2699  // apply the function to this node, if level is correct
2700  if (start_state->level() == level)
2701  {
2702  function(start_state,params);
2703  }
2704  else if (start_state->level() < level)
2705  {
2706  // and recurse down the tree if we haven't
2707  // reached the level yet
2708  EST_Litem *k;
2709  double freq;
2710  EST_String name;
2711 
2712  for (k=start_state->pdf_const().item_start();
2713  !start_state->pdf_const().item_end(k);
2714  k = start_state->pdf_const().item_next(k))
2715  {
2716  start_state->pdf_const().item_freq(k,name,freq);
2717  EST_BackoffNgrammarState *child = start_state->get_child(name);
2718  if (child!=NULL)
2719  backoff_traverse(child,function,params,level);
2720 
2721  }
2722 
2723 
2724  }
2725 }
2726 void
2727 merge_other_grammar(EST_Ngrammar *n, EST_StrVector &ngram, void *params)
2728 {
2729 
2730  EST_Ngrammar *other_n = (EST_Ngrammar*)((void**)params)[0];
2731  float *weight = (float*)((void**)params)[1];
2732 
2733  if(other_n->ngram_exists(ngram))
2734  n->accumulate(ngram,*weight * other_n->frequency(ngram));
2735 
2736 }
2737 
2738 bool
2739 EST_Ngrammar::merge(EST_Ngrammar &n,float weight)
2740 {
2741  EST_StrVector words;
2742  words.resize(p_order);
2743 
2744  void **params = new void*[2];
2745  params[0] = (void*)&n;
2746  params[1] = (void*)&weight;
2747 
2748  iterate(words,&merge_other_grammar,(void*)params);
2749 
2750  delete [] params;
2751  return true;
2752 }
2753 
2754 
2755 void slide(EST_StrVector &v, const int l)
2756 {
2757  // slide elements by 'l' without wraparound
2758  int i;
2759 
2760  if (l==0)
2761  return;
2762 
2763  else if (l<0)
2764  {
2765  // slide left
2766 
2767  for(i=0;i<v.n()+l;i++)
2768  v[i] = v(i-l);
2769 
2770  for(;i<v.n();i++)
2771  v[i] = "";
2772 
2773  }
2774  else
2775  {
2776  // slide right
2777 
2778  for(i=v.n()-1;i>=l;i--)
2779  v[i] = v(i-l);
2780 
2781  for(;i>=0;i--)
2782  v[i] = "";
2783 
2784  }
2785 }
2786 
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
void set_num_samples(const double c)
EST_Litem * item_start() const
Used for iterating through members of the distribution.
void set_frequency(const EST_String &s, double c)
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
bool init(const EST_StrList &vocab)
(re-)initialise
Definition: EST_Discrete.cc:80
static const EST_String Empty
Constant empty string.
Definition: EST_String.h:111
int index(const char *s, int pos=0) const
Position of substring (starting at pos)
Definition: EST_String.h:362
void resize(int n, int set=1)
Definition: EST_TVector.cc:196
INLINE const T & a_no_check(int n) const
read-only const access operator: without bounds checking
Definition: EST_TVector.h:257
INLINE int length() const
number of items in vector.
Definition: EST_TVector.h:252
INLINE int n() const
number of items in vector.
Definition: EST_TVector.h:254
int eof()
end of file
Definition: EST_Token.h:356
int eoln()
end of line
Definition: EST_Token.cc:818
void close(void)
Close stream.
Definition: EST_Token.cc:406
int open(const EST_String &filename)
open a \Ref{EST_TokenStream} for a file.
Definition: EST_Token.cc:200
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:486