From 7ac16e8aea67913b521744f78686e75ced1ca87d Mon Sep 17 00:00:00 2001
From: Brian West <brian@freeswitch.org>
Date: Sat, 28 Feb 2009 20:34:23 +0000
Subject: [PATCH] When you're down in Transylvania And the night begins to fall
 You might get to feeling lonely If you do give me a call

Just dial Transylvania 1, 2, 3, 4, 5
Transylvania 1, 2, 3, 4, 5
Transylvania 1, 2, 3, 4, 5

(oh and remove some stuff too)

/b



git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@12345 d0543943-73ff-0310-b7d9-9358b9ac24b2
---
 scripts/mklm        |  96 -----------------
 scripts/quick_lm.pl | 257 --------------------------------------------
 2 files changed, 353 deletions(-)
 delete mode 100755 scripts/mklm
 delete mode 100755 scripts/quick_lm.pl
diff --git a/scripts/mklm b/scripts/mklm
deleted file mode 100755
index c2ead8979b..0000000000
--- a/scripts/mklm
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/usr/bin/perl
-# mklm - builds LM's from all avaliable sentence files.
-#
-# Processes all .sent files in the current directory.
-#
-
-use Data::Dumper;
-
-@sent = <*.sent>;
-
-if ($ARGV[0] eq "clean") {
-    foreach $file (@sent) {
-	my($base,$ext) = split(/\./, $file);
-	system("rm -rf $base");
-    }
-    exit;
-}
-
-
-
-open(DIC,"<../conf/cmudict.0.6d");
-@dic = <DIC>;
-close(DIC);
-
-
-
-foreach $file (@sent) {
-    my($base,$ext) = split(/\./, $file);
-    system("rm -rf $base");
-    system("mkdir -p $base");
-    system("../bin/quick_lm.pl -s $file -o $base/$base.lm 2>/dev/null");
-}
-
-foreach $file (@sent) {
-    my($word_file,$ext) = split(/\./, $file);
-    open(SENT,"<$file");
-    @lines = <SENT>;
-    close(SENT);
-    undef @in;
-    undef @out;
-    open(WORDS,">$word_file.words");
-    foreach $line (@lines) {
-	chomp($line);
-	$line =~ s/<(.*?)>//gi;;
-	$line =~ s/^\s*//;
-	$line =~ s/\s*$//;
-	$line =~ s/_/ /g;
-	@tmp = split(' ', $line);
-	foreach $tmp (@tmp) {
-	    push(@in, uc($tmp));
-
-	}
-    }
-    print Dumpzer \@in;
-    undef %saw;
-    @saw{@in} = ();
-    @out = sort keys %saw;
-    foreach $line (@out) {
-	print WORDS "$line\n";
-
-    }
-    close(WORDS);
-}
-
-@word_files = <*.words>;
-
-foreach $file (@word_files) {
-  my($dic,$ext) = split(/\./, $file);
-
-  open(WORDS,"<$file");
-  @words = <WORDS>;
-  close(WORDS);
-  unlink($file);
-  unlink("$dic.words");
-  open(DIC, ">$dic/$dic.dic");
-  foreach $line (@dic) {
-    chomp $line;
-    if ($line =~ m/(.*)\s\s(.*)/) {
-      local $word = $1; 
-      local $pron = $2;
-      $word =~ s/^\s*//;
-      $word =~ s/\s*$//;                                                                                                                      
-      $pron =~ s/^\s*//;
-      $pron =~ s/\s*$//;  
-      foreach $myword (@words) {
-	chomp $myword;
-	$string = $word;
-	$string =~ s/\(\d\)//g;
-	if ($myword eq $string) {
-	  print DIC "$word\t$pron\n";
-	}
-      }
-    }
-  }
-  close(DIC);
-}
diff --git a/scripts/quick_lm.pl b/scripts/quick_lm.pl
deleted file mode 100755
index 9172acbd7e..0000000000
--- a/scripts/quick_lm.pl
+++ /dev/null
@@ -1,257 +0,0 @@
-#!/usr/bin/perl
-
-# NOTE: this is by no means an efficient implementation and performance will 
-# deteriorate rapidly as a function of the corpus size. Larger corpora should be
-# processed using the toolkit available at http://www.speech.cs.cmu.edu/SLM_info.html
-
-# [2feb96] (air)
-# cobbles together a language model from a set of exemplar sentences.
-# features: 1) uniform discounting, 2) no cutoffs
-# the "+" version allows insertion of extra words into the 1gram vector
-
-# [27nov97] (air)
-# bulletproof a bit for use in conjunction with a cgi script
-
-# [20000711] (air)
-# made visible the discount parmeter
-
-# [20011123] (air)
-# cleaned-up version for distribution
-
-use Getopt::Std;
-
-$VERBOSE = 1;
-
-sub handler { local($sig) = @_;
-	      print STDERR "quick_lm caught a SIG$sig -- dying\n";
-	      exit(0);
-	    }
-foreach (qw(XCPU KILL TERM STOP)) { $SIG{$_} = \&handler; }
-
-
-if ($#ARGV < 0) { die("usage: quick_lm -s <sentence_file> -o <output_file> [-w <word_file>] [-d discount]\n"); }
-Getopt::Std::getopts("s:w:d:o:x");
-$sentfile = $opt_s;
-$wordfile = $opt_w;
-$discount = $opt_d;
-$output = $opt_o;
-
-$output or die("No output file\n");
-$sentfile or die("No sentence file\n");
-
-$| = 1;  # always flush buffers
-
-if ($VERBOSE>0) {print STDERR "Language model started at ",scalar localtime(),"\n";}
-
-
-open(IN,"<$sentfile") or die("can't open $sentfile!\n");
-if ($wordfile ne "") { open(WORDS,"$wordfile"); $wflag = 1;} else { $wflag = 0; }
-
-$log10 = log(10.0);
-
-if ($discount ne "") {
-  if (($discount<=0.0) or ($discount>=1.0)) {
-    print STDERR "\discount value out of range: must be 0.0 < x < 1.0! ...using 0.5\n";
-    $discount_mass = 0.5;  # just use default
-  } else {
-    $discount_mass = $discount;
-  }
-} else {
-  # Ben and Greg's experiments show that 0.5 is a way better default choice.
-  $discount_mass = 0.5;  # Set a nominal discount...
-}
-$deflator = 1.0 - $discount_mass;
-
-# create count tables
-$sent_cnt = 0;
-while (<IN>) {	 
-  s/^\s*//; s/\s*$//;
-  if ( $_ eq "" ) { next; } else { $sent_cnt++; } # skip empty lines
-  @word = split(/\s/);    
-  for ($j=0;$j<($#word-1);$j++) {	
-    $trigram{join(" ",$word[$j],$word[$j+1],$word[$j+2])}++;
-    $bigram{ join(" ",$word[$j],$word[$j+1])}++;
-    $unigram{$word[$j]}++;
-  }
-  # finish up the bi and uni's at the end of the sentence...
-  $bigram{join(" ",$word[$j],$word[$j+1])}++;
-  $unigram{$word[$j]}++;
-  
-  $unigram{$word[$j+1]}++;
-}
-close(IN);
-if ($VERBOSE) { print STDERR "$sent_cnt sentences found.\n"; }
-
-# add in any words
-if ($wflag) {
-  $new = 0; $read_in = 0;
-  while (<WORDS>) {
-    s/^\s*//; s/\s*$//;
-    if ( $_ eq "" ) { next; }  else { $read_in++; }  # skip empty lines
-    if (! $unigram{$_}) { $unigram{$_} = 1; $new++; }
-  }
-  if ($VERBOSE) { print STDERR "tried to add $read_in word; $new were new words\n"; }
-  close (WORDS);
-}
-if ( ($sent_cnt==0) && ($new==0) ) {
-  print STDERR "no input?\n";
-  exit;
-}
-
-open(LM,">$output") or die("can't open $myfile.lm for output!\n");
-
-$preface = "";
-$preface .= "Language model created by QuickLM on ".`date`;
-$preface .= "Copyright (c) 1996-2002\nCarnegie Mellon University and Alexander I. Rudnicky\n\n";
-$preface .= "This model based on a corpus of $sent_cnt sentences and ".scalar (keys %unigram). " words\n";
-$preface .= "The (fixed) discount mass is $discount_mass\n\n";
-
-
-# compute counts
-$unisum = 0; $uni_count = 0; $bi_count = 0; $tri_count = 0;
-foreach $x (keys(%unigram)) { $uni_count++; $unisum += $unigram{$x}; }
-foreach $x (keys(%bigram))  { $bi_count++; }
-foreach $x (keys(%trigram)) { $tri_count++; }
-
-print LM $preface;
-print LM "\\data\\\n";
-print LM "ngram 1=$uni_count\n";
-if ( $bi_count > 0 ) { print LM "ngram 2=$bi_count\n"; }
-if ( $tri_count > 0 ) { print LM "ngram 3=$tri_count\n"; }
-print LM "\n";
-
-# compute uni probs
-foreach $x (keys(%unigram)) {
-  $uniprob{$x} = ($unigram{$x}/$unisum) * $deflator;
-}
-
-# compute alphas
-foreach $y (keys(%unigram)) {
-  $w1 = $y;
-  $sum_denom = 0.0;
-  foreach $x (keys(%bigram)) {
-    if ( substr($x,0,rindex($x," ")) eq $w1 ) {
-      $w2 = substr($x,index($x," ")+1);
-      $sum_denom += $uniprob{$w2};
-    }
-  }
-  $alpha{$w1} = $discount_mass / (1.0 - $sum_denom);
-}
-
-print LM "\\1-grams:\n";
-foreach $x (sort keys(%unigram)) {
-  printf LM "%6.4f %s %6.4f\n", log($uniprob{$x})/$log10, $x, log($alpha{$x})/$log10;
-}
-print LM "\n";
-
-#compute bi probs
-foreach $x (keys(%bigram)) {
-  $w1 = substr($x,0,rindex($x," "));
-  $biprob{$x} = ($bigram{$x}*$deflator)/$unigram{$w1};
-}
-
-#compute bialphas
-foreach $x (keys(%bigram)) {
-  $w1w2 = $x;
-  $sum_denom = 0.0;
-  foreach $y (keys(%trigram)) {
-    if (substr($y,0,rindex($y," ")) eq $w1w2 ) {
-      $w2w3 = substr($y,index($y," ")+1);
-      $sum_denom += $biprob{$w2w3};
-    }
-  }
-  $bialpha{$w1w2} = $discount_mass / (1.0 - $sum_denom);
-}
-
-# output the bigrams and trigrams (now that we have the alphas computed).
-if ( $bi_count > 0 ) {
-  print LM "\\2-grams:\n";
-  foreach $x (sort keys(%bigram)) {
-    printf LM "%6.4f %s %6.4f\n",
-      log($biprob{$x})/$log10, $x, log($bialpha{$x})/$log10;
-  }
-  print LM "\n";
-}
-
-if ($tri_count > 0 ) {
-  print LM "\\3-grams:\n";
-  foreach $x (sort keys(%trigram)) {
-    $w1w2 = substr($x,0,rindex($x," "));
-    printf LM "%6.4f %s\n",
-      log(($trigram{$x}*$deflator)/$bigram{$w1w2})/$log10, $x;
-  }
-  print LM "\n";
-}
-
-print LM "\\end\\\n";
-close(LM);
-
-if ($VERBOSE>0) { print STDERR "Language model completed at ",scalar localtime(),"\n"; }
-
-#
-__END__
-=pod
-
-/* ====================================================================
- * Copyright (c) 1996-2002 Alexander I. Rudnicky and Carnegie Mellon University.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All copies, used or distributed, must preserve the original wording of
- *    the copyright notice included in the output file.
- *
- * This work was supported in part by funding from the Defense Advanced 
- * Research Projects Agency and the CMU Sphinx Speech Consortium.
- *
- * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
- * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
- * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * ====================================================================
- *
- */
-
-
-Pretty Good Language Modeler, now with unigram vector augmentation!
-
-The Pretty Good Language Modeler is intended for quick construction of small 
-language models, typically as might be needed in application development. Depending
-on the version of Perl that you are running, a practical limitation is a
-maximum vocabulary size on the order of 1000-2000 words. The limiting factor
-is the number of n-grams observed, since each n-gram is stored as a hash key.
-(So smaller vocabularies may turn out to be a problem as well.)
-
-This package computes a stadard back-off language model. It differs in one significant 
-respect, which is the computation of the discount. We adopt a "proportional" (or ratio)
-discount in which a certain percentage of probability mass is removed (typically 50%)
-from observed n-grams and redistributed over unobserved n-grams.
-
-Conventionally, an absolute discount would be used, however we have found that the 
-proportional discount appears to be robust for extremely small languages, as might be 
-prototyped by a developer, as opposed to based on a collected corpus. We have found that 
-absolute and proportional discounts produce comparable recognition results with perhaps
-a slight advantage for proportional discounting. A more systematic investigation of
-this technique would be desirable. In any case it also has the virtue of using a very
-simple computation.
-
-=end
-