From 7ac16e8aea67913b521744f78686e75ced1ca87d Mon Sep 17 00:00:00 2001 From: Brian West Date: Sat, 28 Feb 2009 20:34:23 +0000 Subject: [PATCH] When you're down in Transylvania And the night begins to fall You might get to feeling lonely If you do give me a call Just dial Transylvania 1, 2, 3, 4, 5 Transylvania 1, 2, 3, 4, 5 Transylvania 1, 2, 3, 4, 5 (oh and remove some stuff too) /b git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@12345 d0543943-73ff-0310-b7d9-9358b9ac24b2 --- scripts/mklm | 96 ----------------- scripts/quick_lm.pl | 257 -------------------------------------------- 2 files changed, 353 deletions(-) delete mode 100755 scripts/mklm delete mode 100755 scripts/quick_lm.pl diff --git a/scripts/mklm b/scripts/mklm deleted file mode 100755 index c2ead8979b..0000000000 --- a/scripts/mklm +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/perl -# mklm - builds LM's from all avaliable sentence files. -# -# Processes all .sent files in the current directory. -# - -use Data::Dumper; - -@sent = <*.sent>; - -if ($ARGV[0] eq "clean") { - foreach $file (@sent) { - my($base,$ext) = split(/\./, $file); - system("rm -rf $base"); - } - exit; -} - - - -open(DIC,"<../conf/cmudict.0.6d"); -@dic = ; -close(DIC); - - - -foreach $file (@sent) { - my($base,$ext) = split(/\./, $file); - system("rm -rf $base"); - system("mkdir -p $base"); - system("../bin/quick_lm.pl -s $file -o $base/$base.lm 2>/dev/null"); -} - -foreach $file (@sent) { - my($word_file,$ext) = split(/\./, $file); - open(SENT,"<$file"); - @lines = ; - close(SENT); - undef @in; - undef @out; - open(WORDS,">$word_file.words"); - foreach $line (@lines) { - chomp($line); - $line =~ s/<(.*?)>//gi;; - $line =~ s/^\s*//; - $line =~ s/\s*$//; - $line =~ s/_/ /g; - @tmp = split(' ', $line); - foreach $tmp (@tmp) { - push(@in, uc($tmp)); - - } - } - print Dumpzer \@in; - undef %saw; - @saw{@in} = (); - @out = sort keys %saw; - foreach $line (@out) { - print WORDS "$line\n"; - - } - close(WORDS); -} - -@word_files = <*.words>; - -foreach $file (@word_files) { - my($dic,$ext) = split(/\./, $file); - - open(WORDS,"<$file"); - @words = ; - close(WORDS); - unlink($file); - unlink("$dic.words"); - open(DIC, ">$dic/$dic.dic"); - foreach $line (@dic) { - chomp $line; - if ($line =~ m/(.*)\s\s(.*)/) { - local $word = $1; - local $pron = $2; - $word =~ s/^\s*//; - $word =~ s/\s*$//; - $pron =~ s/^\s*//; - $pron =~ s/\s*$//; - foreach $myword (@words) { - chomp $myword; - $string = $word; - $string =~ s/\(\d\)//g; - if ($myword eq $string) { - print DIC "$word\t$pron\n"; - } - } - } - } - close(DIC); -} diff --git a/scripts/quick_lm.pl b/scripts/quick_lm.pl deleted file mode 100755 index 9172acbd7e..0000000000 --- a/scripts/quick_lm.pl +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/perl - -# NOTE: this is by no means an efficient implementation and performance will -# deteriorate rapidly as a function of the corpus size. Larger corpora should be -# processed using the toolkit available at http://www.speech.cs.cmu.edu/SLM_info.html - -# [2feb96] (air) -# cobbles together a language model from a set of exemplar sentences. -# features: 1) uniform discounting, 2) no cutoffs -# the "+" version allows insertion of extra words into the 1gram vector - -# [27nov97] (air) -# bulletproof a bit for use in conjunction with a cgi script - -# [20000711] (air) -# made visible the discount parmeter - -# [20011123] (air) -# cleaned-up version for distribution - -use Getopt::Std; - -$VERBOSE = 1; - -sub handler { local($sig) = @_; - print STDERR "quick_lm caught a SIG$sig -- dying\n"; - exit(0); - } -foreach (qw(XCPU KILL TERM STOP)) { $SIG{$_} = \&handler; } - - -if ($#ARGV < 0) { die("usage: quick_lm -s -o [-w ] [-d discount]\n"); } -Getopt::Std::getopts("s:w:d:o:x"); -$sentfile = $opt_s; -$wordfile = $opt_w; -$discount = $opt_d; -$output = $opt_o; - -$output or die("No output file\n"); -$sentfile or die("No sentence file\n"); - -$| = 1; # always flush buffers - -if ($VERBOSE>0) {print STDERR "Language model started at ",scalar localtime(),"\n";} - - -open(IN,"<$sentfile") or die("can't open $sentfile!\n"); -if ($wordfile ne "") { open(WORDS,"$wordfile"); $wflag = 1;} else { $wflag = 0; } - -$log10 = log(10.0); - -if ($discount ne "") { - if (($discount<=0.0) or ($discount>=1.0)) { - print STDERR "\discount value out of range: must be 0.0 < x < 1.0! ...using 0.5\n"; - $discount_mass = 0.5; # just use default - } else { - $discount_mass = $discount; - } -} else { - # Ben and Greg's experiments show that 0.5 is a way better default choice. - $discount_mass = 0.5; # Set a nominal discount... -} -$deflator = 1.0 - $discount_mass; - -# create count tables -$sent_cnt = 0; -while () { - s/^\s*//; s/\s*$//; - if ( $_ eq "" ) { next; } else { $sent_cnt++; } # skip empty lines - @word = split(/\s/); - for ($j=0;$j<($#word-1);$j++) { - $trigram{join(" ",$word[$j],$word[$j+1],$word[$j+2])}++; - $bigram{ join(" ",$word[$j],$word[$j+1])}++; - $unigram{$word[$j]}++; - } - # finish up the bi and uni's at the end of the sentence... - $bigram{join(" ",$word[$j],$word[$j+1])}++; - $unigram{$word[$j]}++; - - $unigram{$word[$j+1]}++; -} -close(IN); -if ($VERBOSE) { print STDERR "$sent_cnt sentences found.\n"; } - -# add in any words -if ($wflag) { - $new = 0; $read_in = 0; - while () { - s/^\s*//; s/\s*$//; - if ( $_ eq "" ) { next; } else { $read_in++; } # skip empty lines - if (! $unigram{$_}) { $unigram{$_} = 1; $new++; } - } - if ($VERBOSE) { print STDERR "tried to add $read_in word; $new were new words\n"; } - close (WORDS); -} -if ( ($sent_cnt==0) && ($new==0) ) { - print STDERR "no input?\n"; - exit; -} - -open(LM,">$output") or die("can't open $myfile.lm for output!\n"); - -$preface = ""; -$preface .= "Language model created by QuickLM on ".`date`; -$preface .= "Copyright (c) 1996-2002\nCarnegie Mellon University and Alexander I. Rudnicky\n\n"; -$preface .= "This model based on a corpus of $sent_cnt sentences and ".scalar (keys %unigram). " words\n"; -$preface .= "The (fixed) discount mass is $discount_mass\n\n"; - - -# compute counts -$unisum = 0; $uni_count = 0; $bi_count = 0; $tri_count = 0; -foreach $x (keys(%unigram)) { $uni_count++; $unisum += $unigram{$x}; } -foreach $x (keys(%bigram)) { $bi_count++; } -foreach $x (keys(%trigram)) { $tri_count++; } - -print LM $preface; -print LM "\\data\\\n"; -print LM "ngram 1=$uni_count\n"; -if ( $bi_count > 0 ) { print LM "ngram 2=$bi_count\n"; } -if ( $tri_count > 0 ) { print LM "ngram 3=$tri_count\n"; } -print LM "\n"; - -# compute uni probs -foreach $x (keys(%unigram)) { - $uniprob{$x} = ($unigram{$x}/$unisum) * $deflator; -} - -# compute alphas -foreach $y (keys(%unigram)) { - $w1 = $y; - $sum_denom = 0.0; - foreach $x (keys(%bigram)) { - if ( substr($x,0,rindex($x," ")) eq $w1 ) { - $w2 = substr($x,index($x," ")+1); - $sum_denom += $uniprob{$w2}; - } - } - $alpha{$w1} = $discount_mass / (1.0 - $sum_denom); -} - -print LM "\\1-grams:\n"; -foreach $x (sort keys(%unigram)) { - printf LM "%6.4f %s %6.4f\n", log($uniprob{$x})/$log10, $x, log($alpha{$x})/$log10; -} -print LM "\n"; - -#compute bi probs -foreach $x (keys(%bigram)) { - $w1 = substr($x,0,rindex($x," ")); - $biprob{$x} = ($bigram{$x}*$deflator)/$unigram{$w1}; -} - -#compute bialphas -foreach $x (keys(%bigram)) { - $w1w2 = $x; - $sum_denom = 0.0; - foreach $y (keys(%trigram)) { - if (substr($y,0,rindex($y," ")) eq $w1w2 ) { - $w2w3 = substr($y,index($y," ")+1); - $sum_denom += $biprob{$w2w3}; - } - } - $bialpha{$w1w2} = $discount_mass / (1.0 - $sum_denom); -} - -# output the bigrams and trigrams (now that we have the alphas computed). -if ( $bi_count > 0 ) { - print LM "\\2-grams:\n"; - foreach $x (sort keys(%bigram)) { - printf LM "%6.4f %s %6.4f\n", - log($biprob{$x})/$log10, $x, log($bialpha{$x})/$log10; - } - print LM "\n"; -} - -if ($tri_count > 0 ) { - print LM "\\3-grams:\n"; - foreach $x (sort keys(%trigram)) { - $w1w2 = substr($x,0,rindex($x," ")); - printf LM "%6.4f %s\n", - log(($trigram{$x}*$deflator)/$bigram{$w1w2})/$log10, $x; - } - print LM "\n"; -} - -print LM "\\end\\\n"; -close(LM); - -if ($VERBOSE>0) { print STDERR "Language model completed at ",scalar localtime(),"\n"; } - -# -__END__ -=pod - -/* ==================================================================== - * Copyright (c) 1996-2002 Alexander I. Rudnicky and Carnegie Mellon University. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All copies, used or distributed, must preserve the original wording of - * the copyright notice included in the output file. - * - * This work was supported in part by funding from the Defense Advanced - * Research Projects Agency and the CMU Sphinx Speech Consortium. - * - * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND - * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY - * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * ==================================================================== - * - */ - - -Pretty Good Language Modeler, now with unigram vector augmentation! - -The Pretty Good Language Modeler is intended for quick construction of small -language models, typically as might be needed in application development. Depending -on the version of Perl that you are running, a practical limitation is a -maximum vocabulary size on the order of 1000-2000 words. The limiting factor -is the number of n-grams observed, since each n-gram is stored as a hash key. -(So smaller vocabularies may turn out to be a problem as well.) - -This package computes a stadard back-off language model. It differs in one significant -respect, which is the computation of the discount. We adopt a "proportional" (or ratio) -discount in which a certain percentage of probability mass is removed (typically 50%) -from observed n-grams and redistributed over unobserved n-grams. - -Conventionally, an absolute discount would be used, however we have found that the -proportional discount appears to be robust for extremely small languages, as might be -prototyped by a developer, as opposed to based on a collected corpus. We have found that -absolute and proportional discounts produce comparable recognition results with perhaps -a slight advantage for proportional discounting. A more systematic investigation of -this technique would be desirable. In any case it also has the virtue of using a very -simple computation. - -=end -