speech recognition demo in python

git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@5659 d0543943-73ff-0310-b7d9-9358b9ac24b2
2007-08-22 14:23:28 +00:00 · 2007-08-22 14:23:28 +00:00 · b3454792b1
parent 88bbc489d1
commit b3454792b1
3 changed files with 286 additions and 0 deletions
--- a/scripts/py_modules/init.py
+++ b/scripts/py_modules/init.py
@ -0,0 +1 @@
+# empty
--- a/scripts/py_modules/speechtools.py
+++ b/scripts/py_modules/speechtools.py
@ -0,0 +1,198 @@
+from freeswitch import *
+from xml.dom import minidom
+
+VOICE_ENGINE = "cepstral"
+VOICE = "William"
+
+"""
+A few classes that make it easier to write speech applications
+using Python.  It is roughly modelled after the equivalent that
+is written in JavaScript.
+
+Status: should work, but not yet complete.  some pending items
+are mentioned in comments
+"""
+
+class Grammar:
+    def __init__(self, name, path, obj_path,
+                 min_score=1, confirm_score=400, halt=False):
+        """
+        @param name - name of grammar to reference it later
+        @param path - path to xml grammar file
+        @param obj_path - xml path to find interpretation from root
+                          in result xml, eg, 'interpretation'
+        @param min_score - score threshold to accept result
+        @param confirm_score - if score below this threshold, ask user
+                               if they are sure this is correct
+        @param halt - not sure what was used for in js, currently unused
+        """
+        self.name=name
+        self.path=path
+        self.obj_path=obj_path
+        self.min_score=min_score
+        self.confirm_score=confirm_score
+        self.halt=halt
+
+    
+class SpeechDetect:
+
+    def __init__(self, session, module_name, ip_addr):
+        self.session=session
+        self.module_name=module_name
+        self.ip_addr=ip_addr
+        self.grammars = {}
+
+    def addGrammar(self, grammar):
+        self.grammars[grammar.name]=grammar
+
+    def setGrammar(self, name):
+        self.grammar = self.grammars[name]
+
+    def detectSpeech(self):
+        # TODO: we might not always want to call detect_speech
+        # with this cmd, see js version for other options
+        # also see detect_speech_function() in mod_dptools.c
+        cmd = "%s %s %s %s" % (self.module_name,
+                               self.grammar.name,
+                               self.grammar.path,
+                               self.ip_addr)
+        console_log("debug", "calling detect_speech with: %s\n" % cmd)
+        self.session.execute("detect_speech", cmd)
+        console_log("debug", "finished calling detect_speech\n")
+        
+class SpeechObtainer:
+
+    def __init__(self, speech_detect, required_phrases, wait_time, max_tries):
+        """
+        @param speech_detect - the speech detect object, which holds a
+                               reference to underlying session and can
+                               be re-used by many SpeechObtainers
+        @param required_phrases - the number of required phrases from the
+                                  grammar.  for example if its prompting for
+                                  the toppings on a sandwhich and min toppings
+                                  is 3, use 3.  normally will be 1.
+        @param wait_time - the time, in millisconds, to wait for
+                           input during each loop iteration
+        @param max_tries - this number multiplied by wait time gives the
+                           'total wait time' before we give up and return
+                           partial or no result
+        """
+        self.speech_detect=speech_detect
+        self.required_phrases=required_phrases
+        self.wait_time=wait_time
+        self.max_tries=max_tries        
+
+        self.detected_phrases = []
+        
+    def setGrammar(self, grammar):
+        """
+        @param grammar - instance of grammar class
+        """
+        self.grammar=grammar
+        self.speech_detect.addGrammar(grammar)
+        self.speech_detect.setGrammar(self.grammar.name)
+
+    def detectSpeech(self):
+        self.speech_detect.detectSpeech()
+        
+    def run(self):
+        """
+        start speech detection with the current grammar,
+        and listen for results from asr engine.  once a result
+        has been returned, return it to caller
+        """
+
+        def dtmf_handler(input, itype, funcargs):
+            console_log("INFO","\n\nDTMF itype: %s\n" % itype)
+            if itype == 1: # TODO!! use names for comparison instead of number
+                return self.handle_event(input, funcargs)
+            elif itype== 0:
+                console_log("INFO","\n\nDTMF input: %s\n" % input)
+            else:
+                console_log("INFO","\n\nUnknown input type: %s\n" % itype)
+            return None 
+
+        
+        num_tries = 0
+
+        session = self.speech_detect.session
+
+        console_log("debug", "setting dtmf callback\n")
+        session.setDTMFCallback(dtmf_handler, "")
+        console_log("debug", "calling getDigits\n")
+            
+        console_log("debug", "starting run() while loop\n")        
+        while (session.ready() and 
+               num_tries < self.max_tries and
+               len(self.detected_phrases) < self.required_phrases):
+            console_log("debug", "top of run() while loop\n")        
+            session.collectDigits(self.wait_time)
+            num_tries += 1
+
+        console_log("debug", "while loop finished\n")
+        return self.detected_phrases
+
+    def handle_event(self, event, funcargs):
+        """
+        when the dtmf handler receives an event, it calls back
+        this method.  event is a dictionary with subdictionaries ..
+
+        Example 1
+        =========
+
+        {'body': None, 'headers': {'Speech-Type': 'begin-speaking'}}
+
+        Example 2
+        =========
+        {'body': '<result xmlns='http://www.ietf.org/xml/ns/mrcpv2'
+        xmlns:ex='http://www.example.com/example' score='100'
+        grammar='session:request1@form-level.store'><interpretation>
+        <input mode='speech'>waffles</input></interpretation></result>',
+        'headers': {'Speech-Type': 'detected-speech'}}
+
+        This dictionary is constructed in run_dtmf_callback() in
+        freeswitch_python.cpp
+
+        """
+
+        # what kind of event?
+        headers = event['headers']
+        speech_type = headers['Speech-Type']
+        if speech_type == "begin-speaking":
+            # not sure what to do with this, try returning "stop"
+            # so that it might stop playing a sound file once
+            # speech has been detected 
+            return "stop"
+        elif speech_type == "detected-speech":
+            # extract the detected phrase. from result
+            # BUG: this assumes only ONE interpretation in the xml
+            # result.  rest will get igored
+            # NOTE: have to wrap everything with str() (at least
+            # calls to console_log because otherwise it chokes on
+            # unicode strings.
+            # TODO: check the score
+            body = event['body']
+            dom = minidom.parseString(body)
+            phrase = dom.getElementsByTagName(self.grammar.obj_path)[0]
+            phrase_text = self.getText(phrase)
+            if phrase_text:
+                self.detected_phrases.append(str(phrase_text))
+                # do we want to return stop?  what should we return?
+                return "stop"  
+        else:
+            raise Exception("Unknown speech event: %s" % speech_type)
+
+
+    def getText(self, elt):
+
+        """ given an element, get its text.  if there is more than
+        one text node child, just append all the text together.
+        """
+
+        result = ""
+        children = elt.childNodes
+        for child in children:
+            if child.nodeType == child.TEXT_NODE:
+                result += str(child.nodeValue)
+        return result
+
--- a/scripts/recipewizard.py
+++ b/scripts/recipewizard.py
@ -0,0 +1,87 @@
+from freeswitch import *
+from py_modules.speechtools import Grammar, SpeechDetect
+from py_modules.speechtools import SpeechObtainer
+
+import time, os
+
+VOICE_ENGINE = "cepstral"
+VOICE = "William"
+GRAMMAR_ROOT = "/usr/src/freeswitch_trunk/scripts"
+
+"""
+Example speech recognition application in python.  
+
+How to make this work:
+
+* Get mod_openmrcp working along with an MRCP asr server
+* Add /usr/src/freeswitch/scripts or equivalent to your PYTHONPATH
+* Restart freeswitch
+* Create $GRAMMAR_ROOT/mainmenu.xml from contents in mainmenu() comments
+
+"""
+
+class RecipeWizard:
+
+    def __init__(self, session):
+        self.session=session
+        self.session.set_tts_parms(VOICE_ENGINE, VOICE)        
+        self.main()
+
+    def main(self):
+
+        console_log("debug", "recipe wizard main()\n")        
+        self.speechdetect = SpeechDetect(self.session, "openmrcp", "127.0.0.1");
+        self.speechobtainer = SpeechObtainer(speech_detect=self.speechdetect,
+                                             required_phrases=1,
+                                             wait_time=5000,
+                                             max_tries=3)
+        gfile = os.path.join(GRAMMAR_ROOT, "mainmenu.xml")
+        self.grammar = Grammar("mainmenu", gfile,"input",80,90)
+        self.speechobtainer.setGrammar(self.grammar);
+        console_log("debug", "calling speechobtainer.run()\n")
+        self.speechobtainer.detectSpeech()
+        self.session.speak("Hello. Welcome to the recipe wizard. Drinks or food?")
+        result = self.speechobtainer.run()
+        console_log("debug", "speechobtainer.run() result: %s\n" % result)
+        if result:
+            self.session.speak("Received result.  Result is: %s" % result[0])
+        else:
+            self.session.speak("Sorry, I did not hear you")
+            
+        console_log("debug", "speechobtainer.run() finished\n")        
+
+def mainmenu():
+    """
+    <!DOCTYPE grammar PUBLIC "-//W3C//DTD GRAMMAR 1.0//EN"
+             "http://www.w3.org/TR/speech-grammar/grammar.dtd">
+
+    <grammar xmlns="http://www.w3.org/2001/06/grammar" xml:lang="en"
+      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+      xsi:schemaLocation="http://www.w3.org/2001/06/grammar
+                      http://www.w3.org/TR/speech-grammar/grammar.xsd"
+      version="1.0" mode="voice" root="root">
+
+
+    <rule id="root" scope="public">
+
+        <rule id="main">
+          <one-of>
+         <item weight="10">drinks</item>
+         <item weight="2">food</item>
+          </one-of>
+        </rule>
+
+    </rule>
+
+    </grammar>
+
+    """
+    pass
+
+def handler(uuid):
+    session = PySession(uuid)
+    session.answer()
+    rw = RecipeWizard(session)
+    session.hangup("1")
+
+