Source code for chat.qa

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# PEP 8 check with Pylint
"""qa

NLU based on Natural Language Processing and Graph Database.
基于自然语言处理与图形数据库的自然语言理解。

Available functions:
- All classes and functions: 所有类和函数
"""
import sqlite3
from collections import deque
from concurrent.futures import ProcessPoolExecutor
from py2neo import Graph, Node, Relationship
from .api import nlu_tuling, get_location_by_ip
from .semantic import synonym_cut, get_tag, similarity, check_swords, get_location
from .mytools import time_me, get_current_time, random_item
from .word2pinyin import pinyin_cut, jaccard_pinyin # Add in 2017-6-23

# 获取导航地点——Development requirements from Mr Tang in 2017-5-11.
def get_navigation_location():
    db = sqlite3.connect("C:/docu/db/contentDB.db")
    cursor = db.execute("SELECT name from goalvoice")
    # 过滤0记录
    names = [row[0] for row in cursor if row[0]]
    return names

[docs]class Robot(): """NLU Robot. 自然语言理解机器人。 Public attributes: - graph: The connection of graph database. 图形数据库连接。 - pattern: The pattern for NLU tool: 'semantic' or 'vec'. 语义标签或词向量模式。 - memory: The context memory of robot. 机器人对话上下文记忆。 """ def __init__(self, password="train"): # 连接图知识库 self.graph = Graph("http://localhost:7474/db/data/", password=password) # 语义模式:'semantic' or 'vec' self.pattern = 'semantic' # 获取导航地点数据库 self.locations = get_navigation_location() # 在线场景标志,默认为False self.is_scene = False # 在线调用百度地图IP定位api,网络异常时返回默认地址:上海市 self.address = get_location_by_ip() # 机器人配置信息 self.gconfig = None # 可用话题列表 self.usertopics = [] # 当前QA话题 self.topic = "" # 当前QA id self.qa_id = get_current_time() # 短期记忆:最近问过的10个问题与10个答案 self.qmemory = deque(maxlen=10) self.amemory = deque(maxlen=10) # 匹配不到时随机回答 TODO:记录回答不上的所有问题, self.do_not_know = [ "这个问题太难了,{robotname}还在学习中", "这个问题{robotname}不会,要么我去问下", "您刚才说的是什么,可以再重复一遍吗", "{robotname}刚才走神了,一不小心没听清", "{robotname}理解的不是很清楚啦,你就换种方式表达呗", "不如我们换个话题吧", "咱们聊点别的吧", "{robotname}正在学习中", "{robotname}正在学习哦", "不好意思请问您可以再说一次吗", "额,这个问题嘛。。。", "{robotname}得好好想一想呢", "请问您说什么", "您问的问题好有深度呀", "{robotname}没有听明白,您能再说一遍吗" ] def __str__(self): return "Hello! I'm {robotname} and I'm {robotage} years old.".format(**self.gconfig) @time_me() def configure(self, info="", userid="userid"): """Configure knowledge base. 配置知识库。 """ assert userid is not "", "The userid can not be empty!" # TO UPGRADE 对传入的userid参数分析,若不合适则报相应消息 2017-6-7 if userid != "A0001": userid = "A0001" print("userid 不是标准A0001,已经更改为A0001") match_string = "MATCH (config:Config) RETURN config.name as name" subgraphs = [item[0] for item in self.graph.run(match_string)] print("所有知识库:", subgraphs) if not info: config = {"databases": []} match_string = "MATCH (user:User)-[r:has]->(config:Config)" + \ "where user.userid='" + userid + \ "' RETURN config.name as name, r.bselected as bselected, r.available as available" for item in self.graph.run(match_string): config["databases"].append(dict(name=item[0], bselected=item[1], available=item[2])) print("可配置信息:", config) return config else: selected_names = info.split() forbidden_names = list(set(subgraphs).difference(set(selected_names))) print("选中知识库:", selected_names) print("禁用知识库:", forbidden_names) # TODO:待合并精简 for name in selected_names: match_string = "MATCH (user:User)-[r:has]->(config:Config) where user.userid='" \ + userid + "' AND config.name='" + name + "' SET r.bselected=1" # print(match_string) self.graph.run(match_string) for name in forbidden_names: match_string = "MATCH (user:User)-[r:has]->(config:Config) where user.userid='" \ + userid + "' AND config.name='" + name + "' SET r.bselected=0" # print(match_string) self.graph.run(match_string) return self.get_usertopics(userid=userid) # @time_me() def get_usertopics(self, userid="userid"): """Get usertopics list. """ usertopics = [] if not userid: userid = "userid" # 从知识库获取用户拥有权限的子知识库列表 match_string = "MATCH (user:User)-[r:has {bselected:1, available:1}]->(config:Config)" + \ "where user.userid='" + userid + "' RETURN config" data = self.graph.run(match_string).data() for item in data: usertopics.extend(item["config"]["topic"].split(",")) print("用户:", userid, "\n已有知识库列表:", usertopics) return usertopics def iformat(self, sentence): """Individualization of robot answer. 个性化机器人回答。 """ return sentence.format(**self.gconfig) # @time_me() def add_to_memory(self, question="question", userid="userid"): """Add user question to memory. 将用户当前对话加入信息记忆。 Args: question: 用户问题。 Defaults to "question". userid: 用户唯一标识。 Defaults to "userid". """ previous_node = self.graph.find_one("Memory", "qa_id", self.qa_id) self.qa_id = get_current_time() node = Node("Memory", question=question, userid=userid, qa_id=self.qa_id) if previous_node: relation = Relationship(previous_node, "next", node) self.graph.create(relation) else: self.graph.create(node) # Development requirements from Mr Tang in 2017-5-11. # 由模糊匹配->全匹配 from Mr Tang in 2017-6-1. def extract_navigation(self, question): """Extract navigation。抽取导航地点。 QA匹配模式:从导航地点列表选取匹配度最高的地点。 Args: question: User question. 用户问题。 """ result = dict(question=question, content=self.iformat(random_item(self.do_not_know)), \ context="", url="", behavior=0, parameter=0) # temp_sim = 0 # sv1 = synonym_cut(question, 'wf') # if not sv1: # return result for location in self.locations: if "去" in question and location in question: print("Original navigation") result["content"] = location result["context"] = "user_navigation" result["behavior"] = int("0x001B", 16) return result # sv2 = synonym_cut(location, 'wf') # if sv2: # temp_sim = similarity(sv1, sv2, 'j') # 匹配加速,不必选取最高相似度,只要达到阈值就终止匹配 # if temp_sim > 0.92: # print("Navigation location: " + location + " Similarity Score: " + str(temp_sim)) # result["content"] = location # result["context"] = "user_navigation" # result["behavior"] = int("0x001B", 16) # return result return result def extract_pinyin(self, question, subgraph): """Extract synonymous QA in NLU database。 QA匹配模式:从图形数据库选取匹配度最高的问答对。 Args: question: User question. 用户问题。 subgraph: Sub graphs corresponding to the current dialogue. 当前对话领域对应的子图。 """ temp_sim = 0 result = dict(question=question, content=self.iformat(random_item(self.do_not_know)), \ context="", url="", behavior=0, parameter=0) sv1 = pinyin_cut(question) print(sv1) for node in subgraph: iquestion = self.iformat(node["name"]) sv2 = pinyin_cut(iquestion) print(" ", sv2) temp_sim = jaccard_pinyin(sv1, sv2) print(temp_sim) # 匹配加速,不必选取最高相似度,只要达到阈值就终止匹配 if temp_sim > 0.75: print("Q: " + iquestion + " Similarity Score: " + str(temp_sim)) result["content"] = self.iformat(random_item(node["content"].split("|"))) result["context"] = node["topic"] if node["url"]: # result["url"] = json.loads(random_item(node["url"].split("|"))) result["url"] = random_item(node["url"].split("|")) if node["behavior"]: result["behavior"] = int(node["behavior"], 16) if node["parameter"]: result["parameter"] = int(node["parameter"]) func = node["api"] if func: exec("result['content'] = " + func + "('" + result["content"] + \ "', " + "question)") return result return result def extract_synonym(self, question, subgraph): """Extract synonymous QA in NLU database。 QA匹配模式:从图形数据库选取匹配度最高的问答对。 Args: question: User question. 用户问题。 subgraph: Sub graphs corresponding to the current dialogue. 当前对话领域对应的子图。 """ temp_sim = 0 result = dict(question=question, content=self.iformat(random_item(self.do_not_know)), \ context="", url="", behavior=0, parameter=0) # semantic: 切分为同义词标签向量,根据标签相似性计算相似度矩阵,由相似性矩阵计算句子相似度 # vec: 切分为词向量,根据word2vec计算相似度矩阵,由相似性矩阵计算句子相似度 if self.pattern == 'semantic': # elif self.pattern == 'vec': sv1 = synonym_cut(question, 'wf') if not sv1: return result for node in subgraph: iquestion = self.iformat(node["name"]) if question == iquestion: print("Similarity Score: Original sentence") result["content"] = self.iformat(random_item(node["content"].split("|"))) result["context"] = node["topic"] if node["url"]: # result["url"] = json.loads(random_item(node["url"].split("|"))) result["url"] = random_item(node["url"].split("|")) if node["behavior"]: result["behavior"] = int(node["behavior"], 16) if node["parameter"]: result["parameter"] = int(node["parameter"]) # 知识实体节点api抽取原始问题中的关键信息,据此本地查询/在线调用第三方api/在线爬取 func = node["api"] if func: exec("result['content'] = " + func + "('" + result["content"] + \ "', " + "question)") return result sv2 = synonym_cut(iquestion, 'wf') if sv2: temp_sim = similarity(sv1, sv2, 'j') # 匹配加速,不必选取最高相似度,只要达到阈值就终止匹配 if temp_sim > 0.92: print("Q: " + iquestion + " Similarity Score: " + str(temp_sim)) result["content"] = self.iformat(random_item(node["content"].split("|"))) result["context"] = node["topic"] if node["url"]: # result["url"] = json.loads(random_item(node["url"].split("|"))) result["url"] = random_item(node["url"].split("|")) if node["behavior"]: result["behavior"] = int(node["behavior"], 16) if node["parameter"]: result["parameter"] = int(node["parameter"]) func = node["api"] if func: exec("result['content'] = " + func + "('" + result["content"] + \ "', " + "question)") return result return result def extract_keysentence(self, question): """Extract keysentence QA in NLU database。 QA匹配模式:从图形数据库选取包含关键句的问答对。 Args: question: User question. 用户问题。 """ result = dict(question=question, content=self.iformat(random_item(self.do_not_know)), \ context="", url="", behavior=0, parameter=0) match_string = "MATCH (n:NluCell) WHERE '" + question + "' CONTAINS n.name RETURN n LIMIT 1" subgraph = self.graph.run(match_string).data() if subgraph: node = list(subgraph)[0]['n'] print("Similarity Score: Key sentence") result["content"] = self.iformat(random_item(node["content"].split("|"))) result["context"] = node["topic"] if node["url"]: # result["url"] = json.loads(random_item(node["url"].split("|"))) result["url"] = random_item(node["url"].split("|")) if node["behavior"]: result["behavior"] = int(node["behavior"], 16) if node["parameter"]: result["parameter"] = int(node["parameter"]) # 知识实体节点api抽取原始问题中的关键信息,据此本地查询/在线调用第三方api/在线爬取 func = node["api"] if func: exec("result['content'] = " + func + "('" + result["content"] + \ "', " + "question)") return result return result @time_me() def search(self, question="question", userid="userid"): """Nlu search. 语义搜索。 Args: question: 用户问题。 Defaults to "question". userid: 用户唯一标识。 Defaults to "userid" Returns: Dict contains answer, current topic, url, behavior and parameter. 返回包含答案,当前话题,资源包,行为指令及对应参数的字典。 """ # 添加到问题记忆 # self.qmemory.append(question) # self.add_to_memory(question, userid) # 本地语义:全图模式 #tag = get_tag(question) #subgraph = self.graph.find("NluCell", "tag", tag) #result = self.extract_synonym(question, subgraph) # 本地语义:场景+全图+用户配置模式 # 多用户根据userid动态获取对应的配置信息 self.gconfig = self.graph.find_one("User", "userid", userid) self.usertopics = self.get_usertopics(userid=userid) # 一、预处理===================================================== # 问题过滤器(添加敏感词过滤 2017-5-25) if check_swords(question): print("问题包含敏感词!") return dict(question=question, content=self.iformat(random_item(self.do_not_know)), \ context="", url="", behavior=0, parameter=0) # 姓氏引起误匹配重定义 if question.startswith("小") and len(question) == 2: question = self.gconfig['robotname'] # 称呼过滤 Add in 2017-7-5 for robotname in ["小民", "小明", "小名", "晓明"]: if question.startswith(robotname) and len(question) >= 4 and "在线" not in question: question = question.lstrip(robotname) if not question: question = self.gconfig['robotname'] # 二、导航======================================================= result = self.extract_navigation(question) if result["context"] == "user_navigation": return result # 三、云端在线场景================================================ result = dict(question=question, content="", context="basic_cmd", url="", \ behavior=int("0x0000", 16), parameter=0) # TODO: 简化为统一模式 # TODO {'behavior': 0, 'content': '理财产品取号', 'context': 'basic_cmd', 'parameter': 0, 'question': '理财产品取号', 'url': ''} if "理财产品" in question and "取号" not in question: result["behavior"] = int("0x1002", 16) # 进入在线场景 result["question"] = "理财产品" # 重定义为标准问题 self.is_scene = True # 在线场景标志 if "免费wifi" in question or "wifi" in question: result["behavior"] = int("0x1002", 16) # 进入在线场景 result["question"] = "有没有免费的wifi" # 重定义为标准问题 self.is_scene = True # 在线场景标志 if "存款利率" in question: result["behavior"] = int("0x1002", 16) # 进入在线场景 result["question"] = "存款利率" # 重定义为标准问题 self.is_scene = True # 在线场景标志 if "我要取钱" in question or "取钱" in question: result["behavior"] = int("0x1002", 16) # 进入在线场景 result["question"] = "我要取钱" # 重定义为标准问题 self.is_scene = True # 在线场景标志 if "信用卡挂失" in question: result["behavior"] = int("0x1002", 16) # 进入在线场景 result["question"] = "信用卡挂失" # 重定义为标准问题 self.is_scene = True # 在线场景标志 if "开通云闪付" in question: result["behavior"] = int("0x1002", 16) # 进入在线场景 result["question"] = "开通云闪付" # 重定义为标准问题 self.is_scene = True # 在线场景标志 if "办理粤卡通" in question or "办理粤通卡" in question: result["behavior"] = int("0x1002", 16) # 进入在线场景 result["question"] = "办理粤通卡" # 重定义为标准问题 修正:2017-7-3 self.is_scene = True # 在线场景标志 # 进入在线场景 # start_scene = ["理财产品", "wifi", "存款利率", "取钱", "信用卡挂失", "开通云闪付", "办理粤卡通"] # for item in start_scene: # if item in question: # result["behavior"] = int("0x1002", 16) # 进入在线场景 # result["question"] = "办理粤卡通" # 重定义为标准问题 # self.is_scene = True # 在线场景标志 # 退出在线场景 end_scene = ["退出业务场景", "退出", "返回", "结束", "发挥"] for item in end_scene: if item == question: # if item in question: # 避免多个退出模式冲突 result["behavior"] = int("0x0020", 16) # 场景退出 self.is_scene = False return result previous_step = ["上一步", "上一部", "上一页", "上一个"] next_step = ["下一步", "下一部", "下一页", "下一个"] if self.is_scene: # for item in previous_step: # if item in question: # result["behavior"] = int("0x001D", 16) # 场景上一步 # for item in next_step: # if item in question: # result["behavior"] = int("0x001E", 16) # 场景下一步 if "上一步" in question or "上一部" in question or "上一页" in question or "上一个" in question: result["behavior"] = int("0x001D", 16) # 场景上一步 elif "下一步" in question or "下一部" in question or "下一页" in question or "下一个" in question: result["behavior"] = int("0x001E", 16) # 场景下一步 result["content"] = question return result # 常用命令,交互,业务 # 上下文——重复命令 TODO:确认返回的是正确的指令而不是例如唱歌时的结束语“可以了” if "再来一个" in question: # TODO:从记忆里选取最近的有意义行为作为重复的内容 return self.amemory[-1] # 四、本地标准语义================================================ # 模式1:选取语义得分大于阈值 tag = get_tag(question, self.gconfig) # TODO:添加语义标签和关键词综合匹配的情况 subgraph_all = list(self.graph.find("NluCell", "tag", tag)) # subgraph_scene = [node for node in subgraph_all if node["topic"]==self.topic] # TODO:usergraph_all 包含正常问答和用户自定义问答,可优先匹配用户自定义问答 usergraph_all = [node for node in subgraph_all if node["topic"] in self.usertopics] usergraph_scene = [node for node in usergraph_all if node["topic"] == self.topic] # 查看根据语义标签初步确定的子图 # for node in usergraph_all: # print(node["name"]) # if subgraph_scene: if usergraph_scene: result = self.extract_synonym(question, usergraph_scene) # result = self.extract_pinyin(question, usergraph_scene) if result["context"]: self.topic = result["context"] self.amemory.append(result) # 添加到答案记忆 return result result = self.extract_synonym(question, usergraph_all) # result = self.extract_pinyin(question, usergraph_all) # result = self.extract_synonym(question, subgraph_all) self.topic = result["context"] self.amemory.append(result) # 添加到答案记忆 # 模式2:包含关键句就匹配 if not self.topic: result = self.extract_keysentence(question) if result["context"]: self.topic = result["context"] self.amemory.append(result) # 添加到答案记忆 return result # 五、在线语义==================================================== if not self.topic: # 1.音乐(唱一首xxx的xxx) if "唱一首" in question or "唱首" in question or "我想听" in question: result["behavior"] = int("0x0001", 16) result["content"] = "好的,正在准备哦" # 2.附近有什么好吃的 elif "附近" in question or "好吃的" in question: result["behavior"] = int("0x001C", 16) result["content"] = self.address # 3.nlu_tuling(天气) elif "天气" in question: # 图灵API变更之后 Add in 2017-8-4 location = get_location(question) if not location: # 问句中不包含地址 weather = nlu_tuling(self.address + question) else: # 问句中包含地址 weather = nlu_tuling(question) # 图灵API变更之前 # weather = nlu_tuling(question, loc=self.address) result["behavior"] = int("0x0000", 16) try: # 图灵API变更之前 # temp = weather.split(";")[0].split(",")[1].split() # myweather = temp[0] + temp[2] + temp[3] # 图灵API变更之后 Add in 2017-8-3 temp = weather.split(",") myweather = temp[1] + temp[2] except: myweather = weather result["content"] = myweather result["context"] = "nlu_tuling" # 4.追加记录回答不上的所有问题 else: with open("C:/nlu/bin/do_not_know.txt", "a", encoding="UTF-8") as file: file.write(question + "\n") # 5.nlu_tuling # else: # result["content"] = nlu_tuling(question, loc=self.address) # result["context"] = "nlu_tuling" return result