#/usr/bin/env python
# -*- coding: utf-8 -*-
# PEP 8 check with Pylint
"""A collection of semantic tools. 语义工具集合。
Use 'jieba' as Chinese word segmentation tool. The 'set_dictionary' and
'load_userdict' must before import 'jieba.posseg' and 'jieba.analyse'.
采用'jieba'作为中文分词工具。
Available functions:
- All classes and functions: 所有类和函数
"""
import os
import codecs
from string import punctuation
import jieba
dictpath = os.path.split(os.path.realpath(__file__))[0]
jieba.set_dictionary(dictpath + "\\dict\\jieba\\synonymdict.txt")
jieba.load_userdict(dictpath + "\\dict\\jieba\\userdict.txt")
import jieba.posseg as posseg
import jieba.analyse as analyse
from numpy import mat, zeros, where
# The 'punctuation_all' is the combination of Chinese and English punctuation.
punctuation_zh = " 、,。°?!:;“”’‘~…【】()《》{}×―-·→℃"
punctuation_all = list(punctuation) + list(punctuation_zh)
# 句尾语气词过滤
tone_words = "。?!的了呢吧吗啊啦呀"
# 敏感词库 Modified in 2017-5-25
try:
with codecs.open(dictpath + "\\dict\\swords.txt", "r", "UTF-8") as file:
sensitive_words = set(file.read().split())
except:
sensitive_words = []
def generate_swords():
with codecs.open(dictpath + "\\dict\\sensitive_words.txt", "r", "UTF-8") as file:
with codecs.open(dictpath + "\\dict\\swords.txt", "w", "UTF-8") as newfile:
sensitive_words = sorted(list(set(file.read().split())))
newfile.write("\n".join(sensitive_words))
def check_swords(sentence):
"""检测是否包含敏感词
"""
for word in sensitive_words:
if word in sentence:
return True
return False
# words = synonym_cut(sentence, pattern="w")
# swords = set(sensitive_words).intersection(words)
# if swords:
# return True
# else:
# return False
[docs]def synonym_cut(sentence, pattern="wf"):
"""Cut the sentence into a synonym vector tag.
将句子切分为同义词向量标签。
If a word in this sentence was not found in the synonym dictionary,
it will be marked with default value of the word segmentation tool.
如果同义词词典中没有则标注为切词工具默认的词性。
Args:
pattern: 'w'-分词, 't'-关键词, 'wf'-分词标签, 'tf-关键词标签'。
"""
sentence = sentence.rstrip(tone_words)
synonym_vector = []
if pattern == "w":
result = list(jieba.cut(sentence))
synonym_vector = [item for item in result if item not in punctuation_all]
elif pattern == "t":
synonym_vector = analyse.extract_tags(sentence, topK=10)
elif pattern == "wf":
result = posseg.cut(sentence)
# synonym_vector = [(item.word, item.flag) for item in result \
# if item.word not in punctuation_all]
# Modify in 2017.4.27
for item in result:
if item.word not in punctuation_all:
if len(item.flag) < 4:
item.flag = list(posseg.cut(item.word))[0].flag
synonym_vector.append((item.word, item.flag))
elif pattern == "tf":
result = posseg.cut(sentence)
tags = analyse.extract_tags(sentence, topK=10)
for item in result:
if item.word in tags:
synonym_vector.append((item.word, item.flag))
return synonym_vector
[docs]def get_tag(sentence, config):
"""
Get semantic tag of sentence.
"""
iquestion = sentence.format(**config)
try:
keywords = analyse.extract_tags(iquestion, topK=1)
keyword = keywords[0]
except IndexError:
keyword = iquestion
tags = synonym_cut(keyword, 'wf') # tuple list
if tags:
tag = tags[0][1]
if not tag:
tag = keyword
else:
tag = keyword
return tag
[docs]def sum_cosine(matrix, threshold):
"""Calculate the parameters of the semantic Jaccard model based on the
Cosine similarity matrix of semantic word segmentation.
根据语义分词Cosine相似性矩阵计算语义jaccard模型的各个参数。
Args:
matrix: Semantic Cosine similarity matrix. 语义分词Cosine相似性矩阵。
threshold: Threshold for semantic matching. 达到语义匹配标准的阈值。
Returns:
total: The semantic intersection of two sentence language fragments.
两个句子语言片段组成集合的语义交集。
num_not_match: The total number of fragments or the maximum value of two sets
that do not meet the semantic matching criteria controlled by the threshold.
两个集合中没有达到语义匹配标准(由阈值threshold控制)的总片段个数或者两者中取最大值。
total_dif: The degree of semantic difference between two sets.
两个集合的语义差异程度。
"""
total = 0
count = 0
row = matrix.shape[0]
col = matrix.shape[1]
zero_row = zeros([1, col])
zero_col = zeros([row, 1])
max_score = matrix.max()
while max_score > threshold:
total += max_score
count += 1
pos = where(matrix == max_score)
i = pos[0][0]
j = pos[1][0]
matrix[i, :] = zero_row
matrix[:, j] = zero_col
max_score = matrix.max()
num = (row - count) if row > col else (col - count)
return dict(total=total, num_not_match=num, total_dif=max_score)
[docs]def jaccard_basic(synonym_vector1, synonym_vector2):
"""Similarity score between two vectors with basic jaccard.
两个向量的基础jaccard相似度得分。
According to the bassic jaccard model to calculate the similarity.
The similarity score interval for each two sentences was [0, 1].
根据基础jaccard模型来计算相似度。每两个向量的相似度得分区间为为[0, 1]。
"""
count_intersection = list(set(synonym_vector1).intersection(set(synonym_vector2)))
count_union = list(set(synonym_vector1).union(set(synonym_vector2)))
sim = len(count_intersection)/len(count_union)
return sim
[docs]def jaccard(synonym_vector1, synonym_vector2):
"""Similarity score between two vectors with jaccard.
两个向量的语义jaccard相似度得分。
According to the semantic jaccard model to calculate the similarity.
The similarity score interval for each two sentences was [0, 1].
根据语义jaccard模型来计算相似度。每两个向量的相似度得分区间为为[0, 1]。
"""
sv_matrix = []
sv_rows = []
# 阈值设定为0.8,每两个词的相似度打分为[0,1],若无标签则计算原词相似度得分
# 标签字母前n位相同得分如下
for word1, tag1 in synonym_vector1:
for word2, tag2 in synonym_vector2:
if word1 == word2:
score = 1.0
elif tag1 == tag2:
score = 0.95
elif tag1[:7] == tag2[:7]:
score = 0.90
elif tag1[:6] == tag2[:6]:
score = 0.86
elif tag1[:5] == tag2[:5]:
score = 0.83
elif tag1[:4] == tag2[:4]:
score = 0.70
elif tag1[:3] == tag2[:3]:
score = 0.60
elif tag1[:2] == tag2[:2]:
score = 0.50
elif tag1[:1] == tag2[:1]:
score = 0.40
else:
score = 0.20
if score < 0.5:
jscore = jaccard_basic(list(word1), list(word2))
if jscore >= 0.5:
score = jscore
sv_rows.append(score)
sv_matrix.append(sv_rows)
sv_rows = []
matrix = mat(sv_matrix)
result = sum_cosine(matrix, 0.8)
# result = sum_cosine(matrix, 0.85) # 区分“电脑”和“打印机”:标签前5位相同
total = result["total"]
total_dif = result["total_dif"]
num = result["num_not_match"]
sim = total/(total + num*(1-total_dif))
return sim
[docs]def edit_distance(synonym_vector1, synonym_vector2):
"""Similarity score between two vectors with edit distance.
根据语义编辑距离计算相似度。
"""
sim = 1
print(synonym_vector1, synonym_vector2)
# print(str(sim))
return sim
[docs]def similarity(synonym_vector1, synonym_vector2, pattern='j'):
"""Similarity score between two sentences.
两个向量的相似度得分。
Args:
pattern: Similarity computing model. 相似度计算模式。
Defaults to 'j' represents 'jaccard'.
"""
assert synonym_vector1 != [], "synonym_vector1 can not be empty"
assert synonym_vector2 != [], "synonym_vector2 can not be empty"
if synonym_vector1 == synonym_vector2:
return 1.0
if pattern == 'jb':
sim = jaccard_basic(synonym_vector1, synonym_vector2)
elif pattern == 'j':
sim = jaccard(synonym_vector1, synonym_vector2)
elif pattern == 'e':
sim = edit_distance(synonym_vector1, synonym_vector2)
return sim
[docs]def get_location(sentence):
"""Get location in sentence. 获取句子中的地址。
"""
location = []
sv_sentence = synonym_cut(sentence, 'wf')
for word, tag in sv_sentence:
if tag.startswith("Di02") or tag.startswith("Di03") or tag == "Cb25A11#":
location.append(word)
return location
[docs]def get_musicinfo(sentence):
"""Get music info in sentence.
"""
words = sentence.lstrip("唱一首").split("的")
singer = words[0]
song = words[1]
return (singer, song)