- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

基于简爱语料库实现单词预测（n-gram）

今天吃什么发表于 2022/10/06 22:51:13 2022/10/06

【摘要】目的是给定两个单词，预测第三个单词，如果我们给定的两个单词的单词组合不在语料库中，理论上我们是无法给出一个合理的预测的，所以我在输入的部分是做了个判断的，如果输入了一个不存在的单词组合，会提醒无法查询，这类问题的核心是语料库不够丰富，还是容易解决的。

import re
import numpy as np

def removePunctuation(sentence_str):
    '''给定字符串，进行特殊符号过滤，以及小写转换
    Args:
        sentence_str (str): 句子

    Returns:
        sentence_str.lower() (str): 预处理过的句子
    '''
    punctuation = '().!,;:?\-"\''
    sentence_str = re.sub(r'[{}]+'.format(punctuation), '', sentence_str)#将句子中的().!,;:?\-"\'符号移除
    return sentence_str.lower()#将句子的每个单词小写后返回


def get_sentence(filename):
    '''从给定文件中获取句子
    Args:
        filename (str): 语料库的文件名

    Returns:
        sentences_list (list): 1-D，存放了所有经过预处理后的句子
    '''
    sentences_list = []
    with open(filename, encoding='utf-8') as f:
        for line in f.readlines():
            sentence_str = removePunctuation(line.strip().split('|')[-1])#碰到|切分，因为没有|所以直接就是一句话，然后取最后一个,其实就是第一个
            if(len(sentence_str)==0):#如果removePunctuation得到的字符串为空，那就不记录到sentences_list中
                continue
            sentences_list.append('<s> ' + sentence_str + ' </s>')
    return sentences_list


def count_one_word(sentences_list):#统计一个单词出现的频次
    '''给定大量句子，统计出所有单词出现的频次
    Args:
        sentences_list (list): 所有经过预处理后的句子

    Returns:
        wordcount_dict (dict): 键是str类型，表示单词；值是int类型，表示次数，例如{'the': 1234}
    '''
    wordcount_dict = {}
    for sentence_str in sentences_list:
        for word in sentence_str.split():
            if word in wordcount_dict:
                wordcount_dict[word] += 1
            else:
                wordcount_dict[word] = 1
    return wordcount_dict



def count_two_word(sentences_list):#接着统计每两个单词出现的次数
    '''给定大量句子，统计出所有单词出现的频次
    Args:
        sentences_list (list): 所有经过预处理后的句子
    Returns:
        wordcount_dict (dict): 键是str类型，表示单词；值是int类型，表示次数，例如{'the': 1234}
    '''
    wordcount_dict = {}
    for sentence_str in sentences_list:
        words_list=sentence_str.split()
        for i in range(1, len(words_list)):
            if (words_list[i-1]+" "+words_list[i]) in wordcount_dict:
                wordcount_dict[ words_list[i-1]+" "+words_list[i] ] += 1
            else:
                wordcount_dict[ words_list[i-1]+" "+words_list[i] ] = 1
    # print("wordcount_dict",wordcount_dict)
    return wordcount_dict

def word2idx(wordcount_dict):#构建一个二维矩阵，不论是用 list 还是 numpy，他们进行索引的方式都是下标，而无法直接使用单词。所以我们应该先构建一个单词到索引的映射，以及索引到单词的映射
    '''构建单词到索引的映射与逆映射
    Args:
        wordcount_dict (dict): 键是str类型，表示单词；值是int类型，表示次数

    Returns:
        word2idx_dict (dict): 键是str类型，表示单词；值是int类型，表示索引，例如{'the': 0}
        idx2word_dict (dict): 键是int类型，表示索引；值是str类型，表示单词，例如{0: 'the'}
    '''
    word2idx_dict = {}#键表示单词，值表示索引
    idx2word_dict = {}#键表示索引，值表示单词
    for idx, word in enumerate(list(wordcount_dict.keys())):#利用enumerate函数得到键列表的索引和键
        word2idx_dict[word] = idx
        idx2word_dict[idx] = word
    return word2idx_dict, idx2word_dict

'''
接下来要做的就是统计三个单词同时出现的次数。基本做法就是遍历每个句子，同时遍历句子中的每个单词。
记前两个词组合串为wi，当前词为wj，通过 word2idx_dict 查得wi对应的索引为i，wj对应的索引为j，则矩阵中(i,j)位置的值就加 1。
最后我还设置了一个可选项，如果用户想要使用加一平滑，那一开始就生成一个全 1 的矩阵；如果不用平滑，一开始生成的是全 0 的矩阵
'''
def c_table(one_word2idx_dict, two_word2idx_dict,sentences_list, smooth=False):#构建语料库的频次矩阵
    n1=len(two_word2idx_dict)#获取语料库两个单词组合的数量
    n2=len(one_word2idx_dict)#获取语料库词汇量
    if smooth:  # 加一平滑
        c_table_np = np.ones((n1, n2))  # n*n 全1矩阵
    else:
        c_table_np = np.zeros((n1, n2))  # n*n 全0矩阵
    for sentence_str in sentences_list:#句子组成的列表
        words_list = sentence_str.split()  # ['i', 'like', 'apple']，单词组成的列表
        for i in range(2, len(words_list)):
            w_i = one_word2idx_dict[words_list[i]]  #调取words_list的最后一个单词，查找在单词汇字典中的索引
            w_j = two_word2idx_dict[words_list[i - 2]+" "+words_list[i - 1]] #调取words_list的前两个单词，查找在双词汇字典中的索引
            c_table_np[w_j][w_i] += 1#根据索引在频次矩阵对应位置加1
    return c_table_np

'''
最后用上述生成的 c_table_np 的每一行同除以 wordcount_dict 中的每个值即可，
下面代码利用了 numpy 的广播机制，加快了运算速度'''
def compute_bigram_table(c_table_np,one_wordcount_dict):#构建预料库的概率矩阵。加一平滑专用
    '''构建bigram概率矩阵
    Args:
        c_table_np (numpy): bigram频次矩阵
        wordcount_dict (dict): 所有单词出现的次数
    Returns:
        c_table_np / count_np[:, None] (numpy): 2-D，bigram概率矩阵
    '''
    count_np = np.array(list(two_wordcount_dict.values()))  # 取出双词汇字典每个键对应的值
    return c_table_np /(count_np[:, None]+len(np.array(list(one_wordcount_dict.keys()))) )#平滑加上的是语料库的词汇量，也就是单个词汇的数量



# def compute_sentence_bigram(bigram_table_np,one_wordcount_dict,one_word2idx_dict,two_word2idx_dict,sentences_list):#计算测试集的句子概率
#     '''计算每个句子的bigram概率
#     Args:
#         bigram_table_np (numpy): bigram概率矩阵
#         word2idx_dict (dict): 单词到索引的映射
#         sentences_list (list): 预处理后的句子，也就是加上了开头和结尾
#
#     Returns:
#         scores_list (list): 所有句子的bigram概率
#     '''
#     scores_list = []
#     for sentence_str in sentences_list:
#         words_list = sentence_str.split()#还是把单词组成列表
#         score = 1
#         for i in range(2, len(words_list)):
#             if (words_list[i - 2]+" "+words_list[i - 1] in two_word2idx_dict) and (words_list[i] in one_word2idx_dict):#如果前后缀都在语料库里，直接调用矩阵即可
#                 w_i = one_word2idx_dict[words_list[i]]  # 获取后缀的索引
#                 w_j = two_word2idx_dict[words_list[i - 2]+" "+words_list[i - 1]]  # 获取前缀的索引
#                 score *= bigram_table_np[w_j][w_i]#利用索引从频率矩阵中得到这个3-gram的概率
#                 print(bigram_table_np[w_j][w_i])
#             if words_list[i - 2]+" "+words_list[i - 1] not in two_word2idx_dict:#前缀（两个单词）不在预料库里的话，就算后缀的单词可能存在，但是三个词结合起来是肯定不在的，我们要手动进行加一平滑，分子是1，分母为语料库的词汇量
#                 below=len(np.array(list(one_wordcount_dict.keys())) )
#                 score *=1/below
#                 print(1/below)
#             if (words_list[i] not in one_word2idx_dict) and (words_list[i - 2]+" "+words_list[i - 1]) in two_word2idx_dict:#前缀在，后缀不在，分子怎么样都是1，分母在语料库词汇量基础上需要加上前缀（两个单词）出现的频次
#                 below = len(np.array(list(one_wordcount_dict.keys())))+two_wordcount_dict[words_list[i - 2]+" "+words_list[i - 1]]
#                 score*=1/below
#                 print(1/below)
#             # print("w_i,w_j",words_list[i],"  ",words_list[i - 2]+" "+words_list[i - 1])
#         scores_list.append(score)
#     return scores_list


if __name__ == '__main__':
    '''对Jane语料进行预处理，在此认为逗号，问号，句号，感叹号，分号结束均代表一句话，因此只要碰到这几个标点就进行断行，然后将每一行读入作为一句话'''
    infopen = open('Jane.txt', 'r', encoding="utf-8")
    outfopen = open('Jane-1.txt', 'w', encoding="utf-8")
    db = infopen.read()
    outfopen.write(db.replace('Mrs.', 'Mrs').replace('Miss.', 'Miss').replace('\'', '').replace(',', '\n').replace('?','\n').replace(';', '\n').replace('!', '\n').replace('.', '\n'))
    infopen.close()
    outfopen.close()
    with open('Jane-1.txt', 'r', encoding='utf-8') as fr, open('Jane-2.txt', 'w',
                                                               encoding='utf-8') as fd:  # 将Jane-1.txt中的空行删去，输出到Jane-2.txt中
        for text in fr.readlines():
            if text.split():
                fd.write(text)
        print('语料预处理成功！')

    sentences_list = get_sentence('Jane-2.txt')#语料库
    # print(sentences_list)
    one_wordcount_dict = count_one_word(sentences_list)#计算语料库里每个单词出现的次数
    # print('\n\n单词汇字典',one_wordcount_dict)
    two_wordcount_dict = count_two_word(sentences_list)  # 计算语料库里两个单词一起出现的次数
    # print('\n\n双词汇字典',two_wordcount_dict)

    one_word2idx_dict, one_idx2word_dict = word2idx(one_wordcount_dict)#获取一个单词的单词-索引和索引-单词词典
    two_word2idx_dict, two_idx2word_dict = word2idx(two_wordcount_dict)#获取两个单词组合的单词-索引和索引-单词词典

    c_table_np = c_table(one_word2idx_dict,two_word2idx_dict, sentences_list, True)#构造加一平滑的频次矩阵
    bigram_table_np = compute_bigram_table(c_table_np,one_wordcount_dict )#加一平滑的频率矩阵
    print("使用加一平滑技术预测！\n")
    while True:
        print("请输入两个单词（用空格隔开，输入\"* *\"退出查询）：")
        str1, str2 = map(str, input().split(' '))
        if str1=='*' and str2=='*':
            print("退出成功！")
            break
        string=(str1+" "+str2).lower()
        if string in two_word2idx_dict:#如果两个单词的组合位于语料库的单词组合中
            a=c_table_np[two_word2idx_dict[string]]#直接调取频次字典中此单词组合对应的那一行，我们用的是加一平滑下的频次矩阵
            z=np.argsort(a)#给a排序按从小到大的顺序输出索引序号
            for i in range(5):
                print("预测结果Top"+str(i+1)+" :%-12s"%one_idx2word_dict[z[len(np.array(list(one_wordcount_dict.keys())))-1-i]], " 概率：%.7f" %bigram_table_np[two_word2idx_dict[string]][z[len(np.array(list(one_wordcount_dict.keys())))-1-i]]," 组合在语料中出现次数：%s"%int(c_table_np[two_word2idx_dict[string]][z[len(np.array(list(one_wordcount_dict.keys())))-1-i]]-1))#输出在z中倒数的五个索引对应的单词和概率
        else:
            print("很遗憾，您输入的单词组合并不包含在语料库内，我们无法给您提供具有建设性的单词预测")
        print('\n')

Jane.txt

278.93KB 下载次数：0次

附件下载

Jane.txt 278.93KB 下载次数：0次

点赞
收藏
关注作者

0/1000

抱歉，系统识别当前为高风险访问，暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称，即可参与社区互动！

*长度不超过10个汉字或20个英文字符，设置后3个月内不可修改。

确认取消

加入云驻计划，成为创作者

华为云周边好礼
免费体验产品
特殊身份标识
线下官方门票
内部专家零距离
与10000+优质创作者共同成长

立即加入

基于简爱语料库实现单词预测（n-gram）

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

基于简爱语料库实现单词预测（n-gram）

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

推荐阅读

相关产品