NLP(12)--语言模型

前言

仅记录学习过程，有问题欢迎讨论

应用-语音识别：

声音的本质是一种波，将波按时间段切分为很多帧，之后进行声学特征提取，每一帧转化为向量。

应用-输入法：

按照语言模型给出概率最高的候选词

语言模型的分类：

统计语言模型（SLM） S = Statistics: ngram语言模型等(成句概率)
- 马尔可夫假设：假设第n个词出现的概率，仅受其前面 “有限” 个词的影响
- 平滑（折扣）问题：遇见没见过的句子，概率也不应该为0
- 解决方案–回退：当三元组abc未出现时，找bc二元组的概率（* 概率值 0.4）
- PPL(判断标准) : 困惑度和成句概率成反比----相对值
神经语言模型（NLM） N = Neural: rnn语言模型等
- 语言模型的核心能力是计算成句概率（打标点符号，数字归一化，文本纠错）
- 效果比统计类型好，但是速度相对没它这么快
预训练语言模型（PLM) P = Pre-train: Bert、GPT等
大语言模型（LLM) L = Large: ChatGPT等

代码

n-gram 统计型demo

import math
from collections import defaultdict


class NgramLanguageModel:
    def __init__(self, corpus=None, n=3):
        self.n = n
        self.sep = "_"  # 用来分割两个词，没有实际含义，只要是字典里不存在的符号都可以
        self.sos = "<sos>"  # start of sentence，句子开始的标识符
        self.eos = "<eos>"  # end of sentence，句子结束的标识符
        self.unk_prob = 1e-5  # 给unk分配一个比较小的概率值，避免集外词概率为0
        self.fix_backoff_prob = 0.4  # 使用固定的回退概率
        self.ngram_count_dict = dict((x + 1, defaultdict(int)) for x in range(n))
        self.ngram_count_prob_dict = dict((x + 1, defaultdict(int)) for x in range(n))
        self.ngram_count(corpus)
        self.calc_ngram_prob()

    # 将文本切分成词或字或token
    def sentence_segment(self, sentence):
        return sentence.split()
        # return jieba.lcut(sentence)

    # 统计ngram的数量
    def ngram_count(self, corpus):
        for sentence in corpus:
            word_lists = self.sentence_segment(sentence)
            word_lists = [self.sos] + word_lists + [self.eos]  # 前后补充开始符和结尾符
            for window_size in range(1, self.n + 1):  # 按不同窗长扫描文本
                for index, word in enumerate(word_lists):
                    # 取到末尾时窗口长度会小于指定的gram，跳过那几个
                    if len(word_lists[index:index + window_size]) != window_size:
                        continue
                    # 用分隔符连接word形成一个ngram用于存储
                    ngram = self.sep.join(word_lists[index:index + window_size])
                    self.ngram_count_dict[window_size][ngram] += 1
        # 计算总词数，后续用于计算一阶ngram概率
        self.ngram_count_dict[0] = sum(self.ngram_count_dict[1].values())
        return

    # 计算ngram概率
    def calc_ngram_prob(self):
        for window_size in range(1, self.n + 1):
            for ngram, count in self.ngram_count_dict[window_size].items():
                if window_size > 1:
                    ngram_splits = ngram.split(self.sep)  # ngram        :a b c
                    ngram_prefix = self.sep.join(ngram_splits[:-1])  # ngram_prefix :a b
                    ngram_prefix_count = self.ngram_count_dict[window_size - 1][ngram_prefix]  # Count(a,b)
                else:
                    ngram_prefix_count = self.ngram_count_dict[0]  # count(total word)
                # word = ngram_splits[-1]
                # self.ngram_count_prob_dict[word + "|" + ngram_prefix] = count / ngram_prefix_count
                self.ngram_count_prob_dict[window_size][ngram] = count / ngram_prefix_count
        return

    # 获取ngram概率，其中用到了回退平滑，回退概率采取固定值
    def get_ngram_prob(self, ngram):
        n = len(ngram.split(self.sep))
        if ngram in self.ngram_count_prob_dict[n]:
            # 尝试直接取出概率
            return self.ngram_count_prob_dict[n][ngram]
        elif n == 1:
            # 一阶gram查找不到，说明是集外词，不做回退
            return self.unk_prob
        else:
            # 高于一阶的可以回退
            ngram = self.sep.join(ngram.split(self.sep)[1:])
            return self.fix_backoff_prob * self.get_ngram_prob(ngram)

    # 回退法预测句子概率
    def calc_sentence_ppl(self, sentence):
        word_list = self.sentence_segment(sentence)
        word_list = [self.sos] + word_list + [self.eos]
        sentence_prob = 0
        for index, word in enumerate(word_list):
            ngram = self.sep.join(word_list[max(0, index - self.n + 1):index + 1])
            prob = self.get_ngram_prob(ngram)
            # print(ngram, prob)
            sentence_prob += math.log(prob)
        return 2 ** (sentence_prob * (-1 / len(word_list)))


if __name__ == "__main__":
    corpus = open("sample.txt", encoding="utf8").readlines()
    lm = NgramLanguageModel(corpus, 3)
    print("词总数:", lm.ngram_count_dict[0])
    print(lm.ngram_count_prob_dict)
    print(lm.calc_sentence_ppl("c d b d b"))

rnn demo 预测句子的分类
（需要语料可留言）

import torch
import torch.nn as nn
import math
import os
import random
import torch.utils.data as data_util
import numpy as np

"""
import torch
import torch.nn as nn
import math
import os
import random
import torch.utils.data as data_util
import numpy as np

"""

week 6--语言模型
通俗来说就是人话

统计语言模型----=N-gram
成句概率-》 词w1--wn 按顺序出现的概率
马尔可夫假设：
    假设第n个词出现的概率，仅受其前面  “有限”  个词的影响
    P(今天天气不错) = P(今)*P(天|今) *P(天|今天) *P(气|天天) *P(不|天气) *P(错|气不)

平滑（折扣）问题
遇见没见过的句子，概率也不应该为0
解决方案：
    回退：当三元组abc未出现时，找bc二元组的概率（* 概率值 0.4）
    若是P(word) 都不存在：  加1平滑（count数+1）/// 低频词替换为【unk】 都当做<unk>处理
    插值：计算高阶的概率时，同时考虑低阶的概率 Pw1|wn-1 wn-2 = aP(w1|wn-1 wn-2) + bp(w1|wn-1) + cP(wn)

PPL : 困惑度 和 成句概率成反比----相对值

"""


# 实现一个判断文本是否该领域的模型 根据ppl大小
class LanguageModel(nn.Module):
    def __init__(self, input_dim, vocab):
        super(LanguageModel, self).__init__()
        self.emb = nn.Embedding(len(vocab) + 1, input_dim)
        self.rnn = nn.RNN(input_dim, input_dim, batch_first=True)
        # 输出为字表的长度 代表可能性为任何一个字 因为是预测
        self.linear = nn.Linear(input_dim, len(vocab) + 1)
        self.drop = nn.Dropout(0.1)
        self.loss = nn.functional.cross_entropy

    def forward(self, x, y=None):
        x = self.emb(x)  # output shape:(batch_size, sen_len, input_dim)
        x, _ = self.rnn(x)  # output shape:(batch_size, sen_len, input_dim)
        # x 取最后一个
        x = x[:, -1, :]  # output shape:(batch_size, input_dim)
        x = self.drop(x)
        y_pred = self.linear(x)
        if y is not None:
            return self.loss(y_pred, y)  # [1*vocab_size] []
        else:
            # 需要归一化处理
            return torch.softmax(y_pred, dim=-1)


def build_vocab(vocab_path):
    # set dict
    vocab = {}
    with open(vocab_path, encoding="utf8") as f:
        for index, line in enumerate(f):
            char = line[:-1]  # 去掉结尾换行符
            vocab[char] = index + 1  # 留出0位给pad token
        vocab["\n"] = 1
    return vocab


def build_simple(corpus, window_size, vocab):
    start = random.randint(0, len(corpus) - 1 - window_size)
    end = start + window_size
    window = corpus[start:end]
    # 窗口后一个字符
    target = corpus[end]
    x = [vocab.get(char, vocab["<UNK>"]) for char in window]
    y = vocab[target]
    return x, y


def build_dataset(simple_size, corpus, window_size, vocab):
    x = []
    y = []
    for i in range(simple_size):
        dataset_x, dataset_y = build_simple(corpus, window_size, vocab)
        x.append(dataset_x)
        y.append(dataset_y)
    return torch.LongTensor(x), torch.LongTensor(y)


# 读取文件
def load_corpus(corpus_path):
    return open(corpus_path, encoding="utf8").read()


def train(corpus_path, save_weight=True):
    epoch_num = 10  # 训练轮数
    batch_size = 128  # 每次训练样本个数
    train_sample = 10000  # 每轮训练总共训练的样本总数
    char_dim = 128  # 每个字的维度
    window_size = 6  # 样本文本长度
    vocab = build_vocab("D:\\NLP\\test\\week6\\vocab.txt")  # 建立字表
    corpus = load_corpus(corpus_path)  # 加载语料
    model = LanguageModel(char_dim, vocab)  # 建立模型
    x, y = build_dataset(train_sample, corpus, window_size, vocab)
    dataset = data_util.TensorDataset(x, y)
    dataiter = data_util.DataLoader(dataset, batch_size)
    # gpu
    if torch.cuda.is_available():
        model = model.cuda()
    optim = torch.optim.Adam(model.parameters(), lr=0.001)  # 建立优化器
    for epoch in range(epoch_num):
        # start training
        model.train()
        epoch_loss = []
        # x.shape == 20*5 y_true.shape == 20
        for x, y_true in dataiter:
            # print(x, y_true)
            # 交叉熵需要传递整个x，y过去，而非单个的
            loss = model(x, y_true)
            # print(loss)
            # 反向传播过程，在反向传播过程中会计算每个参数的梯度值
            loss.backward()
            # 改變權重；所有的 optimizer 都实现了 step() 方法，该方法会更新所有的参数。
            optim.step()
            # 将上一轮计算的梯度清零，避免上一轮的梯度值会影响下一轮的梯度值计算
            optim.zero_grad()

            epoch_loss.append(loss.data)
        print("=========\n第%d轮平均loss:%f" % (epoch + 1, np.mean(epoch_loss)))
    if not save_weight:
        return
    else:
        base_name = os.path.basename(corpus_path).replace("txt", "pth")
        model_path = os.path.join("D:\\NLP\\test\\week6\\model", base_name)
        torch.save(model.state_dict(), model_path)
        return


def train_all():
    for path in os.listdir("../week6/corpus"):
        corpus_path = os.path.join("D:\\NLP\\test\\week6\\corpus", path)
        print(corpus_path)
        train(corpus_path)


# def cal_ppl(sentence, model, vocab, window_size):
#     prob = 0
#     model.eval()
#     with torch.no_grad():
#         for i in range(1, len(sentence)):
#             start = max(0, i - window_size)
#             window = sentence[start:i]
#             x = [vocab.get(char, vocab["<UNK>"]) for char in window]
#             x = torch.LongTensor([x])
#             target = sentence[i]
#             target_index = vocab.get(target, vocab["<UNK>"])
#             if torch.cuda.is_available():
#                 x = x.cuda()
#             pred_prob_distribute = model(x)[0]
#             target_prob = pred_prob_distribute[target_index]
#             prob += math.log(target_prob, 10)
#     return 2 ** (prob * (-1 / len(sentence)))

# 计算文本ppl （rnn 无需回退 因为输出的softmax自带平滑）
def cal_ppl(sentence, model):
    prob = 0
    with torch.no_grad():
        for i in range(1, len(sentence)):
            start = max(0, i - model.window_size)
            window = sentence[start:i]
            x = [model.vocab.get(char, model.vocab["<UNK>"]) for char in window]
            x = torch.LongTensor([x])
            target = sentence[i]
            # 目标值的下标
            target_index = model.vocab.get(target, model.vocab["<UNK>"])
            if torch.cuda.is_available():
                x = x.cuda()
            pred_prob_distribute = model(x)[0]
            # 对应的概率是多少
            target_prob = pred_prob_distribute[target_index]
            # print(window , "->", target, "prob:", float(target_prob))
            prob += math.log(target_prob, 10)
    return 2 ** (prob * (-1 / len(sentence)))
# if __name__ == '__main__':
#     train_all()


# ============================== implement =============================
def load_trained_language_model(path):
    char_dim = 128  # 每个字的维度,与训练时保持一致
    window_size = 6  # 样本文本长度,与训练时保持一致
    vocab = build_vocab("D:\\NLP\\test\\week6\\vocab.txt")  # 加载字表
    model = LanguageModel(char_dim, vocab)  # 加载模型
    model.load_state_dict(torch.load(path))  # 加载训练好的模型权重
    model.eval()
    if torch.cuda.is_available():
        model = model.cuda()
    model.window_size = window_size
    model.vocab = vocab
    return model


# 加载训练好的所有模型
def load_models():
    model_paths = os.listdir("D:\\NLP\\test\\week6\\model")
    class_to_model = {}
    for model_path in model_paths:
        class_name = model_path.replace(".pth", "")
        model_path = os.path.join("D:\\NLP\\test\\week6\\model", model_path)
        class_to_model[class_name] = load_trained_language_model(model_path)
    return class_to_model


# 基于语言模型的文本分类伪代码
# class_to_model: {"class1":<language model obj1>, "class2":<language model obj2>, ..}
# 每个语言模型，用对应的领域语料训练
def text_classification_based_on_language_model(class_to_model, sentence):
    ppl = []
    for class_name, class_lm in class_to_model.items():
        # 用每个语言模型计算ppl
        ppl.append([class_name, cal_ppl(sentence, class_lm)])
    ppl = sorted(ppl, key=lambda x: x[1])
    print(sentence)
    print(ppl[0: 3])
    print("==================")
    return ppl


sentence = ["在全球货币体系出现危机的情况下",
            "点击进入双色球玩法经典选号图表",
            "慢时尚服饰最大的优点是独特",
            "做处女座朋友的人真的很难",
            "网戒中心要求家长全程陪护",
            "在欧巡赛扭转了自己此前不利的状态",
            "选择独立的别墅会比公寓更适合你",
            ]

class_to_model = load_models()
for s in sentence:
    text_classification_based_on_language_model(class_to_model, s)