BERT情感分析

举报
九年义务漏网鲨鱼 发表于 2025/09/15 14:56:11 2025/09/15
【摘要】 基于bert情感分析实战

BERT 情感分析

一、 数据集加载与模型训练

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score
mode_name_or_path = '/root/autodl-tmp/bert-base-uncased'


# 1. 加载 SST-2 数据集
dataset = load_dataset("glue", "sst2")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to('cuda')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.save_pretrained(mode_name_or_path)
tokenizer.save_pretrained(mode_name_or_path)
# 2. 数据预处理(tokenization)
def preprocess(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = dataset.map(preprocess, batched=True)
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# 4. 训练参数
training_args = TrainingArguments(
    output_dir="./output",
    # evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    logging_dir="./logs",
)

# 5. 定义指标
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

print(encoded_dataset["train"])
# 6. 启动训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

二、HUGGING FACE 量化

from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from analyze import *

# 1. 加载训练后BERT模型和tokenizer
model_name = "./output/checkpoint-2105/"
tokenizer = AutoTokenizer.from_pretrained(model_name)
original_model = AutoModelForSequenceClassification.from_pretrained(model_name).to('cuda')

# 2. 8-bit量化加载模型
quantized_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="auto",  # 自动分配到可用设备
    load_in_8bit=True,  # 启用8-bit量化
)
  • 比较模型大小

模型大小大幅度降低,降低了近3-4倍;

def print_model_size(model, model_name):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024**2
    print(f"{model_name} size: {size_all_mb:.3f}MB")

print_model_size(original_model, "Original BERT")
print_model_size(quantized_model, "Quantized 8-bit BERT")
#Original BERT size: 417.655MB
#Quantized 8-bit BERT size: 127.269MB
  • 模型精度比较

在大小降低的同时,在验证集上的精度也大幅度降低;

def evaluate(model, dataset, labels):
    model.eval()
    preds = []

    with torch.no_grad():
        for i in range(0, len(dataset), 32):  # batch size = 32
            batch = dataset[i:i + 32]
            input_ids = batch["input_ids"].to(model.device)
            attention_mask = batch["attention_mask"].to(model.device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=1).cpu()
            preds.extend(batch_preds.tolist())

    correct = sum([int(p == t) for p, t in zip(preds, labels)])
    acc = correct / len(labels)
    return acc
def preprocess(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length", max_length=128)
    
from datasets import load_dataset
dataset = load_dataset("glue", "sst2")
val_dataset = dataset["validation"]

encoded_val_dataset = val_dataset.map(preprocess, batched=True)
encoded_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
labels = torch.tensor(val_dataset["label"])

acc_fp32 = evaluate(original_model, encoded_val_dataset, labels)
acc_int8 = evaluate(quantized_model, encoded_val_dataset, labels)

print(f"Original FP32 model accuracy: {acc_fp32:.4f}")
print(f"Quantized INT8 model accuracy: {acc_int8:.4f}")

#Original FP32 model accuracy: 0.9300
#Quantized INT8 model accuracy: 0.5482
  • 量化分析

👉 这种量化方式虽然简单,但存在一个明显的问题,这是方式是 HuggingFace 基于 bitsandbytes 库 实现的轻量量化方式,背后用的是:

  1. bitsandbytes8-bit optimizers
  2. 权重是 FP16 或 INT8 存储,但不是 PyTorch 的量化张量(QTensor)
  3. 目的是节省 显存内存

🎯 为此,该方法无法通过调用tensor.q_scale() , tensor.q_zero_point()进行逐层分析

三、PYTORCH Eager Mode 量化

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 1. 加载原始模型
model_name = "./output/checkpoint-2105/"
model = AutoModelForSequenceClassification.from_pretrained(model_name).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. 将模型移到 CPU(Eager 模式量化推荐在 CPU 上执行)
model.to('cpu')

# 3. 准备量化配置 (动态量化)
quantized_model = torch.quantization.quantize_dynamic(
    model,
    {torch.nn.Linear},  # 指定要量化的模块类型
    dtype=torch.qint8   # 量化类型
)
  • 量化后大小比较,结果比huggingface量化方式大一点
# 大小比较
# Original BERT size: 417.655MB
# Quantized 8-bit BERT size: 127.269MB

# 精度比较
# Original FP32 model accuracy: 0.9300
# Quantized INT8 model accuracy: 0.5482 不变

四、PYTORCH EXPORT 量化 (存在bug)

目前的这种量化方式还有bug存在,并且还找不到错误,希望有大哥帮助一下,主要的问题是模型可以成功量化,但是量化后的模型推理时会报错误,而且量化结果的大小也很奇怪:Original BERT size: 417.655MB ; Quantized 8-bit BERT size: 0.001MB

import torch
from torch.export import export
from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e
from torch.ao.quantization.quantizer.xnnpack_quantizer import get_symmetric_quantization_config
from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_name = "./output/checkpoint-2105/"
# 1. 加载原始模型
model = AutoModelForSequenceClassification.from_pretrained(model_name).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. 准备样例输入
example = tokenizer("This movie is great!", return_tensors="pt", padding="max_length", max_length=128)
example = {k: v.cuda() for k, v in example.items()}
example_inputs = (example["input_ids"], example["attention_mask"])

# 3. 导出模型
ep = export(model, args=example_inputs,dynamic_shapes=None)
gm = ep.graph_module

# 4. 准备量化器
quantizer = X86InductorQuantizer()
quantizer.set_global(get_symmetric_quantization_config(is_per_channel=True))

# 5. 插入 observer
prepared = prepare_pt2e(gm, quantizer)
quantity_model = convert_pt2e(prepared)

# 报错信息forward() missing 203 required positional arguments: 'p_bert_embeddings_position_embeddings_weight', 'p_bert_embeddings_layernorm_weight', 'p_bert_embeddings_layernorm_bias',  
【声明】本内容来自华为云开发者社区博主,不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息,否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。