RedisTemplate取值类型转换问题

JZ39 平衡二叉树

  返回  

bert下albert_chinese_small实现文本分类

2021/8/20 17:11:18 浏览:
import torch
from transformers import BertTokenizer, BertModel, BertConfig
import numpy as np
from torch.utils import data
from sklearn.model_selection import train_test_split
import pandas as pd

pretrained = r'albert_chinese_small'
tokenizer = BertTokenizer.from_pretrained(pretrained)
# 判断传递的预训练模型地址是否在PRETRAINED_VOCAB_ARCHIVE_MAP中,若不在则会将这个路径+VOCAB_NAME拼接成vocab.txt的路径
model = BertModel.from_pretrained(pretrained)
config = BertConfig.from_pretrained(pretrained)

inputtext = "今天心情情很好啊"
tokenized_text = tokenizer.encode(inputtext)
input_ids = torch.tensor(tokenized_text).view(-1, len(tokenized_text))
outputs = model(input_ids)
# outputs[0].shape, outputs[1].shape
# config.hidden_size, config.embedding_size, config.max_length

class AlbertClassfier(torch.nn.Module):
    def __init__(self, bert_model, bert_config, num_class):
        super(AlbertClassfier, self).__init__()
        self.bert_model = bert_model
        self.dropout = torch.nn.Dropout(0.4)
        self.fc1 = torch.nn.Linear(bert_config.hidden_size, bert_config.hidden_size)
        self.fc2 = torch.nn.Linear(bert_config.hidden_size, num_class)

    def forward(self, token_ids):
        # 不太明白为啥要用句向量,那命名实体识别的时候模型怎么写
        bert_out = self.bert_model(token_ids)[1]  # 句向量 [batch_size,hidden_size]
        bert_out = self.dropout(bert_out)
        bert_out = self.fc1(bert_out)
        bert_out = self.dropout(bert_out)
        bert_out = self.fc2(bert_out)  # [batch_size,num_class]
        return bert_out


albertBertClassifier = AlbertClassfier(model, config, 2)
device = torch.device("cuda:0") if torch.cuda.is_available() else 'cpu'
albertBertClassifier = albertBertClassifier.to(device)


def get_train_test_data(pos_file_path, neg_file_path, max_length=100, test_size=0.2):
    data = []
    label = []
    pos_df = pd.read_excel(pos_file_path, header=None)
    pos_df.columns = ['content']
    for index, row in pos_df.iterrows():
        row = row['content']
        ids = tokenizer.encode(row.strip(), max_length=max_length, padding='max_length', truncation=True)
        data.append(ids)
        label.append(1)

    neg_df = pd.read_excel(neg_file_path, header=None)
    neg_df.columns = ['content']
    for index, row in neg_df.iterrows():
        row = row['content']
        ids = tokenizer.encode(row.strip(), max_length=max_length, padding='max_length', truncation=True)
        data.append(ids)
        label.append(0)
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=test_size, shuffle=True)
    return (X_train, y_train), (X_test, y_test)


pos_file_path = r"pos_sim.xlsx"
neg_file_path = r"neg_sim.xlsx"
(X_train, y_train), (X_test, y_test) = get_train_test_data(pos_file_path, neg_file_path)
len(X_train), len(X_test), len(y_train), len(y_test), len(X_train[0])
" ".join([str(i) for i in X_train[0]])
tokenizer.decode(X_train[0]), y_train[0]


class DataGen(data.Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return np.array(self.data[index]), np.array(self.label[index])


train_dataset = DataGen(X_train, y_train)
test_dataset = DataGen(X_test, y_test)
train_dataloader = data.DataLoader(train_dataset, batch_size=10)
test_dataloader = data.DataLoader(test_dataset, batch_size=10)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(albertBertClassifier.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
for epoch in range(50):
    loss_sum = 0.0
    accu = 0
    albertBertClassifier.train()
    for step, (token_ids, label) in enumerate(train_dataloader):
        token_ids = token_ids.to(device)
        label = label.to(device).long()
        out = albertBertClassifier(token_ids)
        loss = criterion(out, label)
        optimizer.zero_grad()
        loss.backward()  # 反向传播
        optimizer.step()  # 梯度更新
        loss_sum += loss.cpu().data.numpy()
        accu += (out.argmax(1) == label).sum().cpu().data.numpy()

    test_loss_sum = 0.0
    test_accu = 0
    albertBertClassifier.eval()
    for step, (token_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.to(device)
        label = label.to(device).long()
        with torch.no_grad():
            out = albertBertClassifier(token_ids)
            loss = criterion(out, label)
            test_loss_sum += loss.cpu().data.numpy()
            test_accu += (out.argmax(1) == label).sum().cpu().data.numpy()
    print("epoch % d,train loss:%f,train acc:%f,test loss:%f,test acc:%f" % (
        epoch, loss_sum / len(train_dataset), accu / len(train_dataset), test_loss_sum / len(test_dataset),
        test_accu / len(test_dataset)))


对应文件我将存在csdn资源库中,或者加群753035545,我将上传资源

联系我们

如果您对我们的服务有兴趣,请及时和我们联系!

服务热线:18288888888
座机:18288888888
传真:
邮箱:888888@qq.com
地址:郑州市文化路红专路93号