Update README.md
Browse files
README.md
CHANGED
@@ -4,85 +4,85 @@ language:
|
|
4 |
- zh
|
5 |
---
|
6 |
|
7 |
-
验证集准确度: 0.9382193411826961
|
8 |
-
验证集分类报告:
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
weighted avg 0.94 0.94 0.94 15118
|
18 |
|
19 |
-
大概使用了10w+的数据做了一个基金方面的中文情感分析模型,暂时测试下来还可以,负面方面的文本是有专人处理过的,中性的可能不准确。
|
20 |
# 返回值解释:
|
21 |
-
0: 'negative', 1: 'positive', 2: 'neutral'
|
22 |
|
23 |
|
24 |
|
25 |
# 测试代码如下:
|
26 |
-
import sys
|
27 |
-
import re
|
28 |
-
import torch
|
29 |
-
from transformers import BertTokenizer, BertForSequenceClassification
|
30 |
-
from torch.nn.functional import softmax
|
31 |
-
|
32 |
-
#设定使用CPU或CUDA
|
33 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
34 |
-
|
35 |
-
#载入预先保存的模型和分词器
|
36 |
-
model = BertForSequenceClassification.from_pretrained('sanshizhang/Chinese-Sentiment-Analysis-Fund-Direction')
|
37 |
-
tokenizer = BertTokenizer.from_pretrained('sanshizhang/Chinese-Sentiment-Analysis-Fund-Direction')
|
38 |
-
|
39 |
-
#确保模型在正确的设备上
|
40 |
-
model = model.to(device)
|
41 |
-
model.eval() # 把模型设置为评估模式
|
42 |
-
|
43 |
-
#函数定义:进行预测并返回预测概率
|
44 |
-
def predict_sentiment(text):
|
45 |
-
# 编码文本数据
|
46 |
-
encoding = tokenizer.encode_plus(
|
47 |
-
text,
|
48 |
-
max_length=512,
|
49 |
-
add_special_tokens=True,
|
50 |
-
return_token_type_ids=False,
|
51 |
-
padding='max_length', # 修改此处
|
52 |
-
return_attention_mask=True,
|
53 |
-
return_tensors='pt',
|
54 |
-
truncation=True
|
55 |
-
)
|
56 |
|
57 |
-
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
- zh
|
5 |
---
|
6 |
|
7 |
+
验证集准确度: 0.9382193411826961
|
8 |
+
验证集分类报告:
|
9 |
+
precision recall f1-score support
|
10 |
+
|
11 |
+
negative 0.93 0.95 0.94 3785
|
12 |
+
positive 0.95 0.96 0.95 6919
|
13 |
+
neutral 0.93 0.89 0.91 4414
|
14 |
+
|
15 |
+
accuracy 0.94 15118
|
16 |
+
macro avg 0.94 0.93 0.93 15118
|
17 |
+
weighted avg 0.94 0.94 0.94 15118
|
18 |
|
19 |
+
大概使用了10w+的数据做了一个基金方面的中文情感分析模型,暂时测试下来还可以,负面方面的文本是有专人处理过的,中性的可能不准确。
|
20 |
# 返回值解释:
|
21 |
+
0: 'negative', 1: 'positive', 2: 'neutral'
|
22 |
|
23 |
|
24 |
|
25 |
# 测试代码如下:
|
26 |
+
import sys
|
27 |
+
import re
|
28 |
+
import torch
|
29 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
30 |
+
from torch.nn.functional import softmax
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
#设定使用CPU或CUDA
|
33 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
34 |
|
35 |
+
#载入预先保存的模型和分词器
|
36 |
+
model = BertForSequenceClassification.from_pretrained('sanshizhang/Chinese-Sentiment-Analysis-Fund-Direction')
|
37 |
+
tokenizer = BertTokenizer.from_pretrained('sanshizhang/Chinese-Sentiment-Analysis-Fund-Direction')
|
38 |
|
39 |
+
#确保模型在正确的设备上
|
40 |
+
model = model.to(device)
|
41 |
+
model.eval() # 把模型设置为评估模式
|
|
|
42 |
|
43 |
+
#函数定义:进行预测并返回预测概率
|
44 |
+
def predict_sentiment(text):
|
45 |
+
# 编码文本数据
|
46 |
+
encoding = tokenizer.encode_plus(
|
47 |
+
text,
|
48 |
+
max_length=512,
|
49 |
+
add_special_tokens=True,
|
50 |
+
return_token_type_ids=False,
|
51 |
+
padding='max_length', # 修改此处
|
52 |
+
return_attention_mask=True,
|
53 |
+
return_tensors='pt',
|
54 |
+
truncation=True
|
55 |
+
)
|
56 |
+
|
57 |
+
# ... 其他代码不变
|
58 |
+
|
59 |
+
# 取出输入对应的编码
|
60 |
+
input_ids = encoding['input_ids'].to(device)
|
61 |
+
attention_mask = encoding['attention_mask'].to(device)
|
62 |
+
|
63 |
+
# 不计算梯度
|
64 |
+
with torch.no_grad():
|
65 |
+
# 产生情感预测的logits
|
66 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
67 |
+
|
68 |
+
# 使用softmax将logits转换为概率
|
69 |
+
probs = softmax(outputs.logits, dim=1)
|
70 |
+
|
71 |
+
# 返回概率和预测的类别
|
72 |
+
return probs, torch.argmax(probs, dim=1).cpu().numpy()[0]
|
73 |
+
|
74 |
+
#从命令行参数获取文本,合并并清理特殊字符
|
75 |
+
arguments = sys.argv[1:] # 忽略脚本名称
|
76 |
+
text = ' '.join(arguments) # 合并为单一字符串
|
77 |
+
text = re.sub(r"[^\u4e00-\u9fff\d.a-zA-Z%+\-。!?,、;:()【】《》“”‘’]", '', text) # 去除特殊字符
|
78 |
+
|
79 |
+
#print(f"传过来的文本是: {text}")
|
80 |
+
#进行预测
|
81 |
+
probabilities, prediction = predict_sentiment(text)
|
82 |
+
|
83 |
+
sentiment_labels = {0: 'negative', 1: 'positive', 2: 'neutral'}
|
84 |
+
|
85 |
+
#打印出预测的情感及其概率
|
86 |
+
predicted_sentiment = sentiment_labels[prediction]
|
87 |
+
print(f"Predicted sentiment: {predicted_sentiment},Probability:{probabilities[0][prediction].item()}")
|
88 |
+
#print(f"Probability: {probabilities[0][prediction].item()}")
|