Update README.md
Browse files
README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
---
|
2 |
-
license:
|
|
|
|
|
3 |
---
|
|
|
4 |
验证集准确度: 0.9382193411826961
|
5 |
验证集分类报告:
|
6 |
precision recall f1-score support
|
@@ -14,28 +17,30 @@ license: openrail
|
|
14 |
weighted avg 0.94 0.94 0.94 15118
|
15 |
|
16 |
大概使用了10w+的数据做了一个基金方面的中文情感分析模型,暂时测试下来还可以,负面方面的文本是有专人处理过的,中性的可能不准确。
|
|
|
17 |
0: 'negative', 1: 'positive', 2: 'neutral'
|
18 |
|
19 |
|
20 |
-
|
|
|
21 |
import sys
|
22 |
import re
|
23 |
import torch
|
24 |
from transformers import BertTokenizer, BertForSequenceClassification
|
25 |
from torch.nn.functional import softmax
|
26 |
|
27 |
-
|
28 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
29 |
|
30 |
-
|
31 |
model = BertForSequenceClassification.from_pretrained('sanshizhang/Chinese-Sentiment-Analysis-Fund-Direction')
|
32 |
tokenizer = BertTokenizer.from_pretrained('sanshizhang/Chinese-Sentiment-Analysis-Fund-Direction')
|
33 |
|
34 |
-
|
35 |
model = model.to(device)
|
36 |
model.eval() # 把模型设置为评估模式
|
37 |
|
38 |
-
|
39 |
def predict_sentiment(text):
|
40 |
# 编码文本数据
|
41 |
encoding = tokenizer.encode_plus(
|
@@ -66,18 +71,18 @@ def predict_sentiment(text):
|
|
66 |
# 返回概率和预测的类别
|
67 |
return probs, torch.argmax(probs, dim=1).cpu().numpy()[0]
|
68 |
|
69 |
-
|
70 |
arguments = sys.argv[1:] # 忽略脚本名称
|
71 |
text = ' '.join(arguments) # 合并为单一字符串
|
72 |
text = re.sub(r"[^\u4e00-\u9fff\d.a-zA-Z%+\-。!?,、;:()【】《》“”‘’]", '', text) # 去除特殊字符
|
73 |
|
74 |
-
#
|
75 |
-
|
76 |
probabilities, prediction = predict_sentiment(text)
|
77 |
|
78 |
sentiment_labels = {0: 'negative', 1: 'positive', 2: 'neutral'}
|
79 |
|
80 |
-
|
81 |
predicted_sentiment = sentiment_labels[prediction]
|
82 |
print(f"Predicted sentiment: {predicted_sentiment},Probability:{probabilities[0][prediction].item()}")
|
83 |
-
#
|
|
|
1 |
---
|
2 |
+
license: apache-2.0
|
3 |
+
language:
|
4 |
+
- zh
|
5 |
---
|
6 |
+
|
7 |
验证集准确度: 0.9382193411826961
|
8 |
验证集分类报告:
|
9 |
precision recall f1-score support
|
|
|
17 |
weighted avg 0.94 0.94 0.94 15118
|
18 |
|
19 |
大概使用了10w+的数据做了一个基金方面的中文情感分析模型,暂时测试下来还可以,负面方面的文本是有专人处理过的,中性的可能不准确。
|
20 |
+
# 返回值解释:
|
21 |
0: 'negative', 1: 'positive', 2: 'neutral'
|
22 |
|
23 |
|
24 |
+
|
25 |
+
# 测试代码如下:
|
26 |
import sys
|
27 |
import re
|
28 |
import torch
|
29 |
from transformers import BertTokenizer, BertForSequenceClassification
|
30 |
from torch.nn.functional import softmax
|
31 |
|
32 |
+
#设定使用CPU或CUDA
|
33 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
34 |
|
35 |
+
#载入预先保存的模型和分词器
|
36 |
model = BertForSequenceClassification.from_pretrained('sanshizhang/Chinese-Sentiment-Analysis-Fund-Direction')
|
37 |
tokenizer = BertTokenizer.from_pretrained('sanshizhang/Chinese-Sentiment-Analysis-Fund-Direction')
|
38 |
|
39 |
+
#确保模型在正确的设备上
|
40 |
model = model.to(device)
|
41 |
model.eval() # 把模型设置为评估模式
|
42 |
|
43 |
+
#函数定义:进行预测并返回预测概率
|
44 |
def predict_sentiment(text):
|
45 |
# 编码文本数据
|
46 |
encoding = tokenizer.encode_plus(
|
|
|
71 |
# 返回概率和预测的类别
|
72 |
return probs, torch.argmax(probs, dim=1).cpu().numpy()[0]
|
73 |
|
74 |
+
#从命令行参数获取文本,合并并清理特殊字符
|
75 |
arguments = sys.argv[1:] # 忽略脚本名称
|
76 |
text = ' '.join(arguments) # 合并为单一字符串
|
77 |
text = re.sub(r"[^\u4e00-\u9fff\d.a-zA-Z%+\-。!?,、;:()【】《》“”‘’]", '', text) # 去除特殊字符
|
78 |
|
79 |
+
#print(f"传过来的文本是: {text}")
|
80 |
+
#进行预测
|
81 |
probabilities, prediction = predict_sentiment(text)
|
82 |
|
83 |
sentiment_labels = {0: 'negative', 1: 'positive', 2: 'neutral'}
|
84 |
|
85 |
+
#打印出预测的情感及其概率
|
86 |
predicted_sentiment = sentiment_labels[prediction]
|
87 |
print(f"Predicted sentiment: {predicted_sentiment},Probability:{probabilities[0][prediction].item()}")
|
88 |
+
#print(f"Probability: {probabilities[0][prediction].item()}")
|