!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip"
!unzip "NewsAggregatorDataset.zip"
NewsAggregatorDatas 100%[===================>] 27.87M 26.4MB/s in 1.1s
2021-08-31 04:13:42 (26.4 MB/s) - ‘NewsAggregatorDataset.zip.1’ saved [29224203/29224203]
Archive: NewsAggregatorDataset.zip
replace 2pageSessions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
inflating: 2pageSessions.csv
replace __MACOSX/._2pageSessions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
inflating: __MACOSX/._2pageSessions.csv
replace newsCorpora.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
inflating: newsCorpora.csv
replace __MACOSX/._newsCorpora.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
inflating: __MACOSX/._newsCorpora.csv
replace readme.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
inflating: readme.txt
replace __MACOSX/._readme.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
inflating: __MACOSX/._readme.txt
Attribute Information:
FILENAME #1: newsCorpora.csv (102.297.000 bytes)
DESCRIPTION: News pages
FORMAT: ID TITLE URL PUBLISHER CATEGORY STORY HOSTNAME TIMESTAMP
where:
ID Numeric ID
TITLE News title
URL Url
PUBLISHER Publisher name
CATEGORY News category (b = business, t = science and technology, e = entertainment, m = health)
STORY Alphanumeric ID of the cluster that includes news about the same story
HOSTNAME Url hostname
TIMESTAMP Approximate time the news was published, as the number of milliseconds since the epoch 00:00:00 GMT, January 1, 1970
FILENAME #2: 2pageSessions.csv (3.049.986 bytes)
DESCRIPTION: 2-page sessions
FORMAT: STORY HOSTNAME CATEGORY URL
where:
STORY Alphanumeric ID of the cluster that includes news about the same story
HOSTNAME Url hostname
CATEGORY News category (b = business, t = science and technology, e = entertainment, m = health)
URL Two space-delimited urls representing a browsing session
import pandas as pd
cols = ["id","title","url","publisher","category","story","hostname","timestamp",]
df = pd.read_csv("newsCorpora.csv", sep='\t', names=cols)
df.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
df = df.query('publisher in ["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"]')
seed = 200
df_train = df.sample(frac=0.8,random_state=seed)
df_remain = df.drop(df_train.index)
df_valid = df_remain.sample(frac=0.5,random_state=seed)
df_test = df_remain.drop(df_valid.index)
# それぞれtrain.txt,valid.txt,test.txtというファイル名で保存する.ファイルには,1行に1事例を書き出すこととし,カテゴリ名と記事見出しのタブ区切り形式とせよ
with open("train.txt","w") as f:
for index, row in df_train.iterrows():
f.write(row["category"] + "\t" + row["title"] + "\n")
with open("valid.txt","w") as f:
for index, row in df_valid.iterrows():
f.write(row["category"] + "\t" + row["title"] + "\n")
with open("test.txt","w") as f:
for index, row in df_test.iterrows():
f.write(row["category"] + "\t" + row["title"] + "\n")
print("Train:{}\nValid:{}\nTest:{}".format(len(df_train),len(df_valid),len(df_test)))
# TF-IDFがいいらしい
# Text を ベクトル化する
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_union
# 文の長さをベクトルに与える
class CustomVectorizer(TransformerMixin):
def __init__(self):
super().__init__()
self.scaler = MinMaxScaler()
def fit(self, raw_documents, y=None):
lengths = np.zeros((len(raw_documents), 1))
for doc_id, document in enumerate(raw_documents):
length = len(document)
lengths[doc_id, 0] = length
self.scaler.fit(lengths)
return self
def transform(self, raw_documents):
lengths = np.zeros((len(raw_documents), 1))
for doc_id, document in enumerate(raw_documents):
length = len(document)
lengths[doc_id, 0] = length
lengths = self.scaler.transform(lengths)
return lengths
# 特徴量を組み合わせて一つのベクトル結合する
vectorizer = make_union(
TfidfVectorizer(),
CustomVectorizer()
)
df_train_valid = pd.concat([df_train,df_valid])
x_train_valid = vectorizer.fit_transform(df_train_valid["title"]).toarray() # train, validでフィット、train,validとtestの分布が同じと仮定?
x_test = vectorizer.transform(df_test["title"]).toarray()
print(vectorizer.transformer_list[0][1].get_feature_names())
print('size of tfidf vector', len(vectorizer.transformer_list[0][1].get_feature_names()))
x_cols = vectorizer.transformer_list[0][1].get_feature_names() + ["length"]
x_train = x_train_valid[:len(df_train)]
x_valid = x_train_valid[len(df_train):]
# ベクトルをデータフレームに変換
x_train = pd.DataFrame(x_train, columns=x_cols)
x_valid = pd.DataFrame(x_valid, columns=x_cols)
x_test = pd.DataFrame(x_test, columns=x_cols)
# データの保存(重いのでしない)
#x_train.to_csv('train.features.csv', index=False)
#x_valid.to_csv('valid.features.csv', index=False)
#x_test.to_csv('test.features.csv', index=False)
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(random_state=seed, max_iter=1000, verbose=100)
lg.fit(x_train,df_train["category"])
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=1000,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=200, solver='lbfgs', tol=0.0001, verbose=100,
warm_start=False)
def get_probas(x):
return [np.max(lg.predict_proba(x), axis=1), lg.predict(x)]
get_probas(x_train)
from sklearn.metrics import accuracy_score
pred_train = get_probas(x_train)
y_train = df_train["category"]
pred_valid = get_probas(x_valid)
y_valid = df_valid["category"]
pred_test = get_probas(x_test)
y_test = df_test["category"]
print("Train accuracy:{}\nValid accuracy:{}\nTest accuracy:{}".format(accuracy_score(y_train,pred_train[1]),accuracy_score(y_valid,pred_valid[1]),accuracy_score(y_test,pred_test[1])))
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
m_train = confusion_matrix(y_train,pred_train[1])
sns.heatmap(m_train, annot=True, cmap='Blues')
plt.show()
m_valid = confusion_matrix(y_valid,pred_valid[1])
sns.heatmap(m_valid, annot=True, cmap='Blues')
plt.show()
m_test = confusion_matrix(y_test,pred_test[1])
sns.heatmap(m_test, annot=True, cmap='Blues')
plt.show()
from sklearn.metrics import precision_score, recall_score, f1_score
def integrated_score(y,pred):
precision = precision_score(y,pred,labels=["b","e","t","m"],average=None)
precision = np.append(precision,precision_score(y,pred,average="micro"))
precision = np.append(precision,precision_score(y,pred,average="macro"))
rc = recall_score(y,pred,labels=["b","e","t","m"],average=None)
rc = np.append(rc,recall_score(y,pred,average="micro"))
rc = np.append(rc,recall_score(y,pred,average="macro"))
f1 = f1_score(y,pred,labels=["b","e","t","m"],average=None)
f1 = np.append(f1,f1_score(y,pred,average="micro"))
f1 = np.append(f1,f1_score(y,pred,average="macro"))
return pd.DataFrame({"precision":precision,
"recall":rc,
"f1":f1
},index=["b","e","t","m","micro","macro"])
integrated_score(y_valid,pred_valid[1])
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
coefs = lg.coef_
f_dict = {"b":"business","e":"entertaiment","t":"science & technology","m":"health"}
features = x_valid.columns.values
for cl, coef in zip(lg.classes_,coefs):
high = features[np.argsort(coef)[::-1][:10]]
low = features[np.argsort(coef)[:10]]
print(f_dict[cl])
print("high:",high)
print("low:",low)
print("\n")
entertaiment
high: ['kardashian' 'chris' 'star' 'she' 'kim' 'miley' 'cyrus' 'movie' 'paul'
'thrones']
low: ['us' 'update' 'google' 'china' 'says' 'facebook' 'gm' 'ceo' 'apple'
'billion']
health
high: ['ebola' 'study' 'fda' 'drug' 'cancer' 'mers' 'cases' 'heart' 'could'
'outbreak']
low: ['gm' 'facebook' 'apple' 'google' 'ceo' 'amazon' 'deal' 'bank' 'climate'
'twitter']
science & technology
high: ['google' 'apple' 'facebook' 'climate' 'microsoft' 'gm' 'tesla' 'comcast'
'mobile' 'nasa']
low: ['stocks' 'fed' 'ecb' 'shares' 'her' 'day' 'men' 'kardashian' 'drug'
'ukraine']
# お好きな人がやる
# お好きな人がやる