import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
data = pd.read_csv('Combined_News_DJIA.csv')
data.head()
输出 : 5 rows × 27 columns
data:image/s3,"s3://crabby-images/97d56/97d568a8049cd0953058cd039d8b0516bd52564a" alt=""
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
example = train.iloc[3,10]
print(example)
输出 : b"The commander of a Navy air reconnaissance squadron that provides the President and the defense secretary the airborne ability to command the nation's nuclear weapons has been relieved of duty"
example2 = example.lower()
print(example2)
输出 :
b"the commander of a navy air reconnaissance squadron that provides the president and the defense secretary the airborne ability to command the nation's nuclear weapons has been relieved of duty"
example3 = CountVectorizer().build_tokenizer()(example2)
print(example3)
输出 :
['the', 'commander', 'of', 'navy', 'air', 'reconnaissance', 'squadron', 'that', 'provides', 'the', 'president', 'and', 'the', 'defense', 'secretary', 'the', 'airborne', 'ability', 'to', 'command', 'the', 'nation', 'nuclear', 'weapons', 'has', 'been', 'relieved', 'of', 'duty']
pd.DataFrame([[x,example3.count(x)] for x in set(example3)], columns = ['Word', 'Count'])
data:image/s3,"s3://crabby-images/c3313/c331339592b16dfff8abfdbbed7193d13e3ba335" alt=""
trainheadlines = []
for row in range(0,len(train.index)):
trainheadlines.append(' '.join(str(x) for x in train.iloc[row,2:27]))
basicvectorizer = CountVectorizer()
basictrain = basicvectorizer.fit_transform(trainheadlines)
print(basictrain.shape)
输出 : (1611, 31675)
basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train["Label"])
testheadlines = []
for row in range(0,len(test.index)):
testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
basictest = basicvectorizer.transform(testheadlines)
predictions = basicmodel.predict(basictest)
pd.crosstab(test["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])
#0.42
data:image/s3,"s3://crabby-images/cd259/cd25923b9976f4d7d2ceb3693f3efab9bb3aa880" alt=""
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : basicwords,
'Coefficient' : basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
coeffdf.head(10)
data:image/s3,"s3://crabby-images/69dfe/69dfed1528fc9920498284bf35a972fb50024382" alt=""
coeffdf.tail(10)
data:image/s3,"s3://crabby-images/b009e/b009e5267f65679d0ee9461178f6c13acce821f1" alt=""
advancedvectorizer = CountVectorizer(ngram_range=(2,2))
advancedtrain = advancedvectorizer.fit_transform(trainheadlines)
print(advancedtrain.shape)
输出 : (1611, 366721)
advancedmodel = LogisticRegression()
advancedmodel = advancedmodel.fit(advancedtrain, train["Label"])
testheadlines = []
for row in range(0,len(test.index)):
testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
advancedtest = advancedvectorizer.transform(testheadlines)
advpredictions = advancedmodel.predict(advancedtest)
pd.crosstab(test["Label"], advpredictions, rownames=["Actual"], colnames=["Predicted"])
#.57
data:image/s3,"s3://crabby-images/ee432/ee432b898046e2ea4b23625d55215a1d17dd6e9f" alt=""
advwords = advancedvectorizer.get_feature_names()
advcoeffs = advancedmodel.coef_.tolist()[0]
advcoeffdf = pd.DataFrame({'Words' : advwords,
'Coefficient' : advcoeffs})
advcoeffdf = advcoeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
advcoeffdf.head(10)
data:image/s3,"s3://crabby-images/bb7bc/bb7bc7e8e762a78c020c18757b0d7f24de236606" alt=""
advcoeffdf.tail(10)
data:image/s3,"s3://crabby-images/79cf5/79cf5db4331955ff7d7639b962dce20ae72f6988" alt=""
gensim
网友评论