美文网首页
股价预测

股价预测

作者: ForgetThatNight | 来源:发表于2018-07-07 11:07 被阅读11次
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.linear_model import LogisticRegression
    
    data = pd.read_csv('Combined_News_DJIA.csv') 
    data.head()
    

    输出 : 5 rows × 27 columns


    train = data[data['Date'] < '2015-01-01']
    test = data[data['Date'] > '2014-12-31']
    
    example = train.iloc[3,10]
    print(example)
    

    输出 : b"The commander of a Navy air reconnaissance squadron that provides the President and the defense secretary the airborne ability to command the nation's nuclear weapons has been relieved of duty"

    example2 = example.lower()
    print(example2)
    

    输出 :
    b"the commander of a navy air reconnaissance squadron that provides the president and the defense secretary the airborne ability to command the nation's nuclear weapons has been relieved of duty"

    example3 = CountVectorizer().build_tokenizer()(example2)
    print(example3)
    

    输出 :
    ['the', 'commander', 'of', 'navy', 'air', 'reconnaissance', 'squadron', 'that', 'provides', 'the', 'president', 'and', 'the', 'defense', 'secretary', 'the', 'airborne', 'ability', 'to', 'command', 'the', 'nation', 'nuclear', 'weapons', 'has', 'been', 'relieved', 'of', 'duty']

    pd.DataFrame([[x,example3.count(x)] for x in set(example3)], columns = ['Word', 'Count'])
    
    trainheadlines = []
    for row in range(0,len(train.index)):
        trainheadlines.append(' '.join(str(x) for x in train.iloc[row,2:27]))
    
    basicvectorizer = CountVectorizer()
    basictrain = basicvectorizer.fit_transform(trainheadlines)
    print(basictrain.shape)
    

    输出 : (1611, 31675)

    basicmodel = LogisticRegression()
    basicmodel = basicmodel.fit(basictrain, train["Label"])
    
    testheadlines = []
    for row in range(0,len(test.index)):
        testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
    basictest = basicvectorizer.transform(testheadlines)
    predictions = basicmodel.predict(basictest)
    
    pd.crosstab(test["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])
    #0.42
    
    basicwords = basicvectorizer.get_feature_names()
    basiccoeffs = basicmodel.coef_.tolist()[0]
    coeffdf = pd.DataFrame({'Word' : basicwords, 
                            'Coefficient' : basiccoeffs})
    coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
    coeffdf.head(10)
    
    coeffdf.tail(10)
    
    advancedvectorizer = CountVectorizer(ngram_range=(2,2))
    advancedtrain = advancedvectorizer.fit_transform(trainheadlines)
    
    print(advancedtrain.shape)
    

    输出 : (1611, 366721)

    advancedmodel = LogisticRegression()
    advancedmodel = advancedmodel.fit(advancedtrain, train["Label"])
    
    testheadlines = []
    for row in range(0,len(test.index)):
        testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
    advancedtest = advancedvectorizer.transform(testheadlines)
    advpredictions = advancedmodel.predict(advancedtest)
    
    pd.crosstab(test["Label"], advpredictions, rownames=["Actual"], colnames=["Predicted"])
    #.57
    
    advwords = advancedvectorizer.get_feature_names()
    advcoeffs = advancedmodel.coef_.tolist()[0]
    advcoeffdf = pd.DataFrame({'Words' : advwords, 
                            'Coefficient' : advcoeffs})
    advcoeffdf = advcoeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
    advcoeffdf.head(10)
    
    advcoeffdf.tail(10)
    
    gensim
    

    相关文章

      网友评论

          本文标题:股价预测

          本文链接:https://www.haomeiwen.com/subject/vbziuftx.html