我编写了一个回归模型,想要用它来预测未来的比特币价格。我试图让它工作,但遇到了多个问题。以下是我的源代码:
import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.linear_model import LinearRegressionfrom sklearn.model_selection import ShuffleSplitfrom sklearn.model_selection import cross_val_scorefrom sklearn.model_selection import GridSearchCVfrom sklearn.linear_model import Lassofrom sklearn.tree import DecisionTreeRegressorfrom matplotlib import pyplot as pltimport matplotlibmatplotlib.rcParams['figure.figsize'] = (20,10)df = pd.read_csv("datasets_1869_18570_bitcoin_cash_price.csv")price = df.Closedate = df.Datedate_format = pd.get_dummies(date)# df_bitcoin = pd.concat([price, date_format], axis="columns")# df_bitcoin.to_csv('test.csv')X_train, X_test, y_train, y_test = train_test_split(date_format,price,test_size=0.2, random_state=10)# print(X_train,y_train)prediction = LinearRegression()prediction.fit(X_train,y_train)prediction.score(X_test,y_test)print(prediction.score(X_test,y_test))crossvalid = ShuffleSplit(n_splits=5,test_size=0.2, random_state=0)print(cross_val_score(LinearRegression(), date_format, price, cv=crossvalid))def find_best_model_using_gridsearchcv(x,y): algos = { 'linear_regression': { 'model': LinearRegression(), 'params': { 'normalize': [True, False] } }, 'lasso': { 'model': Lasso(), 'params': { 'alpha': [1,2], 'selection': ['random', 'cyclic'] } }, 'decision_tree': { 'model': DecisionTreeRegressor(), 'params': { 'criterion': ['mse', 'friedman_mse'], 'splitter': ['best', 'random'] } } } scores = [] cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) for algo_name, config in algos.items(): gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False) gs.fit(date_format, price) scores.append({ 'model': algo_name, 'best_score': gs.best_score_, 'best_params': gs.best_params_ }) final = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params']) final.to_csv("final.csv")find_best_model_using_gridsearchcv(date_format,price)def predict_price(dates,price): date_index = np.where(date_format.columns == dates)[0][0] x = np.zeros(len(date_index.columns)) if date_index >= 0: x[date_index] = 1 return prediction.predict([x])[0]predict_price('Feb 20, 2018', 1000)# print(cross_val_score(LinearRegression(), date_format, price, cv=crossvalid))
这是我的csv文件:
Date,Open,High,Low,Close,Volume,Market Cap"Feb 20, 2018",1543.27,1569.03,1414.35,1418.73,"820,947,000","26,199,800,000""Feb 19, 2018",1483.34,1553.81,1483.34,1534.77,"578,906,000","25,179,700,000""Feb 18, 2018",1552.10,1641.40,1428.49,1487.46,"907,873,000","26,344,200,000""Feb 17, 2018",1548.48,1568.64,1517.14,1551.39,"641,719,000","26,280,100,000""Feb 16, 2018",1373.16,1558.66,1369.68,1552.20,"961,010,000","23,302,000,000""Feb 15, 2018",1358.65,1400.71,1330.18,1375.81,"502,454,000","23,053,300,000""Feb 14, 2018",1229.18,1384.06,1229.18,1362.27,"629,852,000","20,854,300,000""Feb 13, 2018",1285.23,1289.42,1205.73,1231.98,"442,663,000","21,803,000,000""Feb 12, 2018",1222.43,1302.66,1222.43,1283.91,"466,213,000","20,735,400,000""Feb 11, 2018",1255.59,1306.20,1197.87,1217.64,"651,555,000","21,295,600,000""Feb 10, 2018",1311.99,1394.81,1215.72,1257.31,"734,606,000","22,249,700,000""Feb 09, 2018",1273.87,1342.41,1206.39,1312.11,"1,133,760,000","21,601,200,000""Feb 08, 2018",951.57,1345.20,951.57,1284.56,"2,203,710,000","16,133,900,000""Feb 07, 2018",971.08,1040.49,902.49,957.96,"866,222,000","16,462,900,000""Feb 06, 2018",890.97,978.91,764.02,974.52,"777,420,000","15,103,100,000""Feb 05, 2018",1159.89,1172.70,839.95,887.41,"568,117,000","19,659,800,000""Feb 04, 2018",1273.17,1286.98,1101.31,1165.38,"656,034,000","21,577,500,000""Feb 03, 2018",1194.23,1314.43,1072.83,1272.50,"453,101,000","20,237,400,000""Feb 02, 2018",1272.50,1272.50,980.78,1191.14,"896,666,000","21,561,700,000""Feb 01, 2018",1491.12,1503.30,1214.88,1274.35,"678,020,000","25,263,200,000"
我的错误是:
ttributeError: 'numpy.int64' 对象没有属性 'columns'
和
AttributeError: 'numpy.int64' 对象没有属性 'columns'
我的回归模型似乎有负分数,即 -0.12086295635446963。我使用的是 LinearRegression().score
回答:
date_index
是一个标量,但你输入了 date_index.columns
,这引发了错误
详细信息:
一切正常,直到代码的这一部分:
def predict_price(dates,price): date_index = np.where(date_format.columns == dates)[0][0] x = np.zeros(len(date_index.columns)) if date_index >= 0: x[date_index] = 1 return prediction.predict([x])[0]predict_price('Feb 20, 2018', 1000)
在这里,date_index
是一个标量,而你输入了 date_index.columns
,这引发了错误。
使用以下代码:
def predict_price(dates,price): date_index = np.where(date_format.columns == dates)[0][0] x = np.zeros(len(date_format.columns)) # 这里是修改部分 if date_index >= 0: x[date_index] = 1 return prediction.predict([x])[0]predict_price('Feb 20, 2018', 1000)