-
Notifications
You must be signed in to change notification settings - Fork 0
/
Stock_Prediction_Sklearn_Regression.py
153 lines (121 loc) · 4.46 KB
/
Stock_Prediction_Sklearn_Regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#Import requirements
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib as mpl
import math
import numpy as np
from pandas import Series, DataFrame
#pip install pandas-datareader
import pandas_datareader as web
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime.now()
#!pip install yfinance --upgrade --no-cache-dir
df = web.DataReader("AAPL", 'yahoo', start, end)
close_px = df['Adj Close']
mavg = close_px.rolling(window=100).mean()
mpl.rc('figure', figsize=(8, 7))
mpl.__version__
# Adjusting the style of matplotlib
style.use('ggplot')
close_px.plot(label='AAPL')
mavg.plot(label='mavg')
plt.legend()
rets = close_px / close_px.shift(1) - 1
rets.plot(label='return')
plt.show()
dfreg = df.loc[:, ['Adj Close', 'Volume']]
dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0
print(dfreg.head())
# Drop missing value
dfreg.fillna(value=-99999, inplace=True)
print(dfreg.shape)
# We want to separate 2 percent of the data to forecast
forecast_out = int(math.ceil(0.02 * len(dfreg)))
print("FORECAST OUT")
print(forecast_out) # 2% of 2442 = 49 Days
# Separating the label here, we want to predict the AdjClose
forecast_col = 'Adj Close'
dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
X = np.array(dfreg.drop(['label'], 1))
# Scale the X so that everyone can have the same distribution for linear regression
X = preprocessing.scale(X)
# Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
# Separate label and identify it as y
y = np.array(dfreg['label'])
y = y[:-forecast_out]
print('Dimension of X', X.shape)
print('Dimension of y', y.shape)
# Splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
train_len = len(X_train)
test_len = len(X_test)
print('train length', len(X_train))
print('test length', len(X_test))
# Linear regression
clfreg = LinearRegression(n_jobs=-2)
clfreg.fit(X_train, y_train)
# Quadratic Regression 2
clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
clfpoly2.fit(X_train, y_train)
# Quadratic Regression 3
clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
clfpoly3.fit(X_train, y_train)
# LassoCV
clfLasso = LassoCV(eps=0.002, n_alphas=100, fit_intercept=True, normalize=False)
clfLasso.fit(X_train, y_train)
# Score models
scores = []
modelNames = ["LinearRegression", "Quadratic2", "Quadratic3", "LassoCV"]
confidencereg = clfreg.score(X_test, y_test)
confidencepoly2 = clfpoly2.score(X_test, y_test)
confidencepoly3 = clfpoly3.score(X_test, y_test)
confLasso = clfLasso.score(X_test, y_test)
scores.append(confidencereg)
scores.append(confidencepoly2)
scores.append(confidencepoly3)
scores.append(confLasso)
# results
print("The linear regression confidence is: ", confidencereg)
print("The quadratic regression 2 confidence is: ", confidencepoly2)
print("The quadratic regression 3 confidence is: ", confidencepoly3)
print("The LassoCV confidence is: ", confLasso)
bsIndex = [i for i, j in enumerate(scores) if j == max(scores)]
print("BEST SCORE: " + str(max(scores)) + " WAS WITH MODEL:" + str(modelNames[bsIndex[0]]))
# Ploting
if (modelNames[bsIndex[0]] == "LinearRegression"):
forecast = clfreg.predict(X_lately)
if (modelNames[bsIndex[0]] == "Quadratic2"):
forecast = clfpoly2.predict(X_lately)
if (modelNames[bsIndex[0]] == "Quadratic3"):
forecast = clfpoly3.predict(X_lately)
if (modelNames[bsIndex[0]] == "LassoCV"):
forecast = clfLasso.predict(X_lately)
dfreg['ForecastReg'] = np.nan
print(forecast, confidencereg, forecast_out)
last_date = dfreg.iloc[-1].name
last_unix = last_date
next_unix = last_unix + datetime.timedelta(days=1)
for i in forecast:
next_date = next_unix
print(next_date)
next_unix += datetime.timedelta(days=1)
dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns) - 1)] + [i]
dfreg['Adj Close'].tail(1000).plot()
dfreg['ForecastReg'].tail(1000).plot()
plt.legend(loc=4)
plt.title("Price prediction with " + str(modelNames[bsIndex[0]]) + " Model")
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()