Skip to content

Commit 200eff5

Browse files
authoredNov 27, 2024··
Update sklearn_predictions.py
Function applied for code reusability
1 parent 5797227 commit 200eff5

File tree

1 file changed

+237
-134
lines changed

1 file changed

+237
-134
lines changed
 

‎sklearn_predictions.py

+237-134
Original file line numberDiff line numberDiff line change
@@ -1,140 +1,243 @@
11
import pandas as pd
22
import numpy as np
33
import matplotlib.pyplot as plt
4+
from pandas import DataFrame
45
from sklearn.naive_bayes import GaussianNB
56
from sklearn.calibration import CalibratedClassifierCV
67
from sklearn import linear_model as lm
7-
from sklearn.ensemble import RandomForestRegressor
8-
from datetime import datetime, timedelta
9-
from scipy.optimize import curve_fit
10-
11-
# Default variables
12-
predicted_date = "20-Nov-24"
13-
predicted_list = []
14-
actual_list = []
15-
16-
# Read the data from the CSV file and parse dates
17-
df = pd.read_csv("results.csv", dtype={'Date': 'object'})
18-
19-
# Convert data to DataFrame
20-
df['date'] = pd.to_datetime(df['Date'], format='%d-%b-%y')
21-
22-
# Convert dates to ordinal for regression
23-
df['date_ordinal'] = df['date'].apply(lambda x: x.toordinal())
24-
25-
# Prepare training data
26-
X = df['date_ordinal'].values.reshape(-1, 1)
27-
y = df['Numbers'].values
28-
29-
# Train the Gaussian Naive Bayes model with calibration
30-
gnb = GaussianNB()
31-
gnb.fit(X, y)
32-
33-
# Calibrate the model
34-
model = CalibratedClassifierCV(gnb, cv="prefit")
35-
model.fit(X, y)
36-
37-
# loop
38-
for date_ordinal in df['date_ordinal']:
39-
predicted_proba = model.predict_proba([[date_ordinal]])
40-
predicted_value = np.dot(predicted_proba, gnb.classes_) # Calculate the expected value
41-
predicted_list.append(predicted_value[0]) # Append the expected value, not the entire array
42-
actual_value = df.loc[df['date_ordinal'] == date_ordinal, 'Numbers'].iloc[0]
43-
actual_list.append(int(actual_value))
44-
45-
# Predict for a future date using the Naive Bayes model
46-
next_date = datetime.strptime(predicted_date, '%d-%b-%y')
47-
next_date_ordinal = next_date.toordinal()
48-
next_proba = model.predict_proba([[next_date_ordinal]])
49-
next_value = np.dot(next_proba, gnb.classes_) # Calculate the expected value for the next date
50-
51-
# Train the model Linear Regression to predict the next value using the linear trend
52-
# linear_model = lm.LinearRegression()
53-
# linear_model = lm.TheilSenRegressor()
54-
# linear_model = lm.LogisticRegression(C=1.0)
55-
# linear_model = lm.GammaRegressor()
56-
# linear_model = RandomForestRegressor(n_estimators=9, random_state=0)
57-
linear_model = lm.BayesianRidge()
58-
linear_model.fit(X, y)
59-
60-
# Predict using the linear trend model
61-
next_value_linear = linear_model.predict([[next_date_ordinal]])
62-
63-
# Calculate the slope predicted by the linear regression model
64-
slope = linear_model.coef_[0]
65-
66-
# A sine function to the data for cyclical adjustments
67-
def sinusoidal_model(x, A, B, C, D):
68-
return A * np.sin(B * x + C) + D
69-
70-
# Wrapper to convert dates
71-
x_data = df['date_ordinal']
72-
y_data = df['Numbers']
73-
74-
# Fit the curve
75-
params, _ = curve_fit(sinusoidal_model, x_data, y_data, p0=[np.std(y), 2 * np.pi / 365, 0, np.mean(y)])
76-
77-
# Predicting the cyclical adjustment for the given future date
78-
cyclical_adjustment = sinusoidal_model(next_date_ordinal, *params)
79-
80-
# Calculate Weighted Average prediction
81-
weight_naive_bayes = 0.99973895177364354734851780911949 # you can adjust these
82-
weight_linear_trend = slope
83-
weight_linear_offset = 494509 # you can adjust these
84-
weight_cyclical_patterns = cyclical_adjustment # you can adjust these
85-
weight_cyclical = 0.1
86-
"""
87-
next_value_weighted_avg = (
88-
weight_naive_bayes * next_value[0] +
89-
weight_linear_trend * next_value_linear[0] +
90-
weight_cyclical * cyclical_adjustment
91-
)
92-
"""
93-
next_value_weighted_avg = (
94-
weight_naive_bayes * weight_cyclical_patterns + weight_linear_offset +
95-
weight_linear_trend * next_value_linear[0]
96-
)
97-
98-
# Add the next prediction
99-
predicted_list.append(next_value[0])
100-
actual_list.append(next_value_weighted_avg) # guess an actual value for the future date
101-
102-
# Append the future date to the date list as datetime objects
103-
predicted_date = list(pd.to_datetime(df['Date'], format='%d-%b-%y')) + [next_date]
104-
actual_date = list(pd.to_datetime(df['Date'], format='%d-%b-%y')) + [next_date]
105-
106-
# Create a DataFrame with the actual and predicted values
107-
result_df = pd.DataFrame({
108-
'ADate': actual_date,
109-
'Actual': actual_list,
110-
'PDate': predicted_date,
111-
'Predicted_NaiveBayes': predicted_list,
112-
'Predicted_LinearTrend': list(predicted_list[:-1]) + [next_value_linear[0]], # Use linear trend for the last prediction
113-
'Predicted_CyclicalAdjustment': list(predicted_list[:-1]) + [cyclical_adjustment] # Use cyclical adjustment for another prediction
114-
})
115-
116-
# Plotting the results
117-
plt.figure(figsize=(10, 6))
118-
plt.plot(result_df['ADate'], result_df['Actual'], color='purple', label='Actual', marker='o')
119-
plt.plot(result_df['PDate'], result_df['Predicted_LinearTrend'], color='pink', label='Predicted (Linear Trend)', linestyle='-.', marker='^')
120-
plt.xlabel('Date')
121-
plt.ylabel('Value')
122-
plt.title('Actual and Value Prediction')
123-
plt.legend()
124-
plt.grid(True)
125-
plt.xticks(rotation=45)
126-
plt.tight_layout()
127-
128-
# Annotate the predicted points
129-
plt.axvline(x=next_date, color='red', linestyle='--')
130-
plt.annotate(f'{int(next_value_weighted_avg):,}', (next_date, next_value_weighted_avg), textcoords="offset points", xytext=(0, 10), ha='center', color='orange')
131-
plt.annotate(f'{int(next_value_linear[0]):,}', (next_date, next_value_linear), textcoords="offset points", xytext=(0, -20), ha='center', color='red')
132-
plt.annotate(f'{int(cyclical_adjustment):,}', (next_date, cyclical_adjustment), textcoords="offset points", xytext=(0, 20), ha='center', color='green')
133-
134-
plt.show()
135-
136-
# Print the predicted next values
137-
print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Naive Bayes): {int(next_value[0]):,}")
138-
print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Linear Trend): {int(next_value_linear[0]):,}")
139-
print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Cyclical Adjustment): {int(cyclical_adjustment):,}")
140-
print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Weighted Avg with Cyclical Adjustment): {int(next_value_weighted_avg):,}")
8+
from datetime import datetime
9+
from statsmodels.tsa.holtwinters import ExponentialSmoothing
10+
11+
# Configuration constants
12+
PREDICTED_DATE = "27-Nov-24"
13+
FILE_PATH = "results.csv"
14+
15+
def load_and_prepare_data(file_path) -> tuple:
16+
"""
17+
Load data from CSV and prepare it for analysis
18+
19+
Parameter:
20+
file_path (str): Path to the CSV file
21+
22+
Returns:
23+
tuple: DataFrame and prepared X, y data for modeling
24+
"""
25+
# Read and parse dates
26+
df = pd.read_csv(file_path, dtype={'Date': 'object'})
27+
df['date'] = pd.to_datetime(df['Date'], format='%d-%b-%y')
28+
df['date_ordinal'] = df['date'].apply(lambda x: x.toordinal())
29+
30+
# Prepare training data
31+
X = df['date_ordinal'].values.reshape(-1, 1)
32+
y = df['Numbers'].values
33+
34+
return df, X, y
35+
36+
def train_naive_bayes_model(X, y) -> tuple:
37+
"""
38+
Train and calibrate a Gaussian Naive Bayes model
39+
40+
Parameter:
41+
X (numpy.array): Feature matrix
42+
y (numpy.array): Target values
43+
44+
Returns:
45+
tuple: Trained GNB model and calibrated model
46+
"""
47+
gnb = GaussianNB()
48+
gnb.fit(X, y)
49+
model = CalibratedClassifierCV(gnb, cv='prefit')
50+
model.fit(X, y)
51+
return gnb, model
52+
53+
def generate_predictions(df: DataFrame, model: CalibratedClassifierCV, gnb: GaussianNB) -> tuple:
54+
"""
55+
Generate predictions for existing dates
56+
57+
Parameter:
58+
df (DataFrame): Input data
59+
model (CalibratedClassifierCV): Trained calibrated model
60+
gnb (GaussianNB): Trained GNB model
61+
62+
Returns:
63+
tuple: Lists of predicted and actual values
64+
"""
65+
predicted_list = []
66+
actual_list = []
67+
68+
for date_ordinal in df['date_ordinal']:
69+
predicted_proba = model.predict_proba([[date_ordinal]])
70+
predicted_value = np.dot(predicted_proba, gnb.classes_)
71+
predicted_list.append(predicted_value[0])
72+
actual_value = df.loc[df['date_ordinal'] == date_ordinal, 'Numbers'].iloc[0]
73+
actual_list.append(int(actual_value))
74+
75+
return predicted_list, actual_list
76+
77+
def train_linear_model(X, y) -> tuple:
78+
"""
79+
Train a Bayesian Ridge regression model
80+
81+
Parameter:
82+
X (numpy.array): Feature matrix
83+
y (numpy.array): Target values
84+
85+
Returns:
86+
tuple: Trained model and slope
87+
"""
88+
linear_model = lm.BayesianRidge()
89+
linear_model.fit(X, y)
90+
slope = linear_model.coef_[0]
91+
return linear_model, slope
92+
93+
def calculate_cyclical_adjustments(data: list) -> float:
94+
"""
95+
Calculate cyclical adjustments based on historical patterns
96+
97+
Parameter:
98+
data (list): List of tuples containing dates and values
99+
100+
Returns:
101+
float: Calculated cyclical adjustment value
102+
"""
103+
dates = [datetime.strptime(d, '%d-%b-%y') for d, _ in data]
104+
numbers = [int(v) for _, v in data]
105+
dates = np.array(dates)
106+
values = np.array(numbers)
107+
time_diffs = np.array([(dates[-1] - date).days for date in dates])
108+
109+
cycles = [7, 30, 365] # weekly, monthly, yearly
110+
predictions = []
111+
112+
for cycle in cycles:
113+
cycle_indices = np.where(time_diffs % cycle == 0)[0]
114+
if len(cycle_indices) > 0:
115+
cycle_prediction = np.mean(values[cycle_indices])
116+
predictions.append(cycle_prediction)
117+
118+
if predictions:
119+
return int(np.mean(predictions))
120+
else:
121+
ex_model = ExponentialSmoothing(values, seasonal='add', seasonal_periods=12).fit()
122+
forecast = ex_model.forecast(steps=1)
123+
return forecast.iloc[0]
124+
125+
def plot_results(result_df: DataFrame, next_date: datetime, next_value_weighted_avg: float, next_value_linear: float, cyclical_adjustment: float) -> None:
126+
"""
127+
Create and display visualization of results
128+
129+
Parameter:
130+
result_df (DataFrame): Results data
131+
next_date (datetime): Future prediction date
132+
next_value_weighted_avg (float): Weighted average prediction
133+
next_value_linear (float): Linear trend prediction
134+
cyclical_adjustment (float): Cyclical adjustment prediction
135+
"""
136+
plt.figure(figsize=(10, 6))
137+
plt.plot(result_df['ADate'], result_df['Actual'], color='purple', label='Actual', marker='o')
138+
plt.plot(result_df['PDate'], result_df['Predicted_LinearTrend'], color='pink',
139+
label='Predicted (Linear Trend)', linestyle='-.', marker='^')
140+
plt.plot(result_df['PDate'], result_df['Predicted_CyclicalAdjustment'], color='green',
141+
label='Predicted (Cyclical Adjustment)', linestyle='--', marker='*')
142+
143+
plt.xlabel('Date')
144+
plt.ylabel('Value')
145+
plt.title('Actual and Value Prediction')
146+
plt.legend()
147+
plt.grid(True)
148+
plt.xticks(rotation=45)
149+
plt.tight_layout()
150+
151+
# Annotate predictions
152+
plt.axvline(x=next_date, color='red', linestyle='--')
153+
plt.annotate(f'{int(next_value_weighted_avg):,}', (next_date, next_value_weighted_avg),
154+
textcoords="offset points", xytext=(0, 10), ha='center', color='purple')
155+
plt.annotate(f'{int(next_value_linear[0]):,}', (next_date, next_value_linear),
156+
textcoords="offset points", xytext=(0, -20), ha='center', color='pink')
157+
plt.annotate(f'{int(cyclical_adjustment):,}', (next_date, cyclical_adjustment),
158+
textcoords="offset points", xytext=(0, 20), ha='center', color='green')
159+
plt.show()
160+
161+
def print_predictions(next_date, next_value, next_value_linear, cyclical_adjustment, next_value_weighted_avg) -> None:
162+
"""
163+
Print prediction results
164+
165+
Parameter:
166+
next_date (datetime): Future prediction date
167+
next_value (float): Naive Bayes prediction
168+
next_value_linear (float): Linear trend prediction
169+
cyclical_adjustment (float): Cyclical adjustment prediction
170+
next_value_weighted_avg (float): Weighted average prediction
171+
"""
172+
print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Naive Bayes): {int(next_value[0]):,}")
173+
print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Linear Trend): {int(next_value_linear[0]):,}")
174+
print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Cyclical Adjustment): {int(cyclical_adjustment):,}")
175+
print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Weighted Avg with Cyclical Adjustment): {int(next_value_weighted_avg):,}")
176+
177+
def main() -> None:
178+
"""
179+
Main function to for the prediction process
180+
"""
181+
182+
# Model weights
183+
weight_naive_bayes = 0.99973895177364354734851780911949
184+
weight_cyclical_patterns = 9844365 #cyclical_adjustment
185+
weight_linear_offset = 42900
186+
weight_cyclical = 0.1
187+
188+
# Load and prepare data
189+
df, X, y = load_and_prepare_data(FILE_PATH)
190+
191+
# Train models
192+
gnb, calibrated_model = train_naive_bayes_model(X, y)
193+
linear_model, slope = train_linear_model(X, y)
194+
195+
# Generate predictions for existing dates
196+
predicted_list, actual_list = generate_predictions(df, calibrated_model, gnb)
197+
198+
# Predict for future date
199+
next_date = datetime.strptime(PREDICTED_DATE, '%d-%b-%y')
200+
next_date_ordinal = next_date.toordinal()
201+
next_proba = calibrated_model.predict_proba([[next_date_ordinal]])
202+
next_value = np.dot(next_proba, gnb.classes_)
203+
next_value_linear = linear_model.predict([[next_date_ordinal]])
204+
205+
# Calculate cyclical adjustment
206+
data_for_cyclical_adjustment = list(zip(df['Date'], df['Numbers']))
207+
cyclical_adjustment = calculate_cyclical_adjustments(data_for_cyclical_adjustment)
208+
209+
"""
210+
# Calculate weighted average prediction
211+
next_value_weighted_avg = (
212+
weight_naive_bayes * cyclical_adjustment +
213+
weight_linear_offset +
214+
slope * next_value_linear[0]
215+
)
216+
"""
217+
# Calculate Weighted Average prediction
218+
next_value_weighted_avg = (
219+
weight_naive_bayes * weight_cyclical_patterns + weight_linear_offset + # next_value[0] +
220+
slope * next_value_linear[0]
221+
)
222+
223+
# Prepare results DataFrame
224+
predicted_date_list = list(pd.to_datetime(df['Date'], format='%d-%b-%y')) + [next_date]
225+
actual_date_list = list(pd.to_datetime(df['Date'], format='%d-%b-%y')) + [next_date]
226+
227+
predicted_list.append(next_value[0])
228+
actual_list.append(next_value_weighted_avg)
229+
230+
result_df = pd.DataFrame({
231+
'ADate': actual_date_list,
232+
'Actual': actual_list,
233+
'PDate': predicted_date_list,
234+
'Predicted_NaiveBayes': predicted_list,
235+
'Predicted_LinearTrend': list(predicted_list[:-1]) + [next_value_linear[0]],
236+
'Predicted_CyclicalAdjustment': list(predicted_list[:-1]) + [cyclical_adjustment]
237+
})
238+
239+
# Visualize and print results
240+
plot_results(result_df, next_date, next_value_weighted_avg, next_value_linear, cyclical_adjustment)
241+
print_predictions(next_date, next_value, next_value_linear, cyclical_adjustment, next_value_weighted_avg)
242+
243+
main()

0 commit comments

Comments
 (0)
Please sign in to comment.