|
1 | 1 | import pandas as pd
|
2 | 2 | import numpy as np
|
3 | 3 | import matplotlib.pyplot as plt
|
| 4 | +from pandas import DataFrame |
4 | 5 | from sklearn.naive_bayes import GaussianNB
|
5 | 6 | from sklearn.calibration import CalibratedClassifierCV
|
6 | 7 | from sklearn import linear_model as lm
|
7 |
| -from sklearn.ensemble import RandomForestRegressor |
8 |
| -from datetime import datetime, timedelta |
9 |
| -from scipy.optimize import curve_fit |
10 |
| - |
11 |
| -# Default variables |
12 |
| -predicted_date = "20-Nov-24" |
13 |
| -predicted_list = [] |
14 |
| -actual_list = [] |
15 |
| - |
16 |
| -# Read the data from the CSV file and parse dates |
17 |
| -df = pd.read_csv("results.csv", dtype={'Date': 'object'}) |
18 |
| - |
19 |
| -# Convert data to DataFrame |
20 |
| -df['date'] = pd.to_datetime(df['Date'], format='%d-%b-%y') |
21 |
| - |
22 |
| -# Convert dates to ordinal for regression |
23 |
| -df['date_ordinal'] = df['date'].apply(lambda x: x.toordinal()) |
24 |
| - |
25 |
| -# Prepare training data |
26 |
| -X = df['date_ordinal'].values.reshape(-1, 1) |
27 |
| -y = df['Numbers'].values |
28 |
| - |
29 |
| -# Train the Gaussian Naive Bayes model with calibration |
30 |
| -gnb = GaussianNB() |
31 |
| -gnb.fit(X, y) |
32 |
| - |
33 |
| -# Calibrate the model |
34 |
| -model = CalibratedClassifierCV(gnb, cv="prefit") |
35 |
| -model.fit(X, y) |
36 |
| - |
37 |
| -# loop |
38 |
| -for date_ordinal in df['date_ordinal']: |
39 |
| - predicted_proba = model.predict_proba([[date_ordinal]]) |
40 |
| - predicted_value = np.dot(predicted_proba, gnb.classes_) # Calculate the expected value |
41 |
| - predicted_list.append(predicted_value[0]) # Append the expected value, not the entire array |
42 |
| - actual_value = df.loc[df['date_ordinal'] == date_ordinal, 'Numbers'].iloc[0] |
43 |
| - actual_list.append(int(actual_value)) |
44 |
| - |
45 |
| -# Predict for a future date using the Naive Bayes model |
46 |
| -next_date = datetime.strptime(predicted_date, '%d-%b-%y') |
47 |
| -next_date_ordinal = next_date.toordinal() |
48 |
| -next_proba = model.predict_proba([[next_date_ordinal]]) |
49 |
| -next_value = np.dot(next_proba, gnb.classes_) # Calculate the expected value for the next date |
50 |
| - |
51 |
| -# Train the model Linear Regression to predict the next value using the linear trend |
52 |
| -# linear_model = lm.LinearRegression() |
53 |
| -# linear_model = lm.TheilSenRegressor() |
54 |
| -# linear_model = lm.LogisticRegression(C=1.0) |
55 |
| -# linear_model = lm.GammaRegressor() |
56 |
| -# linear_model = RandomForestRegressor(n_estimators=9, random_state=0) |
57 |
| -linear_model = lm.BayesianRidge() |
58 |
| -linear_model.fit(X, y) |
59 |
| - |
60 |
| -# Predict using the linear trend model |
61 |
| -next_value_linear = linear_model.predict([[next_date_ordinal]]) |
62 |
| - |
63 |
| -# Calculate the slope predicted by the linear regression model |
64 |
| -slope = linear_model.coef_[0] |
65 |
| - |
66 |
| -# A sine function to the data for cyclical adjustments |
67 |
| -def sinusoidal_model(x, A, B, C, D): |
68 |
| - return A * np.sin(B * x + C) + D |
69 |
| - |
70 |
| -# Wrapper to convert dates |
71 |
| -x_data = df['date_ordinal'] |
72 |
| -y_data = df['Numbers'] |
73 |
| - |
74 |
| -# Fit the curve |
75 |
| -params, _ = curve_fit(sinusoidal_model, x_data, y_data, p0=[np.std(y), 2 * np.pi / 365, 0, np.mean(y)]) |
76 |
| - |
77 |
| -# Predicting the cyclical adjustment for the given future date |
78 |
| -cyclical_adjustment = sinusoidal_model(next_date_ordinal, *params) |
79 |
| - |
80 |
| -# Calculate Weighted Average prediction |
81 |
| -weight_naive_bayes = 0.99973895177364354734851780911949 # you can adjust these |
82 |
| -weight_linear_trend = slope |
83 |
| -weight_linear_offset = 494509 # you can adjust these |
84 |
| -weight_cyclical_patterns = cyclical_adjustment # you can adjust these |
85 |
| -weight_cyclical = 0.1 |
86 |
| -""" |
87 |
| -next_value_weighted_avg = ( |
88 |
| - weight_naive_bayes * next_value[0] + |
89 |
| - weight_linear_trend * next_value_linear[0] + |
90 |
| - weight_cyclical * cyclical_adjustment |
91 |
| -) |
92 |
| -""" |
93 |
| -next_value_weighted_avg = ( |
94 |
| - weight_naive_bayes * weight_cyclical_patterns + weight_linear_offset + |
95 |
| - weight_linear_trend * next_value_linear[0] |
96 |
| -) |
97 |
| - |
98 |
| -# Add the next prediction |
99 |
| -predicted_list.append(next_value[0]) |
100 |
| -actual_list.append(next_value_weighted_avg) # guess an actual value for the future date |
101 |
| - |
102 |
| -# Append the future date to the date list as datetime objects |
103 |
| -predicted_date = list(pd.to_datetime(df['Date'], format='%d-%b-%y')) + [next_date] |
104 |
| -actual_date = list(pd.to_datetime(df['Date'], format='%d-%b-%y')) + [next_date] |
105 |
| - |
106 |
| -# Create a DataFrame with the actual and predicted values |
107 |
| -result_df = pd.DataFrame({ |
108 |
| - 'ADate': actual_date, |
109 |
| - 'Actual': actual_list, |
110 |
| - 'PDate': predicted_date, |
111 |
| - 'Predicted_NaiveBayes': predicted_list, |
112 |
| - 'Predicted_LinearTrend': list(predicted_list[:-1]) + [next_value_linear[0]], # Use linear trend for the last prediction |
113 |
| - 'Predicted_CyclicalAdjustment': list(predicted_list[:-1]) + [cyclical_adjustment] # Use cyclical adjustment for another prediction |
114 |
| -}) |
115 |
| - |
116 |
| -# Plotting the results |
117 |
| -plt.figure(figsize=(10, 6)) |
118 |
| -plt.plot(result_df['ADate'], result_df['Actual'], color='purple', label='Actual', marker='o') |
119 |
| -plt.plot(result_df['PDate'], result_df['Predicted_LinearTrend'], color='pink', label='Predicted (Linear Trend)', linestyle='-.', marker='^') |
120 |
| -plt.xlabel('Date') |
121 |
| -plt.ylabel('Value') |
122 |
| -plt.title('Actual and Value Prediction') |
123 |
| -plt.legend() |
124 |
| -plt.grid(True) |
125 |
| -plt.xticks(rotation=45) |
126 |
| -plt.tight_layout() |
127 |
| - |
128 |
| -# Annotate the predicted points |
129 |
| -plt.axvline(x=next_date, color='red', linestyle='--') |
130 |
| -plt.annotate(f'{int(next_value_weighted_avg):,}', (next_date, next_value_weighted_avg), textcoords="offset points", xytext=(0, 10), ha='center', color='orange') |
131 |
| -plt.annotate(f'{int(next_value_linear[0]):,}', (next_date, next_value_linear), textcoords="offset points", xytext=(0, -20), ha='center', color='red') |
132 |
| -plt.annotate(f'{int(cyclical_adjustment):,}', (next_date, cyclical_adjustment), textcoords="offset points", xytext=(0, 20), ha='center', color='green') |
133 |
| - |
134 |
| -plt.show() |
135 |
| - |
136 |
| -# Print the predicted next values |
137 |
| -print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Naive Bayes): {int(next_value[0]):,}") |
138 |
| -print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Linear Trend): {int(next_value_linear[0]):,}") |
139 |
| -print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Cyclical Adjustment): {int(cyclical_adjustment):,}") |
140 |
| -print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Weighted Avg with Cyclical Adjustment): {int(next_value_weighted_avg):,}") |
| 8 | +from datetime import datetime |
| 9 | +from statsmodels.tsa.holtwinters import ExponentialSmoothing |
| 10 | + |
| 11 | +# Configuration constants |
| 12 | +PREDICTED_DATE = "27-Nov-24" |
| 13 | +FILE_PATH = "results.csv" |
| 14 | + |
| 15 | +def load_and_prepare_data(file_path) -> tuple: |
| 16 | + """ |
| 17 | + Load data from CSV and prepare it for analysis |
| 18 | +
|
| 19 | + Parameter: |
| 20 | + file_path (str): Path to the CSV file |
| 21 | +
|
| 22 | + Returns: |
| 23 | + tuple: DataFrame and prepared X, y data for modeling |
| 24 | + """ |
| 25 | + # Read and parse dates |
| 26 | + df = pd.read_csv(file_path, dtype={'Date': 'object'}) |
| 27 | + df['date'] = pd.to_datetime(df['Date'], format='%d-%b-%y') |
| 28 | + df['date_ordinal'] = df['date'].apply(lambda x: x.toordinal()) |
| 29 | + |
| 30 | + # Prepare training data |
| 31 | + X = df['date_ordinal'].values.reshape(-1, 1) |
| 32 | + y = df['Numbers'].values |
| 33 | + |
| 34 | + return df, X, y |
| 35 | + |
| 36 | +def train_naive_bayes_model(X, y) -> tuple: |
| 37 | + """ |
| 38 | + Train and calibrate a Gaussian Naive Bayes model |
| 39 | +
|
| 40 | + Parameter: |
| 41 | + X (numpy.array): Feature matrix |
| 42 | + y (numpy.array): Target values |
| 43 | +
|
| 44 | + Returns: |
| 45 | + tuple: Trained GNB model and calibrated model |
| 46 | + """ |
| 47 | + gnb = GaussianNB() |
| 48 | + gnb.fit(X, y) |
| 49 | + model = CalibratedClassifierCV(gnb, cv='prefit') |
| 50 | + model.fit(X, y) |
| 51 | + return gnb, model |
| 52 | + |
| 53 | +def generate_predictions(df: DataFrame, model: CalibratedClassifierCV, gnb: GaussianNB) -> tuple: |
| 54 | + """ |
| 55 | + Generate predictions for existing dates |
| 56 | +
|
| 57 | + Parameter: |
| 58 | + df (DataFrame): Input data |
| 59 | + model (CalibratedClassifierCV): Trained calibrated model |
| 60 | + gnb (GaussianNB): Trained GNB model |
| 61 | +
|
| 62 | + Returns: |
| 63 | + tuple: Lists of predicted and actual values |
| 64 | + """ |
| 65 | + predicted_list = [] |
| 66 | + actual_list = [] |
| 67 | + |
| 68 | + for date_ordinal in df['date_ordinal']: |
| 69 | + predicted_proba = model.predict_proba([[date_ordinal]]) |
| 70 | + predicted_value = np.dot(predicted_proba, gnb.classes_) |
| 71 | + predicted_list.append(predicted_value[0]) |
| 72 | + actual_value = df.loc[df['date_ordinal'] == date_ordinal, 'Numbers'].iloc[0] |
| 73 | + actual_list.append(int(actual_value)) |
| 74 | + |
| 75 | + return predicted_list, actual_list |
| 76 | + |
| 77 | +def train_linear_model(X, y) -> tuple: |
| 78 | + """ |
| 79 | + Train a Bayesian Ridge regression model |
| 80 | +
|
| 81 | + Parameter: |
| 82 | + X (numpy.array): Feature matrix |
| 83 | + y (numpy.array): Target values |
| 84 | +
|
| 85 | + Returns: |
| 86 | + tuple: Trained model and slope |
| 87 | + """ |
| 88 | + linear_model = lm.BayesianRidge() |
| 89 | + linear_model.fit(X, y) |
| 90 | + slope = linear_model.coef_[0] |
| 91 | + return linear_model, slope |
| 92 | + |
| 93 | +def calculate_cyclical_adjustments(data: list) -> float: |
| 94 | + """ |
| 95 | + Calculate cyclical adjustments based on historical patterns |
| 96 | +
|
| 97 | + Parameter: |
| 98 | + data (list): List of tuples containing dates and values |
| 99 | +
|
| 100 | + Returns: |
| 101 | + float: Calculated cyclical adjustment value |
| 102 | + """ |
| 103 | + dates = [datetime.strptime(d, '%d-%b-%y') for d, _ in data] |
| 104 | + numbers = [int(v) for _, v in data] |
| 105 | + dates = np.array(dates) |
| 106 | + values = np.array(numbers) |
| 107 | + time_diffs = np.array([(dates[-1] - date).days for date in dates]) |
| 108 | + |
| 109 | + cycles = [7, 30, 365] # weekly, monthly, yearly |
| 110 | + predictions = [] |
| 111 | + |
| 112 | + for cycle in cycles: |
| 113 | + cycle_indices = np.where(time_diffs % cycle == 0)[0] |
| 114 | + if len(cycle_indices) > 0: |
| 115 | + cycle_prediction = np.mean(values[cycle_indices]) |
| 116 | + predictions.append(cycle_prediction) |
| 117 | + |
| 118 | + if predictions: |
| 119 | + return int(np.mean(predictions)) |
| 120 | + else: |
| 121 | + ex_model = ExponentialSmoothing(values, seasonal='add', seasonal_periods=12).fit() |
| 122 | + forecast = ex_model.forecast(steps=1) |
| 123 | + return forecast.iloc[0] |
| 124 | + |
| 125 | +def plot_results(result_df: DataFrame, next_date: datetime, next_value_weighted_avg: float, next_value_linear: float, cyclical_adjustment: float) -> None: |
| 126 | + """ |
| 127 | + Create and display visualization of results |
| 128 | +
|
| 129 | + Parameter: |
| 130 | + result_df (DataFrame): Results data |
| 131 | + next_date (datetime): Future prediction date |
| 132 | + next_value_weighted_avg (float): Weighted average prediction |
| 133 | + next_value_linear (float): Linear trend prediction |
| 134 | + cyclical_adjustment (float): Cyclical adjustment prediction |
| 135 | + """ |
| 136 | + plt.figure(figsize=(10, 6)) |
| 137 | + plt.plot(result_df['ADate'], result_df['Actual'], color='purple', label='Actual', marker='o') |
| 138 | + plt.plot(result_df['PDate'], result_df['Predicted_LinearTrend'], color='pink', |
| 139 | + label='Predicted (Linear Trend)', linestyle='-.', marker='^') |
| 140 | + plt.plot(result_df['PDate'], result_df['Predicted_CyclicalAdjustment'], color='green', |
| 141 | + label='Predicted (Cyclical Adjustment)', linestyle='--', marker='*') |
| 142 | + |
| 143 | + plt.xlabel('Date') |
| 144 | + plt.ylabel('Value') |
| 145 | + plt.title('Actual and Value Prediction') |
| 146 | + plt.legend() |
| 147 | + plt.grid(True) |
| 148 | + plt.xticks(rotation=45) |
| 149 | + plt.tight_layout() |
| 150 | + |
| 151 | + # Annotate predictions |
| 152 | + plt.axvline(x=next_date, color='red', linestyle='--') |
| 153 | + plt.annotate(f'{int(next_value_weighted_avg):,}', (next_date, next_value_weighted_avg), |
| 154 | + textcoords="offset points", xytext=(0, 10), ha='center', color='purple') |
| 155 | + plt.annotate(f'{int(next_value_linear[0]):,}', (next_date, next_value_linear), |
| 156 | + textcoords="offset points", xytext=(0, -20), ha='center', color='pink') |
| 157 | + plt.annotate(f'{int(cyclical_adjustment):,}', (next_date, cyclical_adjustment), |
| 158 | + textcoords="offset points", xytext=(0, 20), ha='center', color='green') |
| 159 | + plt.show() |
| 160 | + |
| 161 | +def print_predictions(next_date, next_value, next_value_linear, cyclical_adjustment, next_value_weighted_avg) -> None: |
| 162 | + """ |
| 163 | + Print prediction results |
| 164 | +
|
| 165 | + Parameter: |
| 166 | + next_date (datetime): Future prediction date |
| 167 | + next_value (float): Naive Bayes prediction |
| 168 | + next_value_linear (float): Linear trend prediction |
| 169 | + cyclical_adjustment (float): Cyclical adjustment prediction |
| 170 | + next_value_weighted_avg (float): Weighted average prediction |
| 171 | + """ |
| 172 | + print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Naive Bayes): {int(next_value[0]):,}") |
| 173 | + print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Linear Trend): {int(next_value_linear[0]):,}") |
| 174 | + print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Cyclical Adjustment): {int(cyclical_adjustment):,}") |
| 175 | + print(f"Predicted value for {next_date.strftime('%d-%b-%y')} (Weighted Avg with Cyclical Adjustment): {int(next_value_weighted_avg):,}") |
| 176 | + |
| 177 | +def main() -> None: |
| 178 | + """ |
| 179 | + Main function to for the prediction process |
| 180 | + """ |
| 181 | + |
| 182 | + # Model weights |
| 183 | + weight_naive_bayes = 0.99973895177364354734851780911949 |
| 184 | + weight_cyclical_patterns = 9844365 #cyclical_adjustment |
| 185 | + weight_linear_offset = 42900 |
| 186 | + weight_cyclical = 0.1 |
| 187 | + |
| 188 | + # Load and prepare data |
| 189 | + df, X, y = load_and_prepare_data(FILE_PATH) |
| 190 | + |
| 191 | + # Train models |
| 192 | + gnb, calibrated_model = train_naive_bayes_model(X, y) |
| 193 | + linear_model, slope = train_linear_model(X, y) |
| 194 | + |
| 195 | + # Generate predictions for existing dates |
| 196 | + predicted_list, actual_list = generate_predictions(df, calibrated_model, gnb) |
| 197 | + |
| 198 | + # Predict for future date |
| 199 | + next_date = datetime.strptime(PREDICTED_DATE, '%d-%b-%y') |
| 200 | + next_date_ordinal = next_date.toordinal() |
| 201 | + next_proba = calibrated_model.predict_proba([[next_date_ordinal]]) |
| 202 | + next_value = np.dot(next_proba, gnb.classes_) |
| 203 | + next_value_linear = linear_model.predict([[next_date_ordinal]]) |
| 204 | + |
| 205 | + # Calculate cyclical adjustment |
| 206 | + data_for_cyclical_adjustment = list(zip(df['Date'], df['Numbers'])) |
| 207 | + cyclical_adjustment = calculate_cyclical_adjustments(data_for_cyclical_adjustment) |
| 208 | + |
| 209 | + """ |
| 210 | + # Calculate weighted average prediction |
| 211 | + next_value_weighted_avg = ( |
| 212 | + weight_naive_bayes * cyclical_adjustment + |
| 213 | + weight_linear_offset + |
| 214 | + slope * next_value_linear[0] |
| 215 | + ) |
| 216 | + """ |
| 217 | + # Calculate Weighted Average prediction |
| 218 | + next_value_weighted_avg = ( |
| 219 | + weight_naive_bayes * weight_cyclical_patterns + weight_linear_offset + # next_value[0] + |
| 220 | + slope * next_value_linear[0] |
| 221 | + ) |
| 222 | + |
| 223 | + # Prepare results DataFrame |
| 224 | + predicted_date_list = list(pd.to_datetime(df['Date'], format='%d-%b-%y')) + [next_date] |
| 225 | + actual_date_list = list(pd.to_datetime(df['Date'], format='%d-%b-%y')) + [next_date] |
| 226 | + |
| 227 | + predicted_list.append(next_value[0]) |
| 228 | + actual_list.append(next_value_weighted_avg) |
| 229 | + |
| 230 | + result_df = pd.DataFrame({ |
| 231 | + 'ADate': actual_date_list, |
| 232 | + 'Actual': actual_list, |
| 233 | + 'PDate': predicted_date_list, |
| 234 | + 'Predicted_NaiveBayes': predicted_list, |
| 235 | + 'Predicted_LinearTrend': list(predicted_list[:-1]) + [next_value_linear[0]], |
| 236 | + 'Predicted_CyclicalAdjustment': list(predicted_list[:-1]) + [cyclical_adjustment] |
| 237 | + }) |
| 238 | + |
| 239 | + # Visualize and print results |
| 240 | + plot_results(result_df, next_date, next_value_weighted_avg, next_value_linear, cyclical_adjustment) |
| 241 | + print_predictions(next_date, next_value, next_value_linear, cyclical_adjustment, next_value_weighted_avg) |
| 242 | + |
| 243 | +main() |
0 commit comments