import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

class SalesForecaster:
    """销量预测系统 - 支持多种算法"""
    
    def __init__(self):
        self.data = None
        self.models = {}
        self.features = None
        
    def generate_sample_data(self, start_date='2022-01-01', periods=730):
        """生成模拟销售数据"""
        dates = pd.date_range(start=start_date, periods=periods, freq='D')
        
        # 基础趋势 + 季节性 + 节假日 + 噪声
        trend = np.linspace(100, 200, periods)
        seasonal = 30 * np.sin(2 * np.pi * np.arange(periods) / 365.25)
        weekly = 10 * np.sin(2 * np.pi * np.arange(periods) / 7)
        noise = np.random.normal(0, 15, periods)
        
        # 节假日效应
        holidays = pd.DataFrame({
            'date': pd.to_datetime(['2022-01-01', '2022-02-01', '2022-05-01', 
                                   '2022-06-01', '2022-10-01', '2022-11-11',
                                   '2023-01-01', '2023-02-01', '2023-05-01',
                                   '2023-06-01', '2023-10-01', '2023-11-11'])
        })
        
        sales = trend + seasonal + weekly + noise + 100
        
        df = pd.DataFrame({
            'date': dates,
            'sales': np.maximum(sales, 20).astype(int)
        })
        
        # 添加节假日标记
        df['is_holiday'] = df['date'].isin(holidays['date']).astype(int)
        df.loc[df['is_holiday'] == 1, 'sales'] += np.random.randint(50, 150)
        
        self.data = df
        return df
    
    def create_features(self, df):
        """特征工程"""
        df = df.copy()
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        df['dayofweek'] = df['date'].dt.dayofweek
        df['quarter'] = df['date'].dt.quarter
        df['dayofyear'] = df['date'].dt.dayofyear
        
        # 滞后特征
        for lag in [1, 7, 14, 30]:
            df[f'sales_lag_{lag}'] = df['sales'].shift(lag)
        
        # 移动平均
        for window in [7, 14, 30]:
            df[f'sales_ma_{window}'] = df['sales'].shift(1).rolling(window=window).mean()
        
        # 同比环比
        df['sales_yoy'] = df['sales'].shift(365)
        df['sales_mom'] = df['sales'].shift(30)
        
        return df.dropna()
    
    def prepare_data(self, test_days=30):
        """准备训练/测试数据"""
        df = self.create_features(self.data)
        
        feature_cols = [c for c in df.columns if c not in ['date', 'sales']]
        X = df[feature_cols]
        y = df['sales']
        
        split_idx = len(df) - test_days
        return (X.iloc[:split_idx], X.iloc[split_idx:], 
                y.iloc[:split_idx], y.iloc[split_idx:],
                df.iloc[split_idx:]['date'])
    
    def train_models(self):
        """训练多个模型"""
        X_train, X_test, y_train, y_test, test_dates = self.prepare_data()
        
        # 线性回归
        lr = LinearRegression()
        lr.fit(X_train, y_train)
        self.models['线性回归'] = lr
        
        # 随机森林
        rf = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42)
        rf.fit(X_train, y_train)
        self.models['随机森林'] = rf
        
        # 评估
        results = {}
        for name, model in self.models.items():
            pred = model.predict(X_test)
            results[name] = {
                'MAE': mean_absolute_error(y_test, pred),
                'RMSE': np.sqrt(mean_squared_error(y_test, pred)),
                'R²': r2_score(y_test, pred),
                'predictions': pred,
                'actual': y_test.values
            }
        
        return results, test_dates
    
    def forecast_future(self, days=30):
        """预测未来销量"""
        if not self.models:
            raise ValueError("请先训练模型")
        
        # 用最后已知数据扩展
        future_dates = pd.date_range(
            start=self.data['date'].max() + timedelta(days=1),
            periods=days, freq='D'
        )
        
        # 迭代预测
        current_data = self.data.copy()
        predictions = {'date': future_dates}
        
        for name, model in self.models.items():
            preds = []
            temp_data = current_data.copy()
            
            for future_date in future_dates:
                row = pd.DataFrame({'date': [future_date], 'sales': [temp_data['sales'].iloc[-1]]})
                temp_data = pd.concat([temp_data, row], ignore_index=True)
                
                feat_df = self.create_features(temp_data)
                feat_row = feat_df.iloc[[-1]].drop(columns=['date', 'sales'])
                
                pred = model.predict(feat_row)[0]
                preds.append(max(pred, 0))
                temp_data.loc[temp_data.index[-1], 'sales'] = pred
            
            predictions[name] = preds
        
        return pd.DataFrame(predictions)
    
    def plot_results(self, results, test_dates, future_df=None):
        """可视化结果"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # 1. 历史销量趋势
        ax1 = axes[0, 0]
        ax1.plot(self.data['date'], self.data['sales'], alpha=0.7, label='历史销量')
        ax1.set_title('历史销量趋势')
        ax1.set_xlabel('日期')
        ax1.set_ylabel('销量')
        ax1.legend()
        
        # 2. 模型对比
        ax2 = axes[0, 1]
        models = list(results.keys())
        x = np.arange(len(models))
        width = 0.25
        
        ax2.bar(x - width, [results[m]['MAE'] for m in models], width, label='MAE')
        ax2.bar(x, [results[m]['RMSE'] for m in models], width, label='RMSE')
        ax2.bar(x + width, [results[m]['R²'] for m in models], width, label='R²')
        ax2.set_xticks(x)
        ax2.set_xticklabels(models)
        ax2.set_title('模型评估指标对比')
        ax2.legend()
        
        # 3. 预测 vs 实际
        ax3 = axes[1, 0]
        best_model = min(results, key=lambda x: results[x]['MAE'])
        ax3.plot(test_dates, results[best_model]['actual'], 'b-', label='实际值', alpha=0.7)
        ax3.plot(test_dates, results[best_model]['predictions'], 'r--', label=f'{best_model}预测')
        ax3.set_title(f'预测效果对比 (最佳模型: {best_model})')
        ax3.set_xlabel('日期')
        ax3.set_ylabel('销量')
        ax3.legend()
        
        # 4. 未来预测
        ax4 = axes[1, 1]
        if future_df is not None:
            for col in future_df.columns:
                if col != 'date':
                    ax4.plot(future_df['date'], future_df[col], '--', label=f'{col}预测')
            ax4.set_title('未来30天销量预测')
            ax4.set_xlabel('日期')
            ax4.set_ylabel('预测销量')
            ax4.legend()
        
        plt.tight_layout()
        plt.savefig('sales_forecast.png', dpi=150, bbox_inches='tight')
        plt.show()
        print("图表已保存为 sales_forecast.png")
        
        return best_model


# ========== 主程序 ==========
if __name__ == '__main__':
    print("=" * 50)
    print("销量预测系统")
    print("=" * 50)
    
    # 初始化
    forecaster = SalesForecaster()
    
    # 1. 生成/加载数据
    print("\n[1] 生成模拟销售数据...")
    data = forecaster.generate_sample_data()
    print(f"数据范围: {data['date'].min().date()} ~ {data['date'].max().date()}")
    print(f"总记录数: {len(data)}")
    print(f"平均日销量: {data['sales'].mean():.1f}")
    
    # 2. 训练模型
    print("\n[2] 训练预测模型...")
    results, test_dates = forecaster.train_models()
    
    print("\n模型评估结果:")
    print("-" * 40)
    for name, metrics in results.items():
        print(f"\n{name}:")
        print(f"  MAE:  {metrics['MAE']:.2f}")
        print(f"  RMSE: {metrics['RMSE']:.2f}")
        print(f"  R²:   {metrics['R²']:.4f}")
    
    # 3. 未来预测
    print("\n[3] 预测未来30天销量...")
    future_predictions = forecaster.forecast_future(days=30)
    print("\n未来7天预测:")
    print(future_predictions.head(7).to_string())
    
    # 4. 可视化
    print("\n[4] 生成可视化图表...")
    best_model = forecaster.plot_results(results, test_dates, future_predictions)
    
    # 5. 输出预测摘要
    print("\n" + "=" * 50)
    print("预测摘要")
    print("=" * 50)
    print(f"最佳模型: {best_model}")
    print(f"预测日期: {future_predictions['date'].min().date()} ~ {future_predictions['date'].max().date()}")
    for col in future_predictions.columns:
        if col != 'date':
            avg_pred = future_predictions[col].mean()
            print(f"{col} 平均预测销量: {avg_pred:.1f}")