import pandas as pd
import numpy as np

df = pd.read_csv('./data_file/student_habits_performance.csv')

df['pass'] = (df['exam_score'] >= 60).astype(int)

features = [
    'study_hours_per_day', 'social_media_hours', 'netflix_hours',
    'attendance_percentage', 'sleep_hours', 'exercise_frequency',
    'mental_health_rating'
]
X = df[features].fillna(df[features].mean()).values
y = df['pass'].values

class GNB:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.priors = {}
        self.means  = {}
        self.vars   = {}
        for c in self.classes:
            Xc = X[y == c]
            self.priors[c] = Xc.shape[0] / X.shape[0]
            self.means[c]  = Xc.mean(axis=0)
            # 加一个极小常数防止方差为 0
            self.vars[c]   = Xc.var(axis=0) + 1e-8

    def predict(self, X):
        y_pred = []
        for x in X:
            post = []
            for c in self.classes:
                # log 先验
                lp = np.log(self.priors[c])
                # 高斯 Log-Likelihood
                ll = -0.5 * np.sum(np.log(2*np.pi*self.vars[c]))
                ll -= 0.5 * np.sum((x - self.means[c])**2 / self.vars[c])
                post.append(lp + ll)
            y_pred.append(self.classes[np.argmax(post)])
        return np.array(y_pred)

if __name__ == "__main__":
    #分层 5 折交叉验证 + 对比 sklearn 的 GaussianNB
    from sklearn.naive_bayes import GaussianNB
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from scipy.stats import ttest_rel

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    metrics_my     = {'acc':[], 'prec':[], 'recall':[], 'f1':[]}
    metrics_sklearn = {'acc':[], 'prec':[], 'recall':[], 'f1':[]}

    for train_idx, test_idx in skf.split(X, y):
        Xtr, Xte = X[train_idx], X[test_idx]
        ytr, yte = y[train_idx], y[test_idx]

        my_clf = GNB();  my_clf.fit(Xtr, ytr)
        pred_my = my_clf.predict(Xte)

        sk_clf = GaussianNB();  sk_clf.fit(Xtr, ytr)
        pred_sk = sk_clf.predict(Xte)

        for d,p in [('acc', accuracy_score),
                    ('prec', precision_score),
                    ('recall', recall_score),
                    ('f1', f1_score)]:
            metrics_my[d].append(    p(yte, pred_my) )
            metrics_sklearn[d].append(p(yte, pred_sk) )

    #统计结果 & t-检验
    import pandas as pd
    res = pd.DataFrame({
        'MyMean':    [np.mean(metrics_my[d])     for d in ['acc','prec','recall','f1']],
        'MyStd':     [np.std( metrics_my[d], ddof=1) for d in ['acc','prec','recall','f1']],
        'SKLearnMean':[np.mean(metrics_sklearn[d]) for d in ['acc','prec','recall','f1']],
        'SKLearnStd': [np.std( metrics_sklearn[d], ddof=1) for d in ['acc','prec','recall','f1']],
    }, index=['Accuracy','Precision','Recall','F1-score'])

    tt = {'t_stat':[], 'p_value':[]}
    for d in ['acc','prec','recall','f1']:
        t, p = ttest_rel(metrics_my[d], metrics_sklearn[d])
        tt['t_stat'].append(t)
        tt['p_value'].append(p)
    tt_res = pd.DataFrame(tt, index=res.index)

    print("=== Performance Comparison ===")
    print(res.round(4))
    print("\n=== Paired t-test (My vs Sklearn) ===")
    print(tt_res.round(4))

    diffs = np.array(metrics_my['acc']) - np.array(metrics_sklearn['acc'])
    print(diffs) 