import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 自定义函数：按众数的频率填充缺失值
def fill_str(df, column_name):
    mode_counts = df[column_name].value_counts(normalize=True)
    modes = mode_counts.index
    probabilities = mode_counts.values
    missing_index = df[df[column_name].isnull()].index
    for idx in missing_index:
        chosen_mode = np.random.choice(modes, p=probabilities)
        df.at[idx, column_name] = chosen_mode
    return df

df = pd.read_csv('/home/linux/dm/lab1/data_file/student_habits_performance.csv')

df = df.drop_duplicates()

# 输出每列的缺失值数量
missing_values = df.isnull().sum()

# 保留原始 'gender' 列
df['original_gender'] = df['gender']

# 教育水平映射
education_mapping = {
    'High School': 1,
    'Bachelor': 2,
    'Master': 3,
}
df['parental_education_level'] = df['parental_education_level'].map(education_mapping)

df['gender'] = df['gender'].astype('category').cat.codes
df['part_time_job'] = df['part_time_job'].astype('category').cat.codes
df['exercise_frequency'] = df['exercise_frequency'].astype('category')
df['internet_quality'] = df['internet_quality'].astype('category').cat.codes
df['mental_health_rating'] = df['mental_health_rating'].astype('category')
df['extracurricular_participation'] = df['extracurricular_participation'].astype('category').cat.codes

# 选择所有数值列
numeric_cols = df.select_dtypes(include=['number']).columns

# 使用每列的均值填充缺失值，只针对数值列
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# 离散化 'sleep_hours' 列
bins = [0, 5, 7, 9, float('inf')]
labels = ['睡眠不足', '适量睡眠', '健康睡眠', '睡眠过多']
df['sleep_hours_category'] = pd.cut(df['sleep_hours'], bins=bins, labels=labels, right=False)

# 为 'sleep_hours_category' 赋予数字值
sleep_mapping = {'睡眠不足': 1, '适量睡眠': 2, '健康睡眠': 3, '睡眠过多': 4}
df['sleep_hours_category_numeric'] = df['sleep_hours_category'].map(sleep_mapping)

# 归一化相关列
scaler = MinMaxScaler()

df['mental_health_rating_normalized'] = scaler.fit_transform(df[['mental_health_rating']])
df['study_hours_per_day_normalized'] = scaler.fit_transform(df[['study_hours_per_day']])
df['social_media_hours_normalized'] = scaler.fit_transform(df[['social_media_hours']])
df['netflix_hours_normalized'] = scaler.fit_transform(df[['netflix_hours']])
df['attendance_percentage_normalized'] = scaler.fit_transform(df[['attendance_percentage']])

# 使用自定义函数填充缺失值
df = fill_str(df, 'internet_quality')
df = fill_str(df, 'parental_education_level')

# 保存处理后的数据到 CSV 文件
df.to_csv('/home/linux/dm/lab1/data_file/processed_student_data.csv', index=False)

# 输出确认消息
print("数据已保存到 processed_student_data.csv")
