import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

import matplotlib.pyplot as plt
import seaborn as sns

from mlxtend.frequent_patterns import fpgrowth, association_rules

train = pd.read_csv(r"F:\zju_4\other\dm\titanic\train.csv")
test  = pd.read_csv(r"F:\zju_4\other\dm\titanic\test.csv")
test["Survived"] = -1
df = pd.concat([train, test], sort=False).reset_index(drop=True)

def feature_engineering(df):
    df = df.copy()
    df["Cabin"] = df["Cabin"].astype(str).replace("nan", np.nan)
    df["Deck"] = df["Cabin"].str[0].fillna("U")
    df["CabinNum"] = df["Cabin"].str.extract(r"(\d+)").astype(float)
    med = df["CabinNum"].median()
    deck_med = df.groupby("Deck")["CabinNum"].transform("median").fillna(med)
    df["CabinNum"] = df["CabinNum"].fillna(deck_med)
    df["CabinSide"] = np.where(df["CabinNum"].astype(int)%2==0, "Left", "Right")
    df["Surname"] = df["Name"].str.split(",", expand=True)[0]
    df["Title"]   = df["Name"].str.extract(r",\s*([^\.]+)\.")
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"]    = (df["FamilySize"]==1).astype(int)
    df["Age"]      = df.groupby(["Sex","Pclass"])["Age"].transform(lambda x: x.fillna(x.median()))
    df["Fare"]     = df.groupby("Pclass")["Fare"].transform(lambda x: x.fillna(x.mean()))
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

df = feature_engineering(df)

train_df = df[df.Survived!=-1].copy()
test_df  = df[df.Survived==-1].copy()

y = train_df.Survived.astype(int)
X = train_df.drop(columns=["Survived","PassengerId","Name","Ticket","Cabin"])
X_test = test_df.drop(columns=["Survived","PassengerId","Name","Ticket","Cabin"])

num_cols = ["Age","SibSp","Parch","Fare","CabinNum","FamilySize"]
cat_cols = ["Pclass","Sex","Embarked","Deck","CabinSide","Title"]

preprocessor = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                      ("sc", StandardScaler())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("ohe", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))]), cat_cols)
])

X_proc = preprocessor.fit_transform(X)
X_test_proc = preprocessor.transform(X_test)

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_proc)

tsne = TSNE(n_components=2, random_state=42, init="pca")
X_tsne = tsne.fit_transform(X_proc)

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y, palette="Set1", s=40)
plt.title("PCA projection")
plt.subplot(1,2,2)
sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=y, palette="Set1", s=40)
plt.title("t-SNE projection")
plt.show()

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_proc)
plt.figure(figsize=(6,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=clusters, palette="tab10", s=40)
plt.title("KMeans Clusters (k=3)")
plt.show()
train_df['cluster'] = clusters
print(train_df.groupby('cluster')['Survived'].mean())

iso = IsolationForest(contamination=0.02, random_state=42)
outliers = iso.fit_predict(X_proc)  # -1: 异常  +1: 正常
mask_out = (outliers == -1)
print("Detected outliers:", mask_out.sum())

fp_df = pd.DataFrame(
    OneHotEncoder(sparse_output=False).fit_transform(df[cat_cols]),
    columns=OneHotEncoder().fit(df[cat_cols]).get_feature_names_out(cat_cols)
)
fp_tr = fp_df.iloc[train_df.index]

freq_items = fpgrowth(fp_tr, min_support=0.1, use_colnames=True)
rules = association_rules(freq_items, metric="lift", min_threshold=1.2)
print("Top 5 association rules:\n", rules.head())


X_final = pd.DataFrame(X_proc, index=train_df.index)
X_final["cluster"] = clusters
X_final["outlier"] = (outliers==-1).astype(int)
X_final.columns = X_final.columns.astype(str)

X_train, X_val, y_train, y_val = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)

clf1 = LogisticRegression(max_iter=1000, random_state=42)
clf2 = RandomForestClassifier(n_estimators=200, random_state=42)
clf3 = SVC(random_state=42)

scores = {}

for clf, name in [(clf1,"LogReg"), (clf2,"RF"), (clf3,"SVC")]:
    clf.fit(X_train, y_train)
    pred = clf.predict(X_val)
    acc  = accuracy_score(y_val, pred)
    scores[name] = acc
    print(f"{name} acc:", acc)
    print(classification_report(y_val, pred))


best_name = max(scores, key=scores.get)
best = {"LogReg":clf1, "RF":clf2, "SVC":clf3}[best_name]
print(f"\n 最佳模型是 {best_name}，准确率={scores[best_name]:.4f}")


X_test_final = pd.DataFrame(X_test_proc, index=test_df.index)
X_test_final["cluster"] = kmeans.predict(X_test_proc)
X_test_final["outlier"] = (iso.predict(X_test_proc)==-1).astype(int)
X_test_final.columns = X_test_final.columns.astype(str)

submission = pd.DataFrame({
    "PassengerId": test_df.PassengerId,
    "Survived": best.predict(X_test_final)
})
submission.to_csv("submission_clustering_ensemble.csv", index=False)
