import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb
from catboost import CatBoostClassifier

train = pd.read_csv(r"F:\zju_4\other\dm\titanic\train.csv")
test = pd.read_csv(r"F:\zju_4\other\dm\titanic\test.csv")
y     = train["Survived"]
test_ids = test["PassengerId"]

test["Survived"] = -1
df = pd.concat([train, test], sort=False).reset_index(drop=True)

df["Cabin"] = df["Cabin"].astype(str).replace("nan", np.nan)
df["Deck"] = df["Cabin"].str[0].fillna("U")
df["CabinNum"] = df["Cabin"].str.extract(r"(\d+)").astype(float)

global_median = df["CabinNum"].median()

deck_median = (
    df.groupby("Deck")["CabinNum"]
      .median()
      .fillna(global_median)
)

df["CabinNum"] = df.apply(
    lambda row: deck_median.loc[row.Deck]
                if pd.isna(row.CabinNum)
                else row.CabinNum,
    axis=1
)

df["CabinSide"] = df["CabinNum"].astype(int) % 2
df["CabinSide"] = df["CabinSide"].map({0: "Left", 1: "Right"})

df["Surname"] = df["Name"].str.split(",", expand=True)[0]
df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.")
def get_prefix(t):
    parts = t.replace(".", "").replace("/", "").split()
    prefs = [p for p in parts if not p.isdigit()]
    return prefs[0] if prefs else "None"
df["TicketPrefix"] = df["Ticket"].map(get_prefix)
df["TicketCount"]  = df.groupby("Ticket")["Ticket"].transform("count")
df["SurnameCount"] = df.groupby("Surname")["Surname"].transform("count")


prefix_counts = df["TicketPrefix"].value_counts()
rare_pref = prefix_counts[prefix_counts <= 3].index
df["TicketPrefix"] = df["TicketPrefix"].replace(rare_pref, "Other")

surname_counts = df["Surname"].value_counts()
rare_name = surname_counts[surname_counts <= 1].index
df["Surname"] = df["Surname"].replace(rare_name, "Other")

df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["Kid"]   = (df["Age"] < 18).astype(int)
df["Old"]   = (df["Age"] > 60).astype(int)
df["Alone"] = (df["FamilySize"] == 1).astype(int)

age_median = df.groupby(["Sex","Pclass"])["Age"].median()
df["Age"] = df.apply(
    lambda r: age_median[(r.Sex, r.Pclass)] if np.isnan(r.Age) else r.Age,
    axis=1
)

fare_mean = df.groupby("Pclass")["Fare"].mean()
df["Fare"] = df.apply(
    lambda r: fare_mean[r.Pclass] if np.isnan(r.Fare) else r.Fare,
    axis=1
)

df["Embarked"] = df["Embarked"].fillna("S")

features = [
    "Pclass","Sex","Age","SibSp","Parch","Fare","Embarked",
    "Deck","CabinNum","CabinSide",
    "FamilySize","Kid","Old","Alone",
    "TicketPrefix","TicketCount","Surname","SurnameCount","Title"
]

train_fe = df[df["Survived"] != -1].copy()
test_fe  = df[df["Survived"] == -1].copy()

X = train_fe[features]
y = train_fe["Survived"].astype(int)
X_test = test_fe[features]

num_cols = [
    "Age","SibSp","Parch","Fare","CabinNum","FamilySize",
    "TicketCount","SurnameCount"
]
cat_cols = [
    "Pclass","Sex","Embarked","Deck","CabinSide",
    "TicketPrefix","Surname","Title"
]
bool_cols = ["Kid","Old","Alone"]

for b in bool_cols:
    X[b] = X[b].astype(int)
    X_test[b] = X_test[b].astype(int)

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  #独热编码器
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols + bool_cols),
    ("cat", cat_pipe, cat_cols)
])



X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

base_models = {
    "lr":  LogisticRegression(max_iter=2000),
    "svc": SVC(probability=True, random_state=42),
    # "xgb": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    # "cat": CatBoostClassifier(verbose=0, random_state=42)
}

val_preds   = {}
test_preds  = {}
val_scores  = {}

for name, clf in base_models.items():
    pipe = make_pipeline(preprocessor, clf)
    pipe.fit(X_tr, y_tr)

    p_val = pipe.predict(X_val)
    acc   = accuracy_score(y_val, p_val)
    val_scores[name] = acc
    val_preds[name]  = pipe.predict_proba(X_val)[:,1]

    test_preds[name] = pipe.predict(X_test)

    pd.DataFrame({
        "PassengerId": test_ids,
        "Survived": test_preds[name]
    }).to_csv(f"submission_{name}.csv", index=False)
    print(f"{name}  accuracy: {acc:.4f}")

estimators = [(n, base_models[n]) for n in base_models]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5, passthrough=False
)
stack_pipe = make_pipeline(preprocessor, stack)
stack_pipe.fit(X_tr, y_tr)
svc = stack_pipe.predict(X_val)
print("stacking accuracy:", accuracy_score(y_val, svc))
pd.DataFrame({
    "PassengerId": test_ids,
    "Survived": stack_pipe.predict(X_test)
}).to_csv("submission_stacking.csv", index=False)

meta_tr = pd.DataFrame({
    name: cross_val_predict(
        make_pipeline(preprocessor, mdl),
        X, y, cv=5, method="predict_proba"
    )[:,1]
    for name, mdl in base_models.items()
})
meta_te = pd.DataFrame({
    name: make_pipeline(preprocessor, mdl).fit(X, y).predict_proba(X_test)[:,1]
    for name, mdl in base_models.items()
})
weights = np.array(list(val_scores.values()))
weights = weights / weights.sum()
meta_tr["weighted"]   = meta_tr.dot(weights)
meta_te["weighted"]   = meta_te.dot(weights)
meta_tr["final_pred"] = (meta_tr["weighted"] > 0.5).astype(int)

meta_clf = SVC()

meta_clf.fit(meta_tr, y)
X_meta_tr = meta_tr.drop(columns=["final_pred"])
meta_clf.fit(X_meta_tr, y)

final_pred = meta_clf.predict(meta_te)

pd.DataFrame({
    "PassengerId": test_ids,
    "Survived": final_pred
}).to_csv("submission_manual_ensemble.csv", index=False)