import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

df = pd.read_csv('./data_file/student_habits_performance.csv')

X = df[['study_hours_per_day', 'exam_score']].values

def k_means(X, k, max_iters=100, tol=1e-4, random_state=42):
    rng = np.random.RandomState(random_state)
    n_samples, n_features = X.shape

    init_idx = rng.choice(n_samples, k, replace=False)
    centroids = X[init_idx].copy()

    for _ in range(max_iters):
        dists = np.linalg.norm(X[:, None, :] - centroids[None, :, :], axis=2)
        labels = np.argmin(dists, axis=1)

        new_centroids = np.zeros_like(centroids)
        for j in range(k):
            if np.any(labels == j):
                new_centroids[j] = X[labels == j].mean(axis=0)
            else:
                new_centroids[j] = centroids[j]

        if np.max(np.linalg.norm(new_centroids - centroids, axis=1)) < tol:
            
            break
        centroids = new_centroids
    print("the iterating number is",  _)
    return labels, centroids
start   = time.perf_counter()
labels, centers = k_means(X, k=5)
end   = time.perf_counter()

print(f"K-medoids 聚类用时: {end - start:.4f} 秒")
print(np.unique(labels,    return_counts=True))

plt.figure(figsize=(8, 6))

plt.scatter(
    X[:, 0], X[:, 1],
    c=labels,
    cmap='rainbow',
    alpha=0.3,
    s=60
)
plt.scatter(
    centers[:, 0], centers[:, 1],
    marker='x',
    c='red',
    s=200,
    linewidths=3
)

plt.xlabel('Study Hours per Day')
plt.ylabel('Exam Score')
plt.title('K-means Clustering Result')
plt.grid(True)
plt.show()
