# ===== Logistic Regression on Real Dataset ===== # 1) Load libraries import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns # 2) Load Pima Indians Diabetes dataset (public URL) url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv" col_names = [ "pregnancies", "glucose", "blood_pressure", "skin_thickness", "insulin", "bmi", "diabetes_pedigree", "age", "outcome" ] data = pd.read_csv(url, names=col_names) # Inspect dataset print("First 5 rows:") print(data.head(), "\n") print("Dataset shape:", data.shape) # 3) Separate features and labels X = data.drop("outcome", axis=1) y = data["outcome"] # 4) Train/Test split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # 5) Feature scaling (important for many ml models) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 6) Train Logistic Regression model model = LogisticRegression(max_iter=1000) model.fit(X_train_scaled, y_train) # 7) Predictions y_pred = model.predict(X_test_scaled) # 8) Evaluation print("Accuracy:", accuracy_score(y_test, y_pred)) print("\nClassification Report:\n", classification_report(y_test, y_pred)) print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred)) # 9) Optional: Probability outputs y_proba = model.predict_proba(X_test_scaled)[:, 1] print("\nPredicted probabilities (first 10):\n", y_proba[:10]) # 10) Visualizing the Sigmoid (S-Curve) # We calculate the "Logits" (z) which is the raw output: z = wX + b # Then we plot the probability p = 1 / (1 + exp(-z)) # Calculate logits for the test set logits = model.decision_function(X_test_scaled) plt.figure(figsize=(10, 6)) # Plot the S-curve # Sorting is necessary to make the line continuous sorted_indices = np.argsort(logits) plt.plot(logits[sorted_indices], y_proba[sorted_indices], color='blue', lw=3, label='Sigmoid Curve') # Scatter plot of actual outcomes (0 or 1) plt.scatter(logits, y_test, color='red', alpha=0.5, label='Actual Data Points') plt.title('Logistic Regression S-Curve (Sigmoid Function)') plt.xlabel('Logits (Linear combination of features)') plt.ylabel('Probability of Diabetes') plt.grid(True, linestyle='--', alpha=0.6) plt.legend() plt.show()