import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### MODELS TO USE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# PARAMETER TUNING / CROSS VALIDATION TOOLS

from sklearn.model_selection import GridSearchCV

### MODEL EVALUATION METRICS TO USE

from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt


df=pd.read_csv('house_reps_data\\house_reps_data.csv')


df.head()


X = df.iloc[:, 1:]
y = df.iloc[:, 0]


y=y.apply(lambda x: {'republican': 0, 'democrat': 1}[x])


# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)


# Create the hyperparameter grid
n_space = np.arange(1, 50)
param_grid = {'n_neighbors': n_space}

# Instantiate the logistic regression classifier
knn = KNeighborsClassifier()

# Instantiate the GridSearchCV object
knn_cv = GridSearchCV(knn, param_grid, cv = 5)

# Fit it to the training data
knn_cv.fit(X_train, y_train)

# Print the optimal parameters and best score
print("Tuned KNN Parameter: {}".format(knn_cv.best_params_))
print("Tuned KNN Accuracy: {}".format(knn_cv.best_score_))

Tuned KNN Parameter: {'n_neighbors': 7}
Tuned KNN Accuracy: 0.9157088122605364


y_pred_knn=knn_cv.predict(X_test)
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

[[ 55   4]
 [  7 108]]
             precision    recall  f1-score   support

          0       0.89      0.93      0.91        59
          1       0.96      0.94      0.95       115

avg / total       0.94      0.94      0.94       174


# Compute predicted probabilities: y_pred_prob
y_pred_prob = knn_cv.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('KNN ROC Curve')
plt.show()


print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

AUC: 0.9766396462785557


num_cols = len(X_train.columns)
n_est_space=np.array([round_to_int(x) for x in np.logspace(1, 1.5, num = 20)])
max_feat_space=np.arange(round_to_int(np.sqrt(num_cols) / 2), round_to_int(np.sqrt(num_cols) * 2) + 1)

param_grid_rf = {'n_estimators': n_est_space, 'max_features': max_feat_space}

rand_forest = RandomForestClassifier()

rand_forest_cv = GridSearchCV(rand_forest, param_grid_rf, cv = 5)
rand_forest_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 18, 19, 21, 22, 23, 25, 26,
       28, 30, 32]), 'max_features': array([2, 3, 4, 5, 6, 7, 8])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)


print("Tuned Random Forest Parameters: {}".format(rand_forest_cv.best_params_))
print("Tuned Random Forest Accuracy: {}".format(rand_forest_cv.best_score_))

Tuned Random Forest Parameters: {'max_features': 3, 'n_estimators': 14}
Tuned Random Forest Accuracy: 0.9578544061302682


y_pred_rf = rand_forest_cv.predict(X_test)
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

[[ 57   2]
 [  4 111]]
             precision    recall  f1-score   support

          0       0.93      0.97      0.95        59
          1       0.98      0.97      0.97       115

avg / total       0.97      0.97      0.97       174


C_space = np.linspace(.2, 5, num=50)
param_grid_svc = {'C': C_space}

svc = SVC()

svc_cv = GridSearchCV(svc, param_grid_svc, cv = 5)
svc_cv.fit(X_train, y_train)

# Print the optimal parameters and best score
print("Tuned SVC Parameter: {}".format(svc_cv.best_params_))
print("Tuned SVC Accuracy: {}".format(svc_cv.best_score_))

Tuned SVC Parameter: {'C': 3.726530612244898}
Tuned SVC Accuracy: 0.9540229885057471


y_pred_svc = svc_cv.predict(X_test)
print(confusion_matrix(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))

[[ 58   1]
 [  3 112]]
             precision    recall  f1-score   support

          0       0.95      0.98      0.97        59
          1       0.99      0.97      0.98       115

avg / total       0.98      0.98      0.98       174


C_space_logreg = np.linspace(.2, 5, num=50)
param_grid_logreg = {'C': C_space_logreg}

# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression()

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid_logreg, cv = 5)
logreg_cv.fit(X_train, y_train)

# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameter: {'C': 0.5918367346938775}
Tuned Logistic Regression Accuracy: 0.9501915708812261


y_pred_logreg = logreg_cv.predict(X_test)

# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

[[ 57   2]
 [  2 113]]
             precision    recall  f1-score   support

          0       0.97      0.97      0.97        59
          1       0.98      0.98      0.98       115

avg / total       0.98      0.98      0.98       174

Osman Chaudhary

A Machine Learning model comparison on the 1984 Congressional Voting Records Dataset

Classification Models:¶

1) KNN¶

KNN, Model evaluation:¶

2) Random Forest Classifier:¶

Random Forest, Model evaluation:¶

3) SVC Classifier¶

SVC Classifier, Model Evaluation:¶

4) Logistic Regression Classifier¶

Logistic Regression, Model evaluation:¶

Summary¶

	0	1	2	3	4	5	6	10	11	12	13	14	15	16
0	republican	0.0	1.0	0.0	1.0	1.0	1.0	1.0	0.0	1.0	1.0	1.0	0.0	1.0
1	republican	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	1.0	1.0	1.0	0.0	0.0
2	democrat	0.0	1.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0
3	democrat	0.0	1.0	1.0	0.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0
4	democrat	1.0	1.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	1.0	1.0	1.0	1.0