import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
Since we observed some interesting trends between reported time/location and offense during the summary statistics, it might be interesting use all the details for each incident to predict the type of offense.
# Load dc-crime data
df = pd.read_csv('F:\Data Science\Data Visualization\DATS 6401 - Final Project\Data\dc_crime_clean.csv',
low_memory=False)
# Learn more about data
print(df.head())
print(df.columns)
print(df.shape) # dimension
# Get the target vector
y = df['OFFENSE']
# Get the feature vector
X = df[['SHIFT' ,'METHOD', 'DISTRICT', 'WARD', 'PSA', 'NEIGHBORHOOD_CLUSTER', 'Year', 'Month', 'Day']]
# Encode the features using one-hot-encoding
X = pd.get_dummies(X)
from sklearn.preprocessing import LabelEncoder
# Declare the LabelEncoder
class_le = LabelEncoder()
# Enclode the target
y = class_le.fit_transform(y)
from sklearn.metrics import precision_recall_fscore_support
def train_test_evaluate(classifier):
"""
Train, test, and evaluate the classifier
:param classifier: a classifier
"""
# Declare the model
clf = classifier(random_state=0)
# Train the model
clf.fit(X, y)
if classifier is DecisionTreeClassifier:
global tree
# Get the tree
tree = clf
elif classifier is RandomForestClassifier:
global importances
# Get the feature importances
importances = clf.feature_importances_
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# The list of classifiers
clfs = [LogisticRegression, DecisionTreeClassifier, RandomForestClassifier]
# The tree
tree = None
# The list of importances
importances = []
# For each classifer
for classifier in clfs:
# Call function train_test_evaluate (defined above)
train_test_evaluate(classifier)
import matplotlib.pyplot as plt
# Convert the importances into one-dimensional 1darray with corresponding df column names as axis labels
f_importances = pd.Series(importances, X.columns)
# Sort the array in descending order of the importances
f_importances = f_importances.sort_values(ascending=False).head(12)
# Draw the bar Plot from f_importances
f_importances.plot(x='Features', y='Importance', kind='bar', figsize=(16,9), rot=40, fontsize=15)
# Show the plot
plt.tight_layout()
plt.title("Feature Importance to Predict Offense")
plt.show()
The above tries three different classifiers: LogisticRegression, DecisionTreeClassifier and RandomForestClassifier. Based on the three different classifying models, the features that are most significant to the type of offense is the time followed by the method of how the crime incident was carried out.
The less crimes that involve the use of a gun or knife may then significantly decrease the chance of a homicide incident then. This analysis might be why the MPDC are focusing on crime incidents that involve the use of those types of methods.