In [1]:

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import datetime

from src.data.get_data import get_url_data

from src.features.df_functions import str_pad, convert_time

Introduction¶

An exploration of reported Chicago Crime data. This dataset reflects reported incidents of crime (with the exception of murders where data exists for each victim) that occurred in the City of Chicago from 2001 to present, minus the most recent seven days. Data is extracted from the Chicago Police Department’s CLEAR (Citizen Law Enforcement Analysis and Reporting) system.

This report visually explores the data for trends. It also looks for trends using PCA and Gaussian Mixture Models. Through the latter, we see the crime reporting behaves in two distinct patterns: Weekday and Weekend. Weekday crime reporting tends to peak around noon. For Weekends, the flux of crime reports is steady throughout the day.

However, we do see some weekdays behave like weekends, and conversely, some weekends behave like weekdays. Specifically, there are Tuesdays that behave like weekends, i.e. more crime is reported. Some of those happen to be Christmas, New Years, and July 4th—American holidays. And the two Sundays that behaved like weekdays coincide with playoff games, which might say more about Chicagoans than it’s reported crime rates do.

In summary, more crimes are reported (or perhaps committed?) during hours of leisure.

Load data¶

In [2]:

%%time
data = get_url_data()

...loading csv
CPU times: user 10.7 s, sys: 1.09 s, total: 11.8 s
Wall time: 13.1 s

In [4]:

print('Number of observations: {:,.0f}'.format(len(data)))

Number of observations: 1,827,766

In [29]:

# converts time column to formatted timestamps
data = convert_time(data)

In [31]:

# generate counts for timestamps
data_time = data['Time Occurred'].groupby(data.index.floor('1min')).agg([ 'count'])
data_time.head()

Out[31]:

	count
Date
2010-01-01 00:01:00	3
2010-01-01 00:02:00	1
2010-01-01 00:05:00	2
2010-01-01 00:10:00	6
2010-01-01 00:15:00	4

Visually Explore Data¶

Monthly crime reports over all collected data¶

In [8]:

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
title = 'Monthly crime reports'
data_time[data_time.index <=
          '2018-09-01'].resample('M').sum().plot(ax=ax, legend=None, title=title)
plt.ylim(0, None)
plt.xlabel('')

Out[8]:

<matplotlib.text.Text at 0x11ff56278>

_images/0.1-cristian-Chicago_Crime_9_1.png

Crime reports for every day of data as a function of time of day¶

In [9]:

pivoted = data_time.pivot_table('count', index=data_time.index.floor('1min').time, columns=data_time.index.date, fill_value=0)
pivoted.head()

Out[9]:

	2010-01-01	2010-01-02	2010-01-03	2010-01-04	2010-01-05	2010-01-06	2010-01-07	2010-01-08	2010-01-09	2010-01-10	...	2018-09-13	2018-09-14	2018-09-15	2018-09-16	2018-09-17	2018-09-18	2018-09-19	2018-09-20	2018-09-21	2018-09-22
00:01:00	3	8	4	2	5	3	5	3	6	10	...	7	6	10	10	6	13	9	4	6	3
00:02:00	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
00:03:00	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
00:04:00	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
00:05:00	2	2	2	3	2	3	3	2	0	3	...	3	1	4	1	1	2	2	5	5	2

5 rows × 3187 columns

In [34]:

pivoted.plot(legend=False, alpha = 0.1, figsize=(16,9), title = 'Traces for {} days of reports'.format(len(pivoted.columns)));
plt.ylabel('Crime Reports')
plt.xlabel('Time of day')

Out[34]:

<matplotlib.text.Text at 0x11e594518>

_images/0.1-cristian-Chicago_Crime_12_1.png

Modeling¶

PCA analysis¶

In [11]:

# create PCA object
pca= PCA()

In [12]:

# observations in rows

# calculate loading scores and variation each principle compenent acount for
scaled_data = pivoted.T
pca.fit(scaled_data)

Out[12]:

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [13]:

# generate coordinates based on loading scores and scaled data
pca_data = pca.transform(scaled_data)

In [14]:

# scree plot
# generate percentage that each PCA accounts for
per_var = np.round(pca.explained_variance_ratio_*100, decimals=1)
# generate labels for scree plot
labels = ['PC' + str(num) for num in range(1, len(per_var) + 1)]

In [15]:

fig, ax = plt.subplots(1, 1, figsize=(12,6))
plt.bar(left=range(1, 20), height = per_var[0:19], tick_label=labels[0:19])
plt.show()

_images/0.1-cristian-Chicago_Crime_19_0.png

In [16]:

# generate df with pca coordinates, variables are presented as rows, thus the index should be variable names, the columns represent the different PCA axis
pca_df = pd.DataFrame(pca_data, index = pivoted.T.index.values,columns=labels)

In [17]:

pca_df.head()

Out[17]:

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC1429	PC1430	PC1431	PC1432	PC1433	PC1434	PC1435	PC1436	PC1437	PC1438
2010-01-01	-47.152188	16.946524	3.404645	0.198449	-4.633534	-1.115146	1.572401	-0.510697	-0.577448	-3.542706	...	0.028720	-0.020969	0.026601	-0.011574	0.005671	-0.019609	-0.009661	-0.005698	0.014349	0.011784
2010-01-02	-35.122274	1.091027	4.436384	4.385195	6.024288	-4.267457	3.185112	2.607650	-2.695581	-2.240907	...	-0.039363	-0.029664	0.016520	-0.027974	-0.015012	-0.006912	0.042668	0.007663	-0.024588	0.012324
2010-01-03	-35.027476	2.779998	-3.652319	2.432474	-0.912626	2.656788	-0.230950	-0.096006	-2.876035	1.910059	...	0.046643	-0.023867	0.023256	-0.048993	0.020619	0.043542	0.002398	0.002390	-0.009306	-0.024460
2010-01-04	-25.848661	-1.949269	1.086502	-1.338002	-2.086197	-1.060601	1.763883	5.046089	-2.654646	-1.901781	...	0.005797	0.025494	-0.014706	-0.080834	0.004578	-0.040916	-0.014588	0.037911	-0.008122	-0.002751
2010-01-05	-17.676446	-3.366142	2.402381	3.131339	0.537546	0.227890	-8.078550	9.418479	-3.550084	0.184492	...	0.019269	0.004990	0.023384	-0.021063	0.013738	0.017831	0.073808	0.010084	0.002871	0.028127

5 rows × 1438 columns

In [18]:

day_of_week = pd.DatetimeIndex(pivoted.columns).dayofweek

In [19]:

fig, ax = plt.subplots(1, 1, figsize=(12,6))
plt.scatter(pca_df['PC1'],pca_df['PC2'], c = day_of_week, cmap='rainbow');
plt.colorbar();

_images/0.1-cristian-Chicago_Crime_23_0.png

Gaussian Mixture Models¶

In [20]:

from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(2)
gmm.fit(scaled_data)
labels = gmm.predict(scaled_data)
labels

Out[20]:

array([0, 0, 0, ..., 0, 0, 0])

Weekday behaviors¶

In [21]:

# filters the columns with labels of array of 0s and 1s
pivoted.T[labels==1].T.plot(legend=False, alpha = 0.1,  figsize=(16,9));
plt.xlim(datetime.time(3, 0),datetime.time(23, 59));

_images/0.1-cristian-Chicago_Crime_27_0.png

In [22]:

# Isolates weekdays
pd.Series(pd.DatetimeIndex(pivoted.T[labels==1].index).strftime('%a')).value_counts()[::-1].plot(kind='barh');

_images/0.1-cristian-Chicago_Crime_28_0.png

Weekend behaviors¶

In [23]:

# filters the columns with labels of array of 0s and 1s
pivoted.T[labels==0].T.plot(legend=False, alpha = 0.1,  figsize=(16,9));
plt.xlim(datetime.time(3, 0),datetime.time(23, 59));

_images/0.1-cristian-Chicago_Crime_30_0.png

In [24]:

# Isolates weekends
pd.Series(pd.DatetimeIndex(pivoted.T[labels==0].index).strftime('%a')).value_counts()[::-1].plot(kind='barh');

_images/0.1-cristian-Chicago_Crime_31_0.png

Some days weekends behave like weekdays, and some weekdays behave like weekends¶

In [38]:

# Solve for Tue
Tue_index  = pd.DatetimeIndex(pivoted.T[labels==0].index).strftime('%a')=='Tue'

In [39]:

# All Tuesdays that behave like Weekends
pd.DatetimeIndex(pivoted.T[labels==0].index)[Tue_index]

Out[39]:

DatetimeIndex(['2010-01-05', '2010-01-12', '2010-01-19', '2010-03-09',
               '2011-02-22', '2012-12-25', '2013-01-01', '2013-12-24',
               '2017-07-04', '2018-09-18'],
              dtype='datetime64[ns]', freq=None)

In [43]:

# Solve for Sun
Sun_index  = pd.DatetimeIndex(pivoted.T[labels==1].index).strftime('%a')=='Sun'

In [44]:

# All Sundays that behave like Weekdays
pd.DatetimeIndex(pivoted.T[labels==1].index)[Sun_index]

Out[44]:

DatetimeIndex(['2017-04-30', '2017-10-22'], dtype='datetime64[ns]', freq=None)