{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loading Datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook shows an example of how to load a dataset. \n",
"It assumes you found the dataset using techniques shown in `finding_datasets.ipynb`\n",
"The basic steps it demonstrates to load data is:\n",
"1. Find available datasets with `opd.datasets.query`\n",
"2. Create a data source using `opd.Source` and information from the previous step.\n",
"3. Find available data types for given years using `get_tables_types` and `get_years`\n",
"4. Load the data type for a given year using `load`"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" import openpolicedata as opd #This import should be last in the try block because the expect block will only try to load it\n",
"except:\n",
" import sys\n",
" sys.path.append('../openpolicedata')\n",
" import openpolicedata as opd"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" State | \n",
" SourceName | \n",
" Agency | \n",
" TableType | \n",
" Year | \n",
" Description | \n",
" DataType | \n",
" URL | \n",
" date_field | \n",
" dataset_id | \n",
" agency_field | \n",
" min_version | \n",
" readme | \n",
"
\n",
" \n",
" \n",
" \n",
" | 5 | \n",
" Maryland | \n",
" Montgomery County | \n",
" Montgomery County | \n",
" TRAFFIC STOPS | \n",
" MULTI | \n",
" This dataset contains traffic violation inform... | \n",
" Socrata | \n",
" data.montgomerycountymd.gov | \n",
" date_of_stop | \n",
" 4mse-ku6q | \n",
" <NA> | \n",
" <NA> | \n",
" https://data.montgomerycountymd.gov/Public-Saf... | \n",
"
\n",
" \n",
" | 56 | \n",
" Maryland | \n",
" Maryland | \n",
" MULTI | \n",
" TRAFFIC STOPS | \n",
" MULTI | \n",
" Standardized stop data from the Stanford Open ... | \n",
" CSV | \n",
" https://stacks.stanford.edu/file/druid:yg821jf... | \n",
" date | \n",
" <NA> | \n",
" department_name | \n",
" <NA> | \n",
" https://github.com/stanford-policylab/opp/blob... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" State SourceName Agency TableType Year \\\n",
"5 Maryland Montgomery County Montgomery County TRAFFIC STOPS MULTI \n",
"56 Maryland Maryland MULTI TRAFFIC STOPS MULTI \n",
"\n",
" Description DataType \\\n",
"5 This dataset contains traffic violation inform... Socrata \n",
"56 Standardized stop data from the Stanford Open ... CSV \n",
"\n",
" URL date_field \\\n",
"5 data.montgomerycountymd.gov date_of_stop \n",
"56 https://stacks.stanford.edu/file/druid:yg821jf... date \n",
"\n",
" dataset_id agency_field min_version \\\n",
"5 4mse-ku6q \n",
"56 department_name \n",
"\n",
" readme \n",
"5 https://data.montgomerycountymd.gov/Public-Saf... \n",
"56 https://github.com/stanford-policylab/opp/blob... "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We will load Montgormery County, Maryland traffic stop data. First show our dataset options.\n",
"df = opd.datasets.query(table_type='TRAFFIC STOPS', state=\"Maryland\")\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" State | \n",
" SourceName | \n",
" Agency | \n",
" TableType | \n",
" Year | \n",
" Description | \n",
" DataType | \n",
" URL | \n",
" date_field | \n",
" dataset_id | \n",
" agency_field | \n",
" min_version | \n",
" readme | \n",
"
\n",
" \n",
" \n",
" \n",
" | 5 | \n",
" Maryland | \n",
" Montgomery County | \n",
" Montgomery County | \n",
" TRAFFIC STOPS | \n",
" MULTI | \n",
" This dataset contains traffic violation inform... | \n",
" Socrata | \n",
" data.montgomerycountymd.gov | \n",
" date_of_stop | \n",
" 4mse-ku6q | \n",
" <NA> | \n",
" <NA> | \n",
" https://data.montgomerycountymd.gov/Public-Saf... | \n",
"
\n",
" \n",
" | 6 | \n",
" Maryland | \n",
" Montgomery County | \n",
" Montgomery County | \n",
" COMPLAINTS | \n",
" MULTI | \n",
" This dataset contains allegations brought to t... | \n",
" Socrata | \n",
" data.montgomerycountymd.gov | \n",
" created_dt | \n",
" usip-62e2 | \n",
" <NA> | \n",
" <NA> | \n",
" https://data.montgomerycountymd.gov/Public-Saf... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" State SourceName Agency TableType Year \\\n",
"5 Maryland Montgomery County Montgomery County TRAFFIC STOPS MULTI \n",
"6 Maryland Montgomery County Montgomery County COMPLAINTS MULTI \n",
"\n",
" Description DataType \\\n",
"5 This dataset contains traffic violation inform... Socrata \n",
"6 This dataset contains allegations brought to t... Socrata \n",
"\n",
" URL date_field dataset_id agency_field \\\n",
"5 data.montgomerycountymd.gov date_of_stop 4mse-ku6q \n",
"6 data.montgomerycountymd.gov created_dt usip-62e2 \n",
"\n",
" min_version readme \n",
"5 https://data.montgomerycountymd.gov/Public-Saf... \n",
"6 https://data.montgomerycountymd.gov/Public-Saf... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To access the data, create a source using a Source Name (usually a police department name). There is an optional state input to clarify ambiguities.\n",
"# We will use the above cell's information for Maryland to choose the agency \"Montgomery County\" which we select for the source_name\n",
"\n",
"src = opd.Source(source_name=\"Montgomery County\", state=\"Maryland\")\n",
"src.datasets.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['TRAFFIC STOPS', 'COMPLAINTS']\n"
]
}
],
"source": [
"# Find out what types of data are available from this source\n",
"types = src.get_tables_types()\n",
"\n",
"print(types)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]\n"
]
}
],
"source": [
"# Find out what years are available from the stops table\n",
"# IF you do not have a key setup you may see the message: \"WARNING:root:Requests made without an app_token will be subject to strict throttling limits.\" This is normal.\n",
"years = src.get_years(table_type=types[0])\n",
"print(years)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Load traffic stop data for 2021\n",
"t = src.load(year=2021, table_type='TRAFFIC STOPS')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" geometry | \n",
" seq_id | \n",
" date_of_stop | \n",
" time_of_stop | \n",
" agency | \n",
" subagency | \n",
" description | \n",
" location | \n",
" latitude | \n",
" longitude | \n",
" ... | \n",
" driver_state | \n",
" dl_state | \n",
" arrest_type | \n",
" search_conducted | \n",
" search_outcome | \n",
" search_reason_for_stop | \n",
" search_disposition | \n",
" search_reason | \n",
" search_type | \n",
" search_arrest_reason | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" POINT (-77.27504 39.14653) | \n",
" 123add05-d3d2-428d-9932-66bc30831388 | \n",
" 2021-01-01 | \n",
" 23:03:00 | \n",
" MCP | \n",
" 5th District, Germantown | \n",
" DISPLAYING EXPIRED REGISTRATION PLATE ISSUED B... | \n",
" GREAT SENECA @ WSSC ENTRANCE | \n",
" 39.1465333333333 | \n",
" -77.2750433333333 | \n",
" ... | \n",
" MD | \n",
" MD | \n",
" Q - Marked Laser | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" POINT (-77.27504 39.14653) | \n",
" 123add05-d3d2-428d-9932-66bc30831388 | \n",
" 2021-01-01 | \n",
" 23:03:00 | \n",
" MCP | \n",
" 5th District, Germantown | \n",
" EXCEEDING POSTED MAXIMUM SPEED LIMIT: 64 MPH I... | \n",
" GREAT SENECA @ WSSC ENTRANCE | \n",
" 39.1465333333333 | \n",
" -77.2750433333333 | \n",
" ... | \n",
" MD | \n",
" MD | \n",
" Q - Marked Laser | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 2 | \n",
" POINT (-77.27504 39.14653) | \n",
" 123add05-d3d2-428d-9932-66bc30831388 | \n",
" 2021-01-01 | \n",
" 23:03:00 | \n",
" MCP | \n",
" 5th District, Germantown | \n",
" KNOWINGLY DRIVING UNINSURED VEHICLE | \n",
" GREAT SENECA @ WSSC ENTRANCE | \n",
" 39.1465333333333 | \n",
" -77.2750433333333 | \n",
" ... | \n",
" MD | \n",
" MD | \n",
" Q - Marked Laser | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 3 | \n",
" POINT (-77.27285 39.14366) | \n",
" 1b7c9229-d80f-4ed2-9692-d24a6fbda5c7 | \n",
" 2021-01-01 | \n",
" 22:43:00 | \n",
" MCP | \n",
" 5th District, Germantown | \n",
" DRIVING VEHICLE IN EXCESS OF REASONABLE AND PR... | \n",
" GREAT SENECA @ HORN POINT | \n",
" 39.1436583333333 | \n",
" -77.2728533333333 | \n",
" ... | \n",
" MD | \n",
" MD | \n",
" A - Marked Patrol | \n",
" No | \n",
" Warning | \n",
" 21-801(a) | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 4 | \n",
" POINT (-77.27405 39.17419) | \n",
" 0c6f50ae-d462-4356-8319-e1f035dc00fc | \n",
" 2021-01-01 | \n",
" 22:20:00 | \n",
" MCP | \n",
" 5th District, Germantown | \n",
" DRIVER CHANGING LANES WHEN UNSAFE | \n",
" 118 @ WALTERJOHNSON | \n",
" 39.174195 | \n",
" -77.274045 | \n",
" ... | \n",
" MD | \n",
" MD | \n",
" A - Marked Patrol | \n",
" No | \n",
" Warning | \n",
" 21-309(b) | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 43 columns
\n",
"
"
],
"text/plain": [
" geometry seq_id \\\n",
"0 POINT (-77.27504 39.14653) 123add05-d3d2-428d-9932-66bc30831388 \n",
"1 POINT (-77.27504 39.14653) 123add05-d3d2-428d-9932-66bc30831388 \n",
"2 POINT (-77.27504 39.14653) 123add05-d3d2-428d-9932-66bc30831388 \n",
"3 POINT (-77.27285 39.14366) 1b7c9229-d80f-4ed2-9692-d24a6fbda5c7 \n",
"4 POINT (-77.27405 39.17419) 0c6f50ae-d462-4356-8319-e1f035dc00fc \n",
"\n",
" date_of_stop time_of_stop agency subagency \\\n",
"0 2021-01-01 23:03:00 MCP 5th District, Germantown \n",
"1 2021-01-01 23:03:00 MCP 5th District, Germantown \n",
"2 2021-01-01 23:03:00 MCP 5th District, Germantown \n",
"3 2021-01-01 22:43:00 MCP 5th District, Germantown \n",
"4 2021-01-01 22:20:00 MCP 5th District, Germantown \n",
"\n",
" description \\\n",
"0 DISPLAYING EXPIRED REGISTRATION PLATE ISSUED B... \n",
"1 EXCEEDING POSTED MAXIMUM SPEED LIMIT: 64 MPH I... \n",
"2 KNOWINGLY DRIVING UNINSURED VEHICLE \n",
"3 DRIVING VEHICLE IN EXCESS OF REASONABLE AND PR... \n",
"4 DRIVER CHANGING LANES WHEN UNSAFE \n",
"\n",
" location latitude longitude ... \\\n",
"0 GREAT SENECA @ WSSC ENTRANCE 39.1465333333333 -77.2750433333333 ... \n",
"1 GREAT SENECA @ WSSC ENTRANCE 39.1465333333333 -77.2750433333333 ... \n",
"2 GREAT SENECA @ WSSC ENTRANCE 39.1465333333333 -77.2750433333333 ... \n",
"3 GREAT SENECA @ HORN POINT 39.1436583333333 -77.2728533333333 ... \n",
"4 118 @ WALTERJOHNSON 39.174195 -77.274045 ... \n",
"\n",
" driver_state dl_state arrest_type search_conducted search_outcome \\\n",
"0 MD MD Q - Marked Laser NaN NaN \n",
"1 MD MD Q - Marked Laser NaN NaN \n",
"2 MD MD Q - Marked Laser NaN NaN \n",
"3 MD MD A - Marked Patrol No Warning \n",
"4 MD MD A - Marked Patrol No Warning \n",
"\n",
" search_reason_for_stop search_disposition search_reason search_type \\\n",
"0 NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 21-801(a) NaN NaN NaN \n",
"4 21-309(b) NaN NaN NaN \n",
"\n",
" search_arrest_reason \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
"[5 rows x 43 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The loaded table is stored in the table parameter as a pandas DataFrame (https://pandas.pydata.org/docs/user_guide/10min.html#min)\n",
"# Show the first 5 rows of the table\n",
"t.table.head(n=5)\n",
"# Now you are ready for analyzing the data in the table t."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.12 ('opd')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "a73158d29711b2da05ac73de25b71e5d8cae591f14917bba77a9573b5c85a0ce"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}