Child Homicide Project

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup as bs
import requests
import re

chrome_options = Options()
chrome_options.add_argument('disable_infobars')


# Start the WebDriver and load the page
wd = webdriver.Chrome(options=chrome_options)
wd.delete_all_cookies()
wd.set_page_load_timeout(30)

wd.get('https://homicide.latimes.com/age/0/year/all')
time.sleep(10)

html_page = wd.page_source
soup = bs(html_page, 'lxml')
#print(wd.page_source)
result1 = soup.find_all('article', attrs={'post-teaser row'})

last_height = 0

while True:

    wd.execute_script("window.scrollTo(0, document.body.scrollHeight)")
    time.sleep(3)
    
    html_page = wd.page_source
    soup2 = bs(html_page, 'lxml')
    
    #for element in soup2.body:
     #   soup.body.append(element)
        
    time.sleep(2)
   
    #break condition
    new_height = wd.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
    
results = soup2.find_all('div', attrs={'post-teaser row'})    
wd.quit()

results = soup2.find_all('div', attrs={'post-teaser row'})
print(len(results))

results[0]

450

<div class="post-teaser row">
<aside class="span2 offset1 hidden-phone">
<img class="post-list-thumb" 
src="http://dev.virtualearth.net/REST/v1/Imagery/Map/Road/33.9704218958,-118.298415884/12?mapSize=135%2C135&amp;key=AqEQ_HqipMZe_KnMeHEJ_CEtkNG9Y34_aXaGeIya4fBtc4hTIA9KYzMfFaK5nbK5&amp;pushpin=33.9704218958%2C-118.298415884%3B88&amp;format=jpeg"/>
<div class="post-list-badge">
                Died on
                <div class="death-date">Feb. 23</div>
<ul class="badge-location">
<li>
<a href="/neighborhood/vermont-knolls/year/all">
                            Vermont Knolls
                        </a>
</li>
<li>1307 W. 77th St.</li>
</ul>
</div>
</aside>
<article class="span8 post-list-content">
<hgroup>
<h2><a href="/post/baby-girl-greeves/">Baby Girl Reeves
</a></h2>
<p class="post-date">Posted Feb. 23, 2017</p>
</hgroup>
<div class="body">
<p> Baby Girl Reeves, a <a href="/race/black/year/all">black</a> 
<a href="/gender/female/year/all">female</a>, died Thursday, 
Feb. 23, after being <a href="/cause/gunshot/year/all">shot</a> in 
<a href="/neighborhood/vermont-knolls/year/all">Vermont Knolls</a>, 
according to Los Angeles County Medical Examiner-Coroner's records. </p>
</div>
</article>
</div>

import urllib
import numpy as np
from urllib.parse import urljoin
BASE_URL = 'https://homicide.latimes.com/'

url = []
uls = []
clist = []

j=0
for r in results + result1:

    containers = r.findAll("ul",{"class" : lambda L: L and L.startswith('badge-location')})   
    #Find all ul with class: badge-location
    
    for container in containers: 
        c_names = containers[j].findAll("li")

        for i in range(len(c_names)):
            c_name = c_names[0]
            c_mod_name = c_name.text.split(',')
            c_mod_name = [item.replace("\n", "") for item in c_mod_name]
            c_mod_name = [x.strip(' ') for x in c_mod_name]
            i +=1    
        
        cname = c_mod_name

    death_date = r.find('div', 'death-date').text.strip() 
    if r.find('div', 'death-date').text.strip() is not None else ''
    name = r.find("h2").text  if r.find("h2").text  is not None else ''
    name = name.replace("\n", "")

    try:
        coords =  r.img['src'].split("/")[8] 
    except IndexError:
        coords = ''
        pass
    except TypeError:
        coords = ''
        pass

    post_date = r.find('p', 'post-date').text.strip() if 
    r.find('p', 'post-date').text.strip() is not None else ''
    race = r.find("a", href=re.compile("race")).text.strip() if 
    r.find("a", href=re.compile("race")) is not None else ''
    gender = r.find("a", href=re.compile("gender")).text.strip() 
    if r.find("a", href=re.compile("gender")) is not None else ''
    cause = r.find("a", href=re.compile("cause")).text.strip() 
    if r.find("a", href=re.compile("cause"))  is not None else ''
    s_descript = r.find('div', 'body').text.strip() 
    if r.find('div', 'body').text.strip() is not None else ''

    a = r.find('a', href=re.compile("post"))
    url = urljoin(BASE_URL, a.attrs['href'])
    soup = bs(urllib.request.urlopen(url).read())
    title = soup.find('section', class_="body")

    article_text = ''
    article = soup.find('section', class_="body").findAll('p')
    
    for element in article:
        article_text += '\n' + ''.join(element.findAll(text = True))
    article_text = article_text.replace("\n", "")

    data = soup.find('ul', class_="aspects")
    
    for d in data: 
        lis = data.findAll('li')
        
        for k in range(len(lis)):
            try:
                aname = lis[2]
                t = [lis[0].text, lis[1].text,lis[2].text,lis[3].text,lis[4].text, 
                lis[5].text, lis[6].text]
                myString = "::".join(t )
                myString.replace(' ', '0')
                a,b,c,d,e,f,g = myString.split('::')
                k +=1    
            except IndexError:
                pass
            
    hood = a
    address = b
    age = c
    sex = d
    causes = e
    brace = f
    agency = g

    clist.append((cname, death_date, post_date, name, race, gender, cause, coords, s_descript, 
    article_text, age, sex, causes, brace, agency, hood, address))

j+=1

import pandas as pd  
df = pd.DataFrame(clist, columns=['c_name', 'death_date', 'post_date', 'name', 'race', 'gender', 'cause', 
'coords', 's_descript', 'article_text', 'age', 'sex', 'causes', 'brace', 'agency', 'hood', 'address'])

df['post_date'] = df.post_date.str.replace('Posted' , '')
df['age'] = df.age.str.replace('Age:' , '')
df['causes'] = df.causes.str.replace('Cause:' , '')
df['brace'] = df.brace.str.replace('Race/Ethnicity:' , '')
df['sex'] = df.sex.str.replace('Gender:' , '').replace("\n", "")
df['agency'] = df.agency.str.replace('Agency:' , '')


df['sex'] = df['sex'].str.strip()
df['sex'].replace('', np.nan, inplace=True)
df.sex.fillna(df.gender, inplace=True)
df['hood'] = df['hood'].str.replace('href','')
df["sex"]= df["sex"].map(lambda x: x.lower())
df['post_date'] = pd.to_datetime(df['post_date'])
df['year'], df['month'], df['day'] = df['post_date'].apply(lambda x: x.year), 
df['post_date'].apply(lambda x: x.month), df['post_date'].apply(lambda x: x.day)
df[['names', 'age']] = df['name'].str.split(',', n = 1, expand = True)
df[['y', 'x']] = df['coords'].str.split(',', n = 1, expand = True)

from pandas import ExcelWriter
df.to_csv('child_deaths_all.csv', sep=',')

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates

f, ax = plt.subplots(1, figsize=(6, 6))

df_plot = df.groupby('year').size()
df_plot.plot(kind="line", ax = ax)

df_complete = df.dropna(subset=['x', 'y'])

df_complete["sex"].value_counts()
df_complete["brace"].value_counts()
df_complete["age"].value_counts()

 2                             41
 1                             40
 3                             24
 4                             19
 5                             11
 unborn                         5
 newborn                        3
 2 [Updated]                    3
 six months                     2
 two months                     2
 1 [Update]                     2
 6 months                       1
 5 [Update]                     1
 3 [Update]                     1
 3 [Updated]                    1
 0                              1
 1 [Updated]                    1
 four months                    1
 three months                   1
 9-month-old baby [Updated]     1
 2 [Update]                     1
 1-month old baby [Update]      1
 fetus                          1
 infant                         1
 baby                           1
 newborn [Updated]              1
Name: age, dtype: int64

df_complete["age"] = df_complete["age"].str.replace(r'\D+', '') #Nifty
# Values greater than five and null are age 0
df_complete["age"] = df_complete["age"].replace('9', 0)
df_complete["age"] = df_complete["age"].replace('6', 0) 
df_complete["age"] = df_complete["age"].replace('', 0)

df_complete["age"] = df_complete["age"].fillna(-999).astype(int)

df_complete["age"].value_counts()
df_complete["age"] = df_complete["age"].replace(to_replace = -999, value =np.nan)

from plotnine import *
%matplotlib inline


(ggplot(df_complete) + 
    aes(x='age') +
    geom_histogram(binwidth=1))

%matplotlib inline

(ggplot(df) + 
   aes(x='brace') +
   geom_bar(size=20) + 
   coord_flip() +
   ggtitle('Race of Children Who Died')
)

(ggplot(df) + 
   aes(x='causes') +
   geom_bar(size=20) + 
   coord_flip() +
   ggtitle('Cause of Death')
)

(ggplot(df) + 
   aes(x='sex') +
   geom_bar(size=20) + 
   coord_flip() +
   ggtitle('Gender of Child')
)

import numpy as np # linear algebra
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS

stop_words = ["died", "old", "year"] + list(STOPWORDS)
mpl.rcParams['figure.figsize']=(8.0,6.0)    #(6.0,4.0)
mpl.rcParams['font.size']=15                #10 
mpl.rcParams['savefig.dpi']=100             #72 
mpl.rcParams['figure.subplot.bottom']=.1 


stopwords = set(STOPWORDS)

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop_words,
                          max_words=300,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(df['article_text']))

print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#fig.savefig("word1.png", dpi=900)

import os
import sys
os.path.dirname(sys.executable)

df_complete['y'] = pd.to_numeric(df_complete['y'], errors='coerce')
df_complete['x'] = pd.to_numeric(df_complete['x'], errors='coerce')

df_complete.plot(kind='scatter',x='x',y='y',color='red')
plt.show()

import geopandas as gpd

shape=gpd.read_file('/home/barbozag/Downloads/la-county-neighborhoods-v5/
l.a. county neighborhood (v5).shp')

print(shape.crs)

{‘init’: ‘epsg:4269’}

import geopandas as gpd
from shapely.geometry import Point

geometry = [Point(xy) for xy in zip(df_complete.x, df_complete.y)]
#df_complete = df_complete.drop(['x', 'y'], axis=1)
crs = {'init': 'epsg:4326'}
gdf = gpd.GeoDataFrame(df_complete, crs=crs, geometry=geometry)

f, ax = plt.subplots(1, figsize=(12, 12))
ax = gdf.plot(ax=ax)

img7a

f, ax = plt.subplots(1, figsize=(12, 12))
ax = shape.plot(ax=ax,  edgecolor='black' , linewidth=1)
ax = gdf.plot(ax=ax, color = 'yellow')
ax.set_axis_off()
f.suptitle('Location of Child Deaths in LA County')
plt.show()

# Create counts
gdf['hood'] = gdf['hood'].astype(str).str.lower()
gdf_t = gdf.groupby('hood').size().to_frame()

import pysal as ps
import seaborn as sns

sns.kdeplot(gdf['x'], gdf['y'], n_levels=10,  cmap="Blues", shade=False, shade_lowest=True)

img8a

f, ax = plt.subplots(1, figsize=(12, 12))
f.patch.set_facecolor('xkcd:white')
shape.plot(ax=ax, color='white',   edgecolor='black' , linewidth=.1)
sns.kdeplot(gdf['x'], gdf['y'],  n_levels=35,  cmap="Reds", shade=False, 
shade_lowest=True, linewidth=12, ax=ax)

ax.set_axis_off()
plt.axis('equal')
plt.show()

Homicide LA Times web scraping

Gia Elise Barboza-Salerno

Assistant Professor in the School of Criminal Justice and Public Administration

My research interests include applied spatial policy and analysis, child welfare and criminal justice system reform, victimization by bullying, domestic abuse.