from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup as bs
import requests
import re
chrome_options = Options()
chrome_options.add_argument('disable_infobars')
# Start the WebDriver and load the page
wd = webdriver.Chrome(options=chrome_options)
wd.delete_all_cookies()
wd.set_page_load_timeout(30)
wd.get('https://homicide.latimes.com/age/0/year/all')
time.sleep(10)
html_page = wd.page_source
soup = bs(html_page, 'lxml')
#print(wd.page_source)
result1 = soup.find_all('article', attrs={'post-teaser row'})
last_height = 0
while True:
wd.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
html_page = wd.page_source
soup2 = bs(html_page, 'lxml')
#for element in soup2.body:
# soup.body.append(element)
time.sleep(2)
#break condition
new_height = wd.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
results = soup2.find_all('div', attrs={'post-teaser row'})
wd.quit()
results = soup2.find_all('div', attrs={'post-teaser row'})
print(len(results))
results[0]
450
<div class="post-teaser row">
<aside class="span2 offset1 hidden-phone">
<img class="post-list-thumb"
src="http://dev.virtualearth.net/REST/v1/Imagery/Map/Road/33.9704218958,-118.298415884/12?mapSize=135%2C135&key=AqEQ_HqipMZe_KnMeHEJ_CEtkNG9Y34_aXaGeIya4fBtc4hTIA9KYzMfFaK5nbK5&pushpin=33.9704218958%2C-118.298415884%3B88&format=jpeg"/>
<div class="post-list-badge">
Died on
<div class="death-date">Feb. 23</div>
<ul class="badge-location">
<li>
<a href="/neighborhood/vermont-knolls/year/all">
Vermont Knolls
</a>
</li>
<li>1307 W. 77th St.</li>
</ul>
</div>
</aside>
<article class="span8 post-list-content">
<hgroup>
<h2><a href="/post/baby-girl-greeves/">Baby Girl Reeves
</a></h2>
<p class="post-date">Posted Feb. 23, 2017</p>
</hgroup>
<div class="body">
<p> Baby Girl Reeves, a <a href="/race/black/year/all">black</a>
<a href="/gender/female/year/all">female</a>, died Thursday,
Feb. 23, after being <a href="/cause/gunshot/year/all">shot</a> in
<a href="/neighborhood/vermont-knolls/year/all">Vermont Knolls</a>,
according to Los Angeles County Medical Examiner-Coroner's records. </p>
</div>
</article>
</div>
import urllib
import numpy as np
from urllib.parse import urljoin
BASE_URL = 'https://homicide.latimes.com/'
url = []
uls = []
clist = []
j=0
for r in results + result1:
containers = r.findAll("ul",{"class" : lambda L: L and L.startswith('badge-location')})
#Find all ul with class: badge-location
for container in containers:
c_names = containers[j].findAll("li")
for i in range(len(c_names)):
c_name = c_names[0]
c_mod_name = c_name.text.split(',')
c_mod_name = [item.replace("\n", "") for item in c_mod_name]
c_mod_name = [x.strip(' ') for x in c_mod_name]
i +=1
cname = c_mod_name
death_date = r.find('div', 'death-date').text.strip()
if r.find('div', 'death-date').text.strip() is not None else ''
name = r.find("h2").text if r.find("h2").text is not None else ''
name = name.replace("\n", "")
try:
coords = r.img['src'].split("/")[8]
except IndexError:
coords = ''
pass
except TypeError:
coords = ''
pass
post_date = r.find('p', 'post-date').text.strip() if
r.find('p', 'post-date').text.strip() is not None else ''
race = r.find("a", href=re.compile("race")).text.strip() if
r.find("a", href=re.compile("race")) is not None else ''
gender = r.find("a", href=re.compile("gender")).text.strip()
if r.find("a", href=re.compile("gender")) is not None else ''
cause = r.find("a", href=re.compile("cause")).text.strip()
if r.find("a", href=re.compile("cause")) is not None else ''
s_descript = r.find('div', 'body').text.strip()
if r.find('div', 'body').text.strip() is not None else ''
a = r.find('a', href=re.compile("post"))
url = urljoin(BASE_URL, a.attrs['href'])
soup = bs(urllib.request.urlopen(url).read())
title = soup.find('section', class_="body")
article_text = ''
article = soup.find('section', class_="body").findAll('p')
for element in article:
article_text += '\n' + ''.join(element.findAll(text = True))
article_text = article_text.replace("\n", "")
data = soup.find('ul', class_="aspects")
for d in data:
lis = data.findAll('li')
for k in range(len(lis)):
try:
aname = lis[2]
t = [lis[0].text, lis[1].text,lis[2].text,lis[3].text,lis[4].text,
lis[5].text, lis[6].text]
myString = "::".join(t )
myString.replace(' ', '0')
a,b,c,d,e,f,g = myString.split('::')
k +=1
except IndexError:
pass
hood = a
address = b
age = c
sex = d
causes = e
brace = f
agency = g
clist.append((cname, death_date, post_date, name, race, gender, cause, coords, s_descript,
article_text, age, sex, causes, brace, agency, hood, address))
j+=1
import pandas as pd
df = pd.DataFrame(clist, columns=['c_name', 'death_date', 'post_date', 'name', 'race', 'gender', 'cause',
'coords', 's_descript', 'article_text', 'age', 'sex', 'causes', 'brace', 'agency', 'hood', 'address'])
df['post_date'] = df.post_date.str.replace('Posted' , '')
df['age'] = df.age.str.replace('Age:' , '')
df['causes'] = df.causes.str.replace('Cause:' , '')
df['brace'] = df.brace.str.replace('Race/Ethnicity:' , '')
df['sex'] = df.sex.str.replace('Gender:' , '').replace("\n", "")
df['agency'] = df.agency.str.replace('Agency:' , '')
df['sex'] = df['sex'].str.strip()
df['sex'].replace('', np.nan, inplace=True)
df.sex.fillna(df.gender, inplace=True)
df['hood'] = df['hood'].str.replace('href','')
df["sex"]= df["sex"].map(lambda x: x.lower())
df['post_date'] = pd.to_datetime(df['post_date'])
df['year'], df['month'], df['day'] = df['post_date'].apply(lambda x: x.year),
df['post_date'].apply(lambda x: x.month), df['post_date'].apply(lambda x: x.day)
df[['names', 'age']] = df['name'].str.split(',', n = 1, expand = True)
df[['y', 'x']] = df['coords'].str.split(',', n = 1, expand = True)
from pandas import ExcelWriter
df.to_csv('child_deaths_all.csv', sep=',')
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates
f, ax = plt.subplots(1, figsize=(6, 6))
df_plot = df.groupby('year').size()
df_plot.plot(kind="line", ax = ax)
df_complete = df.dropna(subset=['x', 'y'])
df_complete["sex"].value_counts()
df_complete["brace"].value_counts()
df_complete["age"].value_counts()
2 41
1 40
3 24
4 19
5 11
unborn 5
newborn 3
2 [Updated] 3
six months 2
two months 2
1 [Update] 2
6 months 1
5 [Update] 1
3 [Update] 1
3 [Updated] 1
0 1
1 [Updated] 1
four months 1
three months 1
9-month-old baby [Updated] 1
2 [Update] 1
1-month old baby [Update] 1
fetus 1
infant 1
baby 1
newborn [Updated] 1
Name: age, dtype: int64
df_complete["age"] = df_complete["age"].str.replace(r'\D+', '') #Nifty
# Values greater than five and null are age 0
df_complete["age"] = df_complete["age"].replace('9', 0)
df_complete["age"] = df_complete["age"].replace('6', 0)
df_complete["age"] = df_complete["age"].replace('', 0)
df_complete["age"] = df_complete["age"].fillna(-999).astype(int)
df_complete["age"].value_counts()
df_complete["age"] = df_complete["age"].replace(to_replace = -999, value =np.nan)
from plotnine import *
%matplotlib inline
(ggplot(df_complete) +
aes(x='age') +
geom_histogram(binwidth=1))
%matplotlib inline
(ggplot(df) +
aes(x='brace') +
geom_bar(size=20) +
coord_flip() +
ggtitle('Race of Children Who Died')
)
(ggplot(df) +
aes(x='causes') +
geom_bar(size=20) +
coord_flip() +
ggtitle('Cause of Death')
)
(ggplot(df) +
aes(x='sex') +
geom_bar(size=20) +
coord_flip() +
ggtitle('Gender of Child')
)
import numpy as np # linear algebra
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS
stop_words = ["died", "old", "year"] + list(STOPWORDS)
mpl.rcParams['figure.figsize']=(8.0,6.0) #(6.0,4.0)
mpl.rcParams['font.size']=15 #10
mpl.rcParams['savefig.dpi']=100 #72
mpl.rcParams['figure.subplot.bottom']=.1
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='white',
stopwords=stop_words,
max_words=300,
max_font_size=40,
random_state=42
).generate(str(df['article_text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#fig.savefig("word1.png", dpi=900)
import os
import sys
os.path.dirname(sys.executable)
df_complete['y'] = pd.to_numeric(df_complete['y'], errors='coerce')
df_complete['x'] = pd.to_numeric(df_complete['x'], errors='coerce')
df_complete.plot(kind='scatter',x='x',y='y',color='red')
plt.show()
import geopandas as gpd
shape=gpd.read_file('/home/barbozag/Downloads/la-county-neighborhoods-v5/
l.a. county neighborhood (v5).shp')
print(shape.crs)
{‘init’: ‘epsg:4269’}
import geopandas as gpd
from shapely.geometry import Point
geometry = [Point(xy) for xy in zip(df_complete.x, df_complete.y)]
#df_complete = df_complete.drop(['x', 'y'], axis=1)
crs = {'init': 'epsg:4326'}
gdf = gpd.GeoDataFrame(df_complete, crs=crs, geometry=geometry)
f, ax = plt.subplots(1, figsize=(12, 12))
ax = gdf.plot(ax=ax)
f, ax = plt.subplots(1, figsize=(12, 12))
ax = shape.plot(ax=ax, edgecolor='black' , linewidth=1)
ax = gdf.plot(ax=ax, color = 'yellow')
ax.set_axis_off()
f.suptitle('Location of Child Deaths in LA County')
plt.show()
# Create counts
gdf['hood'] = gdf['hood'].astype(str).str.lower()
gdf_t = gdf.groupby('hood').size().to_frame()
import pysal as ps
import seaborn as sns
sns.kdeplot(gdf['x'], gdf['y'], n_levels=10, cmap="Blues", shade=False, shade_lowest=True)
f, ax = plt.subplots(1, figsize=(12, 12))
f.patch.set_facecolor('xkcd:white')
shape.plot(ax=ax, color='white', edgecolor='black' , linewidth=.1)
sns.kdeplot(gdf['x'], gdf['y'], n_levels=35, cmap="Reds", shade=False,
shade_lowest=True, linewidth=12, ax=ax)
ax.set_axis_off()
plt.axis('equal')
plt.show()