from datetime import datetime
currenttime = datetime.now()
print("As of:", currenttime) # may be in UTF
from urllib.request import urlopen
from bs4 import BeautifulSoup as BS
src = urlopen("https://www.city.bunkyo.lg.jp/kyoiku/kosodate/okosan/nicchu/ninka/coronaoshirase.html").read()
soup = BS(src)
ts = soup.find_all("table")
def parse_table(t):
ths = t.find_all("th")
tds = [th.find_next("td") for th in ths]
data = {th.text.strip():td.text.strip() for th,td in zip(ths,tds)}
return data
x = [parse_table(t) for t in ts]
x[:5]
# check the consistency in the variable names
expected = ['いつ', 'どこで', 'だれが', '内容']
for a in x:
keys = list(a.keys())
assert len(keys) == len(expected), a
assert sorted(keys) == sorted(expected), a
import re
import pandas as pd
def clean_item(item):
date = item["いつ"]
r = re.search(r"令和(\d+)年(\d+)月(\d+)日", date) # ignore the second date, if any
assert r is not None, item
date = "%04d-%02d-%02d" % (int(r.group(1)) + 2018, int(r.group(2)), int(r.group(3)))
place = item["どこで"]
patient = item["だれが"]
patient_worker = (patient.find("職員") >= 0)
patient_child = (patient.find("園児") >= 0)
detail = item["内容"]
return {"date":date, "place":place, "patient":patient,
"patient_worker":patient_worker, "patient_child":patient_child, "detail":detail}
y = pd.DataFrame(clean_item(a) for a in x)
y.date = pd.to_datetime(y.date, format="%Y-%m-%d")
y
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
import matplotlib.pyplot as plt
fig, ax = plt.subplots(2, 1, figsize=(12, 6), sharex=True, sharey=True)
# approx 14 days bins
bins = int((y.date.max() - y.date.min()).days / 14)
y.date[y.patient_worker].hist(ax=ax[0], bins=bins, edgecolor="grey")
ax[0].set_title("Workers")
y.date[y.patient_child].hist(ax=ax[1], bins=bins, edgecolor="grey")
ax[1].set_title("Children")
fig.tight_layout()
None
y.to_csv("coronavirus-cases-in-bunkyoku-daycares_{}.csv".format(currenttime.strftime("%Y%m%d")), index=False)