Analysis of COVID-19 Infections and Death Data (Brazil) | |
Project creation date: | May 6, 2020 | Dataset from Ministry of Health: | https://covid.saude.gov.br |
Project author: | Ricardo Szczerbacki (ricardo@rj1.info) |
Project on Github: | https://github.com/ricardocopa/Covid19 |
License: | MIT License |
This projet does an Exploratory Data Analysis (and some simple predictions) of the dataset made available by the brazilian Ministry of Health, that contains daily numbers on infections and death for brazilian states. The idea was to have more insights for the COVID-19 status in Brazil and Rio de Janeiro, where the author lives.
Also simple plots for comparison with other countrie were made, using data available in Our World in Data (https://covid.ourworldindata.org/data/owid-covid-data.csv)
This website was daily updated until March 29, 2022.
import math
import numpy as np
import pandas as pd
from IPython.display import HTML, Markdown
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
#from zipfile import ZipFile
#from rarfile import RarFile
from pyunpack import Archive
import glob
import os
import sys
import time
import subprocess
import locale
locale.setlocale(locale.LC_ALL, 'en_us.utf-8')
import warnings
warnings.filterwarnings('ignore')
display(HTML('''
<link rel="stylesheet" href="https://cdn.jupyter.org/notebook/5.1.0/style/style.min.css">
<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
Python source code is hidden by default for better visualization.
To toggle source code visualization click <a href="javascript:code_toggle()">here</a>.'''))
File HIST_PAINEL_COVIDBR.zip or HIST_PAINEL_COVIDBR.rar (a compressed CSV file) downloaded from brazilian Ministry of Health
files = glob.glob("HIST_PAINEL_COVIDBR.*")
datasetFileName = files[0]
html_code = '''
You can download the dataset file originally used by clicking <a href="http://covid.rj1.info/{}">here</a>
'''.format(datasetFileName)
display(HTML(html_code))
File contents (first 5 rows):
## how files where made available until May 10 2020
# covid = pd.read_csv('arquivo_geral.csv', delimiter = ';')
# how files where mande available until october (or until the excel files exceed the max rows limit)
#covidFull = pd.read_excel(datasetFileName)
#covidFull = pd.read_csv(datasetFileName, parse_dates = True, delimiter = ';')
dataframes = []
for f in glob.glob('extractedFiles/*'):
os.remove(f)
Archive(datasetFileName).extractall('extractedFiles/')
files = glob.glob("extractedFiles/*")
for filename in files:
df = pd.read_csv(filename, parse_dates = True, delimiter = ';')
dataframes.append(df)
covidFull = pd.concat(dataframes, axis=0, ignore_index=True)
print('Columns: Region, State, City, State Code, City Code, Health Region Code, Health Region Name, Date, Week, Population 2019, Total Infections, New Infections, Total Deaths, New Deaths, Total Recovered, New Patients')
display(covidFull.head())
# converting to the old file format
covid = covidFull[covidFull.estado.notnull()]
covid = covid[covid.codmun.isnull()]
covid = covid[['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulado', 'obitosNovos', 'obitosAcumulado']]
covid.columns = ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados']
# preparing a dataframe for the Rio de Janeiro city
covidRioDeJaneiro = covidFull.loc[covidFull['municipio'] == 'Rio de Janeiro']
covidRioDeJaneiro = covidRioDeJaneiro[['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulado', 'obitosNovos', 'obitosAcumulado']]
covidRioDeJaneiro.columns = ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados']
# preparing a dataframe for the Mangaratiba city
covidMangaratiba = covidFull.loc[covidFull['municipio'] == 'Mangaratiba']
covidMangaratiba = covidMangaratiba[['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulado', 'obitosNovos', 'obitosAcumulado']]
covidMangaratiba.columns = ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados']
firstDateData = datetime.strptime(covid['data'].min(), '%Y-%m-%d').strftime('%d/%m/%Y')
lastDayData = datetime.strptime(covid['data'].max(), '%Y-%m-%d').strftime('%d/%m/%Y')
numberOfDaysData = covid['data'].nunique()
covidMaxDeaths = covid.sort_values('obitosNovos', ascending = False).iloc[0]
maxDeathsValue = covidMaxDeaths['obitosNovos']
maxDeathsState = covidMaxDeaths['estado']
maxDeathsDay = datetime.strptime(covidMaxDeaths['data'], '%Y-%m-%d').strftime('%d/%m/%Y')
covidRJ = covid.loc[covid['estado'] == 'RJ']
covidMaxDeathsRJ = covidRJ.sort_values('obitosNovos', ascending = False).iloc[0]
maxDeathsRJValue = covidMaxDeathsRJ['obitosNovos']
maxDeathsRJDay = datetime.strptime(covidMaxDeathsRJ['data'], '%Y-%m-%d').strftime('%d/%m/%Y')
covidMaxDeathsBR = covid.groupby('data')['obitosNovos'].sum().reset_index(name ='obitosNovos')
covidMaxDeathsBR = covidMaxDeathsBR.sort_values('obitosNovos', ascending = False).iloc[0]
maxDeathsValueBR = covidMaxDeathsBR['obitosNovos']
maxDeathsDayBR = datetime.strptime(covidMaxDeathsBR['data'], '%Y-%m-%d').strftime('%d/%m/%Y')
html_code = '''
<div style="
background-color:LightGoldenRodYellow;
border-style: solid;
padding-top: 10px;
padding-right: 10px;
padding-bottom: 20px;
padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png">
<h2> Some General Information About the Dataset </h2>
<p style="font-size:18px"> The file have data from <b>{}</b> to <b>{}</b></p>
<p style="font-size:18px"> <b>{}</b> was the state with the greater number of deaths in one day. There were <b>{:n}</b> deaths on <b>{}</b>. </p>
<p style="font-size:18px"> The day with the greater number of deaths in <b>RJ</b> happend on <b>{}</b>, with a total of <b>{:n}</b> deaths registered. </p>
<p style="font-size:18px"> The day with the greater number of deaths all over Brazil happend on <b>{}</b>, with a total of <b>{:n}</b> deaths registered. </p>
<p style="font-size:18px"> <b>PS</b>: All dates in this page are displayed in the format: DD/MM/YYYY</p>
</div>
'''.format(firstDateData, lastDayData, maxDeathsState, maxDeathsValue, maxDeathsDay, maxDeathsRJDay, maxDeathsRJValue, maxDeathsDayBR, maxDeathsValueBR)
display(HTML(html_code))
# Creating normalized columns for infections and deaths by dividing it by the population multiplied by a million (cases per million people). Source for brazilian population data: IBGE population estimation for 2019
# https://pt.wikipedia.org/wiki/Lista_de_unidades_federativas_do_Brasil_por_popula%C3%A7%C3%A3o
data = {'state': ['AC','AL','AP','AM','BA','CE','DF',
'ES','GO','MA','MT','MS','MG','PA',
'PB','PR','PE','PI','RJ','RN','RS',
'RO','RR','SC','SP','SE','TO'],
'population': [881935, 3337357, 845731, 4144597, 14873064, 9132078, 3015268,
4018650, 7018354, 7075181, 3484466, 2778986, 21168791, 8602865,
4018127, 11433957, 9557071, 3273227, 17264943, 3506853, 11377239,
1777225, 605761, 7164788, 45919049, 2298696, 1572866]
}
populationByState = pd.DataFrame (data, columns = ['state','population'])
covid = covid.merge(populationByState, left_on='estado', right_on='state')
covid['normTotalInfections'] = covid['casosAcumulados']/covid['population'] * 1000000.
covid['normTotalDeaths'] = covid['obitosAcumulados']/covid['population'] * 1000000.
lastDay = covid['data'].max()
lastDayFormatted = datetime.strptime(lastDay, '%Y-%m-%d').strftime('%d/%m/%Y')
covidLastDay = covid.loc[covid['data'] == lastDay]
infectionsBR = covidLastDay.sum()['casosAcumulados']
deathsBR = covidLastDay.sum()['obitosAcumulados']
infectionsLastDayBR = covidLastDay.sum()['casosNovos']
deathsLastDayBR = covidLastDay.sum()['obitosNovos']
covidRJ = covidLastDay.loc[covid['estado'] == 'RJ']
infectionsLastDayRJ = covidRJ.sum()['casosNovos']
deathsLastDayRJ = covidRJ.sum()['obitosNovos']
infectionsRJ = covidRJ.sum()['casosAcumulados']
deathsRJ = covidRJ.sum()['obitosAcumulados']
html_code = '''
<div style="
background-color:LightGoldenRodYellow;
border-style: solid;
padding-top: 10px;
padding-right: 10px;
padding-bottom: 20px;
padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png">
<h2> Numbers for the Last 24 Hours (last day of the dataset) </h2>
<p style="font-size:18px"> <b>{}</b> is the last day in the dataset.</p>
<p>
<p style="font-size:18px"> <b>{:,.0f} infections</b> were confirmed in <b>Brazil</b> on this day, of <b>{:,.0f} total infections</b>. </p>
<p style="font-size:18px"> <b>{:,.0f} deaths</b> were confirmed in <b>Brazil</b> on this day, of <b>{:,.0f} total deaths</b>. </p>
<p>
<p style="font-size:18px"> <b>{:,.0f} infections</b> were confirmed in <b>RJ</b> on this day, of <b>{:,.0f} total infections</b>. </p>
<p style="font-size:18px"> <b>{:,.0f} deaths</b> were confirmed in <b>RJ</b> on this day, of <b>{:,.0f} total deaths</b>. </p>
</div>
'''.format(lastDayFormatted, infectionsLastDayBR, infectionsBR, deathsLastDayBR, deathsBR,
infectionsLastDayRJ, infectionsRJ, deathsLastDayRJ, deathsRJ)
display(HTML(html_code))
Using the last information dowloaded from Our World in Data (https://covid.ourworldindata.org/data/owid-covid-data.csv), I list below the 3 countries in the worst situation (deaths per million people), the 3 countries in the best situation and other 3 countries of interest (Brazil, France and Israel), showing their position in the world ranking in the first column. OBS: only countries with death rates greater than zero are included in the 2 rankings below.
worldCovid = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv', parse_dates = True, delimiter = ',')
worldCovid = worldCovid.loc[~worldCovid['location'].isin(['Africa', 'Asia', 'Europe', 'European Union'
'International', 'North America', 'Oceania',
'South America', 'World'])]
lastDate = worldCovid['date'].max()
# Ignore the last day with data (usually faulty)
lastDate = (datetime.strptime(lastDate, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
formattedLastDate = datetime.strptime(lastDate, '%Y-%m-%d').strftime('%B %d, %Y').replace(' 0', ' ')
worldCovid = worldCovid.loc[worldCovid['date'] == lastDate]
worldCovidTotal = worldCovid.copy()
worldCovidTotal = worldCovid[['location', 'total_deaths_per_million']]
worldCovidTotal.columns = ['Country', 'Deaths/MM inhab.']
worldCovidTotal.sort_values('Deaths/MM inhab.', ascending=False, inplace=True)
worldCovidTotal.reset_index(inplace = True)
worldCovidTotal['Ranking'] = worldCovidTotal.index+1
worldCovidTotal = worldCovidTotal.fillna(0)
worldCovidTotal = worldCovidTotal.loc[worldCovidTotal['Deaths/MM inhab.'] > 0]
worldCovidTotal = worldCovidTotal[['Ranking', 'Country', 'Deaths/MM inhab.']]
# Get the 3 best and the 3 worst status countries
worst1StatusCountry = worldCovidTotal.iloc[0]['Country']
worst2StatusCountry = worldCovidTotal.iloc[1]['Country']
worst3StatusCountry = worldCovidTotal.iloc[2]['Country']
best1StatusCountry = worldCovidTotal.iloc[-1]['Country']
best2StatusCountry = worldCovidTotal.iloc[-2]['Country']
best3StatusCountry = worldCovidTotal.iloc[-3]['Country']
# Filter only the desired countries (3 best, 3 worst, Brazil, France and Israel)
covidByCountrySelected = worldCovidTotal.loc[worldCovidTotal['Country'].isin([worst1StatusCountry,
worst2StatusCountry,
worst3StatusCountry,
best1StatusCountry,
best2StatusCountry,
best3StatusCountry,
'Brazil', 'France', 'Israel'])]
display(Markdown('**Cumulative death rates (registered deaths from the beginning)** '))
display(HTML(covidByCountrySelected.to_html(index=False)))
display(Markdown('*Information updated on ' + formattedLastDate))
# Plot the distribution of deaths/MM inhab. per country showing the countries of interest (Brazil, France and Israel)
worldCovidTotal = worldCovidTotal[['Country', 'Deaths/MM inhab.']]
covidByCountryBrazil = worldCovidTotal.loc[worldCovidTotal['Country'] == 'Brazil']
covidByCountryFrance = worldCovidTotal.loc[worldCovidTotal['Country'] == 'France']
covidByCountryIsrael = worldCovidTotal.loc[worldCovidTotal['Country'] == 'Israel']
axis = worldCovidTotal.plot()
axis.axes.get_xaxis().set_visible(False)
covidByCountryBrazil.plot(ax=axis, linestyle='',marker='o', markersize=12, color='green')
covidByCountryFrance.plot(ax=axis, linestyle='',marker='o', markersize=12, color='red')
covidByCountryIsrael.plot(ax=axis, linestyle='',marker='o', markersize=12, color='blue')
axis.legend(['Deaths/MM inhab.', 'Brazil', 'France', 'Israel'])
plt.show()
worldCovidCurrent = worldCovid.copy()
worldCovidCurrent = worldCovid[['location', 'new_deaths_smoothed_per_million']]
worldCovidCurrent.columns = ['Country', 'Deaths/MM inhab.']
worldCovidCurrent.sort_values('Deaths/MM inhab.', ascending=False, inplace=True)
worldCovidCurrent.reset_index(inplace = True)
worldCovidCurrent['Ranking'] = worldCovidCurrent.index+1
worldCovidCurrent = worldCovidCurrent.fillna(0)
worldCovidCurrent = worldCovidCurrent.loc[worldCovidCurrent['Deaths/MM inhab.'] > 0]
worldCovidCurrent = worldCovidCurrent[['Ranking', 'Country', 'Deaths/MM inhab.']]
# Get the 3 best and the 3 worst status countries
worst1StatusCountry = worldCovidCurrent.iloc[0]['Country']
worst2StatusCountry = worldCovidCurrent.iloc[1]['Country']
worst3StatusCountry = worldCovidCurrent.iloc[2]['Country']
best1StatusCountry = worldCovidCurrent.iloc[-1]['Country']
best2StatusCountry = worldCovidCurrent.iloc[-2]['Country']
best3StatusCountry = worldCovidCurrent.iloc[-3]['Country']
# Filter only the desired countries (3 best, 3 worst, Brazil, France and Israel)
covidByCountrySelected = worldCovidCurrent.loc[worldCovidCurrent['Country'].isin([worst1StatusCountry,
worst2StatusCountry,
worst3StatusCountry,
best1StatusCountry,
best2StatusCountry,
best3StatusCountry,
'Brazil', 'France', 'Israel'])]
display(Markdown('**Current death rates (average from the last 7 days)** '))
display(HTML(covidByCountrySelected.to_html(index=False)))
display(Markdown('*Information updated on ' + formattedLastDate))
# Plot the distribution of deaths/MM inhab. per country showing the countries of interest (Brazil, France and Israel)
worldCovidCurrent = worldCovidCurrent[['Country', 'Deaths/MM inhab.']]
covidByCountryBrazil = worldCovidCurrent.loc[worldCovidCurrent['Country'] == 'Brazil']
covidByCountryFrance = worldCovidCurrent.loc[worldCovidCurrent['Country'] == 'France']
covidByCountryIsrael = worldCovidCurrent.loc[worldCovidCurrent['Country'] == 'Israel']
axis = worldCovidCurrent.plot()
axis.axes.get_xaxis().set_visible(False)
covidByCountryBrazil.plot(ax=axis, linestyle='',marker='o', markersize=12, color='green')
covidByCountryFrance.plot(ax=axis, linestyle='',marker='o', markersize=12, color='red')
covidByCountryIsrael.plot(ax=axis, linestyle='',marker='o', markersize=12, color='blue')
axis.legend(['Deaths/MM inhab.', 'Brazil', 'France', 'Israel'])
plt.show()
covidLastDay = covidLastDay[['estado','casosAcumulados', 'obitosAcumulados', 'normTotalInfections', 'normTotalDeaths']]
covidLastDay.sort_values('normTotalDeaths', ascending=False, inplace=True)
covidLastDay.set_index('estado', inplace=True)
covidLastDay2 = covidLastDay.copy()
covidLastDay2.columns = ['Infections', 'Deaths', 'Infections/MM inh.', 'Deaths/MM inh.']
html_code = '''
<div style="
background-color:LightGoldenRodYellow;
border-style: solid;
padding-top: 10px;
padding-right: 10px;
padding-bottom: 40px;
padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png">
<h2> Infections and Deaths by State </h2>
<p style="font-size:18px"> Sorted by the severity (deaths per million people). States in worse situation are near the top. </p>
<p style="font-size:18px"> The k-means method was used for clustering the states in 3 classes of severity. </p>
</div>
'''
display(HTML(html_code))
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
covidLastDay['myClass'] = kmeans.fit_predict(covidLastDay[['normTotalDeaths']])
worstClass = covidLastDay.iloc[0]['myClass']
bestClass = covidLastDay.iloc[-1]['myClass']
covidLastDay.reset_index(inplace=True)
covidLastDay2 = covidLastDay.copy()
covidLastDay2.columns=['State', 'Infections', 'Deaths', 'Infections/MM inhab.', 'Deaths/MM inhab.', 'myClass']
def highlight(s):
tamanho = s.size
if s.myClass == worstClass:
return ['background-color: orange']*tamanho
elif s.myClass != bestClass:
return ['background-color: yellow']*tamanho
else:
return ['background-color: white']*tamanho
def formatNumber(str_value):
value = int(str_value)
if value > 1000000:
return f'{value/1000000:.2f} Mi'
return f'{value:,}'
display(covidLastDay2.style.apply(highlight, axis=1).format({"Infections": formatNumber,
"Deaths": formatNumber,
"Infections/MM inhab.": "{:,.2f}",
"Deaths/MM inhab.": "{:,.2f}"}))
Using absolute values
covidLastDayPie1 = covidLastDay[['estado','casosAcumulados', 'obitosAcumulados']]
covidLastDayPie1.set_index('estado', inplace=True)
plot = covidLastDayPie1.plot.pie(fontsize=24, subplots=True, layout=(1, 2), figsize=(30, 15), legend=None)
covidLastDayPie2 = covidLastDay[['estado','normTotalInfections', 'normTotalDeaths']]
covidLastDayPie2.set_index('estado', inplace=True)
plot = covidLastDayPie2.plot.pie(fontsize=24, subplots=True, layout=(1, 2), figsize=(30, 15), legend=None)
covid['data'] = pd.to_datetime(covid['data'], format='%Y-%m-%d')
covidByDayBR = covid[['data', 'casosAcumulados', 'obitosAcumulados']].groupby('data').sum()
covidByDayBR.reset_index(inplace=True)
covidByDayBR.set_index(['data'],inplace=True)
covidByDayRJ = covid.loc[covid['estado'] == 'RJ'][['data', 'casosAcumulados', 'obitosAcumulados', 'normTotalInfections', 'normTotalDeaths']]
covidByDayRJ.reset_index(inplace=True)
covidByDayRJ.set_index(['data'],inplace=True)
axis = covidByDayBR.plot(legend=False, style='o-', figsize=(18, 9), y='casosAcumulados', title='Infections in Brazil')
axis.xaxis.get_label().set_visible(False)
axis = covidByDayBR.plot(legend=False, style='o-',figsize=(18, 9), y='obitosAcumulados', title='Deaths in Brazil')
axis.xaxis.get_label().set_visible(False)
axis = covidByDayRJ.plot(legend=False,style='o-',figsize=(18, 9), y='casosAcumulados', title='Infections in RJ')
axis.xaxis.get_label().set_visible(False)
axis = covidByDayRJ.plot(legend=False, style='o-',figsize=(18, 9), y='obitosAcumulados', title='Deaths in RJ')
axis.xaxis.get_label().set_visible(False)
Rolling mean of the last 7 days for the new deaths by day.
newCovidByDayBR = covid[['data', 'obitosNovos']].groupby('data').sum()
newCovidByDayBR.reset_index(inplace=True)
newCovidByDayBR.set_index(['data'],inplace=True)
newCovidByDayBR['rollingNewDeaths'] = newCovidByDayBR['obitosNovos'].rolling(window=7).mean()
axis = newCovidByDayBR.plot(legend=False, style='o-', figsize=(18, 9), y='rollingNewDeaths', title='New deaths in Brazil by day (rolling mean of 7 days)')
axis.xaxis.get_label().set_visible(False)
newCovidByDayRJ = covid.loc[covid['estado'] == 'RJ'][['data', 'obitosNovos']]
newCovidByDayRJ.reset_index(inplace=True)
newCovidByDayRJ.set_index(['data'],inplace=True)
newCovidByDayRJ['rollingNewDeaths'] = newCovidByDayRJ['obitosNovos'].rolling(window=7).mean()
axis = newCovidByDayRJ.plot(legend=False, style='o-', figsize=(18, 9), y='rollingNewDeaths', title='New deaths in RJ state by day (rolling mean of 7 days)')
axis.xaxis.get_label().set_visible(False)
newCovidByDayRio = covidRioDeJaneiro[['data', 'obitosNovos']]
newCovidByDayRio['data'] = pd.to_datetime(newCovidByDayRio['data'], format='%Y-%m-%d')
newCovidByDayRio.reset_index(inplace=True)
newCovidByDayRio.set_index(['data'],inplace=True)
newCovidByDayRio['rollingNewDeaths'] = newCovidByDayRio['obitosNovos'].rolling(window=7).mean()
axis = newCovidByDayRio.plot(legend=False, style='o-', figsize=(18, 9), y='rollingNewDeaths', title='New deaths in Rio de Janeiro City by day (rolling mean of 7 days)')
axis.xaxis.get_label().set_visible(False)
newCovidByDayMangaratiba = covidMangaratiba[['data', 'obitosNovos']]
newCovidByDayMangaratiba['data'] = pd.to_datetime(newCovidByDayMangaratiba['data'], format='%Y-%m-%d')
newCovidByDayMangaratiba = newCovidByDayMangaratiba[(newCovidByDayMangaratiba['obitosNovos'] >= 0)]
newCovidByDayMangaratiba.reset_index(inplace=True)
newCovidByDayMangaratiba.set_index(['data'],inplace=True)
axis = newCovidByDayMangaratiba.plot(legend=False, style='o-', markersize=5, drawstyle="steps-mid", figsize=(18, 9), y='obitosNovos', title='New deaths in Mangaratiba City by day (absolute values)')
axis.xaxis.get_label().set_visible(False)
plt.grid(axis='y', linestyle='-')
For the log of the deaths data of the last 10 days for all states, I use Linear Regressions to estimate and compare how the situation is evolving. Higher coefficients means that situation is getting worse.
deathsByDayStates = covid.pivot(index='data', columns='estado', values = 'normTotalDeaths')
last10Days = deathsByDayStates.tail(10)
axis = last10Days.plot(logy=True, figsize=(15,10), title='Last 10 Days Deaths per MM inhab., Log Scale').legend(bbox_to_anchor=(1.1, 1.01))
plt.gca().xaxis.get_label().set_visible(False)
plt.tick_params(
axis='x',
which='both',
bottom=False,
top=False,
labelbottom=False)
plt.show()
last10Days['day'] = range(1,11)
X_train = last10Days['day']
X_train = X_train.values.reshape(-1, 1)
data = {'state': [],
'coefficient': []
}
coef10days = pd.DataFrame (data, columns = ['state','coefficient'])
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
# Evaluating coefficients
for col in last10Days.columns:
if col != 'day':
last10Days[col + 'log'] = np.log(last10Days[col])
regression_model.fit(X_train, last10Days[col + 'log'])
coef10days = coef10days.append(pd.Series([col, regression_model.coef_[0]], index=coef10days.columns), ignore_index=True)
html_code = '''
<div style="
background-color:LightGoldenRodYellow;
border-style: solid;
padding-top: 10px;
padding-right: 10px;
padding-bottom: 40px;
padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png">
<h2> Predictions for the Near Future </h2>
<p style="font-size:18px"> Using the linear regression over the log data to estimate the number of deaths in 7, 14 e 30 days. </p>
</div>
'''
display(HTML(html_code))
from datetime import timedelta
data = {'date': [],
'state': [],
'predictedDeaths': []
}
predictedDeaths = pd.DataFrame (data, columns = ['date','state', 'predictedDeaths'])
more7Days = 17;
more14Days = 24;
more30Days = 40;
dateMore7Days = (datetime.strptime(lastDay, '%Y-%m-%d') + timedelta(days=7)).strftime("%d/%m/%Y")
dateMore14Days = (datetime.strptime(lastDay, '%Y-%m-%d') + timedelta(days=14)).strftime("%d/%m/%Y")
dateMore30Days = (datetime.strptime(lastDay, '%Y-%m-%d') + timedelta(days=30)).strftime("%d/%m/%Y")
deathsByState = covid.pivot(index='data', columns='estado', values = 'obitosAcumulados')
last10Days = deathsByState.tail(10)
last10Days['day'] = range(1,11)
for col in last10Days.columns:
if col != 'day':
last10Days[col + 'log'] = np.log(last10Days[col])
regression_model.fit(X_train, last10Days[col + 'log'])
xpredict = np.array([more7Days])
xpredict = xpredict.reshape(-1,1)
prediction = regression_model.predict(xpredict)
predictedDeaths = predictedDeaths.append(pd.Series([dateMore7Days, col, int(np.exp(prediction[0]))], index=predictedDeaths.columns), ignore_index=True)
xpredict = np.array([more14Days])
xpredict = xpredict.reshape(-1,1)
prediction = regression_model.predict(xpredict)
predictedDeaths = predictedDeaths.append(pd.Series([dateMore14Days, col, int(np.exp(prediction[0]))], index=predictedDeaths.columns), ignore_index=True)
xpredict = np.array([more30Days])
xpredict = xpredict.reshape(-1,1)
prediction = regression_model.predict(xpredict)
predictedDeaths = predictedDeaths.append(pd.Series([dateMore30Days, col, int(np.exp(prediction[0]))], index=predictedDeaths.columns), ignore_index=True)
predictedDeaths.index.name = None
predictedDeathsOut = predictedDeaths.pivot(index='state', columns='date', values = 'predictedDeaths')
predictedDeathsOut = predictedDeathsOut[[dateMore7Days, dateMore14Days, dateMore30Days]]
predictedDeathsBrazil = predictedDeathsOut.sum().reset_index(name='predictedDeaths')
predictedDeathsBrazil = predictedDeathsBrazil.append(pd.Series([lastDayFormatted, deathsBR], index=predictedDeathsBrazil.columns), ignore_index=True)
predictedDeathsBrazil.set_index('date', inplace=True)
predictedDeathsBrazilOut = predictedDeathsBrazil.transpose()
predictedDeathsBrazilOut = predictedDeathsBrazilOut[[lastDayFormatted, dateMore7Days, dateMore14Days, dateMore30Days]]
predictedDeathsOut['state'] = predictedDeathsOut.index
covidLastDayOut = covidLastDay[['estado', 'obitosAcumulados']][['estado', 'obitosAcumulados']]
covidLastDayOut.index.name = None
predictedDeathsOut.index.name = None
predictedDeathsOut = predictedDeathsOut.merge(covidLastDayOut, left_on='state', right_on='estado')
predictedDeathsOut = predictedDeathsOut[['estado', 'obitosAcumulados', dateMore7Days, dateMore14Days, dateMore30Days]]
predictedDeathsOut.columns = ['State', lastDayFormatted, dateMore7Days, dateMore14Days, dateMore30Days]
pd.options.display.float_format = '{0:g}'.format
display(predictedDeathsOut)
predictedDeathsBrazilOut.columns = [lastDayFormatted, dateMore7Days, dateMore14Days, dateMore30Days]
predictedDeathsBrazilOut.reset_index(inplace=True)
predictedDeathsBrazilOut.drop('index', axis=1, inplace=True)
display(predictedDeathsBrazilOut)
coef10days.sort_values('coefficient',ascending=False,inplace=True)
coef10days.set_index('state', inplace=True)
html_code = '''
<div style="
background-color:LightGoldenRodYellow;
border-style: solid;
padding-top: 10px;
padding-right: 10px;
padding-bottom: 40px;
padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png">
<h2> Clustering the states in 3 classes according to the calculated coefficients </h2>
<p style="font-size:18px"> A higher coefficient means a worse tendency for the deaths. </p>
</div>
'''
display(HTML(html_code))
coef10daysData = pd.DataFrame(coef10days)
coef10daysData['state'] = coef10daysData.index
coef10daysData = coef10daysData.reset_index(level=0, drop=True)
coef10daysData = coef10daysData[['state', 'coefficient']]
coef10daysData['myClass'] = kmeans.fit_predict(coef10daysData[['coefficient']])
worstClass = coef10daysData.iloc[0]['myClass']
bestClass = coef10daysData.iloc[-1]['myClass']
display(coef10daysData.style.apply(highlight, axis=1))
coef10days['Normalized Coefficient'] = (coef10days['coefficient']-coef10days['coefficient'].min())/(coef10days['coefficient'].max()-coef10days['coefficient'].min())
_ = coef10days.plot.pie(y='Normalized Coefficient', figsize=(10, 10), legend=False)
Estimating R, that represents the basic reproduction rate of the disease. It roughly estimates how many people are infected by one infected person after 14 days, assuming a constant death rate for the infected.
It is a very simple and imprecise approach. And, specially, the values in the begining of the plot should be disregarded, because the first notifications of the infections are not representative or reliable.
The plot is more of a tendency indicator, than a real value estimator.
data = {'date': [],
'value': []
}
R_Brazil = pd.DataFrame (data, columns = ['date','value'])
# Generation Time
gt = 14
covidDeathsBR = covid.groupby('data')['obitosNovos'].sum().reset_index(name ='obitosNovos')
covidDeathsBR['rollingMean'] =covidDeathsBR['obitosNovos'].rolling(window=7).mean()
for day in range(covidDeathsBR.shape[0]):
if day >= gt:
if covidDeathsBR.at[day-gt,'rollingMean'] > 0:
Rvalue = float(covidDeathsBR.at[day,'rollingMean'])/float(covidDeathsBR.at[day-gt,'rollingMean'])
if Rvalue <= 4:
R_Brazil = R_Brazil.append(pd.Series([covidDeathsBR.at[day, 'data'], Rvalue], index=R_Brazil.columns), ignore_index=True)
R_Brazil['date'] = pd.to_datetime(R_Brazil['date'], format='%Y-%m-%d')
R_Brazil.set_index('date', inplace=True)
axis = R_Brazil.plot(style='o-', figsize=(18, 9), legend=False, y='value', title='R for Brazil')
axis.axhline(1.0, color='gray', lw=2, alpha=0.5)
axis.axhline(2.0, color='gray', lw=2, alpha=0.5)
axis.axhline(3.0, color='gray', lw=2, alpha=0.5)
axis.axhline(4.0, color='red', lw=2, alpha=0.5)
axis.fill_between(axis.get_xlim(), 0., 1., facecolor='lightgreen', alpha=0.5)
axis.xaxis.get_label().set_visible(False)
plt.show()
data = {'date': [],
'state': [],
'value': []
}
R_States = pd.DataFrame (data, columns = ['date','state', 'value'])
for state in covid.estado.unique():
covidState = covid.loc[covid['estado'] == state]
covidDeathsState = covidState.groupby('data')['obitosNovos'].sum().reset_index(name ='obitosNovos')
covidDeathsState['rollingMean'] =covidDeathsState['obitosNovos'].rolling(window=7).mean()
for day in range(covidDeathsState.shape[0]):
if day >= gt:
if covidDeathsState.at[day-gt,'rollingMean'] > 0:
Rvalue = float(covidDeathsState.at[day,'rollingMean'])/float(covidDeathsState.at[day-gt,'rollingMean'])
R_States = R_States.append(pd.Series([covidDeathsState.at[day, 'data'], state, Rvalue], index=R_States.columns), ignore_index=True)
R_States['date'] = pd.to_datetime(R_States['date'], format='%Y-%m-%d')
R_StatesGroup = R_States.loc[R_States['state'].isin(['RJ', 'SP'])]
R_StatesGroup = R_StatesGroup.pivot(index='date', columns='state', values = 'value')
axis = R_StatesGroup.plot(style='o-', figsize=(18, 9), title='R for RJ and SP')
axis.axhline(1.0, color='gray', lw=2, alpha=0.5)
axis.axhline(2.0, color='gray', lw=2, alpha=0.5)
axis.axhline(3.0, color='gray', lw=2, alpha=0.5)
axis.axhline(4.0, color='red', lw=2, alpha=0.5)
axis.fill_between(axis.get_xlim(), 0., 1., facecolor='lightgreen', alpha=0.5)
plt.ylim((0,4))
axis.xaxis.get_label().set_visible(False)
_ = plt.show()