Analysis of COVID-19 Infections and Death Data (Brazil)
Project creation date:	May 6, 2020
Dataset from Ministry of Health:	https://covid.saude.gov.br
Project author:	Ricardo Szczerbacki (ricardo@rj1.info)
Project on Github:	https://github.com/ricardocopa/Covid19
License:	MIT License

Project Objectives¶

This projet does an Exploratory Data Analysis (and some simple predictions) of the dataset made available by the brazilian Ministry of Health, that contains daily numbers on infections and death for brazilian states. The idea was to have more insights for the COVID-19 status in Brazil and Rio de Janeiro, where the author lives.

Also simple plots for comparison with other countrie were made, using data available in Our World in Data (https://covid.ourworldindata.org/data/owid-covid-data.csv)

This website was daily updated until March 29, 2022.

import math
import numpy as np
import pandas as pd
from IPython.display import HTML, Markdown
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
#from  zipfile import ZipFile
#from  rarfile import RarFile
from pyunpack import Archive
import glob
import os
import sys
import time
import subprocess

import locale
locale.setlocale(locale.LC_ALL, 'en_us.utf-8')

import warnings
warnings.filterwarnings('ignore')

    display(HTML('''
    <link rel="stylesheet" href="https://cdn.jupyter.org/notebook/5.1.0/style/style.min.css">
    <script>
        code_show=true; 
        
        function code_toggle() {
            if (code_show){
                $('div.input').hide();
            } else {
                $('div.input').show();
            }
            code_show = !code_show
        } 
        $( document ).ready(code_toggle);
    </script>
    
    Python source code is hidden by default for better visualization.
    To toggle source code visualization click <a href="javascript:code_toggle()">here</a>.'''))

Dataset¶

File HIST_PAINEL_COVIDBR.zip or HIST_PAINEL_COVIDBR.rar (a compressed CSV file) downloaded from brazilian Ministry of Health

files = glob.glob("HIST_PAINEL_COVIDBR.*")
datasetFileName = files[0]

html_code = '''
You can download the dataset file originally used by clicking <a href="http://covid.rj1.info/{}">here</a>
'''.format(datasetFileName)

display(HTML(html_code))

File contents (first 5 rows):

## how files where made available until May 10 2020 
# covid = pd.read_csv('arquivo_geral.csv', delimiter = ';') 

# how files where mande available until october (or until the excel files exceed the max rows limit)
#covidFull = pd.read_excel(datasetFileName)

#covidFull = pd.read_csv(datasetFileName, parse_dates = True, delimiter = ';')

dataframes = []

for f in glob.glob('extractedFiles/*'):
    os.remove(f)

Archive(datasetFileName).extractall('extractedFiles/')
files = glob.glob("extractedFiles/*")

for filename in files:
    df = pd.read_csv(filename, parse_dates = True, delimiter = ';')
    dataframes.append(df)

covidFull = pd.concat(dataframes, axis=0, ignore_index=True)


print('Columns: Region, State, City, State Code, City Code, Health Region Code, Health Region Name,  Date, Week, Population 2019, Total Infections, New Infections, Total Deaths, New Deaths, Total Recovered, New Patients')
display(covidFull.head())

# converting to the old file format
covid = covidFull[covidFull.estado.notnull()]
covid = covid[covid.codmun.isnull()]

covid = covid[['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulado', 'obitosNovos', 'obitosAcumulado']]
covid.columns = ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados'] 

# preparing a dataframe for the Rio de Janeiro city
covidRioDeJaneiro = covidFull.loc[covidFull['municipio'] == 'Rio de Janeiro']
covidRioDeJaneiro = covidRioDeJaneiro[['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulado', 'obitosNovos', 'obitosAcumulado']]
covidRioDeJaneiro.columns = ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados'] 

# preparing a dataframe for the Mangaratiba city
covidMangaratiba = covidFull.loc[covidFull['municipio'] == 'Mangaratiba']
covidMangaratiba = covidMangaratiba[['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulado', 'obitosNovos', 'obitosAcumulado']]
covidMangaratiba.columns = ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados']

Columns: Region, State, City, State Code, City Code, Health Region Code, Health Region Name,  Date, Week, Population 2019, Total Infections, New Infections, Total Deaths, New Deaths, Total Recovered, New Patients

firstDateData = datetime.strptime(covid['data'].min(), '%Y-%m-%d').strftime('%d/%m/%Y')  
lastDayData = datetime.strptime(covid['data'].max(), '%Y-%m-%d').strftime('%d/%m/%Y')  
numberOfDaysData = covid['data'].nunique()


covidMaxDeaths = covid.sort_values('obitosNovos', ascending = False).iloc[0]
maxDeathsValue = covidMaxDeaths['obitosNovos']
maxDeathsState = covidMaxDeaths['estado']
maxDeathsDay = datetime.strptime(covidMaxDeaths['data'], '%Y-%m-%d').strftime('%d/%m/%Y') 

covidRJ = covid.loc[covid['estado'] == 'RJ']
covidMaxDeathsRJ = covidRJ.sort_values('obitosNovos', ascending = False).iloc[0]
maxDeathsRJValue = covidMaxDeathsRJ['obitosNovos']
maxDeathsRJDay = datetime.strptime(covidMaxDeathsRJ['data'], '%Y-%m-%d').strftime('%d/%m/%Y') 

covidMaxDeathsBR = covid.groupby('data')['obitosNovos'].sum().reset_index(name ='obitosNovos')
covidMaxDeathsBR = covidMaxDeathsBR.sort_values('obitosNovos', ascending = False).iloc[0]
maxDeathsValueBR = covidMaxDeathsBR['obitosNovos']
maxDeathsDayBR = datetime.strptime(covidMaxDeathsBR['data'], '%Y-%m-%d').strftime('%d/%m/%Y') 

html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 20px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Some General Information About the Dataset </h2>
<p style="font-size:18px"> The file have data from <b>{}</b> to <b>{}</b></p>
<p style="font-size:18px"> <b>{}</b> was the state with the greater number of deaths in one day. There were <b>{:n}</b> deaths on <b>{}</b>. </p>
<p style="font-size:18px"> The day with the greater number of deaths in <b>RJ</b> happend on <b>{}</b>, with a total of <b>{:n}</b> deaths registered. </p>
<p style="font-size:18px"> The day with the greater number of deaths all over Brazil happend on <b>{}</b>, with a total of <b>{:n}</b> deaths registered. </p>
<p style="font-size:18px"> <b>PS</b>: All dates in this page are displayed in the format: DD/MM/YYYY</p>
</div>

'''.format(firstDateData, lastDayData, maxDeathsState, maxDeathsValue, maxDeathsDay, maxDeathsRJDay, maxDeathsRJValue, maxDeathsDayBR, maxDeathsValueBR)

display(HTML(html_code))

# Creating normalized columns for infections and deaths by dividing it by the population multiplied by a million (cases per million people). Source for brazilian population data: IBGE population estimation for 2019
# https://pt.wikipedia.org/wiki/Lista_de_unidades_federativas_do_Brasil_por_popula%C3%A7%C3%A3o

data = {'state': ['AC','AL','AP','AM','BA','CE','DF',
                   'ES','GO','MA','MT','MS','MG','PA',
                   'PB','PR','PE','PI','RJ','RN','RS',
                   'RO','RR','SC','SP','SE','TO'],
        'population': [881935, 3337357, 845731, 4144597, 14873064, 9132078, 3015268, 
                       4018650, 7018354, 7075181, 3484466, 2778986, 21168791, 8602865,
                       4018127, 11433957, 9557071, 3273227, 17264943, 3506853, 11377239,
                       1777225, 605761, 7164788, 45919049, 2298696, 1572866]
       }

populationByState = pd.DataFrame (data, columns = ['state','population'])

covid = covid.merge(populationByState, left_on='estado', right_on='state')

covid['normTotalInfections'] = covid['casosAcumulados']/covid['population'] * 1000000.
covid['normTotalDeaths'] = covid['obitosAcumulados']/covid['population'] * 1000000.

lastDay = covid['data'].max()
lastDayFormatted = datetime.strptime(lastDay, '%Y-%m-%d').strftime('%d/%m/%Y')

covidLastDay = covid.loc[covid['data'] == lastDay]

infectionsBR = covidLastDay.sum()['casosAcumulados']
deathsBR = covidLastDay.sum()['obitosAcumulados']
infectionsLastDayBR = covidLastDay.sum()['casosNovos']
deathsLastDayBR = covidLastDay.sum()['obitosNovos']

covidRJ = covidLastDay.loc[covid['estado'] == 'RJ']

infectionsLastDayRJ = covidRJ.sum()['casosNovos']
deathsLastDayRJ = covidRJ.sum()['obitosNovos']
infectionsRJ = covidRJ.sum()['casosAcumulados']
deathsRJ = covidRJ.sum()['obitosAcumulados']

html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 20px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Numbers for the Last 24 Hours (last day of the dataset) </h2>
<p style="font-size:18px"> <b>{}</b> is the last day in the dataset.</p>
<p>
<p style="font-size:18px"> <b>{:,.0f} infections</b> were confirmed in <b>Brazil</b> on this day, of <b>{:,.0f} total infections</b>. </p>
<p style="font-size:18px"> <b>{:,.0f} deaths</b> were confirmed in <b>Brazil</b> on this day, of <b>{:,.0f} total deaths</b>. </p>
<p>
<p style="font-size:18px"> <b>{:,.0f} infections</b> were confirmed in <b>RJ</b> on this day, of <b>{:,.0f} total infections</b>. </p>
<p style="font-size:18px"> <b>{:,.0f} deaths</b> were confirmed in <b>RJ</b> on this day, of <b>{:,.0f} total deaths</b>. </p>
</div>

'''.format(lastDayFormatted, infectionsLastDayBR, infectionsBR, deathsLastDayBR, deathsBR,
          infectionsLastDayRJ, infectionsRJ, deathsLastDayRJ, deathsRJ)

display(HTML(html_code))

Last Known Status for Some Countries¶

Using the last information dowloaded from Our World in Data (https://covid.ourworldindata.org/data/owid-covid-data.csv), I list below the 3 countries in the worst situation (deaths per million people), the 3 countries in the best situation and other 3 countries of interest (Brazil, France and Israel), showing their position in the world ranking in the first column. OBS: only countries with death rates greater than zero are included in the 2 rankings below.

worldCovid = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv', parse_dates = True, delimiter = ',')

worldCovid = worldCovid.loc[~worldCovid['location'].isin(['Africa', 'Asia', 'Europe', 'European Union'
                                                         'International', 'North America', 'Oceania',
                                                         'South America', 'World'])]
lastDate = worldCovid['date'].max()

# Ignore the last day with data (usually faulty)
lastDate = (datetime.strptime(lastDate, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')

formattedLastDate = datetime.strptime(lastDate, '%Y-%m-%d').strftime('%B %d, %Y').replace(' 0', ' ')
worldCovid = worldCovid.loc[worldCovid['date'] == lastDate]
worldCovidTotal = worldCovid.copy()
worldCovidTotal = worldCovid[['location', 'total_deaths_per_million']]
worldCovidTotal.columns = ['Country', 'Deaths/MM inhab.']

worldCovidTotal.sort_values('Deaths/MM inhab.', ascending=False, inplace=True)
worldCovidTotal.reset_index(inplace = True)
worldCovidTotal['Ranking'] = worldCovidTotal.index+1
worldCovidTotal = worldCovidTotal.fillna(0)
worldCovidTotal = worldCovidTotal.loc[worldCovidTotal['Deaths/MM inhab.'] > 0]
worldCovidTotal = worldCovidTotal[['Ranking', 'Country', 'Deaths/MM inhab.']]

# Get the 3 best and the 3 worst status countries
worst1StatusCountry = worldCovidTotal.iloc[0]['Country']
worst2StatusCountry = worldCovidTotal.iloc[1]['Country']
worst3StatusCountry = worldCovidTotal.iloc[2]['Country']
best1StatusCountry = worldCovidTotal.iloc[-1]['Country']
best2StatusCountry = worldCovidTotal.iloc[-2]['Country']
best3StatusCountry = worldCovidTotal.iloc[-3]['Country']

# Filter only the desired countries (3 best, 3 worst, Brazil, France and Israel)
covidByCountrySelected = worldCovidTotal.loc[worldCovidTotal['Country'].isin([worst1StatusCountry, 
                                                                    worst2StatusCountry, 
                                                                    worst3StatusCountry, 
                                                                    best1StatusCountry,
                                                                    best2StatusCountry,
                                                                    best3StatusCountry,
                                                                    'Brazil', 'France', 'Israel'])]


display(Markdown('**Cumulative death rates (registered deaths from the beginning)** '))
display(HTML(covidByCountrySelected.to_html(index=False)))

display(Markdown('*Information updated on ' + formattedLastDate))

# Plot the distribution of deaths/MM inhab. per country showing the countries of interest (Brazil, France and Israel)
worldCovidTotal = worldCovidTotal[['Country', 'Deaths/MM inhab.']]
covidByCountryBrazil = worldCovidTotal.loc[worldCovidTotal['Country'] == 'Brazil']
covidByCountryFrance = worldCovidTotal.loc[worldCovidTotal['Country'] == 'France']
covidByCountryIsrael = worldCovidTotal.loc[worldCovidTotal['Country'] == 'Israel']

axis = worldCovidTotal.plot()
axis.axes.get_xaxis().set_visible(False)
covidByCountryBrazil.plot(ax=axis, linestyle='',marker='o', markersize=12, color='green')
covidByCountryFrance.plot(ax=axis, linestyle='',marker='o', markersize=12, color='red')
covidByCountryIsrael.plot(ax=axis, linestyle='',marker='o', markersize=12, color='blue')
axis.legend(['Deaths/MM inhab.', 'Brazil', 'France', 'Israel'])
plt.show()

worldCovidCurrent = worldCovid.copy()
worldCovidCurrent = worldCovid[['location', 'new_deaths_smoothed_per_million']]
worldCovidCurrent.columns = ['Country', 'Deaths/MM inhab.']

worldCovidCurrent.sort_values('Deaths/MM inhab.', ascending=False, inplace=True)
worldCovidCurrent.reset_index(inplace = True)
worldCovidCurrent['Ranking'] = worldCovidCurrent.index+1
worldCovidCurrent = worldCovidCurrent.fillna(0)
worldCovidCurrent = worldCovidCurrent.loc[worldCovidCurrent['Deaths/MM inhab.'] > 0]
worldCovidCurrent = worldCovidCurrent[['Ranking', 'Country', 'Deaths/MM inhab.']]

# Get the 3 best and the 3 worst status countries
worst1StatusCountry = worldCovidCurrent.iloc[0]['Country']
worst2StatusCountry = worldCovidCurrent.iloc[1]['Country']
worst3StatusCountry = worldCovidCurrent.iloc[2]['Country']
best1StatusCountry = worldCovidCurrent.iloc[-1]['Country']
best2StatusCountry = worldCovidCurrent.iloc[-2]['Country']
best3StatusCountry = worldCovidCurrent.iloc[-3]['Country']

# Filter only the desired countries (3 best, 3 worst, Brazil, France and Israel)
covidByCountrySelected = worldCovidCurrent.loc[worldCovidCurrent['Country'].isin([worst1StatusCountry, 
                                                                    worst2StatusCountry, 
                                                                    worst3StatusCountry, 
                                                                    best1StatusCountry,
                                                                    best2StatusCountry,
                                                                    best3StatusCountry,
                                                                    'Brazil', 'France', 'Israel'])]


display(Markdown('**Current death rates (average from the last 7 days)** '))
display(HTML(covidByCountrySelected.to_html(index=False)))

display(Markdown('*Information updated on ' + formattedLastDate))

# Plot the distribution of deaths/MM inhab. per country showing the countries of interest (Brazil, France and Israel)
worldCovidCurrent = worldCovidCurrent[['Country', 'Deaths/MM inhab.']]
covidByCountryBrazil = worldCovidCurrent.loc[worldCovidCurrent['Country'] == 'Brazil']
covidByCountryFrance = worldCovidCurrent.loc[worldCovidCurrent['Country'] == 'France']
covidByCountryIsrael = worldCovidCurrent.loc[worldCovidCurrent['Country'] == 'Israel']

axis = worldCovidCurrent.plot()
axis.axes.get_xaxis().set_visible(False)
covidByCountryBrazil.plot(ax=axis, linestyle='',marker='o', markersize=12, color='green')
covidByCountryFrance.plot(ax=axis, linestyle='',marker='o', markersize=12, color='red')
covidByCountryIsrael.plot(ax=axis, linestyle='',marker='o', markersize=12, color='blue')
axis.legend(['Deaths/MM inhab.', 'Brazil', 'France', 'Israel'])
plt.show()

covidLastDay = covidLastDay[['estado','casosAcumulados', 'obitosAcumulados', 'normTotalInfections', 'normTotalDeaths']]
covidLastDay.sort_values('normTotalDeaths', ascending=False, inplace=True)
covidLastDay.set_index('estado', inplace=True)
covidLastDay2 = covidLastDay.copy()
covidLastDay2.columns = ['Infections', 'Deaths', 'Infections/MM inh.', 'Deaths/MM inh.']

html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 40px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Infections and Deaths by State </h2>
<p style="font-size:18px"> Sorted by the severity (deaths per million people). States in worse situation are near the top. </p>
<p style="font-size:18px"> The k-means method was used for clustering the states in 3 classes of severity. </p>
</div>
'''
display(HTML(html_code))

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)

covidLastDay['myClass'] = kmeans.fit_predict(covidLastDay[['normTotalDeaths']])

worstClass = covidLastDay.iloc[0]['myClass']
bestClass = covidLastDay.iloc[-1]['myClass']

covidLastDay.reset_index(inplace=True)

covidLastDay2 = covidLastDay.copy()

covidLastDay2.columns=['State', 'Infections', 'Deaths', 'Infections/MM inhab.', 'Deaths/MM inhab.', 'myClass']

def highlight(s):
    tamanho = s.size
    if s.myClass == worstClass:
        return ['background-color: orange']*tamanho
    elif s.myClass != bestClass:
        return ['background-color: yellow']*tamanho
    else:
        return ['background-color: white']*tamanho

def formatNumber(str_value):
    value = int(str_value)
    if value > 1000000:
        return f'{value/1000000:.2f} Mi'
    return f'{value:,}'

display(covidLastDay2.style.apply(highlight, axis=1).format({"Infections": formatNumber,
                                                             "Deaths": formatNumber,
                                                             "Infections/MM inhab.": "{:,.2f}",
                                                             "Deaths/MM inhab.": "{:,.2f}"}))

Comparing Infections and Deaths by State¶

Using absolute values

covidLastDayPie1 = covidLastDay[['estado','casosAcumulados', 'obitosAcumulados']]
covidLastDayPie1.set_index('estado', inplace=True)
plot = covidLastDayPie1.plot.pie(fontsize=24, subplots=True, layout=(1, 2), figsize=(30, 15), legend=None)

Comparing Infections and Deaths per Million Inhabitans, per State¶

covidLastDayPie2 = covidLastDay[['estado','normTotalInfections', 'normTotalDeaths']]
covidLastDayPie2.set_index('estado', inplace=True)
plot = covidLastDayPie2.plot.pie(fontsize=24, subplots=True, layout=(1, 2), figsize=(30, 15), legend=None)

Plots of cumulative infections and deaths¶

covid['data'] = pd.to_datetime(covid['data'], format='%Y-%m-%d')

covidByDayBR = covid[['data', 'casosAcumulados', 'obitosAcumulados']].groupby('data').sum()
covidByDayBR.reset_index(inplace=True)
covidByDayBR.set_index(['data'],inplace=True)

covidByDayRJ = covid.loc[covid['estado'] == 'RJ'][['data', 'casosAcumulados', 'obitosAcumulados', 'normTotalInfections', 'normTotalDeaths']]
covidByDayRJ.reset_index(inplace=True)
covidByDayRJ.set_index(['data'],inplace=True)

axis = covidByDayBR.plot(legend=False, style='o-', figsize=(18, 9), y='casosAcumulados', title='Infections in Brazil')
axis.xaxis.get_label().set_visible(False)

axis = covidByDayBR.plot(legend=False, style='o-',figsize=(18, 9), y='obitosAcumulados', title='Deaths in Brazil')
axis.xaxis.get_label().set_visible(False)

axis = covidByDayRJ.plot(legend=False,style='o-',figsize=(18, 9), y='casosAcumulados', title='Infections in RJ')
axis.xaxis.get_label().set_visible(False)

axis = covidByDayRJ.plot(legend=False, style='o-',figsize=(18, 9), y='obitosAcumulados', title='Deaths in RJ')
axis.xaxis.get_label().set_visible(False)

Plots of new deaths for Brazil, RJ state and Rio de Janeiro city¶

Rolling mean of the last 7 days for the new deaths by day.

newCovidByDayBR = covid[['data', 'obitosNovos']].groupby('data').sum()
newCovidByDayBR.reset_index(inplace=True)
newCovidByDayBR.set_index(['data'],inplace=True)
newCovidByDayBR['rollingNewDeaths'] = newCovidByDayBR['obitosNovos'].rolling(window=7).mean()

axis = newCovidByDayBR.plot(legend=False, style='o-', figsize=(18, 9), y='rollingNewDeaths', title='New deaths in Brazil by day (rolling mean of 7 days)')
axis.xaxis.get_label().set_visible(False)

newCovidByDayRJ = covid.loc[covid['estado'] == 'RJ'][['data', 'obitosNovos']]
newCovidByDayRJ.reset_index(inplace=True)
newCovidByDayRJ.set_index(['data'],inplace=True)
newCovidByDayRJ['rollingNewDeaths'] = newCovidByDayRJ['obitosNovos'].rolling(window=7).mean()

axis = newCovidByDayRJ.plot(legend=False, style='o-', figsize=(18, 9), y='rollingNewDeaths', title='New deaths in RJ state by day (rolling mean of 7 days)')
axis.xaxis.get_label().set_visible(False)

newCovidByDayRio = covidRioDeJaneiro[['data', 'obitosNovos']]
newCovidByDayRio['data'] = pd.to_datetime(newCovidByDayRio['data'], format='%Y-%m-%d')
newCovidByDayRio.reset_index(inplace=True)
newCovidByDayRio.set_index(['data'],inplace=True)
newCovidByDayRio['rollingNewDeaths'] = newCovidByDayRio['obitosNovos'].rolling(window=7).mean()

axis = newCovidByDayRio.plot(legend=False, style='o-', figsize=(18, 9), y='rollingNewDeaths', title='New deaths in Rio de Janeiro City by day (rolling mean of 7 days)')
axis.xaxis.get_label().set_visible(False)

newCovidByDayMangaratiba = covidMangaratiba[['data', 'obitosNovos']]
newCovidByDayMangaratiba['data'] = pd.to_datetime(newCovidByDayMangaratiba['data'], format='%Y-%m-%d')
newCovidByDayMangaratiba = newCovidByDayMangaratiba[(newCovidByDayMangaratiba['obitosNovos'] >= 0)]
newCovidByDayMangaratiba.reset_index(inplace=True)
newCovidByDayMangaratiba.set_index(['data'],inplace=True)

axis = newCovidByDayMangaratiba.plot(legend=False, style='o-', markersize=5, drawstyle="steps-mid", figsize=(18, 9), y='obitosNovos', title='New deaths in Mangaratiba City by day (absolute values)')
axis.xaxis.get_label().set_visible(False)
plt.grid(axis='y', linestyle='-')

Comparing tendencies of all states in the last 10 days¶

For the log of the deaths data of the last 10 days for all states, I use Linear Regressions to estimate and compare how the situation is evolving. Higher coefficients means that situation is getting worse.

deathsByDayStates = covid.pivot(index='data', columns='estado', values = 'normTotalDeaths')

last10Days = deathsByDayStates.tail(10)

axis = last10Days.plot(logy=True, figsize=(15,10), title='Last 10 Days Deaths per MM inhab., Log Scale').legend(bbox_to_anchor=(1.1, 1.01))
plt.gca().xaxis.get_label().set_visible(False)
plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,      
    top=False,         
    labelbottom=False) 
plt.show()

last10Days['day'] = range(1,11)

X_train = last10Days['day']
X_train = X_train.values.reshape(-1, 1)

data = {'state': [],
        'coefficient': []
       }
coef10days = pd.DataFrame (data, columns = ['state','coefficient'])

from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()

# Evaluating coefficients 
for col in last10Days.columns: 
    if col != 'day':
        last10Days[col + 'log'] = np.log(last10Days[col])
        regression_model.fit(X_train, last10Days[col + 'log'])
        coef10days = coef10days.append(pd.Series([col, regression_model.coef_[0]], index=coef10days.columns), ignore_index=True)

html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 40px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Predictions for the Near Future </h2>
<p style="font-size:18px"> Using the linear regression over the log data to estimate the number of deaths in 7, 14 e 30 days. </p>
</div>
'''
display(HTML(html_code))

All States¶

from datetime import timedelta

data = {'date': [],
        'state': [],
        'predictedDeaths': []
       }
predictedDeaths = pd.DataFrame (data, columns = ['date','state', 'predictedDeaths'])

more7Days = 17;
more14Days = 24;
more30Days = 40;
dateMore7Days = (datetime.strptime(lastDay, '%Y-%m-%d') + timedelta(days=7)).strftime("%d/%m/%Y")
dateMore14Days = (datetime.strptime(lastDay, '%Y-%m-%d') + timedelta(days=14)).strftime("%d/%m/%Y")
dateMore30Days = (datetime.strptime(lastDay, '%Y-%m-%d') + timedelta(days=30)).strftime("%d/%m/%Y")

deathsByState = covid.pivot(index='data', columns='estado', values = 'obitosAcumulados')
last10Days = deathsByState.tail(10)
last10Days['day'] = range(1,11)

for col in last10Days.columns: 
    if col != 'day':
        last10Days[col + 'log'] = np.log(last10Days[col])
        regression_model.fit(X_train, last10Days[col + 'log'])
        
        xpredict = np.array([more7Days])
        xpredict = xpredict.reshape(-1,1)
        prediction = regression_model.predict(xpredict)
        predictedDeaths = predictedDeaths.append(pd.Series([dateMore7Days, col, int(np.exp(prediction[0]))], index=predictedDeaths.columns), ignore_index=True)
        
        xpredict = np.array([more14Days])
        xpredict = xpredict.reshape(-1,1)
        prediction = regression_model.predict(xpredict)
        predictedDeaths = predictedDeaths.append(pd.Series([dateMore14Days, col, int(np.exp(prediction[0]))], index=predictedDeaths.columns), ignore_index=True)
        
        xpredict = np.array([more30Days])
        xpredict = xpredict.reshape(-1,1)
        prediction = regression_model.predict(xpredict)
        predictedDeaths = predictedDeaths.append(pd.Series([dateMore30Days, col, int(np.exp(prediction[0]))], index=predictedDeaths.columns), ignore_index=True)
        
predictedDeaths.index.name = None
predictedDeathsOut = predictedDeaths.pivot(index='state', columns='date', values = 'predictedDeaths')

predictedDeathsOut = predictedDeathsOut[[dateMore7Days, dateMore14Days, dateMore30Days]]

predictedDeathsBrazil = predictedDeathsOut.sum().reset_index(name='predictedDeaths')
predictedDeathsBrazil = predictedDeathsBrazil.append(pd.Series([lastDayFormatted, deathsBR], index=predictedDeathsBrazil.columns), ignore_index=True)
predictedDeathsBrazil.set_index('date', inplace=True)
predictedDeathsBrazilOut = predictedDeathsBrazil.transpose()
predictedDeathsBrazilOut = predictedDeathsBrazilOut[[lastDayFormatted, dateMore7Days, dateMore14Days, dateMore30Days]]

predictedDeathsOut['state'] = predictedDeathsOut.index
covidLastDayOut = covidLastDay[['estado', 'obitosAcumulados']][['estado', 'obitosAcumulados']]
covidLastDayOut.index.name = None
predictedDeathsOut.index.name = None
predictedDeathsOut = predictedDeathsOut.merge(covidLastDayOut, left_on='state', right_on='estado')
predictedDeathsOut = predictedDeathsOut[['estado', 'obitosAcumulados', dateMore7Days, dateMore14Days, dateMore30Days]]
predictedDeathsOut.columns = ['State', lastDayFormatted, dateMore7Days, dateMore14Days, dateMore30Days]

pd.options.display.float_format = '{0:g}'.format
display(predictedDeathsOut)

Brazil¶

predictedDeathsBrazilOut.columns = [lastDayFormatted, dateMore7Days, dateMore14Days, dateMore30Days]
predictedDeathsBrazilOut.reset_index(inplace=True)
predictedDeathsBrazilOut.drop('index', axis=1, inplace=True)
display(predictedDeathsBrazilOut)

coef10days.sort_values('coefficient',ascending=False,inplace=True)
coef10days.set_index('state', inplace=True)

html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 40px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Clustering the states in 3 classes according to the calculated coefficients </h2>
<p style="font-size:18px">  A higher coefficient means a worse tendency for the deaths. </p>
</div>
'''
display(HTML(html_code))

coef10daysData = pd.DataFrame(coef10days)
coef10daysData['state'] = coef10daysData.index
coef10daysData = coef10daysData.reset_index(level=0, drop=True)
coef10daysData = coef10daysData[['state', 'coefficient']]

coef10daysData['myClass'] = kmeans.fit_predict(coef10daysData[['coefficient']])
worstClass = coef10daysData.iloc[0]['myClass']
bestClass = coef10daysData.iloc[-1]['myClass']

display(coef10daysData.style.apply(highlight, axis=1))

coef10days['Normalized Coefficient'] = (coef10days['coefficient']-coef10days['coefficient'].min())/(coef10days['coefficient'].max()-coef10days['coefficient'].min())

_ = coef10days.plot.pie(y='Normalized Coefficient', figsize=(10, 10), legend=False)

R curve¶

Estimating R, that represents the basic reproduction rate of the disease. It roughly estimates how many people are infected by one infected person after 14 days, assuming a constant death rate for the infected.

It is a very simple and imprecise approach. And, specially, the values in the begining of the plot should be disregarded, because the first notifications of the infections are not representative or reliable.

The plot is more of a tendency indicator, than a real value estimator.

data = {'date': [],
        'value': []
       }
R_Brazil = pd.DataFrame (data, columns = ['date','value'])

# Generation Time
gt = 14

covidDeathsBR = covid.groupby('data')['obitosNovos'].sum().reset_index(name ='obitosNovos')
covidDeathsBR['rollingMean'] =covidDeathsBR['obitosNovos'].rolling(window=7).mean()

for day in range(covidDeathsBR.shape[0]):
    if day >= gt:
        if covidDeathsBR.at[day-gt,'rollingMean'] > 0:
            Rvalue = float(covidDeathsBR.at[day,'rollingMean'])/float(covidDeathsBR.at[day-gt,'rollingMean'])
            if Rvalue <= 4:
                R_Brazil = R_Brazil.append(pd.Series([covidDeathsBR.at[day, 'data'], Rvalue], index=R_Brazil.columns), ignore_index=True)

R_Brazil['date'] = pd.to_datetime(R_Brazil['date'], format='%Y-%m-%d')

R_Brazil.set_index('date', inplace=True)
axis = R_Brazil.plot(style='o-', figsize=(18, 9), legend=False, y='value', title='R for Brazil')
axis.axhline(1.0, color='gray', lw=2, alpha=0.5)
axis.axhline(2.0, color='gray', lw=2, alpha=0.5)
axis.axhline(3.0, color='gray', lw=2, alpha=0.5)
axis.axhline(4.0, color='red', lw=2, alpha=0.5)
axis.fill_between(axis.get_xlim(), 0., 1., facecolor='lightgreen', alpha=0.5)
axis.xaxis.get_label().set_visible(False)
plt.show()

data = {'date': [],
        'state': [],
        'value': []
       }
R_States = pd.DataFrame (data, columns = ['date','state', 'value'])

for state in covid.estado.unique():
    covidState = covid.loc[covid['estado'] == state]

    covidDeathsState = covidState.groupby('data')['obitosNovos'].sum().reset_index(name ='obitosNovos')
    covidDeathsState['rollingMean'] =covidDeathsState['obitosNovos'].rolling(window=7).mean()
    for day in range(covidDeathsState.shape[0]):
        if day >= gt:
            if covidDeathsState.at[day-gt,'rollingMean'] > 0:
                Rvalue = float(covidDeathsState.at[day,'rollingMean'])/float(covidDeathsState.at[day-gt,'rollingMean'])
                R_States = R_States.append(pd.Series([covidDeathsState.at[day, 'data'], state, Rvalue], index=R_States.columns), ignore_index=True)


R_States['date'] = pd.to_datetime(R_States['date'], format='%Y-%m-%d')

R_StatesGroup = R_States.loc[R_States['state'].isin(['RJ', 'SP'])]
R_StatesGroup = R_StatesGroup.pivot(index='date', columns='state', values = 'value')

axis = R_StatesGroup.plot(style='o-', figsize=(18, 9), title='R for RJ and SP')
axis.axhline(1.0, color='gray', lw=2, alpha=0.5)
axis.axhline(2.0, color='gray', lw=2, alpha=0.5)
axis.axhline(3.0, color='gray', lw=2, alpha=0.5)
axis.axhline(4.0, color='red', lw=2, alpha=0.5)
axis.fill_between(axis.get_xlim(), 0., 1., facecolor='lightgreen', alpha=0.5)
plt.ylim((0,4))
axis.xaxis.get_label().set_visible(False)
_ = plt.show()

	regiao	estado	municipio	coduf	codmun	codRegiaoSaude	nomeRegiaoSaude	data	semanaEpi	populacaoTCU2019	casosAcumulado	casosNovos	Recuperadosnovos	emAcompanhamentoNovos	interior/metropolitana
0	Brasil	NaN	NaN	76	NaN	NaN	NaN	2020-02-25	9	210147125.0	0.0	0	0.0	0.0	NaN
1	Brasil	NaN	NaN	76	NaN	NaN	NaN	2020-02-26	9	210147125.0	1.0	1	1.0	0.0	NaN
2	Brasil	NaN	NaN	76	NaN	NaN	NaN	2020-02-27	9	210147125.0	1.0	0	1.0	0.0	NaN
3	Brasil	NaN	NaN	76	NaN	NaN	NaN	2020-02-28	9	210147125.0	1.0	0	0.0	1.0	NaN
4	Brasil	NaN	NaN	76	NaN	NaN	NaN	2020-02-29	9	210147125.0	2.0	1	1.0	1.0	NaN

Ranking	Country	Deaths/MM inhab.
1	Peru	6358.865
2	Bulgaria	5283.286
3	Bosnia and Herzegovina	4811.459
15	Brazil	3080.277
42	France	2102.296
82	Israel	1128.511
210	China	3.211
211	Vanuatu	3.180
212	Burundi	3.101

Ranking	Country	Deaths/MM inhab.
1	Hong Kong	25.648
2	Liechtenstein	7.469
3	South Korea	6.763
43	France	1.566
55	Brazil	1.111
63	Israel	0.830
125	Nepal	0.005
126	Ethiopia	0.004
127	Bangladesh	0.001

	State	Infections	Deaths	Infections/MM inhab.	Deaths/MM inhab.	myClass
0	RJ	2.08 Mi	72,756	120,638.68	4,214.09	0
1	MT	719,859	14,587	206,590.91	4,186.29	0
2	RO	392,811	7,176	221,024.91	4,037.76	0
3	DF	691,980	11,579	229,492.04	3,840.12	0
4	MS	523,860	10,493	188,507.61	3,775.84	0
5	PR	2.41 Mi	42,912	210,851.94	3,753.03	0
6	GO	1.27 Mi	26,221	181,390.82	3,736.06	0
7	SP	5.24 Mi	167,211	114,145.53	3,641.43	0
8	ES	1.04 Mi	14,333	258,399.21	3,566.62	0
9	RR	155,076	2,144	256,001.95	3,539.35	0
10	RS	2.27 Mi	39,015	199,298.27	3,429.22	0
11	AM	581,177	14,153	140,225.21	3,414.81	0
12	SC	1.67 Mi	21,656	233,524.98	3,022.56	1
13	CE	1.24 Mi	26,725	135,797.46	2,926.50	1
14	MG	3.32 Mi	60,770	156,861.11	2,870.74	1
15	SE	325,611	6,313	141,650.31	2,746.34	1
16	TO	302,606	4,142	192,391.47	2,633.41	1
17	PB	595,347	10,191	148,165.30	2,536.26	1
18	AP	160,336	2,124	189,582.74	2,511.44	1
19	PI	367,585	7,726	112,300.49	2,360.36	2
20	RN	495,822	8,120	141,386.59	2,315.47	2
21	AC	123,808	1,992	140,382.23	2,258.67	2
22	PE	894,462	21,386	93,591.65	2,237.71	2
23	PA	752,167	18,081	87,432.15	2,101.74	2
24	AL	296,066	6,876	88,712.71	2,060.31	2
25	BA	1.53 Mi	29,688	102,934.20	1,996.09	2
26	MA	424,956	10,871	60,062.92	1,536.50	2

	State	29/03/2022	05/04/2022	12/04/2022	28/04/2022
0	AC	1992	1994	1996	2001
1	AL	6876	6900	6923	6976
2	AM	14153	14157	14162	14173
3	AP	2124	2126	2129	2135
4	BA	29688	29758	29830	29993
5	CE	26725	26763	26808	26909
6	DF	11579	11599	11619	11664
7	ES	14333	14355	14380	14436
8	GO	26221	26369	26492	26774
9	MA	10871	10882	10893	10917
10	MG	60770	60995	61187	61628
11	MS	10493	10510	10530	10576
12	MT	14587	14598	14612	14644
13	PA	18081	18128	18169	18263
14	PB	10191	10207	10222	10256
15	PE	21386	21469	21550	21738
16	PI	7726	7731	7738	7755
17	PR	42912	43010	43100	43305
18	RJ	72756	72948	73155	73632
19	RN	8120	8125	8131	8143
20	RO	7176	7210	7239	7307
21	RR	2144	2144	2144	2144
22	RS	39015	39121	39225	39465
23	SC	21656	21685	21713	21776
24	SE	6313	6324	6336	6363
25	SP	167211	167706	168170	169235
26	TO	4142	4144	4145	4149

	state	coefficient	myClass
0	GO	0.000661496	2
1	RO	0.000582403	2
2	PE	0.000540853	2
3	AL	0.000474341	2
4	MG	0.00044892	2
5	RJ	0.000405715	0
6	SP	0.000394732	0
7	RS	0.000380636	0
8	BA	0.000341299	0
9	PA	0.000323197	0
10	PR	0.000296688	0
11	SE	0.000270108	0
12	MS	0.000269524	0
13	ES	0.000243866	0
14	DF	0.000239991	0
15	CE	0.000236363	0
16	PB	0.00020892	1
17	AP	0.000182789	1
18	SC	0.000182342	1
19	AC	0.0001522	1
20	MA	0.000140015	1
21	MT	0.000138472	1
22	PI	0.000137394	1
23	RN	9.48276e-05	1
24	TO	5.8541e-05	1
25	AM	4.92568e-05	1
26	RR	0	1