Analysis of COVID-19 Infections and Death Data (Brazil)
Project creation date: May 6, 2020
Dataset from Ministry of Health: https://covid.saude.gov.br
Project author: Ricardo Szczerbacki (ricardo@rj1.info)
Project on Github: https://github.com/ricardocopa/Covid19
License: MIT License

Project Objectives

This projet does an Exploratory Data Analysis (and some simple predictions) of the dataset made available by the brazilian Ministry of Health, that contains daily numbers on infections and death for brazilian states. The idea was to have more insights for the COVID-19 status in Brazil and Rio de Janeiro, where the author lives.

Also simple plots for comparison with other countrie were made, using data available in Our World in Data (https://covid.ourworldindata.org/data/owid-covid-data.csv)

This website was daily updated until March 29, 2022.

In [1]:
import math
import numpy as np
import pandas as pd
from IPython.display import HTML, Markdown
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
#from  zipfile import ZipFile
#from  rarfile import RarFile
from pyunpack import Archive
import glob
import os
import sys
import time
import subprocess

import locale
locale.setlocale(locale.LC_ALL, 'en_us.utf-8')

import warnings
warnings.filterwarnings('ignore')
In [2]:
    display(HTML('''
    <link rel="stylesheet" href="https://cdn.jupyter.org/notebook/5.1.0/style/style.min.css">
    <script>
        code_show=true; 
        
        function code_toggle() {
            if (code_show){
                $('div.input').hide();
            } else {
                $('div.input').show();
            }
            code_show = !code_show
        } 
        $( document ).ready(code_toggle);
    </script>
    
    Python source code is hidden by default for better visualization.
    To toggle source code visualization click <a href="javascript:code_toggle()">here</a>.'''))
Python source code is hidden by default for better visualization. To toggle source code visualization click here.

Dataset

File HIST_PAINEL_COVIDBR.zip or HIST_PAINEL_COVIDBR.rar (a compressed CSV file) downloaded from brazilian Ministry of Health

In [3]:
files = glob.glob("HIST_PAINEL_COVIDBR.*")
datasetFileName = files[0]
In [4]:
html_code = '''
You can download the dataset file originally used by clicking <a href="http://covid.rj1.info/{}">here</a>
'''.format(datasetFileName)

display(HTML(html_code))
You can download the dataset file originally used by clicking here

File contents (first 5 rows):

In [5]:
## how files where made available until May 10 2020 
# covid = pd.read_csv('arquivo_geral.csv', delimiter = ';') 

# how files where mande available until october (or until the excel files exceed the max rows limit)
#covidFull = pd.read_excel(datasetFileName)

#covidFull = pd.read_csv(datasetFileName, parse_dates = True, delimiter = ';')

dataframes = []

for f in glob.glob('extractedFiles/*'):
    os.remove(f)

Archive(datasetFileName).extractall('extractedFiles/')
files = glob.glob("extractedFiles/*")

for filename in files:
    df = pd.read_csv(filename, parse_dates = True, delimiter = ';')
    dataframes.append(df)

covidFull = pd.concat(dataframes, axis=0, ignore_index=True)


print('Columns: Region, State, City, State Code, City Code, Health Region Code, Health Region Name,  Date, Week, Population 2019, Total Infections, New Infections, Total Deaths, New Deaths, Total Recovered, New Patients')
display(covidFull.head())

# converting to the old file format
covid = covidFull[covidFull.estado.notnull()]
covid = covid[covid.codmun.isnull()]

covid = covid[['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulado', 'obitosNovos', 'obitosAcumulado']]
covid.columns = ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados'] 

# preparing a dataframe for the Rio de Janeiro city
covidRioDeJaneiro = covidFull.loc[covidFull['municipio'] == 'Rio de Janeiro']
covidRioDeJaneiro = covidRioDeJaneiro[['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulado', 'obitosNovos', 'obitosAcumulado']]
covidRioDeJaneiro.columns = ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados'] 

# preparing a dataframe for the Mangaratiba city
covidMangaratiba = covidFull.loc[covidFull['municipio'] == 'Mangaratiba']
covidMangaratiba = covidMangaratiba[['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulado', 'obitosNovos', 'obitosAcumulado']]
covidMangaratiba.columns = ['regiao', 'estado', 'data', 'casosNovos', 'casosAcumulados', 'obitosNovos', 'obitosAcumulados'] 
Columns: Region, State, City, State Code, City Code, Health Region Code, Health Region Name,  Date, Week, Population 2019, Total Infections, New Infections, Total Deaths, New Deaths, Total Recovered, New Patients
regiao estado municipio coduf codmun codRegiaoSaude nomeRegiaoSaude data semanaEpi populacaoTCU2019 casosAcumulado casosNovos obitosAcumulado obitosNovos Recuperadosnovos emAcompanhamentoNovos interior/metropolitana
0 Brasil NaN NaN 76 NaN NaN NaN 2020-02-25 9 210147125.0 0.0 0 0 0 0.0 0.0 NaN
1 Brasil NaN NaN 76 NaN NaN NaN 2020-02-26 9 210147125.0 1.0 1 0 0 1.0 0.0 NaN
2 Brasil NaN NaN 76 NaN NaN NaN 2020-02-27 9 210147125.0 1.0 0 0 0 1.0 0.0 NaN
3 Brasil NaN NaN 76 NaN NaN NaN 2020-02-28 9 210147125.0 1.0 0 0 0 0.0 1.0 NaN
4 Brasil NaN NaN 76 NaN NaN NaN 2020-02-29 9 210147125.0 2.0 1 0 0 1.0 1.0 NaN
In [6]:
firstDateData = datetime.strptime(covid['data'].min(), '%Y-%m-%d').strftime('%d/%m/%Y')  
lastDayData = datetime.strptime(covid['data'].max(), '%Y-%m-%d').strftime('%d/%m/%Y')  
numberOfDaysData = covid['data'].nunique()


covidMaxDeaths = covid.sort_values('obitosNovos', ascending = False).iloc[0]
maxDeathsValue = covidMaxDeaths['obitosNovos']
maxDeathsState = covidMaxDeaths['estado']
maxDeathsDay = datetime.strptime(covidMaxDeaths['data'], '%Y-%m-%d').strftime('%d/%m/%Y') 

covidRJ = covid.loc[covid['estado'] == 'RJ']
covidMaxDeathsRJ = covidRJ.sort_values('obitosNovos', ascending = False).iloc[0]
maxDeathsRJValue = covidMaxDeathsRJ['obitosNovos']
maxDeathsRJDay = datetime.strptime(covidMaxDeathsRJ['data'], '%Y-%m-%d').strftime('%d/%m/%Y') 

covidMaxDeathsBR = covid.groupby('data')['obitosNovos'].sum().reset_index(name ='obitosNovos')
covidMaxDeathsBR = covidMaxDeathsBR.sort_values('obitosNovos', ascending = False).iloc[0]
maxDeathsValueBR = covidMaxDeathsBR['obitosNovos']
maxDeathsDayBR = datetime.strptime(covidMaxDeathsBR['data'], '%Y-%m-%d').strftime('%d/%m/%Y') 

html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 20px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Some General Information About the Dataset </h2>
<p style="font-size:18px"> The file have data from <b>{}</b> to <b>{}</b></p>
<p style="font-size:18px"> <b>{}</b> was the state with the greater number of deaths in one day. There were <b>{:n}</b> deaths on <b>{}</b>. </p>
<p style="font-size:18px"> The day with the greater number of deaths in <b>RJ</b> happend on <b>{}</b>, with a total of <b>{:n}</b> deaths registered. </p>
<p style="font-size:18px"> The day with the greater number of deaths all over Brazil happend on <b>{}</b>, with a total of <b>{:n}</b> deaths registered. </p>
<p style="font-size:18px"> <b>PS</b>: All dates in this page are displayed in the format: DD/MM/YYYY</p>
</div>

'''.format(firstDateData, lastDayData, maxDeathsState, maxDeathsValue, maxDeathsDay, maxDeathsRJDay, maxDeathsRJValue, maxDeathsDayBR, maxDeathsValueBR)

display(HTML(html_code))

Some General Information About the Dataset

The file have data from 25/02/2020 to 29/03/2022

SP was the state with the greater number of deaths in one day. There were 1,389 deaths on 06/04/2021.

The day with the greater number of deaths in RJ happend on 17/04/2021, with a total of 446 deaths registered.

The day with the greater number of deaths all over Brazil happend on 08/04/2021, with a total of 4,249 deaths registered.

PS: All dates in this page are displayed in the format: DD/MM/YYYY

In [7]:
# Creating normalized columns for infections and deaths by dividing it by the population multiplied by a million (cases per million people). Source for brazilian population data: IBGE population estimation for 2019
# https://pt.wikipedia.org/wiki/Lista_de_unidades_federativas_do_Brasil_por_popula%C3%A7%C3%A3o

data = {'state': ['AC','AL','AP','AM','BA','CE','DF',
                   'ES','GO','MA','MT','MS','MG','PA',
                   'PB','PR','PE','PI','RJ','RN','RS',
                   'RO','RR','SC','SP','SE','TO'],
        'population': [881935, 3337357, 845731, 4144597, 14873064, 9132078, 3015268, 
                       4018650, 7018354, 7075181, 3484466, 2778986, 21168791, 8602865,
                       4018127, 11433957, 9557071, 3273227, 17264943, 3506853, 11377239,
                       1777225, 605761, 7164788, 45919049, 2298696, 1572866]
       }

populationByState = pd.DataFrame (data, columns = ['state','population'])

covid = covid.merge(populationByState, left_on='estado', right_on='state')

covid['normTotalInfections'] = covid['casosAcumulados']/covid['population'] * 1000000.
covid['normTotalDeaths'] = covid['obitosAcumulados']/covid['population'] * 1000000.
In [8]:
lastDay = covid['data'].max()
lastDayFormatted = datetime.strptime(lastDay, '%Y-%m-%d').strftime('%d/%m/%Y')

covidLastDay = covid.loc[covid['data'] == lastDay]

infectionsBR = covidLastDay.sum()['casosAcumulados']
deathsBR = covidLastDay.sum()['obitosAcumulados']
infectionsLastDayBR = covidLastDay.sum()['casosNovos']
deathsLastDayBR = covidLastDay.sum()['obitosNovos']

covidRJ = covidLastDay.loc[covid['estado'] == 'RJ']

infectionsLastDayRJ = covidRJ.sum()['casosNovos']
deathsLastDayRJ = covidRJ.sum()['obitosNovos']
infectionsRJ = covidRJ.sum()['casosAcumulados']
deathsRJ = covidRJ.sum()['obitosAcumulados']

html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 20px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Numbers for the Last 24 Hours (last day of the dataset) </h2>
<p style="font-size:18px"> <b>{}</b> is the last day in the dataset.</p>
<p>
<p style="font-size:18px"> <b>{:,.0f} infections</b> were confirmed in <b>Brazil</b> on this day, of <b>{:,.0f} total infections</b>. </p>
<p style="font-size:18px"> <b>{:,.0f} deaths</b> were confirmed in <b>Brazil</b> on this day, of <b>{:,.0f} total deaths</b>. </p>
<p>
<p style="font-size:18px"> <b>{:,.0f} infections</b> were confirmed in <b>RJ</b> on this day, of <b>{:,.0f} total infections</b>. </p>
<p style="font-size:18px"> <b>{:,.0f} deaths</b> were confirmed in <b>RJ</b> on this day, of <b>{:,.0f} total deaths</b>. </p>
</div>

'''.format(lastDayFormatted, infectionsLastDayBR, infectionsBR, deathsLastDayBR, deathsBR,
          infectionsLastDayRJ, infectionsRJ, deathsLastDayRJ, deathsRJ)

display(HTML(html_code))

Numbers for the Last 24 Hours (last day of the dataset)

29/03/2022 is the last day in the dataset.

30,056 infections were confirmed in Brazil on this day, of 29,882,397 total infections.

285 deaths were confirmed in Brazil on this day, of 659,241 total deaths.

3,019 infections were confirmed in RJ on this day, of 2,082,820 total infections.

59 deaths were confirmed in RJ on this day, of 72,756 total deaths.

Last Known Status for Some Countries

Using the last information dowloaded from Our World in Data (https://covid.ourworldindata.org/data/owid-covid-data.csv), I list below the 3 countries in the worst situation (deaths per million people), the 3 countries in the best situation and other 3 countries of interest (Brazil, France and Israel), showing their position in the world ranking in the first column. OBS: only countries with death rates greater than zero are included in the 2 rankings below.

In [9]:
worldCovid = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv', parse_dates = True, delimiter = ',')
In [10]:
worldCovid = worldCovid.loc[~worldCovid['location'].isin(['Africa', 'Asia', 'Europe', 'European Union'
                                                         'International', 'North America', 'Oceania',
                                                         'South America', 'World'])]
lastDate = worldCovid['date'].max()

# Ignore the last day with data (usually faulty)
lastDate = (datetime.strptime(lastDate, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')

formattedLastDate = datetime.strptime(lastDate, '%Y-%m-%d').strftime('%B %d, %Y').replace(' 0', ' ')
worldCovid = worldCovid.loc[worldCovid['date'] == lastDate]
worldCovidTotal = worldCovid.copy()
worldCovidTotal = worldCovid[['location', 'total_deaths_per_million']]
worldCovidTotal.columns = ['Country', 'Deaths/MM inhab.']

worldCovidTotal.sort_values('Deaths/MM inhab.', ascending=False, inplace=True)
worldCovidTotal.reset_index(inplace = True)
worldCovidTotal['Ranking'] = worldCovidTotal.index+1
worldCovidTotal = worldCovidTotal.fillna(0)
worldCovidTotal = worldCovidTotal.loc[worldCovidTotal['Deaths/MM inhab.'] > 0]
worldCovidTotal = worldCovidTotal[['Ranking', 'Country', 'Deaths/MM inhab.']]

# Get the 3 best and the 3 worst status countries
worst1StatusCountry = worldCovidTotal.iloc[0]['Country']
worst2StatusCountry = worldCovidTotal.iloc[1]['Country']
worst3StatusCountry = worldCovidTotal.iloc[2]['Country']
best1StatusCountry = worldCovidTotal.iloc[-1]['Country']
best2StatusCountry = worldCovidTotal.iloc[-2]['Country']
best3StatusCountry = worldCovidTotal.iloc[-3]['Country']

# Filter only the desired countries (3 best, 3 worst, Brazil, France and Israel)
covidByCountrySelected = worldCovidTotal.loc[worldCovidTotal['Country'].isin([worst1StatusCountry, 
                                                                    worst2StatusCountry, 
                                                                    worst3StatusCountry, 
                                                                    best1StatusCountry,
                                                                    best2StatusCountry,
                                                                    best3StatusCountry,
                                                                    'Brazil', 'France', 'Israel'])]


display(Markdown('**Cumulative death rates (registered deaths from the beginning)** '))
display(HTML(covidByCountrySelected.to_html(index=False)))

display(Markdown('*Information updated on ' + formattedLastDate))

Cumulative death rates (registered deaths from the beginning)

Ranking Country Deaths/MM inhab.
1 Peru 6358.865
2 Bulgaria 5283.286
3 Bosnia and Herzegovina 4811.459
15 Brazil 3080.277
42 France 2102.296
82 Israel 1128.511
210 China 3.211
211 Vanuatu 3.180
212 Burundi 3.101

*Information updated on March 27, 2022

In [11]:
# Plot the distribution of deaths/MM inhab. per country showing the countries of interest (Brazil, France and Israel)
worldCovidTotal = worldCovidTotal[['Country', 'Deaths/MM inhab.']]
covidByCountryBrazil = worldCovidTotal.loc[worldCovidTotal['Country'] == 'Brazil']
covidByCountryFrance = worldCovidTotal.loc[worldCovidTotal['Country'] == 'France']
covidByCountryIsrael = worldCovidTotal.loc[worldCovidTotal['Country'] == 'Israel']

axis = worldCovidTotal.plot()
axis.axes.get_xaxis().set_visible(False)
covidByCountryBrazil.plot(ax=axis, linestyle='',marker='o', markersize=12, color='green')
covidByCountryFrance.plot(ax=axis, linestyle='',marker='o', markersize=12, color='red')
covidByCountryIsrael.plot(ax=axis, linestyle='',marker='o', markersize=12, color='blue')
axis.legend(['Deaths/MM inhab.', 'Brazil', 'France', 'Israel'])
plt.show()
In [12]:
worldCovidCurrent = worldCovid.copy()
worldCovidCurrent = worldCovid[['location', 'new_deaths_smoothed_per_million']]
worldCovidCurrent.columns = ['Country', 'Deaths/MM inhab.']

worldCovidCurrent.sort_values('Deaths/MM inhab.', ascending=False, inplace=True)
worldCovidCurrent.reset_index(inplace = True)
worldCovidCurrent['Ranking'] = worldCovidCurrent.index+1
worldCovidCurrent = worldCovidCurrent.fillna(0)
worldCovidCurrent = worldCovidCurrent.loc[worldCovidCurrent['Deaths/MM inhab.'] > 0]
worldCovidCurrent = worldCovidCurrent[['Ranking', 'Country', 'Deaths/MM inhab.']]

# Get the 3 best and the 3 worst status countries
worst1StatusCountry = worldCovidCurrent.iloc[0]['Country']
worst2StatusCountry = worldCovidCurrent.iloc[1]['Country']
worst3StatusCountry = worldCovidCurrent.iloc[2]['Country']
best1StatusCountry = worldCovidCurrent.iloc[-1]['Country']
best2StatusCountry = worldCovidCurrent.iloc[-2]['Country']
best3StatusCountry = worldCovidCurrent.iloc[-3]['Country']

# Filter only the desired countries (3 best, 3 worst, Brazil, France and Israel)
covidByCountrySelected = worldCovidCurrent.loc[worldCovidCurrent['Country'].isin([worst1StatusCountry, 
                                                                    worst2StatusCountry, 
                                                                    worst3StatusCountry, 
                                                                    best1StatusCountry,
                                                                    best2StatusCountry,
                                                                    best3StatusCountry,
                                                                    'Brazil', 'France', 'Israel'])]


display(Markdown('**Current death rates (average from the last 7 days)** '))
display(HTML(covidByCountrySelected.to_html(index=False)))

display(Markdown('*Information updated on ' + formattedLastDate))

Current death rates (average from the last 7 days)

Ranking Country Deaths/MM inhab.
1 Hong Kong 25.648
2 Liechtenstein 7.469
3 South Korea 6.763
43 France 1.566
55 Brazil 1.111
63 Israel 0.830
125 Nepal 0.005
126 Ethiopia 0.004
127 Bangladesh 0.001

*Information updated on March 27, 2022

In [13]:
# Plot the distribution of deaths/MM inhab. per country showing the countries of interest (Brazil, France and Israel)
worldCovidCurrent = worldCovidCurrent[['Country', 'Deaths/MM inhab.']]
covidByCountryBrazil = worldCovidCurrent.loc[worldCovidCurrent['Country'] == 'Brazil']
covidByCountryFrance = worldCovidCurrent.loc[worldCovidCurrent['Country'] == 'France']
covidByCountryIsrael = worldCovidCurrent.loc[worldCovidCurrent['Country'] == 'Israel']

axis = worldCovidCurrent.plot()
axis.axes.get_xaxis().set_visible(False)
covidByCountryBrazil.plot(ax=axis, linestyle='',marker='o', markersize=12, color='green')
covidByCountryFrance.plot(ax=axis, linestyle='',marker='o', markersize=12, color='red')
covidByCountryIsrael.plot(ax=axis, linestyle='',marker='o', markersize=12, color='blue')
axis.legend(['Deaths/MM inhab.', 'Brazil', 'France', 'Israel'])
plt.show()
In [14]:
covidLastDay = covidLastDay[['estado','casosAcumulados', 'obitosAcumulados', 'normTotalInfections', 'normTotalDeaths']]
covidLastDay.sort_values('normTotalDeaths', ascending=False, inplace=True)
covidLastDay.set_index('estado', inplace=True)
covidLastDay2 = covidLastDay.copy()
covidLastDay2.columns = ['Infections', 'Deaths', 'Infections/MM inh.', 'Deaths/MM inh.']
In [15]:
html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 40px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Infections and Deaths by State </h2>
<p style="font-size:18px"> Sorted by the severity (deaths per million people). States in worse situation are near the top. </p>
<p style="font-size:18px"> The k-means method was used for clustering the states in 3 classes of severity. </p>
</div>
'''
display(HTML(html_code))

Infections and Deaths by State

Sorted by the severity (deaths per million people). States in worse situation are near the top.

The k-means method was used for clustering the states in 3 classes of severity.

In [16]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)

covidLastDay['myClass'] = kmeans.fit_predict(covidLastDay[['normTotalDeaths']])
In [17]:
worstClass = covidLastDay.iloc[0]['myClass']
bestClass = covidLastDay.iloc[-1]['myClass']

covidLastDay.reset_index(inplace=True)

covidLastDay2 = covidLastDay.copy()

covidLastDay2.columns=['State', 'Infections', 'Deaths', 'Infections/MM inhab.', 'Deaths/MM inhab.', 'myClass']
In [18]:
def highlight(s):
    tamanho = s.size
    if s.myClass == worstClass:
        return ['background-color: orange']*tamanho
    elif s.myClass != bestClass:
        return ['background-color: yellow']*tamanho
    else:
        return ['background-color: white']*tamanho

def formatNumber(str_value):
    value = int(str_value)
    if value > 1000000:
        return f'{value/1000000:.2f} Mi'
    return f'{value:,}'

display(covidLastDay2.style.apply(highlight, axis=1).format({"Infections": formatNumber,
                                                             "Deaths": formatNumber,
                                                             "Infections/MM inhab.": "{:,.2f}",
                                                             "Deaths/MM inhab.": "{:,.2f}"}))
State Infections Deaths Infections/MM inhab. Deaths/MM inhab. myClass
0 RJ 2.08 Mi 72,756 120,638.68 4,214.09 0
1 MT 719,859 14,587 206,590.91 4,186.29 0
2 RO 392,811 7,176 221,024.91 4,037.76 0
3 DF 691,980 11,579 229,492.04 3,840.12 0
4 MS 523,860 10,493 188,507.61 3,775.84 0
5 PR 2.41 Mi 42,912 210,851.94 3,753.03 0
6 GO 1.27 Mi 26,221 181,390.82 3,736.06 0
7 SP 5.24 Mi 167,211 114,145.53 3,641.43 0
8 ES 1.04 Mi 14,333 258,399.21 3,566.62 0
9 RR 155,076 2,144 256,001.95 3,539.35 0
10 RS 2.27 Mi 39,015 199,298.27 3,429.22 0
11 AM 581,177 14,153 140,225.21 3,414.81 0
12 SC 1.67 Mi 21,656 233,524.98 3,022.56 1
13 CE 1.24 Mi 26,725 135,797.46 2,926.50 1
14 MG 3.32 Mi 60,770 156,861.11 2,870.74 1
15 SE 325,611 6,313 141,650.31 2,746.34 1
16 TO 302,606 4,142 192,391.47 2,633.41 1
17 PB 595,347 10,191 148,165.30 2,536.26 1
18 AP 160,336 2,124 189,582.74 2,511.44 1
19 PI 367,585 7,726 112,300.49 2,360.36 2
20 RN 495,822 8,120 141,386.59 2,315.47 2
21 AC 123,808 1,992 140,382.23 2,258.67 2
22 PE 894,462 21,386 93,591.65 2,237.71 2
23 PA 752,167 18,081 87,432.15 2,101.74 2
24 AL 296,066 6,876 88,712.71 2,060.31 2
25 BA 1.53 Mi 29,688 102,934.20 1,996.09 2
26 MA 424,956 10,871 60,062.92 1,536.50 2

Comparing Infections and Deaths by State

Using absolute values

In [19]:
covidLastDayPie1 = covidLastDay[['estado','casosAcumulados', 'obitosAcumulados']]
covidLastDayPie1.set_index('estado', inplace=True)
plot = covidLastDayPie1.plot.pie(fontsize=24, subplots=True, layout=(1, 2), figsize=(30, 15), legend=None)

Comparing Infections and Deaths per Million Inhabitans, per State

In [20]:
covidLastDayPie2 = covidLastDay[['estado','normTotalInfections', 'normTotalDeaths']]
covidLastDayPie2.set_index('estado', inplace=True)
plot = covidLastDayPie2.plot.pie(fontsize=24, subplots=True, layout=(1, 2), figsize=(30, 15), legend=None)

Plots of cumulative infections and deaths

In [21]:
covid['data'] = pd.to_datetime(covid['data'], format='%Y-%m-%d')

covidByDayBR = covid[['data', 'casosAcumulados', 'obitosAcumulados']].groupby('data').sum()
covidByDayBR.reset_index(inplace=True)
covidByDayBR.set_index(['data'],inplace=True)

covidByDayRJ = covid.loc[covid['estado'] == 'RJ'][['data', 'casosAcumulados', 'obitosAcumulados', 'normTotalInfections', 'normTotalDeaths']]
covidByDayRJ.reset_index(inplace=True)
covidByDayRJ.set_index(['data'],inplace=True)

axis = covidByDayBR.plot(legend=False, style='o-', figsize=(18, 9), y='casosAcumulados', title='Infections in Brazil')
axis.xaxis.get_label().set_visible(False)
In [22]:
axis = covidByDayBR.plot(legend=False, style='o-',figsize=(18, 9), y='obitosAcumulados', title='Deaths in Brazil')
axis.xaxis.get_label().set_visible(False)
In [23]:
axis = covidByDayRJ.plot(legend=False,style='o-',figsize=(18, 9), y='casosAcumulados', title='Infections in RJ')
axis.xaxis.get_label().set_visible(False)
In [24]:
axis = covidByDayRJ.plot(legend=False, style='o-',figsize=(18, 9), y='obitosAcumulados', title='Deaths in RJ')
axis.xaxis.get_label().set_visible(False)

Plots of new deaths for Brazil, RJ state and Rio de Janeiro city

Rolling mean of the last 7 days for the new deaths by day.

In [25]:
newCovidByDayBR = covid[['data', 'obitosNovos']].groupby('data').sum()
newCovidByDayBR.reset_index(inplace=True)
newCovidByDayBR.set_index(['data'],inplace=True)
newCovidByDayBR['rollingNewDeaths'] = newCovidByDayBR['obitosNovos'].rolling(window=7).mean()

axis = newCovidByDayBR.plot(legend=False, style='o-', figsize=(18, 9), y='rollingNewDeaths', title='New deaths in Brazil by day (rolling mean of 7 days)')
axis.xaxis.get_label().set_visible(False)
In [26]:
newCovidByDayRJ = covid.loc[covid['estado'] == 'RJ'][['data', 'obitosNovos']]
newCovidByDayRJ.reset_index(inplace=True)
newCovidByDayRJ.set_index(['data'],inplace=True)
newCovidByDayRJ['rollingNewDeaths'] = newCovidByDayRJ['obitosNovos'].rolling(window=7).mean()

axis = newCovidByDayRJ.plot(legend=False, style='o-', figsize=(18, 9), y='rollingNewDeaths', title='New deaths in RJ state by day (rolling mean of 7 days)')
axis.xaxis.get_label().set_visible(False)
In [27]:
newCovidByDayRio = covidRioDeJaneiro[['data', 'obitosNovos']]
newCovidByDayRio['data'] = pd.to_datetime(newCovidByDayRio['data'], format='%Y-%m-%d')
newCovidByDayRio.reset_index(inplace=True)
newCovidByDayRio.set_index(['data'],inplace=True)
newCovidByDayRio['rollingNewDeaths'] = newCovidByDayRio['obitosNovos'].rolling(window=7).mean()

axis = newCovidByDayRio.plot(legend=False, style='o-', figsize=(18, 9), y='rollingNewDeaths', title='New deaths in Rio de Janeiro City by day (rolling mean of 7 days)')
axis.xaxis.get_label().set_visible(False)
In [28]:
newCovidByDayMangaratiba = covidMangaratiba[['data', 'obitosNovos']]
newCovidByDayMangaratiba['data'] = pd.to_datetime(newCovidByDayMangaratiba['data'], format='%Y-%m-%d')
newCovidByDayMangaratiba = newCovidByDayMangaratiba[(newCovidByDayMangaratiba['obitosNovos'] >= 0)]
newCovidByDayMangaratiba.reset_index(inplace=True)
newCovidByDayMangaratiba.set_index(['data'],inplace=True)

axis = newCovidByDayMangaratiba.plot(legend=False, style='o-', markersize=5, drawstyle="steps-mid", figsize=(18, 9), y='obitosNovos', title='New deaths in Mangaratiba City by day (absolute values)')
axis.xaxis.get_label().set_visible(False)
plt.grid(axis='y', linestyle='-')

Comparing tendencies of all states in the last 10 days

For the log of the deaths data of the last 10 days for all states, I use Linear Regressions to estimate and compare how the situation is evolving. Higher coefficients means that situation is getting worse.

In [29]:
deathsByDayStates = covid.pivot(index='data', columns='estado', values = 'normTotalDeaths')

last10Days = deathsByDayStates.tail(10)
In [30]:
axis = last10Days.plot(logy=True, figsize=(15,10), title='Last 10 Days Deaths per MM inhab., Log Scale').legend(bbox_to_anchor=(1.1, 1.01))
plt.gca().xaxis.get_label().set_visible(False)
plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,      
    top=False,         
    labelbottom=False) 
plt.show()
In [31]:
last10Days['day'] = range(1,11)

X_train = last10Days['day']
X_train = X_train.values.reshape(-1, 1)

data = {'state': [],
        'coefficient': []
       }
coef10days = pd.DataFrame (data, columns = ['state','coefficient'])
In [32]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()

# Evaluating coefficients 
for col in last10Days.columns: 
    if col != 'day':
        last10Days[col + 'log'] = np.log(last10Days[col])
        regression_model.fit(X_train, last10Days[col + 'log'])
        coef10days = coef10days.append(pd.Series([col, regression_model.coef_[0]], index=coef10days.columns), ignore_index=True)
In [33]:
html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 40px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Predictions for the Near Future </h2>
<p style="font-size:18px"> Using the linear regression over the log data to estimate the number of deaths in 7, 14 e 30 days. </p>
</div>
'''
display(HTML(html_code))

Predictions for the Near Future

Using the linear regression over the log data to estimate the number of deaths in 7, 14 e 30 days.

All States

In [34]:
from datetime import timedelta

data = {'date': [],
        'state': [],
        'predictedDeaths': []
       }
predictedDeaths = pd.DataFrame (data, columns = ['date','state', 'predictedDeaths'])

more7Days = 17;
more14Days = 24;
more30Days = 40;
dateMore7Days = (datetime.strptime(lastDay, '%Y-%m-%d') + timedelta(days=7)).strftime("%d/%m/%Y")
dateMore14Days = (datetime.strptime(lastDay, '%Y-%m-%d') + timedelta(days=14)).strftime("%d/%m/%Y")
dateMore30Days = (datetime.strptime(lastDay, '%Y-%m-%d') + timedelta(days=30)).strftime("%d/%m/%Y")

deathsByState = covid.pivot(index='data', columns='estado', values = 'obitosAcumulados')
last10Days = deathsByState.tail(10)
last10Days['day'] = range(1,11)

for col in last10Days.columns: 
    if col != 'day':
        last10Days[col + 'log'] = np.log(last10Days[col])
        regression_model.fit(X_train, last10Days[col + 'log'])
        
        xpredict = np.array([more7Days])
        xpredict = xpredict.reshape(-1,1)
        prediction = regression_model.predict(xpredict)
        predictedDeaths = predictedDeaths.append(pd.Series([dateMore7Days, col, int(np.exp(prediction[0]))], index=predictedDeaths.columns), ignore_index=True)
        
        xpredict = np.array([more14Days])
        xpredict = xpredict.reshape(-1,1)
        prediction = regression_model.predict(xpredict)
        predictedDeaths = predictedDeaths.append(pd.Series([dateMore14Days, col, int(np.exp(prediction[0]))], index=predictedDeaths.columns), ignore_index=True)
        
        xpredict = np.array([more30Days])
        xpredict = xpredict.reshape(-1,1)
        prediction = regression_model.predict(xpredict)
        predictedDeaths = predictedDeaths.append(pd.Series([dateMore30Days, col, int(np.exp(prediction[0]))], index=predictedDeaths.columns), ignore_index=True)
        
predictedDeaths.index.name = None
predictedDeathsOut = predictedDeaths.pivot(index='state', columns='date', values = 'predictedDeaths')

predictedDeathsOut = predictedDeathsOut[[dateMore7Days, dateMore14Days, dateMore30Days]]

predictedDeathsBrazil = predictedDeathsOut.sum().reset_index(name='predictedDeaths')
predictedDeathsBrazil = predictedDeathsBrazil.append(pd.Series([lastDayFormatted, deathsBR], index=predictedDeathsBrazil.columns), ignore_index=True)
predictedDeathsBrazil.set_index('date', inplace=True)
predictedDeathsBrazilOut = predictedDeathsBrazil.transpose()
predictedDeathsBrazilOut = predictedDeathsBrazilOut[[lastDayFormatted, dateMore7Days, dateMore14Days, dateMore30Days]]

predictedDeathsOut['state'] = predictedDeathsOut.index
covidLastDayOut = covidLastDay[['estado', 'obitosAcumulados']][['estado', 'obitosAcumulados']]
covidLastDayOut.index.name = None
predictedDeathsOut.index.name = None
predictedDeathsOut = predictedDeathsOut.merge(covidLastDayOut, left_on='state', right_on='estado')
predictedDeathsOut = predictedDeathsOut[['estado', 'obitosAcumulados', dateMore7Days, dateMore14Days, dateMore30Days]]
predictedDeathsOut.columns = ['State', lastDayFormatted, dateMore7Days, dateMore14Days, dateMore30Days]

pd.options.display.float_format = '{0:g}'.format
display(predictedDeathsOut)
State 29/03/2022 05/04/2022 12/04/2022 28/04/2022
0 AC 1992 1994 1996 2001
1 AL 6876 6900 6923 6976
2 AM 14153 14157 14162 14173
3 AP 2124 2126 2129 2135
4 BA 29688 29758 29830 29993
5 CE 26725 26763 26808 26909
6 DF 11579 11599 11619 11664
7 ES 14333 14355 14380 14436
8 GO 26221 26369 26492 26774
9 MA 10871 10882 10893 10917
10 MG 60770 60995 61187 61628
11 MS 10493 10510 10530 10576
12 MT 14587 14598 14612 14644
13 PA 18081 18128 18169 18263
14 PB 10191 10207 10222 10256
15 PE 21386 21469 21550 21738
16 PI 7726 7731 7738 7755
17 PR 42912 43010 43100 43305
18 RJ 72756 72948 73155 73632
19 RN 8120 8125 8131 8143
20 RO 7176 7210 7239 7307
21 RR 2144 2144 2144 2144
22 RS 39015 39121 39225 39465
23 SC 21656 21685 21713 21776
24 SE 6313 6324 6336 6363
25 SP 167211 167706 168170 169235
26 TO 4142 4144 4145 4149

Brazil

In [35]:
predictedDeathsBrazilOut.columns = [lastDayFormatted, dateMore7Days, dateMore14Days, dateMore30Days]
predictedDeathsBrazilOut.reset_index(inplace=True)
predictedDeathsBrazilOut.drop('index', axis=1, inplace=True)
display(predictedDeathsBrazilOut)
29/03/2022 05/04/2022 12/04/2022 28/04/2022
0 659241 660958 662598 666357
In [36]:
coef10days.sort_values('coefficient',ascending=False,inplace=True)
coef10days.set_index('state', inplace=True)
In [37]:
html_code = '''
<div style="
  background-color:LightGoldenRodYellow;
  border-style: solid;
  padding-top: 10px;
  padding-right: 10px;
  padding-bottom: 40px;
  padding-left: 10px;">
<img style="float: right;" width=30px, height=30px src="info2.png"> 
<h2> Clustering the states in 3 classes according to the calculated coefficients </h2>
<p style="font-size:18px">  A higher coefficient means a worse tendency for the deaths. </p>
</div>
'''
display(HTML(html_code))

Clustering the states in 3 classes according to the calculated coefficients

A higher coefficient means a worse tendency for the deaths.

In [38]:
coef10daysData = pd.DataFrame(coef10days)
coef10daysData['state'] = coef10daysData.index
coef10daysData = coef10daysData.reset_index(level=0, drop=True)
coef10daysData = coef10daysData[['state', 'coefficient']]

coef10daysData['myClass'] = kmeans.fit_predict(coef10daysData[['coefficient']])
worstClass = coef10daysData.iloc[0]['myClass']
bestClass = coef10daysData.iloc[-1]['myClass']

display(coef10daysData.style.apply(highlight, axis=1))
state coefficient myClass
0 GO 0.000661496 2
1 RO 0.000582403 2
2 PE 0.000540853 2
3 AL 0.000474341 2
4 MG 0.00044892 2
5 RJ 0.000405715 0
6 SP 0.000394732 0
7 RS 0.000380636 0
8 BA 0.000341299 0
9 PA 0.000323197 0
10 PR 0.000296688 0
11 SE 0.000270108 0
12 MS 0.000269524 0
13 ES 0.000243866 0
14 DF 0.000239991 0
15 CE 0.000236363 0
16 PB 0.00020892 1
17 AP 0.000182789 1
18 SC 0.000182342 1
19 AC 0.0001522 1
20 MA 0.000140015 1
21 MT 0.000138472 1
22 PI 0.000137394 1
23 RN 9.48276e-05 1
24 TO 5.8541e-05 1
25 AM 4.92568e-05 1
26 RR 0 1
In [39]:
coef10days['Normalized Coefficient'] = (coef10days['coefficient']-coef10days['coefficient'].min())/(coef10days['coefficient'].max()-coef10days['coefficient'].min())

_ = coef10days.plot.pie(y='Normalized Coefficient', figsize=(10, 10), legend=False)

R curve

Estimating R, that represents the basic reproduction rate of the disease. It roughly estimates how many people are infected by one infected person after 14 days, assuming a constant death rate for the infected.

It is a very simple and imprecise approach. And, specially, the values in the begining of the plot should be disregarded, because the first notifications of the infections are not representative or reliable.

The plot is more of a tendency indicator, than a real value estimator.

In [40]:
data = {'date': [],
        'value': []
       }
R_Brazil = pd.DataFrame (data, columns = ['date','value'])

# Generation Time
gt = 14

covidDeathsBR = covid.groupby('data')['obitosNovos'].sum().reset_index(name ='obitosNovos')
covidDeathsBR['rollingMean'] =covidDeathsBR['obitosNovos'].rolling(window=7).mean()

for day in range(covidDeathsBR.shape[0]):
    if day >= gt:
        if covidDeathsBR.at[day-gt,'rollingMean'] > 0:
            Rvalue = float(covidDeathsBR.at[day,'rollingMean'])/float(covidDeathsBR.at[day-gt,'rollingMean'])
            if Rvalue <= 4:
                R_Brazil = R_Brazil.append(pd.Series([covidDeathsBR.at[day, 'data'], Rvalue], index=R_Brazil.columns), ignore_index=True)

R_Brazil['date'] = pd.to_datetime(R_Brazil['date'], format='%Y-%m-%d')

R_Brazil.set_index('date', inplace=True)
axis = R_Brazil.plot(style='o-', figsize=(18, 9), legend=False, y='value', title='R for Brazil')
axis.axhline(1.0, color='gray', lw=2, alpha=0.5)
axis.axhline(2.0, color='gray', lw=2, alpha=0.5)
axis.axhline(3.0, color='gray', lw=2, alpha=0.5)
axis.axhline(4.0, color='red', lw=2, alpha=0.5)
axis.fill_between(axis.get_xlim(), 0., 1., facecolor='lightgreen', alpha=0.5)
axis.xaxis.get_label().set_visible(False)
plt.show()
In [41]:
data = {'date': [],
        'state': [],
        'value': []
       }
R_States = pd.DataFrame (data, columns = ['date','state', 'value'])

for state in covid.estado.unique():
    covidState = covid.loc[covid['estado'] == state]

    covidDeathsState = covidState.groupby('data')['obitosNovos'].sum().reset_index(name ='obitosNovos')
    covidDeathsState['rollingMean'] =covidDeathsState['obitosNovos'].rolling(window=7).mean()
    for day in range(covidDeathsState.shape[0]):
        if day >= gt:
            if covidDeathsState.at[day-gt,'rollingMean'] > 0:
                Rvalue = float(covidDeathsState.at[day,'rollingMean'])/float(covidDeathsState.at[day-gt,'rollingMean'])
                R_States = R_States.append(pd.Series([covidDeathsState.at[day, 'data'], state, Rvalue], index=R_States.columns), ignore_index=True)


R_States['date'] = pd.to_datetime(R_States['date'], format='%Y-%m-%d')
In [42]:
R_StatesGroup = R_States.loc[R_States['state'].isin(['RJ', 'SP'])]
R_StatesGroup = R_StatesGroup.pivot(index='date', columns='state', values = 'value')

axis = R_StatesGroup.plot(style='o-', figsize=(18, 9), title='R for RJ and SP')
axis.axhline(1.0, color='gray', lw=2, alpha=0.5)
axis.axhline(2.0, color='gray', lw=2, alpha=0.5)
axis.axhline(3.0, color='gray', lw=2, alpha=0.5)
axis.axhline(4.0, color='red', lw=2, alpha=0.5)
axis.fill_between(axis.get_xlim(), 0., 1., facecolor='lightgreen', alpha=0.5)
plt.ylim((0,4))
axis.xaxis.get_label().set_visible(False)
_ = plt.show()