[42]:
from pysus.online_data.ESUS import download
import pandas as pd
Downloading data from ESUS
Esta função permite o download de dados de COVID-19 do ESUS. Para alguns estados, o tamanho da tabela resultante pode facilmente exceder o tamanho da memória da maioria da máquina, nestes casos, a função de download do ESUS irá extrair os dados para o disco sem preencher a memória, e retornar um iterador de blocos de 1000 linhas de dados. O usuário deve, então, iterar sobre os blocos para analisar os dados.
[43]:
df = download(uf='se').read()
2023-04-12 09:21:40.709 | INFO | pysus.online_data.ESUS:download:34 - Local csv file found at /home/luabida/pysus/ESUS_temp_SE.csv.gz
/tmp/ipykernel_91880/506159845.py:1: DtypeWarning: Columns (2,4,5,8,9,10,11,12,15,18,19,21,22,23,24,26,27,29,30,31,32,34,35,36,37,40,41,42,44,45,46,48,49,50,51,54,55,56,57,58,59,60,61) have mixed types. Specify dtype option on import or set low_memory=False.
df = download(uf='se').read()
[44]:
df.head()
[44]:
Unnamed: 0 | outrasCondicoes | municipio | resultadoTesteSorologicoIgM | laboratorioSegundaReforcoDose | dataTesteSorologico | codigoBuscaAtivaAssintomatico | recebeuAntiviral | profissionalSeguranca | cbo | ... | testes | idade | resultadoTesteSorologicoTotais | codigoEstrategiaCovid | outrosSintomas | codigoQualAntiviral | estrangeiro | resultadoTesteSorologicoIgA | sexo | sintomas | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | NaN | Aracaju | NaN | NaN | NaN | NaN | NaN | Não | NaN | ... | [] | 18 | NaN | 1 | NaN | NaN | NaN | NaN | Feminino | Coriza, Dor de Cabeça, Febre, Dor de Garganta |
1 | 1 | NaN | Aracaju | NaN | JANSSEN | NaN | NaN | Não | Não | NaN | ... | [{'codigoEstadoTeste': '3', 'tipoTeste': 'RT-P... | 26 | NaN | 1 | NaN | NaN | NaN | NaN | Feminino | Tosse, Febre, Dor de Garganta |
2 | 2 | NaN | Nossa Senhora do Socorro | NaN | ASTRAZENECA/FIOCRUZ | NaN | NaN | Não | Não | NaN | ... | [{'codigoEstadoTeste': '3', 'tipoTeste': 'RT-P... | 31 | NaN | 1 | NaN | NaN | NaN | NaN | Masculino | Tosse, Febre, Dor de Garganta |
3 | 3 | NaN | Aracaju | NaN | PFIZER | NaN | NaN | NaN | Não | NaN | ... | [] | 39 | NaN | 1 | NaN | NaN | NaN | NaN | Masculino | Coriza, Dor de Cabeça, Tosse, Febre, Dor de Ga... |
4 | 4 | NaN | Aracaju | NaN | NaN | NaN | NaN | Não | Não | NaN | ... | [{'codigoEstadoTeste': '3', 'tipoTeste': 'TEST... | 32 | NaN | 1 | ALGIA OCULAR | NaN | NaN | NaN | Feminino | Dor de Cabeça, Tosse, Outros |
5 rows × 63 columns
[45]:
for cname in df:
if cname.startswith('data'):
df[cname] = pd.to_datetime(df[cname], errors='coerce')
/home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages/pandas/core/tools/datetimes.py:1047: UserWarning: Parsing '15/03/2022' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
[46]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905099 entries, 0 to 905098
Data columns (total 63 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 905099 non-null int64
1 outrasCondicoes 845124 non-null object
2 municipio 60015 non-null object
3 resultadoTesteSorologicoIgM 347694 non-null object
4 laboratorioSegundaReforcoDose 846885 non-null object
5 dataTesteSorologico 0 non-null datetime64[ns]
6 codigoBuscaAtivaAssintomatico 352115 non-null object
7 recebeuAntiviral 900105 non-null object
8 profissionalSeguranca 137182 non-null object
9 cbo 861037 non-null object
10 codigoRecebeuAntiviral 203302 non-null object
11 municipioNotificacaoIBGE 854567 non-null object
12 profissionalSaude 30981 non-null object
13 outroTriagemPopulacaoEspecifica 890233 non-null object
14 resultadoTeste 890667 non-null object
15 classificacaoFinal 893538 non-null object
16 estadoIBGE 393564 non-null object
17 estadoNotificacaoIBGE 278568 non-null object
18 dataInicioTratamento 5255 non-null object
19 @version 900294 non-null object
20 evolucaoCaso 875469 non-null object
21 @timestamp 581989 non-null object
22 municipioIBGE 23681 non-null object
23 dataEncerramento 854649 non-null datetime64[ns, UTC]
24 estado 24573 non-null object
25 resultadoTesteSorologicoIgG 880839 non-null object
26 dataReforcoDose 54581 non-null datetime64[ns, UTC]
27 tipoTesteSorologico 54288 non-null object
28 qualAntiviral 852065 non-null object
29 estadoNotificacao 354141 non-null object
30 loteSegundaReforcoDose 897885 non-null object
31 id 65786 non-null object
32 outroLocalRealizacaoTestagem 23429 non-null object
33 outroBuscaAtivaAssintomatico 857163 non-null object
34 dataSegundaReforcoDose 18455 non-null datetime64[ns, UTC]
35 codigoTriagemPopulacaoEspecifica 46742 non-null object
36 condicoes 31514 non-null object
37 codigoDosesVacina 890467 non-null object
38 idCollection 47349 non-null object
39 estadoTeste 73489 non-null object
40 codigoLocalRealizacaoTestagem 804537 non-null object
41 outroAntiviral 288180 non-null object
42 dataSegundaDose 4605 non-null object
43 codigoContemComunidadeTradicional 465724 non-null object
44 tipoTeste 350244 non-null object
45 racaCor 878427 non-null object
46 codigoRecebeuVacina 889016 non-null object
47 dataPrimeiraDose 5038 non-null datetime64[ns, UTC]
48 dataTeste 190916 non-null datetime64[ns, UTC]
49 registroAtual 17821 non-null object
50 dataNotificacao 15883 non-null datetime64[ns, UTC]
51 dataInicioSintomas 10223 non-null datetime64[ns, UTC]
52 municipioNotificacao 834928 non-null object
53 testes 513594 non-null object
54 idade 26270 non-null object
55 resultadoTesteSorologicoTotais 9712 non-null object
56 codigoEstrategiaCovid 184183 non-null object
57 outrosSintomas 13699 non-null object
58 codigoQualAntiviral 56849 non-null object
59 estrangeiro 886104 non-null object
60 resultadoTesteSorologicoIgA 891226 non-null object
61 sexo 531234 non-null object
62 sintomas 115487 non-null object
dtypes: datetime64[ns, UTC](7), datetime64[ns](1), int64(1), object(54)
memory usage: 435.0+ MB
Agora vamos criar um índice de data e hora para o nosso dataframe, mas devemos ter cuidado com as datas ausentes aqui. Por enquanto, para permitir uma visualização rápida, vamos simplesmente forçar as datas ausentes a NaT
.
[47]:
df['datesint'] = pd.to_datetime(df['dataInicioSintomas'],errors='coerce')
# df = df.dropna()
df.set_index('datesint', inplace=True);
Agora podemos contar os casos diários e plotar.
[48]:
df_day = df.resample('D').count()
df_day.head()
[48]:
Unnamed: 0 | outrasCondicoes | municipio | resultadoTesteSorologicoIgM | laboratorioSegundaReforcoDose | dataTesteSorologico | codigoBuscaAtivaAssintomatico | recebeuAntiviral | profissionalSeguranca | cbo | ... | testes | idade | resultadoTesteSorologicoTotais | codigoEstrategiaCovid | outrosSintomas | codigoQualAntiviral | estrangeiro | resultadoTesteSorologicoIgA | sexo | sintomas | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
datesint | |||||||||||||||||||||
1970-01-01 00:00:00+00:00 | 33 | 21 | 33 | 0 | 0 | 0 | 31 | 33 | 33 | 8 | ... | 33 | 8 | 0 | 0 | 0 | 4 | 33 | 33 | 0 | 33 |
1970-01-02 00:00:00+00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1970-01-03 00:00:00+00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1970-01-04 00:00:00+00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1970-01-05 00:00:00+00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 63 columns
[49]:
df_day.estadoIBGE.plot();

Removendo a duplicidade
Os registros do ESUS são conhecidos por terem vários registros duplicados. Vamos ver aqui como detectar possíveis duplicatas no dataframe que acabamos de baixar. Para isso, precisaremos do pacote `recordlinkage
https://recordlinkage.readthedocs.io/en/latest/index.html`__.
[50]:
!pip3 install recordlinkage
Requirement already satisfied: recordlinkage in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (0.15)
Requirement already satisfied: numpy>=1.13.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.23.2)
Requirement already satisfied: scikit-learn>=0.19.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.2.2)
Requirement already satisfied: pandas<2,>=1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.4.3)
Requirement already satisfied: scipy>=1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.10.1)
Requirement already satisfied: joblib in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.2.0)
Requirement already satisfied: jellyfish>=0.8.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (0.11.2)
Requirement already satisfied: pytz>=2020.1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from pandas<2,>=1->recordlinkage) (2022.2.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from pandas<2,>=1->recordlinkage) (2.8.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from scikit-learn>=0.19.0->recordlinkage) (3.1.0)
Requirement already satisfied: six>=1.5 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas<2,>=1->recordlinkage) (1.16.0)
[51]:
import recordlinkage
[52]:
df.reset_index(inplace=True)
df['dataInicioSintomas'] = pd.to_datetime(df['dataInicioSintomas'], errors='coerce')
[53]:
dup_index = recordlinkage.Index()
dup_index.block(['municipio','dataTeste'])
# dup_index.sortedneighbourhood(['municipio','dataTeste'])
dup_candidates = dup_index.index(df)
len(dup_candidates)
[53]:
3154
[54]:
dup_candidates
[54]:
MultiIndex([( 3450, 1020),
( 4995, 1581),
(18524, 3101),
(57863, 12167),
(57897, 12167),
(57897, 57863),
(35208, 12291),
(56484, 12291),
(56484, 35208),
(74707, 12291),
...
(74583, 58130),
(74542, 58174),
(74610, 58174),
(74610, 74542),
(74630, 58174),
(74630, 74542),
(74630, 74610),
(58921, 58491),
(74258, 74188),
(74606, 74587)],
length=3154)
[55]:
compare_dups = recordlinkage.Compare()
compare_dups.string('sintomas', 'sintomas', threshold=0.85, label='sintomas', method='jarowinkler')
compare_dups.date('dataInicioSintomas', 'dataInicioSintomas', label='dataInicioSintomas')
[55]:
<Compare>
[56]:
dup_features = compare_dups.compute(dup_candidates,df)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[56], line 1
----> 1 dup_features = compare_dups.compute(dup_candidates,df)
File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:862, in BaseCompare.compute(self, pairs, x, x_link)
859 raise ValueError("expected pandas.DataFrame as third argument")
861 if self.n_jobs == 1:
--> 862 results = self._compute(pairs, x, x_link)
863 elif self.n_jobs > 1:
864 results = self._compute_parallel(pairs,
865 x,
866 x_link,
867 n_jobs=self.n_jobs)
File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:727, in BaseCompare._compute(self, pairs, x, x_link)
722 # else: subset columns and pass tuple of series
723 else:
724 data2 = tuple(
725 [df_b_indexed[lbl] for lbl in listify(feat.labels_right)])
--> 727 result = feat._compute(data1, data2)
728 features.append((result, feat.label))
730 features = self._union(features, pairs)
File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:452, in BaseCompareFeature._compute(self, left_on, right_on)
430 def _compute(self, left_on, right_on):
431 """Compare the data on the left and right.
432
433 :meth:`BaseCompareFeature._compute` and
(...)
450 numpy.ndarray objects.
451 """
--> 452 result = self._compute_vectorized(*tuple(left_on + right_on))
454 return result
File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/compare.py:382, in Date._compute_vectorized(self, s_left, s_right)
378 def _compute_vectorized(self, s_left, s_right):
379
380 # validate datatypes
381 if str(s_left.dtype) != 'datetime64[ns]':
--> 382 raise ValueError('Left column is not of type datetime64[ns]')
384 if str(s_right.dtype) != 'datetime64[ns]':
385 raise ValueError('Right column is not of type datetime64[ns]')
ValueError: Left column is not of type datetime64[ns]
[ ]:
dup_features
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[17], line 1
----> 1 dup_features
NameError: name 'dup_features' is not defined
[ ]:
dup_features.sum(axis=1).value_counts().sort_index(ascending=False)
1.0 5910561
0.0 28495991
dtype: int64
[ ]:
potential_dups = dup_features[dup_features.sum(axis=1) >= 1]#.reset_index()
# potential_dups['Score'] = potential_dups.loc[:, 'sintomas':'dataInicioSintomas'].sum(axis=1)
[ ]:
potential_dups
sintomas | ||
---|---|---|
9 | 5 | 1.0 |
12 | 3 | 1.0 |
15 | 3 | 1.0 |
12 | 1.0 | |
23 | 3 | 1.0 |
... | ... | ... |
380990 | 380953 | 1.0 |
381048 | 381019 | 1.0 |
381050 | 381019 | 1.0 |
381048 | 1.0 | |
381057 | 380939 | 1.0 |
5910561 rows × 1 columns
[ ]:
potential_dups