[42]:
from pysus.online_data.ESUS import download
import pandas as pd
Downloading data from ESUS¶
This function alows for the download of COVID-19 data from ESUS. For Some States, the size of the resulting table can easily exceed the memory size of most personal computers, in such cases, the ESUS download function will stream the data to disk without filling up the memory and return an iterator of chunks of 1000 rows of data. The user must then iterate over the chunks to analyze the data.
[43]:
df = download(uf='se').read()
2023-04-12 09:21:40.709 | INFO | pysus.online_data.ESUS:download:34 - Local csv file found at /home/luabida/pysus/ESUS_temp_SE.csv.gz
/tmp/ipykernel_91880/506159845.py:1: DtypeWarning: Columns (2,4,5,8,9,10,11,12,15,18,19,21,22,23,24,26,27,29,30,31,32,34,35,36,37,40,41,42,44,45,46,48,49,50,51,54,55,56,57,58,59,60,61) have mixed types. Specify dtype option on import or set low_memory=False.
df = download(uf='se').read()
[44]:
df.head()
[44]:
Unnamed: 0 | outrasCondicoes | municipio | resultadoTesteSorologicoIgM | laboratorioSegundaReforcoDose | dataTesteSorologico | codigoBuscaAtivaAssintomatico | recebeuAntiviral | profissionalSeguranca | cbo | ... | testes | idade | resultadoTesteSorologicoTotais | codigoEstrategiaCovid | outrosSintomas | codigoQualAntiviral | estrangeiro | resultadoTesteSorologicoIgA | sexo | sintomas | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | NaN | Aracaju | NaN | NaN | NaN | NaN | NaN | Não | NaN | ... | [] | 18 | NaN | 1 | NaN | NaN | NaN | NaN | Feminino | Coriza, Dor de Cabeça, Febre, Dor de Garganta |
1 | 1 | NaN | Aracaju | NaN | JANSSEN | NaN | NaN | Não | Não | NaN | ... | [{'codigoEstadoTeste': '3', 'tipoTeste': 'RT-P... | 26 | NaN | 1 | NaN | NaN | NaN | NaN | Feminino | Tosse, Febre, Dor de Garganta |
2 | 2 | NaN | Nossa Senhora do Socorro | NaN | ASTRAZENECA/FIOCRUZ | NaN | NaN | Não | Não | NaN | ... | [{'codigoEstadoTeste': '3', 'tipoTeste': 'RT-P... | 31 | NaN | 1 | NaN | NaN | NaN | NaN | Masculino | Tosse, Febre, Dor de Garganta |
3 | 3 | NaN | Aracaju | NaN | PFIZER | NaN | NaN | NaN | Não | NaN | ... | [] | 39 | NaN | 1 | NaN | NaN | NaN | NaN | Masculino | Coriza, Dor de Cabeça, Tosse, Febre, Dor de Ga... |
4 | 4 | NaN | Aracaju | NaN | NaN | NaN | NaN | Não | Não | NaN | ... | [{'codigoEstadoTeste': '3', 'tipoTeste': 'TEST... | 32 | NaN | 1 | ALGIA OCULAR | NaN | NaN | NaN | Feminino | Dor de Cabeça, Tosse, Outros |
5 rows × 63 columns
[45]:
for cname in df:
if cname.startswith('data'):
df[cname] = pd.to_datetime(df[cname], errors='coerce')
/home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages/pandas/core/tools/datetimes.py:1047: UserWarning: Parsing '15/03/2022' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
[46]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905099 entries, 0 to 905098
Data columns (total 63 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 905099 non-null int64
1 outrasCondicoes 845124 non-null object
2 municipio 60015 non-null object
3 resultadoTesteSorologicoIgM 347694 non-null object
4 laboratorioSegundaReforcoDose 846885 non-null object
5 dataTesteSorologico 0 non-null datetime64[ns]
6 codigoBuscaAtivaAssintomatico 352115 non-null object
7 recebeuAntiviral 900105 non-null object
8 profissionalSeguranca 137182 non-null object
9 cbo 861037 non-null object
10 codigoRecebeuAntiviral 203302 non-null object
11 municipioNotificacaoIBGE 854567 non-null object
12 profissionalSaude 30981 non-null object
13 outroTriagemPopulacaoEspecifica 890233 non-null object
14 resultadoTeste 890667 non-null object
15 classificacaoFinal 893538 non-null object
16 estadoIBGE 393564 non-null object
17 estadoNotificacaoIBGE 278568 non-null object
18 dataInicioTratamento 5255 non-null object
19 @version 900294 non-null object
20 evolucaoCaso 875469 non-null object
21 @timestamp 581989 non-null object
22 municipioIBGE 23681 non-null object
23 dataEncerramento 854649 non-null datetime64[ns, UTC]
24 estado 24573 non-null object
25 resultadoTesteSorologicoIgG 880839 non-null object
26 dataReforcoDose 54581 non-null datetime64[ns, UTC]
27 tipoTesteSorologico 54288 non-null object
28 qualAntiviral 852065 non-null object
29 estadoNotificacao 354141 non-null object
30 loteSegundaReforcoDose 897885 non-null object
31 id 65786 non-null object
32 outroLocalRealizacaoTestagem 23429 non-null object
33 outroBuscaAtivaAssintomatico 857163 non-null object
34 dataSegundaReforcoDose 18455 non-null datetime64[ns, UTC]
35 codigoTriagemPopulacaoEspecifica 46742 non-null object
36 condicoes 31514 non-null object
37 codigoDosesVacina 890467 non-null object
38 idCollection 47349 non-null object
39 estadoTeste 73489 non-null object
40 codigoLocalRealizacaoTestagem 804537 non-null object
41 outroAntiviral 288180 non-null object
42 dataSegundaDose 4605 non-null object
43 codigoContemComunidadeTradicional 465724 non-null object
44 tipoTeste 350244 non-null object
45 racaCor 878427 non-null object
46 codigoRecebeuVacina 889016 non-null object
47 dataPrimeiraDose 5038 non-null datetime64[ns, UTC]
48 dataTeste 190916 non-null datetime64[ns, UTC]
49 registroAtual 17821 non-null object
50 dataNotificacao 15883 non-null datetime64[ns, UTC]
51 dataInicioSintomas 10223 non-null datetime64[ns, UTC]
52 municipioNotificacao 834928 non-null object
53 testes 513594 non-null object
54 idade 26270 non-null object
55 resultadoTesteSorologicoTotais 9712 non-null object
56 codigoEstrategiaCovid 184183 non-null object
57 outrosSintomas 13699 non-null object
58 codigoQualAntiviral 56849 non-null object
59 estrangeiro 886104 non-null object
60 resultadoTesteSorologicoIgA 891226 non-null object
61 sexo 531234 non-null object
62 sintomas 115487 non-null object
dtypes: datetime64[ns, UTC](7), datetime64[ns](1), int64(1), object(54)
memory usage: 435.0+ MB
Now we will create a datetime index for our dataframe, but we must be carefull with missing dates here. For now, to enable a quick visualization, we will simply coerce missing dates to NaT
.
[47]:
df['datesint'] = pd.to_datetime(df['dataInicioSintomas'],errors='coerce')
# df = df.dropna()
df.set_index('datesint', inplace=True);
Now we can count the cases per day and plot.
[48]:
df_day = df.resample('D').count()
df_day.head()
[48]:
Unnamed: 0 | outrasCondicoes | municipio | resultadoTesteSorologicoIgM | laboratorioSegundaReforcoDose | dataTesteSorologico | codigoBuscaAtivaAssintomatico | recebeuAntiviral | profissionalSeguranca | cbo | ... | testes | idade | resultadoTesteSorologicoTotais | codigoEstrategiaCovid | outrosSintomas | codigoQualAntiviral | estrangeiro | resultadoTesteSorologicoIgA | sexo | sintomas | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
datesint | |||||||||||||||||||||
1970-01-01 00:00:00+00:00 | 33 | 21 | 33 | 0 | 0 | 0 | 31 | 33 | 33 | 8 | ... | 33 | 8 | 0 | 0 | 0 | 4 | 33 | 33 | 0 | 33 |
1970-01-02 00:00:00+00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1970-01-03 00:00:00+00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1970-01-04 00:00:00+00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1970-01-05 00:00:00+00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 63 columns
[49]:
df_day.estadoIBGE.plot();

Deduplicating the data¶
ESUS records are know to have a number of duplicated records. Let’s see here how to detect possible duplicates in the dataframe we have just downloaded. For that we will need the `recordlinkage
<https://recordlinkage.readthedocs.io/en/latest/index.html>`__ package.
[50]:
!pip3 install recordlinkage
Requirement already satisfied: recordlinkage in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (0.15)
Requirement already satisfied: numpy>=1.13.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.23.2)
Requirement already satisfied: scikit-learn>=0.19.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.2.2)
Requirement already satisfied: pandas<2,>=1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.4.3)
Requirement already satisfied: scipy>=1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.10.1)
Requirement already satisfied: joblib in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.2.0)
Requirement already satisfied: jellyfish>=0.8.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (0.11.2)
Requirement already satisfied: pytz>=2020.1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from pandas<2,>=1->recordlinkage) (2022.2.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from pandas<2,>=1->recordlinkage) (2.8.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from scikit-learn>=0.19.0->recordlinkage) (3.1.0)
Requirement already satisfied: six>=1.5 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas<2,>=1->recordlinkage) (1.16.0)
[51]:
import recordlinkage
[52]:
df.reset_index(inplace=True)
df['dataInicioSintomas'] = pd.to_datetime(df['dataInicioSintomas'], errors='coerce')
[53]:
dup_index = recordlinkage.Index()
dup_index.block(['municipio','dataTeste'])
# dup_index.sortedneighbourhood(['municipio','dataTeste'])
dup_candidates = dup_index.index(df)
len(dup_candidates)
[53]:
3154
[54]:
dup_candidates
[54]:
MultiIndex([( 3450, 1020),
( 4995, 1581),
(18524, 3101),
(57863, 12167),
(57897, 12167),
(57897, 57863),
(35208, 12291),
(56484, 12291),
(56484, 35208),
(74707, 12291),
...
(74583, 58130),
(74542, 58174),
(74610, 58174),
(74610, 74542),
(74630, 58174),
(74630, 74542),
(74630, 74610),
(58921, 58491),
(74258, 74188),
(74606, 74587)],
length=3154)
[55]:
compare_dups = recordlinkage.Compare()
compare_dups.string('sintomas', 'sintomas', threshold=0.85, label='sintomas', method='jarowinkler')
compare_dups.date('dataInicioSintomas', 'dataInicioSintomas', label='dataInicioSintomas')
[55]:
<Compare>
[56]:
dup_features = compare_dups.compute(dup_candidates,df)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[56], line 1
----> 1 dup_features = compare_dups.compute(dup_candidates,df)
File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:862, in BaseCompare.compute(self, pairs, x, x_link)
859 raise ValueError("expected pandas.DataFrame as third argument")
861 if self.n_jobs == 1:
--> 862 results = self._compute(pairs, x, x_link)
863 elif self.n_jobs > 1:
864 results = self._compute_parallel(pairs,
865 x,
866 x_link,
867 n_jobs=self.n_jobs)
File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:727, in BaseCompare._compute(self, pairs, x, x_link)
722 # else: subset columns and pass tuple of series
723 else:
724 data2 = tuple(
725 [df_b_indexed[lbl] for lbl in listify(feat.labels_right)])
--> 727 result = feat._compute(data1, data2)
728 features.append((result, feat.label))
730 features = self._union(features, pairs)
File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:452, in BaseCompareFeature._compute(self, left_on, right_on)
430 def _compute(self, left_on, right_on):
431 """Compare the data on the left and right.
432
433 :meth:`BaseCompareFeature._compute` and
(...)
450 numpy.ndarray objects.
451 """
--> 452 result = self._compute_vectorized(*tuple(left_on + right_on))
454 return result
File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/compare.py:382, in Date._compute_vectorized(self, s_left, s_right)
378 def _compute_vectorized(self, s_left, s_right):
379
380 # validate datatypes
381 if str(s_left.dtype) != 'datetime64[ns]':
--> 382 raise ValueError('Left column is not of type datetime64[ns]')
384 if str(s_right.dtype) != 'datetime64[ns]':
385 raise ValueError('Right column is not of type datetime64[ns]')
ValueError: Left column is not of type datetime64[ns]
[ ]:
dup_features
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[17], line 1
----> 1 dup_features
NameError: name 'dup_features' is not defined
[ ]:
dup_features.sum(axis=1).value_counts().sort_index(ascending=False)
1.0 5910561
0.0 28495991
dtype: int64
[ ]:
potential_dups = dup_features[dup_features.sum(axis=1) >= 1]#.reset_index()
# potential_dups['Score'] = potential_dups.loc[:, 'sintomas':'dataInicioSintomas'].sum(axis=1)
[ ]:
potential_dups
sintomas | ||
---|---|---|
9 | 5 | 1.0 |
12 | 3 | 1.0 |
15 | 3 | 1.0 |
12 | 1.0 | |
23 | 3 | 1.0 |
... | ... | ... |
380990 | 380953 | 1.0 |
381048 | 381019 | 1.0 |
381050 | 381019 | 1.0 |
381048 | 1.0 | |
381057 | 380939 | 1.0 |
5910561 rows × 1 columns
[ ]:
potential_dups