[42]:
from pysus.online_data.ESUS import download
import pandas as pd

Downloading data from ESUS

Esta função permite o download de dados de COVID-19 do ESUS. Para alguns estados, o tamanho da tabela resultante pode facilmente exceder o tamanho da memória da maioria da máquina, nestes casos, a função de download do ESUS irá extrair os dados para o disco sem preencher a memória, e retornar um iterador de blocos de 1000 linhas de dados. O usuário deve, então, iterar sobre os blocos para analisar os dados.

[43]:
df = download(uf='se').read()
2023-04-12 09:21:40.709 | INFO     | pysus.online_data.ESUS:download:34 - Local csv file found at /home/luabida/pysus/ESUS_temp_SE.csv.gz
/tmp/ipykernel_91880/506159845.py:1: DtypeWarning: Columns (2,4,5,8,9,10,11,12,15,18,19,21,22,23,24,26,27,29,30,31,32,34,35,36,37,40,41,42,44,45,46,48,49,50,51,54,55,56,57,58,59,60,61) have mixed types. Specify dtype option on import or set low_memory=False.
  df = download(uf='se').read()
[44]:
df.head()
[44]:
Unnamed: 0 outrasCondicoes municipio resultadoTesteSorologicoIgM laboratorioSegundaReforcoDose dataTesteSorologico codigoBuscaAtivaAssintomatico recebeuAntiviral profissionalSeguranca cbo ... testes idade resultadoTesteSorologicoTotais codigoEstrategiaCovid outrosSintomas codigoQualAntiviral estrangeiro resultadoTesteSorologicoIgA sexo sintomas
0 0 NaN Aracaju NaN NaN NaN NaN NaN Não NaN ... [] 18 NaN 1 NaN NaN NaN NaN Feminino Coriza, Dor de Cabeça, Febre, Dor de Garganta
1 1 NaN Aracaju NaN JANSSEN NaN NaN Não Não NaN ... [{'codigoEstadoTeste': '3', 'tipoTeste': 'RT-P... 26 NaN 1 NaN NaN NaN NaN Feminino Tosse, Febre, Dor de Garganta
2 2 NaN Nossa Senhora do Socorro NaN ASTRAZENECA/FIOCRUZ NaN NaN Não Não NaN ... [{'codigoEstadoTeste': '3', 'tipoTeste': 'RT-P... 31 NaN 1 NaN NaN NaN NaN Masculino Tosse, Febre, Dor de Garganta
3 3 NaN Aracaju NaN PFIZER NaN NaN NaN Não NaN ... [] 39 NaN 1 NaN NaN NaN NaN Masculino Coriza, Dor de Cabeça, Tosse, Febre, Dor de Ga...
4 4 NaN Aracaju NaN NaN NaN NaN Não Não NaN ... [{'codigoEstadoTeste': '3', 'tipoTeste': 'TEST... 32 NaN 1 ALGIA OCULAR NaN NaN NaN Feminino Dor de Cabeça, Tosse, Outros

5 rows × 63 columns

[45]:
for cname in df:
    if cname.startswith('data'):
        df[cname] = pd.to_datetime(df[cname], errors='coerce')
/home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages/pandas/core/tools/datetimes.py:1047: UserWarning: Parsing '15/03/2022' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
[46]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905099 entries, 0 to 905098
Data columns (total 63 columns):
 #   Column                             Non-Null Count   Dtype
---  ------                             --------------   -----
 0   Unnamed: 0                         905099 non-null  int64
 1   outrasCondicoes                    845124 non-null  object
 2   municipio                          60015 non-null   object
 3   resultadoTesteSorologicoIgM        347694 non-null  object
 4   laboratorioSegundaReforcoDose      846885 non-null  object
 5   dataTesteSorologico                0 non-null       datetime64[ns]
 6   codigoBuscaAtivaAssintomatico      352115 non-null  object
 7   recebeuAntiviral                   900105 non-null  object
 8   profissionalSeguranca              137182 non-null  object
 9   cbo                                861037 non-null  object
 10  codigoRecebeuAntiviral             203302 non-null  object
 11  municipioNotificacaoIBGE           854567 non-null  object
 12  profissionalSaude                  30981 non-null   object
 13  outroTriagemPopulacaoEspecifica    890233 non-null  object
 14  resultadoTeste                     890667 non-null  object
 15  classificacaoFinal                 893538 non-null  object
 16  estadoIBGE                         393564 non-null  object
 17  estadoNotificacaoIBGE              278568 non-null  object
 18  dataInicioTratamento               5255 non-null    object
 19  @version                           900294 non-null  object
 20  evolucaoCaso                       875469 non-null  object
 21  @timestamp                         581989 non-null  object
 22  municipioIBGE                      23681 non-null   object
 23  dataEncerramento                   854649 non-null  datetime64[ns, UTC]
 24  estado                             24573 non-null   object
 25  resultadoTesteSorologicoIgG        880839 non-null  object
 26  dataReforcoDose                    54581 non-null   datetime64[ns, UTC]
 27  tipoTesteSorologico                54288 non-null   object
 28  qualAntiviral                      852065 non-null  object
 29  estadoNotificacao                  354141 non-null  object
 30  loteSegundaReforcoDose             897885 non-null  object
 31  id                                 65786 non-null   object
 32  outroLocalRealizacaoTestagem       23429 non-null   object
 33  outroBuscaAtivaAssintomatico       857163 non-null  object
 34  dataSegundaReforcoDose             18455 non-null   datetime64[ns, UTC]
 35  codigoTriagemPopulacaoEspecifica   46742 non-null   object
 36  condicoes                          31514 non-null   object
 37  codigoDosesVacina                  890467 non-null  object
 38  idCollection                       47349 non-null   object
 39  estadoTeste                        73489 non-null   object
 40  codigoLocalRealizacaoTestagem      804537 non-null  object
 41  outroAntiviral                     288180 non-null  object
 42  dataSegundaDose                    4605 non-null    object
 43  codigoContemComunidadeTradicional  465724 non-null  object
 44  tipoTeste                          350244 non-null  object
 45  racaCor                            878427 non-null  object
 46  codigoRecebeuVacina                889016 non-null  object
 47  dataPrimeiraDose                   5038 non-null    datetime64[ns, UTC]
 48  dataTeste                          190916 non-null  datetime64[ns, UTC]
 49  registroAtual                      17821 non-null   object
 50  dataNotificacao                    15883 non-null   datetime64[ns, UTC]
 51  dataInicioSintomas                 10223 non-null   datetime64[ns, UTC]
 52  municipioNotificacao               834928 non-null  object
 53  testes                             513594 non-null  object
 54  idade                              26270 non-null   object
 55  resultadoTesteSorologicoTotais     9712 non-null    object
 56  codigoEstrategiaCovid              184183 non-null  object
 57  outrosSintomas                     13699 non-null   object
 58  codigoQualAntiviral                56849 non-null   object
 59  estrangeiro                        886104 non-null  object
 60  resultadoTesteSorologicoIgA        891226 non-null  object
 61  sexo                               531234 non-null  object
 62  sintomas                           115487 non-null  object
dtypes: datetime64[ns, UTC](7), datetime64[ns](1), int64(1), object(54)
memory usage: 435.0+ MB

Agora vamos criar um índice de data e hora para o nosso dataframe, mas devemos ter cuidado com as datas ausentes aqui. Por enquanto, para permitir uma visualização rápida, vamos simplesmente forçar as datas ausentes a NaT.

[47]:
df['datesint'] = pd.to_datetime(df['dataInicioSintomas'],errors='coerce')
# df = df.dropna()
df.set_index('datesint', inplace=True);

Agora podemos contar os casos diários e plotar.

[48]:
df_day = df.resample('D').count()
df_day.head()
[48]:
Unnamed: 0 outrasCondicoes municipio resultadoTesteSorologicoIgM laboratorioSegundaReforcoDose dataTesteSorologico codigoBuscaAtivaAssintomatico recebeuAntiviral profissionalSeguranca cbo ... testes idade resultadoTesteSorologicoTotais codigoEstrategiaCovid outrosSintomas codigoQualAntiviral estrangeiro resultadoTesteSorologicoIgA sexo sintomas
datesint
1970-01-01 00:00:00+00:00 33 21 33 0 0 0 31 33 33 8 ... 33 8 0 0 0 4 33 33 0 33
1970-01-02 00:00:00+00:00 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1970-01-03 00:00:00+00:00 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1970-01-04 00:00:00+00:00 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1970-01-05 00:00:00+00:00 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 63 columns

[49]:
df_day.estadoIBGE.plot();
_images/ESUS_10_0.png

Removendo a duplicidade

Os registros do ESUS são conhecidos por terem vários registros duplicados. Vamos ver aqui como detectar possíveis duplicatas no dataframe que acabamos de baixar. Para isso, precisaremos do pacote `recordlinkage https://recordlinkage.readthedocs.io/en/latest/index.html`__.

[50]:
!pip3 install recordlinkage
Requirement already satisfied: recordlinkage in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (0.15)
Requirement already satisfied: numpy>=1.13.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.23.2)
Requirement already satisfied: scikit-learn>=0.19.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.2.2)
Requirement already satisfied: pandas<2,>=1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.4.3)
Requirement already satisfied: scipy>=1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.10.1)
Requirement already satisfied: joblib in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.2.0)
Requirement already satisfied: jellyfish>=0.8.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (0.11.2)
Requirement already satisfied: pytz>=2020.1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from pandas<2,>=1->recordlinkage) (2022.2.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from pandas<2,>=1->recordlinkage) (2.8.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from scikit-learn>=0.19.0->recordlinkage) (3.1.0)
Requirement already satisfied: six>=1.5 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas<2,>=1->recordlinkage) (1.16.0)
[51]:
import recordlinkage
[52]:
df.reset_index(inplace=True)
df['dataInicioSintomas'] = pd.to_datetime(df['dataInicioSintomas'], errors='coerce')
[53]:
dup_index = recordlinkage.Index()
dup_index.block(['municipio','dataTeste'])
# dup_index.sortedneighbourhood(['municipio','dataTeste'])
dup_candidates = dup_index.index(df)
len(dup_candidates)
[53]:
3154
[54]:
dup_candidates
[54]:
MultiIndex([( 3450,  1020),
            ( 4995,  1581),
            (18524,  3101),
            (57863, 12167),
            (57897, 12167),
            (57897, 57863),
            (35208, 12291),
            (56484, 12291),
            (56484, 35208),
            (74707, 12291),
            ...
            (74583, 58130),
            (74542, 58174),
            (74610, 58174),
            (74610, 74542),
            (74630, 58174),
            (74630, 74542),
            (74630, 74610),
            (58921, 58491),
            (74258, 74188),
            (74606, 74587)],
           length=3154)
[55]:
compare_dups = recordlinkage.Compare()
compare_dups.string('sintomas', 'sintomas', threshold=0.85, label='sintomas', method='jarowinkler')
compare_dups.date('dataInicioSintomas', 'dataInicioSintomas', label='dataInicioSintomas')
[55]:
<Compare>
[56]:
dup_features = compare_dups.compute(dup_candidates,df)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[56], line 1
----> 1 dup_features = compare_dups.compute(dup_candidates,df)

File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:862, in BaseCompare.compute(self, pairs, x, x_link)
    859     raise ValueError("expected pandas.DataFrame as third argument")
    861 if self.n_jobs == 1:
--> 862     results = self._compute(pairs, x, x_link)
    863 elif self.n_jobs > 1:
    864     results = self._compute_parallel(pairs,
    865                                      x,
    866                                      x_link,
    867                                      n_jobs=self.n_jobs)

File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:727, in BaseCompare._compute(self, pairs, x, x_link)
    722     # else: subset columns and pass tuple of series
    723     else:
    724         data2 = tuple(
    725             [df_b_indexed[lbl] for lbl in listify(feat.labels_right)])
--> 727     result = feat._compute(data1, data2)
    728     features.append((result, feat.label))
    730 features = self._union(features, pairs)

File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:452, in BaseCompareFeature._compute(self, left_on, right_on)
    430 def _compute(self, left_on, right_on):
    431     """Compare the data on the left and right.
    432
    433     :meth:`BaseCompareFeature._compute` and
   (...)
    450         numpy.ndarray objects.
    451     """
--> 452     result = self._compute_vectorized(*tuple(left_on + right_on))
    454     return result

File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/compare.py:382, in Date._compute_vectorized(self, s_left, s_right)
    378 def _compute_vectorized(self, s_left, s_right):
    379
    380     # validate datatypes
    381     if str(s_left.dtype) != 'datetime64[ns]':
--> 382         raise ValueError('Left column is not of type datetime64[ns]')
    384     if str(s_right.dtype) != 'datetime64[ns]':
    385         raise ValueError('Right column is not of type datetime64[ns]')

ValueError: Left column is not of type datetime64[ns]
[ ]:
dup_features
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 1
----> 1 dup_features

NameError: name 'dup_features' is not defined
[ ]:
dup_features.sum(axis=1).value_counts().sort_index(ascending=False)
1.0     5910561
0.0    28495991
dtype: int64
[ ]:
potential_dups = dup_features[dup_features.sum(axis=1) >= 1]#.reset_index()
# potential_dups['Score'] = potential_dups.loc[:, 'sintomas':'dataInicioSintomas'].sum(axis=1)
[ ]:
potential_dups
sintomas
9 5 1.0
12 3 1.0
15 3 1.0
12 1.0
23 3 1.0
... ... ...
380990 380953 1.0
381048 381019 1.0
381050 381019 1.0
381048 1.0
381057 380939 1.0

5910561 rows × 1 columns

[ ]:
potential_dups