[42]:
from pysus.online_data.ESUS import download
import pandas as pd

Downloading data from ESUS

This function alows for the download of COVID-19 data from ESUS. For Some States, the size of the resulting table can easily exceed the memory size of most personal computers, in such cases, the ESUS download function will stream the data to disk without filling up the memory and return an iterator of chunks of 1000 rows of data. The user must then iterate over the chunks to analyze the data.

[43]:
df = download(uf='se').read()
2023-04-12 09:21:40.709 | INFO     | pysus.online_data.ESUS:download:34 - Local csv file found at /home/luabida/pysus/ESUS_temp_SE.csv.gz
/tmp/ipykernel_91880/506159845.py:1: DtypeWarning: Columns (2,4,5,8,9,10,11,12,15,18,19,21,22,23,24,26,27,29,30,31,32,34,35,36,37,40,41,42,44,45,46,48,49,50,51,54,55,56,57,58,59,60,61) have mixed types. Specify dtype option on import or set low_memory=False.
  df = download(uf='se').read()
[44]:
df.head()
[44]:
Unnamed: 0 outrasCondicoes municipio resultadoTesteSorologicoIgM laboratorioSegundaReforcoDose dataTesteSorologico codigoBuscaAtivaAssintomatico recebeuAntiviral profissionalSeguranca cbo ... testes idade resultadoTesteSorologicoTotais codigoEstrategiaCovid outrosSintomas codigoQualAntiviral estrangeiro resultadoTesteSorologicoIgA sexo sintomas
0 0 NaN Aracaju NaN NaN NaN NaN NaN Não NaN ... [] 18 NaN 1 NaN NaN NaN NaN Feminino Coriza, Dor de Cabeça, Febre, Dor de Garganta
1 1 NaN Aracaju NaN JANSSEN NaN NaN Não Não NaN ... [{'codigoEstadoTeste': '3', 'tipoTeste': 'RT-P... 26 NaN 1 NaN NaN NaN NaN Feminino Tosse, Febre, Dor de Garganta
2 2 NaN Nossa Senhora do Socorro NaN ASTRAZENECA/FIOCRUZ NaN NaN Não Não NaN ... [{'codigoEstadoTeste': '3', 'tipoTeste': 'RT-P... 31 NaN 1 NaN NaN NaN NaN Masculino Tosse, Febre, Dor de Garganta
3 3 NaN Aracaju NaN PFIZER NaN NaN NaN Não NaN ... [] 39 NaN 1 NaN NaN NaN NaN Masculino Coriza, Dor de Cabeça, Tosse, Febre, Dor de Ga...
4 4 NaN Aracaju NaN NaN NaN NaN Não Não NaN ... [{'codigoEstadoTeste': '3', 'tipoTeste': 'TEST... 32 NaN 1 ALGIA OCULAR NaN NaN NaN Feminino Dor de Cabeça, Tosse, Outros

5 rows × 63 columns

[45]:
for cname in df:
    if cname.startswith('data'):
        df[cname] = pd.to_datetime(df[cname], errors='coerce')
/home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages/pandas/core/tools/datetimes.py:1047: UserWarning: Parsing '15/03/2022' in DD/MM/YYYY format. Provide format or specify infer_datetime_format=True for consistent parsing.
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
[46]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905099 entries, 0 to 905098
Data columns (total 63 columns):
 #   Column                             Non-Null Count   Dtype
---  ------                             --------------   -----
 0   Unnamed: 0                         905099 non-null  int64
 1   outrasCondicoes                    845124 non-null  object
 2   municipio                          60015 non-null   object
 3   resultadoTesteSorologicoIgM        347694 non-null  object
 4   laboratorioSegundaReforcoDose      846885 non-null  object
 5   dataTesteSorologico                0 non-null       datetime64[ns]
 6   codigoBuscaAtivaAssintomatico      352115 non-null  object
 7   recebeuAntiviral                   900105 non-null  object
 8   profissionalSeguranca              137182 non-null  object
 9   cbo                                861037 non-null  object
 10  codigoRecebeuAntiviral             203302 non-null  object
 11  municipioNotificacaoIBGE           854567 non-null  object
 12  profissionalSaude                  30981 non-null   object
 13  outroTriagemPopulacaoEspecifica    890233 non-null  object
 14  resultadoTeste                     890667 non-null  object
 15  classificacaoFinal                 893538 non-null  object
 16  estadoIBGE                         393564 non-null  object
 17  estadoNotificacaoIBGE              278568 non-null  object
 18  dataInicioTratamento               5255 non-null    object
 19  @version                           900294 non-null  object
 20  evolucaoCaso                       875469 non-null  object
 21  @timestamp                         581989 non-null  object
 22  municipioIBGE                      23681 non-null   object
 23  dataEncerramento                   854649 non-null  datetime64[ns, UTC]
 24  estado                             24573 non-null   object
 25  resultadoTesteSorologicoIgG        880839 non-null  object
 26  dataReforcoDose                    54581 non-null   datetime64[ns, UTC]
 27  tipoTesteSorologico                54288 non-null   object
 28  qualAntiviral                      852065 non-null  object
 29  estadoNotificacao                  354141 non-null  object
 30  loteSegundaReforcoDose             897885 non-null  object
 31  id                                 65786 non-null   object
 32  outroLocalRealizacaoTestagem       23429 non-null   object
 33  outroBuscaAtivaAssintomatico       857163 non-null  object
 34  dataSegundaReforcoDose             18455 non-null   datetime64[ns, UTC]
 35  codigoTriagemPopulacaoEspecifica   46742 non-null   object
 36  condicoes                          31514 non-null   object
 37  codigoDosesVacina                  890467 non-null  object
 38  idCollection                       47349 non-null   object
 39  estadoTeste                        73489 non-null   object
 40  codigoLocalRealizacaoTestagem      804537 non-null  object
 41  outroAntiviral                     288180 non-null  object
 42  dataSegundaDose                    4605 non-null    object
 43  codigoContemComunidadeTradicional  465724 non-null  object
 44  tipoTeste                          350244 non-null  object
 45  racaCor                            878427 non-null  object
 46  codigoRecebeuVacina                889016 non-null  object
 47  dataPrimeiraDose                   5038 non-null    datetime64[ns, UTC]
 48  dataTeste                          190916 non-null  datetime64[ns, UTC]
 49  registroAtual                      17821 non-null   object
 50  dataNotificacao                    15883 non-null   datetime64[ns, UTC]
 51  dataInicioSintomas                 10223 non-null   datetime64[ns, UTC]
 52  municipioNotificacao               834928 non-null  object
 53  testes                             513594 non-null  object
 54  idade                              26270 non-null   object
 55  resultadoTesteSorologicoTotais     9712 non-null    object
 56  codigoEstrategiaCovid              184183 non-null  object
 57  outrosSintomas                     13699 non-null   object
 58  codigoQualAntiviral                56849 non-null   object
 59  estrangeiro                        886104 non-null  object
 60  resultadoTesteSorologicoIgA        891226 non-null  object
 61  sexo                               531234 non-null  object
 62  sintomas                           115487 non-null  object
dtypes: datetime64[ns, UTC](7), datetime64[ns](1), int64(1), object(54)
memory usage: 435.0+ MB

Now we will create a datetime index for our dataframe, but we must be carefull with missing dates here. For now, to enable a quick visualization, we will simply coerce missing dates to NaT.

[47]:
df['datesint'] = pd.to_datetime(df['dataInicioSintomas'],errors='coerce')
# df = df.dropna()
df.set_index('datesint', inplace=True);

Now we can count the cases per day and plot.

[48]:
df_day = df.resample('D').count()
df_day.head()
[48]:
Unnamed: 0 outrasCondicoes municipio resultadoTesteSorologicoIgM laboratorioSegundaReforcoDose dataTesteSorologico codigoBuscaAtivaAssintomatico recebeuAntiviral profissionalSeguranca cbo ... testes idade resultadoTesteSorologicoTotais codigoEstrategiaCovid outrosSintomas codigoQualAntiviral estrangeiro resultadoTesteSorologicoIgA sexo sintomas
datesint
1970-01-01 00:00:00+00:00 33 21 33 0 0 0 31 33 33 8 ... 33 8 0 0 0 4 33 33 0 33
1970-01-02 00:00:00+00:00 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1970-01-03 00:00:00+00:00 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1970-01-04 00:00:00+00:00 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1970-01-05 00:00:00+00:00 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 63 columns

[49]:
df_day.estadoIBGE.plot();
_images/ESUS_10_0.png

Deduplicating the data

ESUS records are know to have a number of duplicated records. Let’s see here how to detect possible duplicates in the dataframe we have just downloaded. For that we will need the `recordlinkage <https://recordlinkage.readthedocs.io/en/latest/index.html>`__ package.

[50]:
!pip3 install recordlinkage
Requirement already satisfied: recordlinkage in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (0.15)
Requirement already satisfied: numpy>=1.13.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.23.2)
Requirement already satisfied: scikit-learn>=0.19.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.2.2)
Requirement already satisfied: pandas<2,>=1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.4.3)
Requirement already satisfied: scipy>=1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.10.1)
Requirement already satisfied: joblib in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (1.2.0)
Requirement already satisfied: jellyfish>=0.8.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from recordlinkage) (0.11.2)
Requirement already satisfied: pytz>=2020.1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from pandas<2,>=1->recordlinkage) (2022.2.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from pandas<2,>=1->recordlinkage) (2.8.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from scikit-learn>=0.19.0->recordlinkage) (3.1.0)
Requirement already satisfied: six>=1.5 in /home/luabida/micromamba/envs/pysus/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas<2,>=1->recordlinkage) (1.16.0)
[51]:
import recordlinkage
[52]:
df.reset_index(inplace=True)
df['dataInicioSintomas'] = pd.to_datetime(df['dataInicioSintomas'], errors='coerce')
[53]:
dup_index = recordlinkage.Index()
dup_index.block(['municipio','dataTeste'])
# dup_index.sortedneighbourhood(['municipio','dataTeste'])
dup_candidates = dup_index.index(df)
len(dup_candidates)
[53]:
3154
[54]:
dup_candidates
[54]:
MultiIndex([( 3450,  1020),
            ( 4995,  1581),
            (18524,  3101),
            (57863, 12167),
            (57897, 12167),
            (57897, 57863),
            (35208, 12291),
            (56484, 12291),
            (56484, 35208),
            (74707, 12291),
            ...
            (74583, 58130),
            (74542, 58174),
            (74610, 58174),
            (74610, 74542),
            (74630, 58174),
            (74630, 74542),
            (74630, 74610),
            (58921, 58491),
            (74258, 74188),
            (74606, 74587)],
           length=3154)
[55]:
compare_dups = recordlinkage.Compare()
compare_dups.string('sintomas', 'sintomas', threshold=0.85, label='sintomas', method='jarowinkler')
compare_dups.date('dataInicioSintomas', 'dataInicioSintomas', label='dataInicioSintomas')
[55]:
<Compare>
[56]:
dup_features = compare_dups.compute(dup_candidates,df)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[56], line 1
----> 1 dup_features = compare_dups.compute(dup_candidates,df)

File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:862, in BaseCompare.compute(self, pairs, x, x_link)
    859     raise ValueError("expected pandas.DataFrame as third argument")
    861 if self.n_jobs == 1:
--> 862     results = self._compute(pairs, x, x_link)
    863 elif self.n_jobs > 1:
    864     results = self._compute_parallel(pairs,
    865                                      x,
    866                                      x_link,
    867                                      n_jobs=self.n_jobs)

File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:727, in BaseCompare._compute(self, pairs, x, x_link)
    722     # else: subset columns and pass tuple of series
    723     else:
    724         data2 = tuple(
    725             [df_b_indexed[lbl] for lbl in listify(feat.labels_right)])
--> 727     result = feat._compute(data1, data2)
    728     features.append((result, feat.label))
    730 features = self._union(features, pairs)

File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/base.py:452, in BaseCompareFeature._compute(self, left_on, right_on)
    430 def _compute(self, left_on, right_on):
    431     """Compare the data on the left and right.
    432
    433     :meth:`BaseCompareFeature._compute` and
   (...)
    450         numpy.ndarray objects.
    451     """
--> 452     result = self._compute_vectorized(*tuple(left_on + right_on))
    454     return result

File ~/micromamba/envs/pysus/lib/python3.9/site-packages/recordlinkage/compare.py:382, in Date._compute_vectorized(self, s_left, s_right)
    378 def _compute_vectorized(self, s_left, s_right):
    379
    380     # validate datatypes
    381     if str(s_left.dtype) != 'datetime64[ns]':
--> 382         raise ValueError('Left column is not of type datetime64[ns]')
    384     if str(s_right.dtype) != 'datetime64[ns]':
    385         raise ValueError('Right column is not of type datetime64[ns]')

ValueError: Left column is not of type datetime64[ns]
[ ]:
dup_features
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 1
----> 1 dup_features

NameError: name 'dup_features' is not defined
[ ]:
dup_features.sum(axis=1).value_counts().sort_index(ascending=False)
1.0     5910561
0.0    28495991
dtype: int64
[ ]:
potential_dups = dup_features[dup_features.sum(axis=1) >= 1]#.reset_index()
# potential_dups['Score'] = potential_dups.loc[:, 'sintomas':'dataInicioSintomas'].sum(axis=1)
[ ]:
potential_dups
sintomas
9 5 1.0
12 3 1.0
15 3 1.0
12 1.0
23 3 1.0
... ... ...
380990 380953 1.0
381048 381019 1.0
381050 381019 1.0
381048 1.0
381057 380939 1.0

5910561 rows × 1 columns

[ ]:
potential_dups