"""High-level convenience functions for fetching Brazilian health data.
Each function wraps an asynchronous query/download pipeline and returns a
pandas DataFrame. The available datasets cover disease notification (SINAN),
vital statistics (SINASC, SIM), hospital admissions (SIH), ambulatory care
(SIA), immunisation (PNI), census data (IBGE), health facilities (CNES),
and hospitalisation records (CIHA).
"""
import asyncio
from typing import Literal, cast
import pandas as pd
from pysus.api import types
from pysus.api.client import PySUS
from tqdm.asyncio import tqdm
__all__ = [
"sinan",
"sinasc",
"sim",
"sih",
"sia",
"pni",
"ibge",
"cnes",
"ciha",
"list_files",
]
def _fetch_data(
dataset: str,
group: str | None = None,
state: str | None = None,
year: int | list[int] | None = None,
month: int | list[int] | None = None,
show_progress: bool = True,
as_dataframe: bool = False,
**kwargs,
) -> list[str] | pd.DataFrame:
"""Query, download, and process Parquet files for a given dataset.
Internally creates an async event loop, queries the PySUS API for matching
files, and downloads them. By default, returns a list of local file paths.
Parameters
----------
dataset : str
Name of the dataset (e.g. "sinan", "sinasc").
group : str, optional
Group or disease code to filter by.
state : str, optional
Two-letter state abbreviation (e.g. "RJ").
year : int | list[int], optional
Year or list of years to fetch.
month : int | list[int], optional
Month or list of months to fetch.
show_progress : bool, optional
Whether to display a tqdm progress bar during download.
as_dataframe : bool, optional
Whether to concatenate and return the data as a pandas DataFrame.
**kwargs
Additional arguments forwarded to :meth:`PySUS.read_parquet`.
Returns
-------
list[str] | pd.DataFrame
A list of paths to the downloaded Parquet files by default. If
as_dataframe is True, returns a concatenated DataFrame.
"""
async def _fetch() -> list[str] | pd.DataFrame:
async with PySUS() as pysus:
files = await pysus.query(
dataset=dataset,
group=group,
state=state,
year=year,
month=month,
)
if not files:
return pd.DataFrame() if as_dataframe else cast(list[str], [])
sem = asyncio.Semaphore(3)
async def _throttled_download(f):
async with sem:
return await pysus.download(f)
tasks = [_throttled_download(f) for f in files]
if show_progress:
downloaded_files = await tqdm.gather(
*tasks,
desc=f"Downloading {dataset}",
unit="file",
)
else:
downloaded_files = await asyncio.gather(*tasks)
paths: list[str] = [str(f.path) for f in downloaded_files]
if as_dataframe:
res = pysus.read_parquet(paths, **kwargs).df()
return cast(pd.DataFrame, res)
return paths
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop and loop.is_running():
try:
import nest_asyncio
nest_asyncio.apply()
except ImportError:
msg = (
"nest_asyncio is required when running inside Jupyter. "
"Install it with: pip install nest_asyncio"
)
raise RuntimeError(msg) from None
result = loop.run_until_complete(_fetch())
return cast(list[str] | pd.DataFrame, result)
result = asyncio.run(_fetch())
return cast(list[str] | pd.DataFrame, result)
[docs]
def sinan(
disease: Literal[
"ACBI",
"ACGR",
"ANIM",
"ANTR",
"BOTU",
"CANC",
"CHAG",
"CHIK",
"COLE",
"COQU",
"DENG",
"DERM",
"DIFT",
"ESQU",
"EXAN",
"FMAC",
"FTIF",
"HANS",
"HANT",
"HEPA",
"IEXO",
"INFL",
"LEIV",
"LEPT",
"LERD",
"LTAN",
"MALA",
"MENI",
"MENT",
"NTRA",
"PAIR",
"PEST",
"PFAN",
"PNEU",
"RAIV",
"SDTA",
"SIFA",
"SIFC",
"SIFG",
"SRC",
"TETA",
"TETN",
"TOXC",
"TOXG",
"TRAC",
"TUBE",
"VARC",
"VIOL",
"ZIKA",
],
year: int | list[int],
**kwargs,
) -> list[str] | pd.DataFrame:
"""Fetch SINAN records for a given disease and year(s).
SINAN (Sistema de Informação de Agravos de Notificação) is the Brazilian
notifiable-disease information system.
Parameters
----------
disease : Literal
Disease code (e.g. "DENG" for dengue, "ZIKA" for zika).
year : int | list[int]
Year or list of years to fetch.
**kwargs
Additional arguments forwarded to :func:`_fetch_data`.
Returns
-------
list[str] | pd.DataFrame
List of downloaded Parquet paths, or a DataFrame if specified.
"""
return _fetch_data(
dataset="sinan",
group=disease.upper(),
year=year,
**kwargs,
)
[docs]
def sinasc(
state: types.State,
year: int | list[int],
group: str | None = None,
**kwargs,
) -> list[str] | pd.DataFrame:
"""Fetch SINASC birth certificates for a given state, year(s), and group.
SINASC (Sistema de Informação sobre Nascidos Vivo) is the Brazilian live
birth information system.
Parameters
----------
state : types.State
Two-letter state abbreviation (e.g. "RJ").
year : int | list[int]
Year or list of years to fetch.
group : str, optional
Additional grouping code.
**kwargs
Additional arguments forwarded to :func:`_fetch_data`.
Returns
-------
list[str] | pd.DataFrame
List of downloaded Parquet paths, or a DataFrame if specified.
"""
return _fetch_data(
dataset="sinasc",
state=state.upper(),
group=group,
year=year,
**kwargs,
)
[docs]
def sim(
state: types.State,
year: int | list[int],
group: str | None = None,
**kwargs,
) -> list[str] | pd.DataFrame:
"""Fetch SIM mortality records for a given state, year(s), and group.
SIM (Sistema de Informação sobre Mortalidade) is the Brazilian mortality
information system.
Parameters
----------
state : State
Two-letter state abbreviation (e.g. "RJ").
year : int | list[int]
Year or list of years to fetch.
group : str, optional
Additional grouping code.
**kwargs
Additional arguments forwarded to :func:`_fetch_data`.
Returns
-------
list[str] | pd.DataFrame
List of downloaded Parquet paths, or a DataFrame if specified.
"""
return _fetch_data(
dataset="sim",
state=state.upper(),
group=group,
year=year,
**kwargs,
)
[docs]
def sih(
state: types.State,
year: int | list[int],
month: int | list[int],
group: str | None = None,
**kwargs,
) -> list[str] | pd.DataFrame:
"""Fetch SIH hospital admissions for a state, year, month, and group.
SIH (Sistema de Informação Hospitalar) is the Brazilian hospital
admission information system.
Parameters
----------
state : types.State
Two-letter state abbreviation (e.g. "RJ").
year : int | list[int]
Year or list of years to fetch.
month : int | list[int]
Month or list of months to fetch.
group : str, optional
Additional grouping code.
**kwargs
Additional arguments forwarded to :func:`_fetch_data`.
Returns
-------
list[str] | pd.DataFrame
List of downloaded Parquet paths, or a DataFrame if specified.
"""
return _fetch_data(
dataset="sih",
state=state.upper(),
group=group,
year=year,
month=month,
**kwargs,
)
[docs]
def sia(
state: types.State,
year: int | list[int],
month: int | list[int],
group: str | None = None,
**kwargs,
) -> list[str] | pd.DataFrame:
"""Fetch SIA ambulatory care for a state, year, month, and group.
SIA (Sistema de Informação Ambulatorial) is the Brazilian ambulatory care
information system.
Parameters
----------
state : types.State
Two-letter state abbreviation (e.g. "RJ").
year : int | list[int]
Year or list of years to fetch.
month : int | list[int]
Month or list of months to fetch.
group : str, optional
Additional grouping code.
**kwargs
Additional arguments forwarded to :func:`_fetch_data`.
Returns
-------
list[str] | pd.DataFrame
List of downloaded Parquet paths, or a DataFrame if specified.
"""
return _fetch_data(
dataset="sia",
state=state.upper(),
group=group,
year=year,
month=month,
**kwargs,
)
[docs]
def pni(
state: types.State,
year: int | list[int],
group: str | None = None,
**kwargs,
) -> list[str] | pd.DataFrame:
"""Fetch PNI immunisation records for a given state, year(s), and group.
PNI (Programa Nacional de Imunizações) is the Brazilian national
immunisation programme.
Parameters
----------
state : State
Two-letter state abbreviation (e.g. "RJ").
year : int | list[int]
Year or list of years to fetch.
group : str, optional
Additional grouping code.
**kwargs
Additional arguments forwarded to :func:`_fetch_data`.
Returns
-------
list[str] | pd.DataFrame
List of downloaded Parquet paths, or a DataFrame if specified.
"""
return _fetch_data(
dataset="pni",
state=state.upper(),
group=group,
year=year,
**kwargs,
)
[docs]
def ibge(
year: int | list[int],
group: str | None = None,
**kwargs,
) -> list[str] | pd.DataFrame:
"""Fetch IBGE census data for given year(s) and optional group.
IBGE (Instituto Brasileiro de Geografia e Estatística) provides census
and demographic data.
Parameters
----------
year : int | list[int]
Year or list of years to fetch.
group : str, optional
Additional grouping code.
**kwargs
Additional arguments forwarded to :func:`_fetch_data`.
Returns
-------
list[str] | pd.DataFrame
List of downloaded Parquet paths, or a DataFrame if specified.
"""
return _fetch_data(dataset="ibge", group=group, year=year, **kwargs)
[docs]
def cnes(
state: types.State,
year: int | list[int],
month: int | list[int],
group: str | None = None,
**kwargs,
) -> list[str] | pd.DataFrame:
"""Fetch CNES health facilities for a state, year, month, and group.
CNES (Cadastro Nacional de Estabelecimentos de Saúde) is the Brazilian
registry of health-care facilities.
Parameters
----------
state : State
Two-letter state abbreviation (e.g. "RJ").
year : int | list[int]
Year or list of years to fetch.
month : int | list[int]
Month or list of months to fetch.
group : str, optional
Additional grouping code.
**kwargs
Additional arguments forwarded to :func:`_fetch_data`.
Returns
-------
list[str] | pd.DataFrame
List of downloaded Parquet paths, or a DataFrame if specified.
"""
return _fetch_data(
dataset="cnes",
state=state.upper(),
group=group,
year=year,
month=month,
**kwargs,
)
[docs]
def ciha(
state: types.State,
year: int | list[int],
month: int | list[int],
group: str | None = "CIHA",
**kwargs,
) -> list[str] | pd.DataFrame:
"""Fetch CIHA hospitalisation records for state, year, month, and group.
CIHA (Comunicação de Internação Hospitalar) provides hospitalisation
records.
Parameters
----------
state : State
Two-letter state abbreviation (e.g. "RJ").
year : int | list[int]
Year or list of years to fetch.
month : int | list[int]
Month or list of months to fetch.
group : str, optional
Additional grouping code. Default is "CIHA".
**kwargs
Additional arguments forwarded to :func:`_fetch_data`.
Returns
-------
list[str] | pd.DataFrame
List of downloaded Parquet paths, or a DataFrame if specified.
"""
return _fetch_data(
dataset="ciha",
state=state.upper(),
group=group,
year=year,
month=month,
**kwargs,
)
[docs]
def list_files(
dataset: types.DatasetName,
client: types.Origin | None = None,
group: str | None = None,
state: str | None = None,
year: int | list[int] | None = None,
month: int | list[int] | None = None,
**kwargs,
) -> pd.DataFrame:
"""List catalog files filtered by client, group, state, year, and month.
Queries the PySUS API metadata and returns a DataFrame with file data
without downloading the actual files.
Parameters
----------
dataset : Literal
Dataset name (e.g. "SINAN", "SINASC", etc.).
client : Origin, optional
Data source client to query.
group : str, optional
Group or disease code to filter by.
state : str, optional
Two-letter state abbreviation (e.g. "RJ").
year : int | list[int], optional
Year or list of years to filter by.
month : int | list[int], optional
Month or list of months to filter by.
**kwargs
Additional arguments forwarded to :meth:`PySUS.query`.
Returns
-------
pd.DataFrame
DataFrame with columns name, path, dataset, group, year, month, state,
and modify.
"""
async def _list():
async with PySUS() as pysus:
years = [year] if isinstance(year, int) else (year or [None])
months = [month] if isinstance(month, int) else (month or [None])
records = []
for y in years:
for m in months:
records.extend(
await pysus.query(
client=client,
dataset=dataset,
group=group,
state=state,
year=y,
month=m,
)
)
return [
{
"name": str(r.path).split("/")[-1],
"path": str(r.path),
"dataset": r.dataset.name if r.dataset else None,
"group": r.group.name if r.group else None,
"year": r.record.year,
"month": r.record.month,
"state": r.record.state,
"modify": r.record.origin_modified,
}
for r in records
]
return pd.DataFrame(asyncio.run(_list()))