Source code for pysus.api.dadosgov.databases

"""Pre-configured health database definitions accessible via dados.gov.br."""

import re
from typing import Any

from pysus.utils import zfill_year

from .models import Dataset

MONTHS: dict[str, int] = {
    "jan": 1,
    "fev": 2,
    "mar": 3,
    "abr": 4,
    "mai": 5,
    "jun": 6,
    "jul": 7,
    "ago": 8,
    "set": 9,
    "out": 10,
    "nov": 11,
    "dez": 12,
}


def _parse_year(val: str) -> int | None:
    """Parse a year string into an integer within the valid range."""
    try:
        y = int(val)
        return y if 1970 <= y <= 2100 else None
    except ValueError:
        return None


def _skip(name: str) -> bool:
    """Check whether a filename should be skipped by naming conventions."""
    return name.startswith("get_") or name.lower().endswith(".pdf")


[docs] class CNES(Dataset): """Cadastro Nacional de Estabelecimentos de Saúde (CNES).""" ids: list[str] = [ "40a0d093-b12f-44a4-bdc7-bae8eb54dd04", "9455b341-b06e-408e-8e10-54b32b3d74ec", ] @property def name(self) -> str: """Return the short name. Returns ------- str The abbreviated dataset name ``"CNES"``. """ return "CNES" @property def long_name(self) -> str: """Return the human-readable name. Returns ------- str The full Portuguese name of the dataset. """ return "Cadastro Nacional de Estabelecimentos de Saúde" @property def description(self) -> str: """Return a description of the dataset. Returns ------- str A Portuguese description of the CNES information system. """ return ( "O Cadastro Nacional de Estabelecimentos de Saúde (CNES) é o " "sistema de informação oficial de cadastramento de informações " "de todos os estabelecimentos de saúde no país." )
[docs] def formatter(self, filename: str) -> dict[str, Any]: """Parse a CNES filename and extract metadata. Parameters ---------- filename : str The name of the file to parse. Returns ------- dict[str, Any] A dictionary with keys ``state``, ``year``, and ``month``. Unrecognised files return ``None`` for all keys. """ try: name = filename.strip() if _skip(name): return {"state": None, "year": None, "month": None} m = re.search(r"_(\d{2})-(\d{4})\.csv$", name) if m: return { "state": None, "year": _parse_year(m.group(2)), "month": int(m.group(1)), } return {"state": None, "year": None, "month": None} except (IndexError, ValueError): return {"state": None, "year": None, "month": None}
[docs] class PNI(Dataset): """Programa Nacional de Imunizações (PNI).""" ids: list[str] = [ "2989d396-cb09-47e7-a3b8-a4b951ca0200", "543aa08a-46c4-44e8-802e-198daa30753d", "04292d08-ee4f-463a-b7b5-76cfb76775b3", "7ed6eecc-c254-475c-92c5-daba5727596b", "783b7456-6a6c-4025-a8bd-8e9caa0fb962", "c6c3c6f3-2026-48a2-84ac-d8039714a0ba", "9a25b796-80e3-444a-a4e7-405f5596d8ab", ] _PNI_PREFIX = "doses-aplicadas-pelo-programa-de-nacional-de-imunizacoes-pni" group_aliases: dict[str, str] = { _PNI_PREFIX: "DPNI", f"{_PNI_PREFIX}-2020": "DPNI", f"{_PNI_PREFIX}-2021": "DPNI", f"dataset-{_PNI_PREFIX}_2022": "DPNI", f"{_PNI_PREFIX}-2023": "DPNI", f"{_PNI_PREFIX}-2025": "DPNI", f"{_PNI_PREFIX}-2026": "DPNI", } @property def name(self) -> str: """Return the short name. Returns ------- str The abbreviated dataset name ``"PNI"``. """ return "PNI" @property def long_name(self) -> str: """Return the human-readable name. Returns ------- str The full Portuguese name of the dataset. """ return "Programa Nacional de Imunizações" @property def description(self) -> str: """Return a description of the dataset. Returns ------- str A Portuguese description of the PNI vaccination monitoring system. """ return "O PNI monitora a cobertura vacinal e doses aplicadas no Brasil."
[docs] def formatter(self, filename: str) -> dict[str, Any]: """Parse a PNI vaccination filename into month and year. Parameters ---------- filename : str The name of the file to parse. Returns ------- dict[str, Any] A dictionary with keys ``state``, ``year``, and ``month``. Unrecognised files return ``None`` for all keys. """ try: name = filename.strip().lower() if _skip(name): return {"state": None, "year": None, "month": None} m = re.match(r"vacinacao_(\w{3})_(\d{4})_csv\.zip", name) if m: month = MONTHS.get(m.group(1)) year = _parse_year(m.group(2)) return {"state": None, "year": year, "month": month} return {"state": None, "year": None, "month": None} except (IndexError, ValueError): return {"state": None, "year": None, "month": None}
[docs] class SIA(Dataset): """Sistema de Informações Ambulatoriais (SIA).""" ids: list[str] = [ "9a335cb7-2b4f-4fce-8947-e8441b4a90af", ] @property def name(self) -> str: """Return the short name. Returns ------- str The abbreviated dataset name ``"SIA"``. """ return "SIA" @property def long_name(self) -> str: """Return the human-readable name. Returns ------- str The full Portuguese name of the dataset. """ return "Sistema de Informações Ambulatoriais" @property def description(self) -> str: """Return a description of the dataset. Returns ------- str A Portuguese description of the SIA outpatient information system. """ return """ O SIA acompanha as ações de saúde produzidas no âmbito ambulatorial. """
[docs] def formatter(self, filename: str) -> dict[str, Any]: """Parse an SIA filename into year. Parameters ---------- filename : str The name of the file to parse. Returns ------- dict[str, Any] A dictionary with keys ``state``, ``year``, and ``month``. Unrecognised files return ``None`` for all keys. """ try: name = filename.strip().lower() if _skip(name): return {"state": None, "year": None, "month": None} m = re.search(r"_(\d{4})_\.csv$", name) if m: return { "state": None, "year": _parse_year(m.group(1)), "month": None, } m = re.search(r"_(\w{3})-out_(\d{4})_\.csv$", name) if m: return { "state": None, "year": _parse_year(m.group(2)), "month": None, } return {"state": None, "year": None, "month": None} except (IndexError, ValueError): return {"state": None, "year": None, "month": None}
[docs] class SINAN(Dataset): """Sistema de Informação de Agravos de Notificação (SINAN).""" ids: list[str] = [ "4d5e5d44-58a8-4d67-b8aa-4ef1e4b00a1c", "5699abe0-0510-4da8-b47d-209b3bb32b34", "4557ba96-7d52-4a56-bd6f-f99a5af09f77", "740ce8f4-7a5d-4351-aad4-7623f2490ada", "cf044c1b-b966-4d0e-bab0-f3aa65897b7d", "2d4997fb-cd11-4ce2-b217-09cd50e3151f", "8a585222-4c2e-43b7-807d-59355ee79c48", "527e8665-de64-4f81-b7c3-40b59c7d1d3c", ] group_aliases: dict[str, str] = { "arboviroses-dengue": "DENG", "arboviroses-febre-de-chikungunya": "CHIK", "arboviroses-zika-virus": "ZIKA", "hanseniase": "HANS", "dados-tuberculose": "TUBE", "sifilis": "SIFA", } @property def name(self) -> str: """Return the short name. Returns ------- str The abbreviated dataset name ``"SINAN"``. """ return "SINAN" @property def long_name(self) -> str: """Return the human-readable name. Returns ------- str The full Portuguese name of the dataset. """ return "Sistema de Informação de Agravos de Notificação" @property def description(self) -> str: """Return a description of the dataset. Returns ------- str A Portuguese description of the SINAN notifiable diseases system. """ return """ O SINAN é alimentado pela notificação de doenças de notificação compulsória """
[docs] def formatter(self, filename: str) -> dict[str, Any]: """Parse a SINAN filename into state and year. Parameters ---------- filename : str The name of the file to parse. Returns ------- dict[str, Any] A dictionary with keys ``state``, ``year``, and ``month``. Unrecognised files return ``None`` for all keys. """ try: name = filename.strip().upper() if _skip(name): return {"state": None, "year": None, "month": None} m = re.match(r"(\w{4})(BR)(\d{2})\.CSV\.ZIP", name) if m: return { "state": m.group(2), "year": zfill_year(m.group(3)), "month": None, } m = re.match(r"MPX_(\d{4})_OPENDATASUS\.CSV\.ZIP", name) if m: return { "state": None, "year": _parse_year(m.group(1)), "month": None, } return {"state": None, "year": None, "month": None} except (IndexError, ValueError): return {"state": None, "year": None, "month": None}
[docs] class SIM(Dataset): """Sistema de Informação sobre Mortalidade (SIM).""" ids: list[str] = [ "5f121f4d-47c6-428e-8ec6-e8ec56417172", ] group_aliases: dict[str, str] = { "sim-1979-2019": "DO", } @property def name(self) -> str: """Return the short name. Returns ------- str The abbreviated dataset name ``"SIM"``. """ return "SIM" @property def long_name(self) -> str: """Return the human-readable name. Returns ------- str The full Portuguese name of the dataset. """ return "Sistema de Informação sobre Mortalidade" @property def description(self) -> str: """Return a description of the dataset. Returns ------- str A Portuguese description of the SIM mortality information system. """ return """ O SIM coleta dados sobre óbitos no país para análise epidemiológica. """
[docs] def formatter(self, filename: str) -> dict[str, Any]: """Parse a SIM filename into year. Parameters ---------- filename : str The name of the file to parse. Returns ------- dict[str, Any] A dictionary with keys ``state``, ``year``, and ``month``. Unrecognised files return ``None`` for all keys. """ try: name = filename.strip() if _skip(name): return {"state": None, "year": None, "month": None} m = re.search(r"Mortalidade_Geral_(\d{4})_csv\.zip", name) if m: return { "state": None, "year": _parse_year(m.group(1)), "month": None, } m = re.match(r"DO(\d{2})OPEN", name) if m: return { "state": None, "year": zfill_year(m.group(1)), "month": None, } return {"state": None, "year": None, "month": None} except (IndexError, ValueError): return {"state": None, "year": None, "month": None}
[docs] class SINASC(Dataset): """Sistema de Informações sobre Nascidos Vivos (SINASC).""" ids: list[str] = [ "441cc6bd-684a-4afd-a88b-ba4734c9e83e", ] group_aliases: dict[str, str] = { "sistema-de-informacao-sobre-nascidos-vivos-sinasc-1996-a-20201": "DN", } @property def name(self) -> str: """Return the short name. Returns ------- str The abbreviated dataset name ``"SINASC"``. """ return "SINASC" @property def long_name(self) -> str: """Return the human-readable name. Returns ------- str The full Portuguese name of the dataset. """ return "Sistema de Informações sobre Nascidos Vivos" @property def description(self) -> str: """Return a description of the dataset. Returns ------- str Portuguese description of the SINASC live birth system. """ return """ O SINASC fornece subsídios para o diagnóstico de saúde e planejamento de políticas de natalidade. """
[docs] def formatter(self, filename: str) -> dict[str, Any]: """Parse a SINASC filename into year. Parameters ---------- filename : str The name of the file to parse. Returns ------- dict[str, Any] A dictionary with keys ``state``, ``year``, and ``month``. Unrecognised files return ``None`` for all keys. """ try: name = filename.strip() if _skip(name): return {"state": None, "year": None, "month": None} m = re.search(r"SINASC_(\d{4})_csv\.zip", name) if m: return { "state": None, "year": _parse_year(m.group(1)), "month": None, } m = re.search(r"DNBR(\d{4})_csv\.zip", name) if m: return { "state": "BR", "year": _parse_year(m.group(1)), "month": None, } return {"state": None, "year": None, "month": None} except (IndexError, ValueError): return {"state": None, "year": None, "month": None}
[docs] class COVID19(Dataset): """Casos Confirmados de COVID-19.""" ids: list[str] = [ "1ba1801e-aec0-4dba-ae2a-7732f0a0c9f7", ] @property def name(self) -> str: """Return the short name. Returns ------- str The abbreviated dataset name ``"COVID19"``. """ return "COVID19" @property def long_name(self) -> str: """Return the human-readable name. Returns ------- str The full Portuguese name of the dataset. """ return "Casos Confirmados de COVID-19" @property def description(self) -> str: """Return a description of the dataset. Returns ------- str A Portuguese description of the COVID-19 confirmed cases dataset. """ return "Dados anonimizados de casos confirmados de COVID-19."
[docs] def formatter(self, filename: str) -> dict[str, Any]: """Parse a COVID-19 filename and extract metadata. Parameters ---------- filename : str The name of the file to parse. Returns ------- dict[str, Any] A dictionary with keys ``state``, ``year``, and ``month``. Unrecognised files return ``None`` for all keys. """ try: name = filename.strip().lower() if _skip(name) or name.endswith(".xlsx"): return {"state": None, "year": None, "month": None} if name.endswith(".csv"): return {"state": None, "year": None, "month": None} return {"state": None, "year": None, "month": None} except (IndexError, ValueError): return {"state": None, "year": None, "month": None}
AVAILABLE_DATABASES: list[type[Dataset]] = [ CNES, PNI, SIA, SIM, SINAN, SINASC, COVID19, ]