SIM Metadata Information

[1]:
from pysus.online_data.SIM import get_CID9_table, get_CID10_table, get_municipios, get_ocupations
[2]:
get_CID9_table()
2023-09-15 18:51:51.424 | DEBUG    | pysus.online_data.SIM:get_CID9_table:139 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-15 18:51:51.448 | DEBUG    | pysus.online_data.SIM:get_CID9_table:143 - Changing FTP work dir to: /dissemin/publicos/SIM/CID9/TABELAS
2023-09-15 18:51:51.450 | INFO     | pysus.online_data.SIM:get_CID9_table:153 - Local parquet file found at /home/bida/pysus/SIM_CID9_.parquet
[2]:
DESCRICAO CAUSAS
0 001 COLERA 001
1 002 FEBRES TIFOIDE E PARATIFOIDE 002
2 003 OUTRAS INFECCOES POR SALMONELLA 003
3 004 SHIGUELOSE 004
4 005 OUT INTOXIC ALIMENTARES (BACTERIANAS) 005
... ... ...
881 E995 LES OUT MEIOS OU N ESP OP GUERRA CONVENC 995
882 E996 LES ARMA NUCLEAR EM OPERACOES DE GUERRA 996
883 E997 LES OUT FORM GUERRA NAO CONVENCIONAL 997
884 E998 LES OP GUERRA OCORR APOS CESSACAO HOSTIL 998
885 E999 EFEIT TARDIOS DE LES OPERACAO DE GUERRA 999

886 rows × 2 columns

[3]:
get_CID10_table()
2023-09-15 18:51:51.586 | DEBUG    | pysus.online_data.SIM:get_CID10_table:93 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-15 18:51:51.608 | DEBUG    | pysus.online_data.SIM:get_CID10_table:97 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-15 18:51:51.609 | INFO     | pysus.online_data.SIM:get_CID10_table:107 - Local parquet file found at /home/bida/pysus/SIM_CID10_.parquet
[3]:
CID10 OPC CAT SUBCAT DESCR RESTRSEXO
0 A00 S N A00 Colera 5
1 A000 N S A00.0 Colera dev Vibrio cholerae 01 biot cholerae 5
2 A001 N S A00.1 Colera dev Vibrio cholerae 01 biot El Tor 5
3 A009 N S A00.9 Colera NE 5
4 A01 S N A01 Febres tifoide e paratifoide 5
... ... ... ... ... ... ...
14252 Z926 N S Z92.6 História pessoal de quimioterapia para d... 5
14253 U070 N S U07.0 Doença por cigarro eletrônico 5
14254 U071 N S U07.1 Infecção pelo novo Coronavírus (COVID-19) 5
14255 U072 N S U07.2 COVID-19, vírus não identificado 5
14256 U07 S N U07 Uso emergencial do U07 5

14257 rows × 6 columns

[4]:
get_municipios()
2023-09-15 18:51:51.744 | DEBUG    | pysus.online_data.SIM:get_municipios:185 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-15 18:51:51.768 | DEBUG    | pysus.online_data.SIM:get_municipios:189 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-15 18:51:51.769 | INFO     | pysus.online_data.SIM:get_municipios:199 - Local parquet file found at /home/bida/pysus/SIM_CADMUN_.parquet
[4]:
MUNCOD MUNCODDV SITUACAO MUNSINP MUNSIAFI MUNNOME MUNNOMEX OBSERV MUNSINON MUNSINONDV ... CSAUDCOD RMETRCOD AGLCOD ANOINST ANOEXT SUCESSOR LATITUDE LONGITUDE ALTITUDE AREA
0 000000 0000000 IGNOR 00000 Ignorado ou exterior IGNORADO OU EXTERIOR Ignorado , ,000001-009999 , ,0000001-0099999 ... 00000 0000 0000 0.000 0.000 0.0 0.000
1 110000 1100000 IGNOR 26000 Munic¡pio ignorado - RO MUNICIPIO IGNORADO - RO ,119999 ,1199999 ... 11000 1100 1100 0.000 0.000 0.0 0.000
2 110001 1100015 ATIVO 26016 0033 Alta Floresta D'Oeste ALTA FLORESTA D'OESTE ... 11900 1190 1190 1986 -11.929 -61.996 350.0 7066.702
3 110002 1100023 ATIVO 26004 0007 Ariquemes ARIQUEMES ... 11900 1190 1190 1977 -9.913 -63.041 142.0 4426.558
4 110003 1100031 ATIVO 26020 0037 Cabixi CABIXI ... 11900 1190 1190 1989 -13.492 -60.545 230.0 1314.355
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5647 522208 5222088 TRANS Wanderlƒndia (transf. p/TO) WANDERLANDIA (TRANSF. P/TO) Transferido para Tocantins ... 52980 5298 5298 1988 1722081 0.000 0.000 0.0 0.000
5648 522210 5222104 TRANS Xambio  (transf. p/TO) XAMBIOA (TRANSF. P/TO) Transferido para Tocantins ... 52980 5298 5298 1988 1722107 0.000 0.000 0.0 0.000
5649 522220 5222203 ATIVO 08313 0067 Vila Boa VILA BOA ... 52900 5301 5301 1993 -15.038 -47.059 0.0 1060.170
5650 522230 5222302 ATIVO 08323 1068 Vila Prop¡cio VILA PROPICIO ... 52900 5290 5290 1997 -15.457 -48.889 744.0 2181.575
5651 530010 5300108 ATIVO 23001 9701 Bras¡lia BRASILIA ,530000-530009,530011-539999 ,5300000-5300099,5300110-5399999 ... 53001 5301 5301 1960 -15.780 -47.930 1171.0 5801.937

5652 rows × 28 columns

[5]:
get_ocupations()
2023-09-15 18:51:51.942 | DEBUG    | pysus.online_data.SIM:get_ocupations:231 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-15 18:51:51.965 | DEBUG    | pysus.online_data.SIM:get_ocupations:235 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-15 18:51:51.966 | INFO     | pysus.online_data.SIM:get_ocupations:244 - Local parquet file found at /home/bida/pysus/SIM_TABOCUP_.parquet
[5]:
CODIGO DESCRICAO
0 639 ABALIZADOR (AREAS AGRICOLAS)
1 759 ABASTECEDOR DE ESPULAS
2 893 ABASTECEDOR DE FORNO (CERAMICA)
3 999 ABASTECEDOR DE LINHA DE PRODUCAO
4 739 ABASTECEDOR DE MAQUINAS (MAD. E PASTA PAPEL)
... ... ...
3559 540 ZELADOR DE VESTIARIOS
3560 551 ZELADOR SEM ESPECIFICACAO
3561 728 ZINCADOR
3562 051 ZOOLOGO
3563 065 ZOOTECNISTA

3564 rows × 2 columns

Preprocessing SIM data

[2]:
from pysus.ftp.databases.sim import SIM
from pysus.preprocessing.decoders import translate_variables_SIM
from pysus.preprocessing.SIM import group_and_count, redistribute_missing, redistribute_cid_chapter

sim = SIM().load()
[3]:
df = sim.download(sim.get_files("CID10",'ac',2010)).to_dataframe()
df
DOAC2010.parquet:   0%|                                                                                          | 0.00/7.37k [00:00<?, ?B/s]/home/bida/micromamba/envs/pysus/lib/python3.11/site-packages/tqdm/std.py:533: TqdmWarning: clamping frac to range [0, 1]
  full_bar = Bar(frac,
DOAC2010.parquet: 100%|█████████████████████████████████████████████████████████████████████████████████| 7.37k/7.37k [00:00<00:00, 34.4kB/s]
[3]:
contador ORIGEM TIPOBITO DTOBITO HORAOBITO NATURAL DTNASC IDADE SEXO RACACOR ... DTCADASTRO ATESTANTE FONTEINV DTRECEBIM UFINFORM CB_PRE MORTEPARTO DTCADINF TPOBITOCOR DTCADINV
0 1 1 2 06082010 2000 831 09041945 465 2 ... 19082010 20092010 R98
1 2 1 2 06082010 1300 812 20011912 498 1 ... 19082010 2 01102010 R98
2 3 1 2 02102010 1700 812 17032010 306 2 4 ... 05012011 3 10022011 R98
3 4 1 2 07042010 2300 07042010 005 2 1 ... 06052010 1 20072010 O689
4 5 1 2 13052010 0030 812 04081971 438 1 1 ... 21062010 5 20072010 X999
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3004 3005 1 2 28092010 0100 20041958 452 2 4 ... 20102010 11112010 I619
3005 3006 1 2 14102010 1015 24031928 482 1 1 ... 06122010 2 19012011 I219
3006 3007 1 2 21112010 0650 11121955 454 2 1 ... 09122010 5 17012011 C349
3007 3008 1 2 10112010 0500 07071955 455 1 1 ... 04012011 2 19012011 C787
3008 3009 1 2 10102010 1240 07051976 434 2 4 ... 28122010 2 26112010 I739

3009 rows × 59 columns

[4]:
df['SEXO'] = df['SEXO'].astype('object')
[5]:
variables = ['CODMUNRES','SEXO','IDADE_ANOS','CID10_CHAPTER']

df = translate_variables_SIM(
    df,
    age_classes=True,
    classify_cid10_chapters=True,
)
df = df[variables]
2023-09-19 13:55:09.529 | DEBUG    | pysus.online_data.SIM:get_municipios:185 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-19 13:55:09.552 | DEBUG    | pysus.online_data.SIM:get_municipios:189 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-19 13:55:09.553 | INFO     | pysus.online_data.SIM:get_municipios:199 - Local parquet file found at /home/bida/pysus/SIM_CADMUN_.parquet
/home/bida/Projetos/InfoDengue/PySUS/pysus/preprocessing/decoders.py:122: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  return df["MUNCODDV"].append(df["MUNCOD"]).astype("int64").values
2023-09-19 13:55:09.745 | DEBUG    | pysus.online_data.SIM:get_CID10_chapters_table:47 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-19 13:55:09.767 | DEBUG    | pysus.online_data.SIM:get_CID10_chapters_table:51 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-19 13:55:09.768 | INFO     | pysus.online_data.SIM:get_CID10_chapters_table:61 - Local parquet file found at /home/bida/pysus/SIM_CIDCAP10_.parquet
[6]:
counts = group_and_count(df,variables)
counts["COUNTS_ORIGINAL"] = counts["COUNTS"]
nan_string = 'missing'
[7]:
counts = redistribute_missing(counts,['CODMUNRES','SEXO','IDADE_ANOS'],nan_string=nan_string)
counts[counts["COUNTS"] > 1]
[7]:
CODMUNRES SEXO IDADE_ANOS CID10_CHAPTER COUNTS COUNTS_ORIGINAL
13 1200013.0 Feminino [0.0, 1.0) 16 2.0 2.0
9397 1200104.0 Feminino [0.0, 1.0) 16 2.0 2.0
10615 1200104.0 Feminino [72.0, 73.0) 10 2.0 2.0
10961 1200104.0 Masculino [0.0, 1.0) 16 3.0 3.0
11423 1200104.0 Masculino [27.0, 28.0) 20 2.0 2.0
... ... ... ... ... ... ...
96787 1200708.0 Masculino [81.0, 82.0) 9 2.0 2.0
98539 1200807.0 Feminino [0.0, 1.0) 10 2.0 2.0
98545 1200807.0 Feminino [0.0, 1.0) 16 4.0 4.0
101420 1200807.0 Masculino [77.0, 78.0) 18 2.0 2.0
101539 1200807.0 Masculino [84.0, 85.0) 18 2.0 2.0

466 rows × 6 columns

[8]:
counts = redistribute_cid_chapter(counts,['CODMUNRES','SEXO','IDADE_ANOS'])
counts[counts['CID10_CHAPTER'] == 1]
[8]:
CODMUNRES SEXO IDADE_ANOS CID10_CHAPTER COUNTS COUNTS_ORIGINAL
0 1200013.0 Feminino [0.0, 1.0) 1 1.0 1.0
17 1200013.0 Feminino [1.0, 2.0) 1 0.0 0.0
34 1200013.0 Feminino [2.0, 3.0) 1 0.0 0.0
51 1200013.0 Feminino [3.0, 4.0) 1 0.0 0.0
68 1200013.0 Feminino [4.0, 5.0) 1 0.0 0.0
... ... ... ... ... ... ...
107831 NA NA [87.0, 88.0) 1 0.0 0.0
107848 NA NA [88.0, 89.0) 1 0.0 0.0
107865 NA NA [89.0, 90.0) 1 0.0 0.0
107882 NA NA [90.0, inf) 1 0.0 0.0
107899 NA NA NA 1 0.0 0.0

6348 rows × 6 columns