SIM Metadata Information
[1]:
from pysus.online_data.SIM import get_CID9_table, get_CID10_table, get_municipios, get_ocupations
[2]:
get_CID9_table()
2023-09-15 18:51:51.424 | DEBUG | pysus.online_data.SIM:get_CID9_table:139 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-15 18:51:51.448 | DEBUG | pysus.online_data.SIM:get_CID9_table:143 - Changing FTP work dir to: /dissemin/publicos/SIM/CID9/TABELAS
2023-09-15 18:51:51.450 | INFO | pysus.online_data.SIM:get_CID9_table:153 - Local parquet file found at /home/bida/pysus/SIM_CID9_.parquet
[2]:
DESCRICAO | CAUSAS | |
---|---|---|
0 | 001 COLERA | 001 |
1 | 002 FEBRES TIFOIDE E PARATIFOIDE | 002 |
2 | 003 OUTRAS INFECCOES POR SALMONELLA | 003 |
3 | 004 SHIGUELOSE | 004 |
4 | 005 OUT INTOXIC ALIMENTARES (BACTERIANAS) | 005 |
... | ... | ... |
881 | E995 LES OUT MEIOS OU N ESP OP GUERRA CONVENC | 995 |
882 | E996 LES ARMA NUCLEAR EM OPERACOES DE GUERRA | 996 |
883 | E997 LES OUT FORM GUERRA NAO CONVENCIONAL | 997 |
884 | E998 LES OP GUERRA OCORR APOS CESSACAO HOSTIL | 998 |
885 | E999 EFEIT TARDIOS DE LES OPERACAO DE GUERRA | 999 |
886 rows × 2 columns
[3]:
get_CID10_table()
2023-09-15 18:51:51.586 | DEBUG | pysus.online_data.SIM:get_CID10_table:93 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-15 18:51:51.608 | DEBUG | pysus.online_data.SIM:get_CID10_table:97 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-15 18:51:51.609 | INFO | pysus.online_data.SIM:get_CID10_table:107 - Local parquet file found at /home/bida/pysus/SIM_CID10_.parquet
[3]:
CID10 | OPC | CAT | SUBCAT | DESCR | RESTRSEXO | |
---|---|---|---|---|---|---|
0 | A00 | S | N | A00 Colera | 5 | |
1 | A000 | N | S | A00.0 Colera dev Vibrio cholerae 01 biot cholerae | 5 | |
2 | A001 | N | S | A00.1 Colera dev Vibrio cholerae 01 biot El Tor | 5 | |
3 | A009 | N | S | A00.9 Colera NE | 5 | |
4 | A01 | S | N | A01 Febres tifoide e paratifoide | 5 | |
... | ... | ... | ... | ... | ... | ... |
14252 | Z926 | N | S | Z92.6 História pessoal de quimioterapia para d... | 5 | |
14253 | U070 | N | S | U07.0 Doença por cigarro eletrônico | 5 | |
14254 | U071 | N | S | U07.1 Infecção pelo novo Coronavírus (COVID-19) | 5 | |
14255 | U072 | N | S | U07.2 COVID-19, vírus não identificado | 5 | |
14256 | U07 | S | N | U07 Uso emergencial do U07 | 5 |
14257 rows × 6 columns
[4]:
get_municipios()
2023-09-15 18:51:51.744 | DEBUG | pysus.online_data.SIM:get_municipios:185 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-15 18:51:51.768 | DEBUG | pysus.online_data.SIM:get_municipios:189 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-15 18:51:51.769 | INFO | pysus.online_data.SIM:get_municipios:199 - Local parquet file found at /home/bida/pysus/SIM_CADMUN_.parquet
[4]:
MUNCOD | MUNCODDV | SITUACAO | MUNSINP | MUNSIAFI | MUNNOME | MUNNOMEX | OBSERV | MUNSINON | MUNSINONDV | ... | CSAUDCOD | RMETRCOD | AGLCOD | ANOINST | ANOEXT | SUCESSOR | LATITUDE | LONGITUDE | ALTITUDE | AREA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000000 | 0000000 | IGNOR | 00000 | Ignorado ou exterior | IGNORADO OU EXTERIOR | Ignorado | , ,000001-009999 | , ,0000001-0099999 | ... | 00000 | 0000 | 0000 | 0.000 | 0.000 | 0.0 | 0.000 | ||||
1 | 110000 | 1100000 | IGNOR | 26000 | Munic¡pio ignorado - RO | MUNICIPIO IGNORADO - RO | ,119999 | ,1199999 | ... | 11000 | 1100 | 1100 | 0.000 | 0.000 | 0.0 | 0.000 | |||||
2 | 110001 | 1100015 | ATIVO | 26016 | 0033 | Alta Floresta D'Oeste | ALTA FLORESTA D'OESTE | ... | 11900 | 1190 | 1190 | 1986 | -11.929 | -61.996 | 350.0 | 7066.702 | |||||
3 | 110002 | 1100023 | ATIVO | 26004 | 0007 | Ariquemes | ARIQUEMES | ... | 11900 | 1190 | 1190 | 1977 | -9.913 | -63.041 | 142.0 | 4426.558 | |||||
4 | 110003 | 1100031 | ATIVO | 26020 | 0037 | Cabixi | CABIXI | ... | 11900 | 1190 | 1190 | 1989 | -13.492 | -60.545 | 230.0 | 1314.355 | |||||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5647 | 522208 | 5222088 | TRANS | Wanderlndia (transf. p/TO) | WANDERLANDIA (TRANSF. P/TO) | Transferido para Tocantins | ... | 52980 | 5298 | 5298 | 1988 | 1722081 | 0.000 | 0.000 | 0.0 | 0.000 | |||||
5648 | 522210 | 5222104 | TRANS | Xambio (transf. p/TO) | XAMBIOA (TRANSF. P/TO) | Transferido para Tocantins | ... | 52980 | 5298 | 5298 | 1988 | 1722107 | 0.000 | 0.000 | 0.0 | 0.000 | |||||
5649 | 522220 | 5222203 | ATIVO | 08313 | 0067 | Vila Boa | VILA BOA | ... | 52900 | 5301 | 5301 | 1993 | -15.038 | -47.059 | 0.0 | 1060.170 | |||||
5650 | 522230 | 5222302 | ATIVO | 08323 | 1068 | Vila Prop¡cio | VILA PROPICIO | ... | 52900 | 5290 | 5290 | 1997 | -15.457 | -48.889 | 744.0 | 2181.575 | |||||
5651 | 530010 | 5300108 | ATIVO | 23001 | 9701 | Bras¡lia | BRASILIA | ,530000-530009,530011-539999 | ,5300000-5300099,5300110-5399999 | ... | 53001 | 5301 | 5301 | 1960 | -15.780 | -47.930 | 1171.0 | 5801.937 |
5652 rows × 28 columns
[5]:
get_ocupations()
2023-09-15 18:51:51.942 | DEBUG | pysus.online_data.SIM:get_ocupations:231 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-15 18:51:51.965 | DEBUG | pysus.online_data.SIM:get_ocupations:235 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-15 18:51:51.966 | INFO | pysus.online_data.SIM:get_ocupations:244 - Local parquet file found at /home/bida/pysus/SIM_TABOCUP_.parquet
[5]:
CODIGO | DESCRICAO | |
---|---|---|
0 | 639 | ABALIZADOR (AREAS AGRICOLAS) |
1 | 759 | ABASTECEDOR DE ESPULAS |
2 | 893 | ABASTECEDOR DE FORNO (CERAMICA) |
3 | 999 | ABASTECEDOR DE LINHA DE PRODUCAO |
4 | 739 | ABASTECEDOR DE MAQUINAS (MAD. E PASTA PAPEL) |
... | ... | ... |
3559 | 540 | ZELADOR DE VESTIARIOS |
3560 | 551 | ZELADOR SEM ESPECIFICACAO |
3561 | 728 | ZINCADOR |
3562 | 051 | ZOOLOGO |
3563 | 065 | ZOOTECNISTA |
3564 rows × 2 columns
Preprocessing SIM data
[2]:
from pysus.ftp.databases.sim import SIM
from pysus.preprocessing.decoders import translate_variables_SIM
from pysus.preprocessing.SIM import group_and_count, redistribute_missing, redistribute_cid_chapter
sim = SIM().load()
[3]:
df = sim.download(sim.get_files("CID10",'ac',2010)).to_dataframe()
df
DOAC2010.parquet: 0%| | 0.00/7.37k [00:00<?, ?B/s]/home/bida/micromamba/envs/pysus/lib/python3.11/site-packages/tqdm/std.py:533: TqdmWarning: clamping frac to range [0, 1]
full_bar = Bar(frac,
DOAC2010.parquet: 100%|█████████████████████████████████████████████████████████████████████████████████| 7.37k/7.37k [00:00<00:00, 34.4kB/s]
[3]:
contador | ORIGEM | TIPOBITO | DTOBITO | HORAOBITO | NATURAL | DTNASC | IDADE | SEXO | RACACOR | ... | DTCADASTRO | ATESTANTE | FONTEINV | DTRECEBIM | UFINFORM | CB_PRE | MORTEPARTO | DTCADINF | TPOBITOCOR | DTCADINV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 2 | 06082010 | 2000 | 831 | 09041945 | 465 | 2 | ... | 19082010 | 20092010 | R98 | ||||||||
1 | 2 | 1 | 2 | 06082010 | 1300 | 812 | 20011912 | 498 | 1 | ... | 19082010 | 2 | 01102010 | R98 | |||||||
2 | 3 | 1 | 2 | 02102010 | 1700 | 812 | 17032010 | 306 | 2 | 4 | ... | 05012011 | 3 | 10022011 | R98 | ||||||
3 | 4 | 1 | 2 | 07042010 | 2300 | 07042010 | 005 | 2 | 1 | ... | 06052010 | 1 | 20072010 | O689 | |||||||
4 | 5 | 1 | 2 | 13052010 | 0030 | 812 | 04081971 | 438 | 1 | 1 | ... | 21062010 | 5 | 20072010 | X999 | ||||||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3004 | 3005 | 1 | 2 | 28092010 | 0100 | 20041958 | 452 | 2 | 4 | ... | 20102010 | 11112010 | I619 | ||||||||
3005 | 3006 | 1 | 2 | 14102010 | 1015 | 24031928 | 482 | 1 | 1 | ... | 06122010 | 2 | 19012011 | I219 | |||||||
3006 | 3007 | 1 | 2 | 21112010 | 0650 | 11121955 | 454 | 2 | 1 | ... | 09122010 | 5 | 17012011 | C349 | |||||||
3007 | 3008 | 1 | 2 | 10112010 | 0500 | 07071955 | 455 | 1 | 1 | ... | 04012011 | 2 | 19012011 | C787 | |||||||
3008 | 3009 | 1 | 2 | 10102010 | 1240 | 07051976 | 434 | 2 | 4 | ... | 28122010 | 2 | 26112010 | I739 |
3009 rows × 59 columns
[4]:
df['SEXO'] = df['SEXO'].astype('object')
[5]:
variables = ['CODMUNRES','SEXO','IDADE_ANOS','CID10_CHAPTER']
df = translate_variables_SIM(
df,
age_classes=True,
classify_cid10_chapters=True,
)
df = df[variables]
2023-09-19 13:55:09.529 | DEBUG | pysus.online_data.SIM:get_municipios:185 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-19 13:55:09.552 | DEBUG | pysus.online_data.SIM:get_municipios:189 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-19 13:55:09.553 | INFO | pysus.online_data.SIM:get_municipios:199 - Local parquet file found at /home/bida/pysus/SIM_CADMUN_.parquet
/home/bida/Projetos/InfoDengue/PySUS/pysus/preprocessing/decoders.py:122: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
return df["MUNCODDV"].append(df["MUNCOD"]).astype("int64").values
2023-09-19 13:55:09.745 | DEBUG | pysus.online_data.SIM:get_CID10_chapters_table:47 - Stablishing connection with ftp.datasus.gov.br.
220 Microsoft FTP Service
2023-09-19 13:55:09.767 | DEBUG | pysus.online_data.SIM:get_CID10_chapters_table:51 - Changing FTP work dir to: /dissemin/publicos/SIM/CID10/TABELAS
2023-09-19 13:55:09.768 | INFO | pysus.online_data.SIM:get_CID10_chapters_table:61 - Local parquet file found at /home/bida/pysus/SIM_CIDCAP10_.parquet
[6]:
counts = group_and_count(df,variables)
counts["COUNTS_ORIGINAL"] = counts["COUNTS"]
nan_string = 'missing'
[7]:
counts = redistribute_missing(counts,['CODMUNRES','SEXO','IDADE_ANOS'],nan_string=nan_string)
counts[counts["COUNTS"] > 1]
[7]:
CODMUNRES | SEXO | IDADE_ANOS | CID10_CHAPTER | COUNTS | COUNTS_ORIGINAL | |
---|---|---|---|---|---|---|
13 | 1200013.0 | Feminino | [0.0, 1.0) | 16 | 2.0 | 2.0 |
9397 | 1200104.0 | Feminino | [0.0, 1.0) | 16 | 2.0 | 2.0 |
10615 | 1200104.0 | Feminino | [72.0, 73.0) | 10 | 2.0 | 2.0 |
10961 | 1200104.0 | Masculino | [0.0, 1.0) | 16 | 3.0 | 3.0 |
11423 | 1200104.0 | Masculino | [27.0, 28.0) | 20 | 2.0 | 2.0 |
... | ... | ... | ... | ... | ... | ... |
96787 | 1200708.0 | Masculino | [81.0, 82.0) | 9 | 2.0 | 2.0 |
98539 | 1200807.0 | Feminino | [0.0, 1.0) | 10 | 2.0 | 2.0 |
98545 | 1200807.0 | Feminino | [0.0, 1.0) | 16 | 4.0 | 4.0 |
101420 | 1200807.0 | Masculino | [77.0, 78.0) | 18 | 2.0 | 2.0 |
101539 | 1200807.0 | Masculino | [84.0, 85.0) | 18 | 2.0 | 2.0 |
466 rows × 6 columns
[8]:
counts = redistribute_cid_chapter(counts,['CODMUNRES','SEXO','IDADE_ANOS'])
counts[counts['CID10_CHAPTER'] == 1]
[8]:
CODMUNRES | SEXO | IDADE_ANOS | CID10_CHAPTER | COUNTS | COUNTS_ORIGINAL | |
---|---|---|---|---|---|---|
0 | 1200013.0 | Feminino | [0.0, 1.0) | 1 | 1.0 | 1.0 |
17 | 1200013.0 | Feminino | [1.0, 2.0) | 1 | 0.0 | 0.0 |
34 | 1200013.0 | Feminino | [2.0, 3.0) | 1 | 0.0 | 0.0 |
51 | 1200013.0 | Feminino | [3.0, 4.0) | 1 | 0.0 | 0.0 |
68 | 1200013.0 | Feminino | [4.0, 5.0) | 1 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... |
107831 | NA | NA | [87.0, 88.0) | 1 | 0.0 | 0.0 |
107848 | NA | NA | [88.0, 89.0) | 1 | 0.0 | 0.0 |
107865 | NA | NA | [89.0, 90.0) | 1 | 0.0 | 0.0 |
107882 | NA | NA | [90.0, inf) | 1 | 0.0 | 0.0 |
107899 | NA | NA | NA | 1 | 0.0 | 0.0 |
6348 rows × 6 columns