Good morning everyone.
Need to search for the name of the presidents of Brazil, in html files. I created a json with the names of the presidents to facilitate.
Follow the code:
# !/bin/env python
# coding: utf-8
__author__ = '@britodfbr'
import json
import re
import os
from bs4 import BeautifulSoup
value = '''{
"presidentes": {
"1" : {
"nome": "Manuel Deodoro da Fonseca",
"vice": "Floriano Vieira Peixoto",
"imandato": "15 de novembro de 1889",
"fmandato": "23 de novembro de 1891"
},
"2": {
"nome":"Floriano Vieira Peixoto",
"vice": null,
"imandato": "23 de novembro de 1891",
"fmandato": "15 de novembro de 1894"
},
"3": {
"nome":"Prudente José de Morais e Barros",
"vice": "Manuel Vitorino Pereira",
"imandato": "15 de novembro de 1894",
"fmandato": "15 de novembro de 1898"
},
"4": {
"nome":"Manuel Ferraz de Campos Sales",
"vice": "Francisco de Assis Rosa e Silva",
"imandato": "15 de novembro de 1898",
"fmandato": "15 de novembro de 1902"
},
"5": {
"nome":"Francisco de Paula Rodrigues Alves",
"vice": "Francisco Silviano de Almeida Brandão",
"imandato": "15 de novembro de 1902",
"fmandato": "15 de novembro de 1906"
}
}
}'''
def truncus13(file = '../data/teste02.html'):
soup = BeautifulSoup(open(file).read(), 'html5lib')
presidentes = json.loads(value)
for item in presidentes['presidentes'].values():
#print(item)
regex = ''
for i in [x.lower() for x in item['nome'].split() if len(x) > 2]:
regex += f'("{i} ")?'
regex = f"'{regex}'"
print(regex)
result = soup.find_all(string=re.compile(regex, re.I), limit=10)
if result:
return result
return False
if __name__ == '__main__':
for file in [os.path.abspath(f) for f in os.listdir() if f.endswith('.html')]:
truncus13(file)
HTML examples
<!DOCTYPE html>
<html lang="pt-br">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
Deodoro da Fonseca<br />
Floriano Peixoto
</body>
</html>
<!DOCTYPE html>
<html lang="pt-br">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<p> Manuel Deodoro</p>
<p> Vieira Peixoto</p>
</body>
</html>
<!DOCTYPE html>
<html lang="pt-br">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<p> Deodoro da Fonseca</p>
<p> Vieira Peixoto</p>
</body>
</html>
Expected Goal A class 'bs4.element.ResultSet' with the name of the chair that exists in the HTML document.