I have a project that "retrieves" certain information from an HTML page, parses with the help of Beautiful Soup, and returns values in the form of dictionary , to that in another method I generate a JSON object.
The problem is that because of the peculiarity of the page, very poorly written and with excessive tags, besides problems with the organization of the information itself, I need to handle everything with a lot of use of loops and conditionals. The current method has 91 rows.
I can not logically separate these blocks of code into other methods, everything seems to me "part of the same operation". It is even more difficult because it does not seem to be useful in another situation either.
Does anyone have any suggestions on when and how can I split my code?
As an example, a method I used to "play", which shares the same problem (to make it less strange, I explain: it takes information from a UK menu page of my university):
def parse_cardapios(self):
"""Interpreta as tabelas de cardápio no site do restaurante"""
pag = urllib2.urlopen(self.url + '/' + self.campus).read();
soup = BeautifulSoup(pag)
resultado = []
# Percorre as refeições e suas respectivas tabelas de cardápio
nomes_ref = soup.find('section', id='post-content').find_all('h2')
tabelas_card = soup.find('section', id='post-content').find_all('table')
for ref, tab in zip(nomes_ref, tabelas_card):
refeicao = tratar_chave(ref)
# Percorre todos os dias disponíveis
nome_colunas = tab.find_all('th')
linhas = tab.find_all('tr', class_=True)
for lin in linhas: # Cada linha é um dia diferente
dia_repetido = False # Para controlar a repetição
obj_refeicoes = {refeicao: {}}
obj_temp = {'data': '', 'refeicoes': {}}
# Percorre cada dado
celulas = lin.find_all('td')
for meta, dado in zip(nome_colunas, celulas):
meta = tratar_chave(meta)
dado = tratar_valor(dado)
if meta == 'data':
dado = dado.translate(None, 'aábcçdefghijklmnopqrstuvzwxyz- ,')
if not resultado:
obj_temp['data'] = dado
else:
for r in resultado:
if r['data'] == dado:
dia_repetido = True
r['refeicoes'].update(obj_refeicoes)
break
else:
obj_temp['data'] = dado
else:
obj_refeicoes[refeicao].update({meta: dado})
obj_temp['refeicoes'].update(obj_refeicoes)
if not dia_repetido:
resultado.append(obj_temp)
# Junta as refeições vegetarianas no mesmo cardápio que as outras
for r in resultado:
for s in r['refeicoes'].keys():
if '-vegetariano' in s:
veg = {}
for t in r['refeicoes'][s].keys():
if not '-vegetariano' in t:
veg.update({t + '-vegetariano': r['refeicoes'][s][t]})
else:
veg.update({t: r['refeicoes'][s][t]})
sem_sufixo = s.replace('-vegetariano', '')
r['refeicoes'][sem_sufixo].update(veg)
for u in r['refeicoes'].keys():
if '-vegetariano' in u:
del r['refeicoes'][u]
return dict({'campus': self.campus, 'dia-cardapio': resultado})