I'm new to Python and I'm having a hard time with my algorithm. Its function is to check words in a set of files in PDF format and to analyze the recurrence of each word, generating with this information a graph of the law of zipf second most recurrent word repeats the square root of the first most recurrent word in quantity) in short, I am using the matplotlib library to plot bar graphs, however the number of words appearing on the graph is very large (on the x-axis) and are overwriting.
obs: all criticism is welcome, thank you at once. follows the algorithm and generated graph.
#!/usr/bin/env python3.6
import os
import re
from operator import itemgetter
import matplotlib.pyplot as plt
import numpy as np
import math
from tkinter import *
def io_pasta():
def on_press():
if not (os.path.exists(entrada.get())):
lb["fg"]="red"
lb["text"] = "Pasta inexistente/inacessivél"
lb["font"]= "Bold"
else:
zipf(entrada.get(),janela)
janela = Tk()
lb=Label(janela, text = "Onde estão os asquivos?", font = "arial")
lb.pack()
entrada = Entry(janela, width = 40)
entrada.place(x=40,y=40)
b = Button(janela,text="OK",width = 10, command=on_press)
b.place(x=150,y=75)
janela.geometry("400x120")
janela.title("Distribuição ZIPF")
janela.mainloop()
def zipf(pasta,win):
win.destroy()
if not pasta[-1]=="/":
pasta+="/"
palavra=[]
repetic=[]
for nome in os.listdir(pasta):
os.system("pdftotext -enc UTF-8 "+pasta+""+str(nome)+" "+pasta+""+str(nome)+".txt")
print("arquivos convertidos ......................ok!")
os.system("mkdir "+pasta+"arquivos_originais && mv "+pasta+"*pdf "+pasta+"arquivos_originais")
os.system("mkdir "+pasta+"convertidos_txt && mv "+pasta+"*txt "+pasta+"convertidos_txt/")
os.system("mkdir "+pasta+"zipf")
print("pasta ARQUIVOS_MOVIDOS criada .................ok!")
print("Arquivos Movidos.............................ok!")
frequency = {}
for arq in os.listdir(""+pasta+"convertidos_txt/"):
open_file = open(""+pasta+"convertidos_txt/"+str(arq)+"", "r", encoding='latin-1')
file_to_string = open_file.read()
w1 = re.findall(r'(\b[A-Za-z][a-z]{4,20}\b)', file_to_string)
control = True
for word in w1:
count = frequency.get(word,0)
frequency[word] = count + 1
for key, value in reversed(sorted(frequency.items(), key = itemgetter(1))):
if control == True:
j=value
control=False
else:
if abs(math.sqrt(j)-value)<4:
palavra.append(key)
repetic.append(value)
plt.title("Distribuição zipf")
plt.grid(True)
plt.xticks(repetic,palavra,rotation=90,size='small')
pos = np.arange(len(palavra)) + .5
plt.bar(pos,repetic,align='center',color='#b8ff5c')
plt.savefig(''+pasta+'zipf/grafico_'+str(arq)+'.png')
io_pasta()