Error generating matplotlib graph

2

I'm new to Python and I'm having a hard time with my algorithm. Its function is to check words in a set of files in PDF format and to analyze the recurrence of each word, generating with this information a graph of the law of zipf second most recurrent word repeats the square root of the first most recurrent word in quantity) in short, I am using the matplotlib library to plot bar graphs, however the number of words appearing on the graph is very large (on the x-axis) and are overwriting.

obs: all criticism is welcome, thank you at once. follows the algorithm and generated graph.

    #!/usr/bin/env python3.6
import os
import re
from operator import itemgetter
import matplotlib.pyplot as plt
import numpy as np
import math

from tkinter import *
def io_pasta():
        def on_press():
            if not (os.path.exists(entrada.get())):
                lb["fg"]="red"
                lb["text"] = "Pasta inexistente/inacessivél"
                lb["font"]= "Bold"
            else:
                zipf(entrada.get(),janela)

        janela = Tk()
        lb=Label(janela, text = "Onde estão os asquivos?", font = "arial")
        lb.pack()
        entrada = Entry(janela, width = 40)
        entrada.place(x=40,y=40)
        b = Button(janela,text="OK",width = 10, command=on_press)
        b.place(x=150,y=75)

        janela.geometry("400x120")
        janela.title("Distribuição ZIPF")

        janela.mainloop()


def zipf(pasta,win):
    win.destroy()
    if not pasta[-1]=="/":
          pasta+="/"
    palavra=[]
    repetic=[]
    for nome in os.listdir(pasta):
        os.system("pdftotext -enc UTF-8   "+pasta+""+str(nome)+"  "+pasta+""+str(nome)+".txt")
    print("arquivos convertidos ......................ok!")
    os.system("mkdir "+pasta+"arquivos_originais && mv "+pasta+"*pdf "+pasta+"arquivos_originais")
    os.system("mkdir "+pasta+"convertidos_txt && mv "+pasta+"*txt "+pasta+"convertidos_txt/")
    os.system("mkdir "+pasta+"zipf")
    print("pasta ARQUIVOS_MOVIDOS criada .................ok!")
    print("Arquivos Movidos.............................ok!")
    frequency = {}
    for arq in os.listdir(""+pasta+"convertidos_txt/"):
        open_file = open(""+pasta+"convertidos_txt/"+str(arq)+"", "r", encoding='latin-1')
        file_to_string = open_file.read()
        w1 = re.findall(r'(\b[A-Za-z][a-z]{4,20}\b)', file_to_string)
        control = True
        for word in w1:
            count = frequency.get(word,0)
            frequency[word] = count + 1

        for key, value in reversed(sorted(frequency.items(), key = itemgetter(1))):
            if control == True:
                    j=value
                    control=False
            else:
                if abs(math.sqrt(j)-value)<4:
                        palavra.append(key)
                        repetic.append(value)

        plt.title("Distribuição zipf")
        plt.grid(True)
        plt.xticks(repetic,palavra,rotation=90,size='small')
        pos = np.arange(len(palavra)) + .5 
        plt.bar(pos,repetic,align='center',color='#b8ff5c')
        plt.savefig(''+pasta+'zipf/grafico_'+str(arq)+'.png')      

io_pasta()
    
asked by anonymous 11.09.2017 / 21:40

1 answer

0

Tip 1: Use the tightlayout

Add function tightlayout before save your image:

plt.bar(pos,repetic,align='center',color='#b8ff5c')
plt.tightlayout()
plt.savefig(''+pasta+'zipf/grafico_'+str(arq)+'.png')      

Tip 2: Instantiate a figure and adjust its size

  • Process your text and end up with an X number of words.
  • Create a figure with the function figure (or analog function) and specify the size of it according to the size of its X number of words:

    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(0.5 * x, 10))
    
  • (set the 0.5 factor to a higher / lower number).

        
    05.12.2017 / 12:39