I need to get some data from a huge access database (two, actually, depending on which input I have). I have a function that calls that bank by chunks (if I try a fetchall gives memory problem). This function is sometimes called by my main function until I can get all the data I want. I'll post the 2 codes here ... I need to be able to speed up the execution time ... depending on the input it runs in 8 minutes! It's too much.
def chama_chunk():
global data
i=0
names = [x[0] for x in cursor.description] #pega os nomes das colunas
while True:
rows = cursor.fetchmany(200) #pega 200 linhas
if len(rows) == 0: #parar se n tiver nd
break
else:
if (i == 0): #se for a 1a linha, cria o dataframe com 200 linhas e colunas conforme em names
data = pd.DataFrame(np.array(rows),columns=names)
else:
data_append = [{names[i]:x[i] for i in range(len(names))} for x in rows]
data_append = pd.DataFrame ( data_append, columns = names)
data = pd.concat(objs = [data,data_append], axis = 0)
i = i+1
if i == 200:
break
The main function is this (the columns being deleted are not required)
def faztudo(lista,ponto1,ponto2):
global ativo
global reativo
lista1=lista
chama_chunk()
del data['CampoB']
del data['CampoC']
del data['Dia']
del data ['Unidade']
del data['Ano']
del data['Consumo']
del data['Ponto_maxima']
del data['Demanda_maxima']
del data['Fator_carga']
gc.enable()
ativo=pd.DataFrame(columns=data.columns)
reativo=pd.DataFrame(columns=data.columns)
while (data.Mes == mes).any():
if periodo == "DOM":
#print("entrou dom")
data.query('Mes==@mes',inplace=True)
data.query('Codigo in @lista1',inplace=True)
data.query('Dia_da_semana=="DM"',inplace=True)
ativo=ativo.append(data.query('Grandeza==1'))
reativo=reativo.append(data.query('Grandeza==2'))
elif periodo == "SAB":
#print("entrou sab")
data.query('Mes==@mes',inplace=True)
data.query('Codigo in @lista1',inplace=True)
data.query('Dia_da_semana=="SB"',inplace=True)
ativo=ativo.append(data.query('Grandeza==1'))
reativo=reativo.append(data.query('Grandeza==2'))
else:
#print("entrou resto")
data.query('Mes==@mes',inplace=True)
data.query('Codigo in @lista1',inplace=True)
data.query('Dia_da_semana==@diasuteis',inplace=True)
ativo=ativo.append(data.query('Grandeza==1'))
reativo=reativo.append(data.query('Grandeza==2'))
chama_chunk()
del data['CampoB']
del data['CampoC']
del data['Dia']
del data ['Unidade']
del data['Ano']
del data['Consumo']
del data['Ponto_maxima']
del data['Demanda_maxima']
del data['Fator_carga']
and my query is done by the function start:
def inicia (dbpath,mes):
#dbpath="W:\CuCa-2018-ZFA.mdb"
driver='{Microsoft Access Driver (*.mdb)}'
con = pyodbc.connect("DRIVER={}; DBQ={}".format(driver,dbpath))
query="SELECT * from Curvas_de_carga"
global cursor
cursor = con.cursor()
cursor.execute(query)
Some other pieces of code have not been mentioned, but I think the performance problem is pretty much in those 3 functions.
I'm new to database access: / And the option to change databases does not exist here, because the database is not mine, I'm accessing a file from someone else.