How do I enter new data for prediction (text)?

0

I'm new to Python and new to Machine Learning. I built an algorithm for it to be trained to classify texts. So you can predict in which category a particular text fits.

Training and score are working. Now, my question is with regard to new data. How do I enter this data to be predicted in which category it fits?

I tried, as in the code below, but I could not, I still did not understand how it can be done. Follow the code and I appreciate the help.

# -*- coding: utf8 -*-
import pandas as pd
import psycopg2
import numpy as np
import re
import string
import sklearn
from nltk.corpus import stopwords
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors
import pickle
from sklearn.externals import joblib

conn = psycopg2.connect(host="10.1....", port="5432", database="aplicationNames", user="aplicationNames", password="aplicationNames")

# dataframe com o texto dos sintomas e a categoria do cat
df = pd.read_sql(
    "SELECT dsobservacaoclinica1, cdcatcategoria AS cat FROM iaconsultas limit 500",
    conn)
# criação das colunas do dataframe
df.columns = ['texto', 'cat']
# print(df) #imprimindo o dataframe
# Pegando os textos puros da coluna texto, para normalização.
# textos puros terão de ter um índice
df['texto'] = df['texto'].astype(str).str.replace('-', '')
df['texto'] = df['texto'].astype(str).str.replace('/', '')
df['texto'] = df['texto'].astype(str).str.replace('+', '')
df['texto'] = df['texto'].astype(str).str.replace('ões', '')
df['texto'] = df['texto'].astype(str).str.replace(';', '')
df['texto'] = df['texto'].astype(str).str.replace('#', '')
df['texto'] = df['texto'].astype(str).str.replace('~', '')
df['texto'] = df['texto'].astype(str).str.replace(':', '')
df['texto'] = df['texto'].astype(str).str.lower().str.split()
stop = stopwords.words("portuguese")
textosPuros = df['texto'].apply(lambda x: [w for w in x if not w in stop])


# textosPuros = df['texto'].astype(str).str.replace('\n', '')

# print(textosPuros)

def remove(string):
    novo = []
    for x in string:
        item = x
        for y in ['\n', '\t', '/', '.', '-', '(', ')']:
            item = item.replace(y, "")
        novo.append(item)
    return novo


# print(textosPuros)

textoMinusculo = textosPuros

# print(textoMinusculo)

textoLimpo = textoMinusculo  # [item for item in textoMinusculo if item not in ['\n', '\t']]

# textoLimpo = re.sub()
dicionario = set()
for lista in textoLimpo:
    dicionario.update(lista)
# imprime dicionario
print("Dicioonario")
# dicionario = {y.strip('\t\n.,/34567890();:-_') for y in dicionario}
print(dicionario)
# imprime as palavras limpas
print(textoLimpo)

# Atribui cada palavra a uma posicao no dicionario
totalDePalavras = len(dicionario)
tuplas = zip(dicionario, np.arange(totalDePalavras))
tradutor = {palavra: indice for palavra, indice in tuplas}

# Mostra a quantidade total de palavras
print("Total de palavras: ")
print(totalDePalavras)


def vetorizar_texto(texto, tradutor):
    vetor = [0] * len(tradutor)

    for palavra in texto:
        if palavra in tradutor:
            posicao = tradutor[palavra]
            vetor[posicao] += 1

    return vetor


# Vincula os textos quebrados a posicao no vetor
vetoresDeTexto = [vetorizar_texto(texto, tradutor) for texto in textoLimpo]
marcas = df['cat']
# Define o conjunto de dados X
X = np.array(vetoresDeTexto)
# Define o conjunto de dados Y (labels)
Y = np.array(marcas.tolist())

# Define porcentagem do treino
porcentagem_de_treino = 0.8

# Separa o tamanho do treino a partir da porcentagem
tamanho_do_treino = int(porcentagem_de_treino * len(Y))
# O restante fica para a validacao
tamanho_de_validacao = (len(Y) - tamanho_do_treino)

print("Frases disponiveis: ")
# print(len(Y))
print("Frases para treino: ")
# print(tamanho_do_treino)
print("Frase para validacao: ")
# print(tamanho_de_validacao)

# Separa os dados de treino
treino_dados = X[0:tamanho_do_treino]
# Separa as marcacoes de treino
treino_marcacoes = Y[0:tamanho_do_treino]
# Separa os dados de validacao
validacao_dados = X[tamanho_do_treino:]
# Separa as marcacoes de validacao
validacao_marcacoes = Y[tamanho_do_treino:]

print("Textos usados na validacao: ")
# print(textoLimpo[tamanho_do_treino:])
print("Validacao Marcacoes: ")
print(validacao_marcacoes)

clf = LogisticRegression()  # MultinomialNB() obtive 62% de acerto#GaussianNB()
clf.fit(treino_dados, treino_marcacoes)
# resp = clf.predict(validacao_dados)

accuracy = clf.score(validacao_dados, validacao_marcacoes)

print("Indice de acerto do algoritmo: ")
print("%.2f " % round(accuracy * 100) + "%\n")
# salvando  treino com pickle
file_name = 'treino.sav'
pickle._dump(clf, open(file_name, 'wb'))
# salvando treino com joblib
file_name_joblib = 'treino_joblib.sav'
joblib.dump(clf, file_name_joblib)
'''for cat in resp:
    print("cat {:16s}".format(cat))
'''

This above is the code for training. Below, the code that would be for prediction, with a data entry, "mine"

def predict():
    pr = pd.read_csv('csv_to_predict.csv', sep=';', header=0, usecols=[0])
    pred_cols = list(pr.columns.values)[0]
    test = ['buscar resultado']
    tvect = TfidfVectorizer(min_df=1, max_df=1)
    X_test = test #vetoresDeTexto(test, tradutor)  # tvect.transform(test)
    # carregar modelo salvo para predição com pickle
    loaded_model = pickle.load(open(file_name, 'rb'))
    # carregando modelo salvo para predição com joblib
    loaded_model_joblib = joblib.load(file_name_joblib)
    result = loaded_model_joblib.predict(X_test)
    print(result)


predict()

Error:

Traceback (most recent call last):
  File "C:/Python/categoriesTest/DataBases.py", line 167, in <module>
    predict()
  File "C:/Python/categoriesTest/DataBases.py", line 163, in predict
    result = loaded_model_joblib.predict(X_test)
  File "C:\Program Files (x86)\Python37-32\lib\site-packages\sklearn\linear_model\base.py", line 324, in predict
    scores = self.decision_function(X)
  File "C:\Program Files (x86)\Python37-32\lib\site-packages\sklearn\linear_model\base.py", line 300, in decision_function
    X = check_array(X, accept_sparse='csr')
  File "C:\Program Files (x86)\Python37-32\lib\site-packages\sklearn\utils\validation.py", line 441, in check_array
    "if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=['buscar resultado'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
    
asked by anonymous 06.09.2018 / 20:05

1 answer

1

The problem occurs in the following part of the code:

 result = loaded_model_joblib.predict(X_test)

simply change to:

result = loaded_model_joblib.predict (X_test.reshape (-1,1))

or if the error continues:

 result = loaded_model_joblib.predict(X_test.reshape(1,-1))

I will not go into detail as to why the error occurs. just start looking at the shape of your arrays. and when you do prediction in just one instance just remember that the algorithm expects an array of not two dimensions.

for example 1 line of the data frame: [1,2,3,4,5,6]

but the algorithm expects: [[1,2,3,4,5,6]]

This is a very common mistake so you will not have any more problems.

    
13.09.2018 / 16:07