I'm doing some research in the area of feelings analysis, so I'm doing some testing on a text database to get results. I was looking for tutorials among other forms of information on the internet and I came to the conclusion that the scikit library of python is well used. However, I'm having trouble getting this library to work. Any help is welcome.
Init class
import codecs
import baseline
def loadContent():
positiveData = codecs.open('opinioesNegativas.txt', 'r', encoding='utf8').readlines()
file = codecs.open('opinioesPositivas.txt', 'r', encoding='utf8')
negativeDate = file.readlines()
data_set = [0 for i in range(2000)]
label_set = [0 for i in range(2000)]
data_set[:1000] = positiveData
data_set[1000:] = negativeDate
for i in range(2000):
if i < 1000:
label_set[i] = "p"
else:
label_set[i] = "n"
return data_set, label_set
def run_baseline():
# getting the data#
data_set, label_set = loadContent()
baseline_classifier = baseline
# Pre-processing and setting the data to train and test model#
data_set = baseline_classifier.data_TFIDF_transform(data_set)
# data_set = baseline_classifier.data_transform(data_set)
folds = 10
scores = baseline_classifier.runKFoldCrossValitation(data_set, label_set, folds)
return scores
scores = run_baseline()
print(scores)
print("Baseline Accuracy: {} +/- {}".format(scores.mean(), scores.std() ** 2))
print(scores)
print("Stylometric Accuracy: {} +/- {}".format(scores.mean(), scores.std() ** 2))
Baseline Class
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import SVC
classifier = OneVsOneClassifier(SVC(kernel='linear', random_state=84, probability=True))
# training method #
def buildModel(train, labels):
# train_transformed = tf_idf.fit_transform(train)
classifier.fit(train, labels)
# predicted method #
def predict(test_data):
# test_transformed = tf_idf.fit_transform(test_data)
return classifier.predict(test_data)
# Pre-processing and setting the data to train and test model#
def data_transform(data_set):
transform = CountVectorizer(ngram_range=(1, 1))
data_set = transform.fit_transform(data_set)
return data_set
def data_TFIDF_transform(data_set):
tf_idf = TfidfVectorizer(ngram_range=(1, 1))
data_set = tf_idf.fit_transform(data_set)
return data_set
def runKFoldCrossValitation(data_set: object, label_set: object, folds: object) -> object:
classifier = OneVsOneClassifier(SVC(kernel='linear', random_state=84, probability=True))
# Split Data
train_data, test_data, train_label, test_label = train_test_split(data_set, label_set, test_size=0.1,
random_state=0)
# Class Stratified 10-fold Cross Validation
skf = StratifiedShuffleSplit(n_splits=folds)
# Cross Validation
scores = cross_val_score(classifier, test_data, test_label, cv=skf)
return scores
Error submitted
Traceback (most recent call last):
File "C:/Users/Jeferson/PycharmProjects/NewProject/TestePython.py", line 40, in <module>
scores = run_baseline()
File "C:/Users/Jeferson/PycharmProjects/NewProject/TestePython.py", line 36, in run_baseline
scores = baseline_classifier.runKFoldCrossValitation(data_set, label_set, folds)
File "C:\Users\Jeferson\PycharmProjects\NewProject\baseline.py", line 42, in runKFoldCrossValitation
skf = StratifiedShuffleSplit(n_splits=folds)
TypeError: __init__() got an unexpected keyword argument 'n_splits'