I have a collection in MongoDB of Tweets, these records have a field called text, in this field I need to delete records that have the same value besides removing single, double, commas, and line breaks. For the removal of the data with the duplicate text field I'm trying as follows:
var registro;
db.getCollection('TweetsBR_1_copy').find().forEach( function(myDoc) {
db.getCollection('TweetsBR_1_copy').find({"text": myDoc.text}).forEach( function(myDoc_2) {
registro = db.getCollection('TweetsBR_1_copy').findOne({text:myDoc_2.text})
db.getCollection('TweetsBR_1_copy').remove(registro)
print("registro excluido:")
print(registro.text)
});
db.getCollection('TweetsBR_1_copy').insert(registro)
print("registro inserido:")
print(registro.text)
});
But I'm noticing that every time I run the command it deletes more and more records, so I'm not sure it's working properly.
The collection has around 500K
of records.
Can anyone help me with this?
Thank you.