Delete single quotes, double quotation marks, commas, line breaks, and records with the same value as a field in mongodb

0

I have a collection in MongoDB of Tweets, these records have a field called text, in this field I need to delete records that have the same value besides removing single, double, commas, and line breaks. For the removal of the data with the duplicate text field I'm trying as follows:

var registro;
db.getCollection('TweetsBR_1_copy').find().forEach( function(myDoc) {
    db.getCollection('TweetsBR_1_copy').find({"text": myDoc.text}).forEach( function(myDoc_2) {
        registro = db.getCollection('TweetsBR_1_copy').findOne({text:myDoc_2.text})
        db.getCollection('TweetsBR_1_copy').remove(registro)
        print("registro excluido:")
        print(registro.text)
    });
    db.getCollection('TweetsBR_1_copy').insert(registro)
    print("registro inserido:")
    print(registro.text)
});

But I'm noticing that every time I run the command it deletes more and more records, so I'm not sure it's working properly. The collection has around 500K of records.

Can anyone help me with this?

Thank you.

    
asked by anonymous 10.03.2017 / 16:58

1 answer

1

I will copy your code and comment to try to understand what you are doing and help you:

var registro;

// para cada documento na collection "TweetsBR_1_copy"

db.getCollection('TweetsBR_1_copy').find().forEach( function(myDoc) {

    // vou buscar na mesma collection um (ou vários) documento(s) que tenha o mesmo valor do campo "text"

    db.getCollection('TweetsBR_1_copy').find({"text": myDoc.text}).forEach( function(myDoc_2) {

        // buscar novamente, só que somente um documento dessa vez

        registro = db.getCollection('TweetsBR_1_copy').findOne({text:myDoc_2.text})

        // excluir o documento da collection

        db.getCollection('TweetsBR_1_copy').remove(registro)
        print("registro excluido:")
        print(registro.text)
    });

    // inserir novamente o documento na collection, isso vai executar tantas vezes quantos documentos estiverem na collection (você está dentro do primeiro foreach aqui ainda)

    db.getCollection('TweetsBR_1_copy').insert(registro)
    print("registro inserido:")
    print(registro.text)
});

You are deleting and inserting it again, which is why each time you run it, you delete more records. Try to do something like this:

// para cada documento na collection "TweetsBR_1_copy"

db.getCollection('TweetsBR_1_copy').find().forEach( function(myDoc) {

    // vou buscar na mesma collection um (ou vários) documento(s) que tenha o mesmo valor do campo "text"

    db.getCollection('TweetsBR_1_copy').find({"text": myDoc.text}).forEach( function(myDoc_2) {

        // excluir o documento da collection

        db.getCollection('TweetsBR_1_copy').remove(myDoc_2)
        print("registro excluido:")
        print(myDoc_2.text)

        // inserir em uma collection de backup

        print("registro inserido na collection excluidos:")
        print(myDoc_2.text)
        db.getCollection('TweetsBR_1_excluidos').insert(myDoc_2)
    });
});
    
13.05.2017 / 20:44