Smart query with MySQL

9

I'm trying to make a more "smart" query in my DB, My question is how?

I have a query :

SELECT * from publicacao where titulo like '%$busca%';

and in my DB there are several publishing titles, for example:

Farmácia
Pharmácia
Pharmacia
Farmacia

When searching for a pharmacy, the 4 lines should be returned.

Is it possible to do this?

    
asked by anonymous 22.11.2016 / 19:22

2 answers

8

If you want to search for the phonetics of words adapted to Portuguese, execute the function:

DROP FUNCTION IF EXISTS transformar_fonetica;

DELIMITER $
CREATE FUNCTION transformar_fonetica(ptexto TEXT)
RETURNS TEXT
BEGIN
  DECLARE vtexto             TEXT;
  DECLARE vtexto_apoio       TEXT;
  DECLARE vposicao_atual     INT;
  DECLARE vcaracter_anterior VARCHAR(1);
  DECLARE vcaracter_atual    VARCHAR(1);
  DECLARE vcaracter_seguinte VARCHAR(1);
  DECLARE vsom               VARCHAR(2);
  DECLARE com_acentos        VARCHAR(65);
  DECLARE sem_acentos        VARCHAR(65);

  SET vtexto = UPPER(ptexto);

  SET com_acentos = 'ŠšŽžÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝŸÞàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿþƒ';
  SET sem_acentos = 'SsZzAAAAAAACEEEEIIIINOOOOOOUUUUYYBaaaaaaaceeeeiiiinoooooouuuuyybf';
  SET vposicao_atual = LENGTH(com_acentos);

  -- Remove acentos
  WHILE vposicao_atual > 0 DO
    SET vtexto = REPLACE(vtexto, SUBSTRING(com_acentos, vposicao_atual, 1), SUBSTRING(sem_acentos, vposicao_atual, 1));
    SET vposicao_atual = vposicao_atual - 1;
  end while;

  -- Remove caracteres inválido
  SET vposicao_atual = 1;

  WHILE vposicao_atual <= LENGTH(vtexto) DO
    SET vcaracter_atual = SUBSTRING(vtexto, vposicao_atual, 1);

    IF INSTR('ABCDEFGHIJKLMNOPQRSTUVWXYZ ', vcaracter_atual) <> 0 THEN
      SET vtexto_apoio = CONCAT(IFNULL(vtexto_apoio, ''), vcaracter_atual);
    END IF;

    SET vposicao_atual = vposicao_atual + 1;
  END WHILE;

  SET vtexto = vtexto_apoio;

  -- Substitui os mais simples
  SET vtexto = REPLACE(vtexto, 'SS', 'S');
  SET vtexto = REPLACE(vtexto, 'SH', 'X');
  SET vtexto = REPLACE(vtexto, 'XC', 'S');
  SET vtexto = REPLACE(vtexto, 'QU', 'K');
  SET vtexto = REPLACE(vtexto, 'CH', 'X');
  SET vtexto = REPLACE(vtexto, 'PH', 'F');
  SET vtexto = REPLACE(vtexto, 'LH', 'LI');

  -- Remove duplicados
  SET vposicao_atual = 1;
  SET vtexto_apoio = '';

  WHILE vposicao_atual <= LENGTH(vtexto) DO
    SET vcaracter_atual = SUBSTRING(vtexto, vposicao_atual, 1);

    IF vposicao_atual < LENGTH(vtexto) THEN
      SET vcaracter_seguinte = SUBSTRING(vtexto, vposicao_atual + 1, 1);
    ELSE -- Último caracter não tem motivo para ser verificado
      SET vcaracter_seguinte = '';
    END IF;

    IF vcaracter_atual <> vcaracter_seguinte THEN
      SET vtexto_apoio = CONCAT(vtexto_apoio, vcaracter_atual);
    END IF;

    SET vposicao_atual = vposicao_atual + 1;
  END WHILE;

  SET vtexto = vtexto_apoio;

  -- Troca caracteres pelo som
  SET vposicao_atual = 1;
  SET vtexto_apoio = '';

  WHILE vposicao_atual <= LENGTH(vtexto) DO
    SET vcaracter_atual = SUBSTRING(vtexto, vposicao_atual, 1);

    IF vposicao_atual < LENGTH(vtexto) THEN
      SET vcaracter_seguinte = SUBSTRING(vtexto, vposicao_atual + 1, 1);
    ELSE
      SET vcaracter_seguinte = '';
    END IF;

    -- "B" seguindo de qualquer caracter que não seja "A", "E", "I", "O", "U", "R" ou "Y"
    IF vcaracter_atual = 'B' AND INSTR('AEIOURY', vcaracter_seguinte) = 0 THEN
        SET vsom = 'BI';
    -- "C" seguindo de "E", "I" ou "Y"
    ELSEIF vcaracter_atual = 'C' AND INSTR('EIY', vcaracter_seguinte) <> 0 THEN
      SET vsom = 'S';
    ELSEIF vcaracter_atual = 'C' THEN
      SET vsom = 'K';
    ELSEIF vcaracter_atual = 'D'  AND INSTR('AEIOURY', vcaracter_seguinte) = 0 THEN
      SET vsom = 'DI';
    ELSEIF vcaracter_atual = 'E' THEN
      SET vsom = 'I';
    ELSEIF vcaracter_atual = 'G' AND INSTR('EIY', vcaracter_seguinte) <> 0  THEN -- GE, GI OU GY
      SET vsom = 'J';
    ELSEIF vcaracter_atual = 'G' AND vcaracter_seguinte = 'T' THEN -- GT
      SET vsom = '';
    ELSEIF vcaracter_atual = 'H' THEN
      SET vsom = 'H';
    ELSEIF vcaracter_atual = 'N' THEN
      SET vsom = 'M';
    ELSEIF vcaracter_atual = 'P' AND INSTR('AEIOURY', vcaracter_seguinte) = 0 THEN
      SET vsom = 'PI';
    ELSEIF vcaracter_atual = 'Q' THEN
      SET vsom = 'K';
    -- QUA, QUE, QUI, QUO ou QUY
    ELSEIF IFNULL(vcaracter_anterior, '') = 'Q' AND vcaracter_atual = 'U' AND INSTR('AEIOY', vcaracter_seguinte) <> 0 THEN
      SET vsom = '';
    ELSEIF vcaracter_atual = 'W' THEN
      SET vsom = 'V';
    ELSEIF vcaracter_atual = 'X' THEN
      SET vsom = 'S';
    ELSEIF vcaracter_atual = 'Y' THEN
      SET vsom = 'I';
    ELSEIF vcaracter_atual = 'Z' THEN
      SET vsom = 'S';
    ELSE
      SET vsom = vcaracter_atual;
    END IF;

    SET vcaracter_anterior = vcaracter_atual;
    SET vposicao_atual = vposicao_atual + 1;
    SET vtexto_apoio = CONCAT(vtexto_apoio, vsom);
  END WHILE;

  SET vtexto = vtexto_apoio;

  SET vtexto = CONCAT('%', replace(vtexto, ' ', '%'), '%');

  RETURN vtexto;
END
$

The usage is as follows:

SELECT *
  FROM publicacao
 WHERE transformar_fonetica(titulo) LIKE transformar_fonetica('$busca');

If there are too many records, query will have performance problems, so I'd advise creating a column for phonetics only and a trigger to update it.

Adding the new column

ALTER TABLE publicacao ADD fonetica TEXT;

Creating the update trigger

DELIMITER $
CREATE TRIGGER publicacao_fonetica AFTER UPDATE ON publicacao
FOR EACH ROW
BEGIN
  if NEW.titulo <=> OLD.titulo THEN
    UPDATE publicacao
       SET fonetica = transformar_fonetica(NEW.titulo)
     WHERE id = OLD.id;
  END IF;
END;
$

Data selection

SELECT *
  FROM publicacao
 WHERE fonetica LIKE transformar_fonetica('$busca');

Note:

The pronunciation of some letter combinations varies according to the words, so there may be discrepancies. However I have not really found an example where it does not work. The search also works for proper names.

    
23.11.2016 / 02:10
4

For simple cases, the SOUNDEX () function can help

SELECT title, SOUNDEX(title) FROM soundex_test WHERE SOUNDEX(title) like SOUNDEX('%Farmácia%');

Of course you should make some adjustments because Pharmácia is P652 and Pharmacy is F652.

But it helps a lot in cases where phonetics can have some similarity. Example

SELECT title, SOUNDEX(title) FROM soundex_test WHERE SOUNDEX(title) like SOUNDEX('%drug%');

This will return records containing terms such as

drug
droug
droog

If you want to test, you can use these test tables:

CREATE TABLE 'soundex_test' (
  'id' int(10) unsigned NOT NULL AUTO_INCREMENT,
  'title' varchar(45) NOT NULL,
  PRIMARY KEY ('id')
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

INSERT INTO 'soundex_test' VALUES ('1', 'Farmácia');
INSERT INTO 'soundex_test' VALUES ('4', 'Pharmácia');
INSERT INTO 'soundex_test' VALUES ('45', 'aucatra');
INSERT INTO 'soundex_test' VALUES ('55', 'alcatara');
INSERT INTO 'soundex_test' VALUES ('56', 'alctra');
INSERT INTO 'soundex_test' VALUES ('71', 'Pharmacia');
INSERT INTO 'soundex_test' VALUES ('72', 'Farmacia');
INSERT INTO 'soundex_test' VALUES ('73', 'Memory Card');
INSERT INTO 'soundex_test' VALUES ('79', 'Gamepad');
INSERT INTO 'soundex_test' VALUES ('80', 'drug');
INSERT INTO 'soundex_test' VALUES ('454', 'droug');
INSERT INTO 'soundex_test' VALUES ('565', 'alcatra');
INSERT INTO 'soundex_test' VALUES ('3445', 'droog');
INSERT INTO 'soundex_test' VALUES ('5656', 'acatra');



SELECT title, SOUNDEX(title) FROM soundex_test WHERE SOUNDEX(title) like SOUNDEX('%alcatra%');

Return:

alcatara
alctra
alcatra

Unfortunately it does not return aucatra because the default is English phonetic. To learn more about soundex: link

Be aware that this is just an alternative and not a definitive solution.

Another solution is to create dictionary tables and then you manually register the variations. Of course this is a colossal job and takes years, but that's basically what Google does. Another tip is to always log what users type in searches and you will notice certain patterns of typos, grammar errors, etc. Take advantage of these logs to compose the dictionary, relating them to the correct term or relating them to each other.

    
23.11.2016 / 06:39