1st Option - Fuzzy Lookup Add-In for Excel
You can use Fuzzy Lookup Add-In for Excel or create the Fuzzy logic itself in a program. Search for Diffuse Search or in English fuzzy search / lookup .
The Fuzzy Lookup Add-in for Excel was developed by Microsoft Research and a fuzzy match of text data in Microsoft Excel. It can be used to identify lines of fuzzy duplicates within a single table or for confusing junctions in similar queues between two different tables. Matching is robust for a wide variety of errors, including misspellings, abbreviations, synonyms, and missing / added data. For example, you may detect that the lines "Mr. Andrew Hill", "Hill, Andrew R." and "Andy Hill" refer to the same underlying entity, returning to the similarity score with each match. Although the default setting works well for a wide variety of textual data, such as product names or customer addresses, matching can also be customized for specific domains or languages. (Translation by Google Translator.)
2nd Option - VBA Code
This code has been tested in Excel VBA and the credits are on this page .
Public Function Similarity(ByVal String1 As String, _
ByVal String2 As String, _
Optional ByRef RetMatch As String, _
Optional min_match = 1) As Single
Dim b1() As Byte, b2() As Byte
Dim lngLen1 As Long, lngLen2 As Long
Dim lngResult As Long
If UCase(String1) = UCase(String2) Then
Similarity = 1
Else:
lngLen1 = Len(String1)
lngLen2 = Len(String2)
If (lngLen1 = 0) Or (lngLen2 = 0) Then
Similarity = 0
Else:
b1() = StrConv(UCase(String1), vbFromUnicode)
b2() = StrConv(UCase(String2), vbFromUnicode)
lngResult = Similarity_sub(0, lngLen1 - 1, _
0, lngLen2 - 1, _
b1, b2, _
String1, _
RetMatch, _
min_match)
Erase b1
Erase b2
If lngLen1 >= lngLen2 Then
Similarity = lngResult / lngLen1
Else
Similarity = lngResult / lngLen2
End If
End If
End If
End Function
Private Function Similarity_sub(ByVal start1 As Long, ByVal end1 As Long, _
ByVal start2 As Long, ByVal end2 As Long, _
ByRef b1() As Byte, ByRef b2() As Byte, _
ByVal FirstString As String, _
ByRef RetMatch As String, _
ByVal min_match As Long, _
Optional recur_level As Integer = 0) As Long
'* CALLED BY: Similarity *(RECURSIVE)
Dim lngCurr1 As Long, lngCurr2 As Long
Dim lngMatchAt1 As Long, lngMatchAt2 As Long
Dim I As Long
Dim lngLongestMatch As Long, lngLocalLongestMatch As Long
Dim strRetMatch1 As String, strRetMatch2 As String
If (start1 > end1) Or (start1 < 0) Or (end1 - start1 + 1 < min_match) _
Or (start2 > end2) Or (start2 < 0) Or (end2 - start2 + 1 < min_match) Then
Exit Function '(exit if start/end is out of string, or length is too short)
End If
For lngCurr1 = start1 To end1
For lngCurr2 = start2 To end2
I = 0
Do Until b1(lngCurr1 + I) <> b2(lngCurr2 + I)
I = I + 1
If I > lngLongestMatch Then
lngMatchAt1 = lngCurr1
lngMatchAt2 = lngCurr2
lngLongestMatch = I
End If
If (lngCurr1 + I) > end1 Or (lngCurr2 + I) > end2 Then Exit Do
Loop
Next lngCurr2
Next lngCurr1
If lngLongestMatch < min_match Then Exit Function
lngLocalLongestMatch = lngLongestMatch
RetMatch = ""
lngLongestMatch = lngLongestMatch _
+ Similarity_sub(start1, lngMatchAt1 - 1, _
start2, lngMatchAt2 - 1, _
b1, b2, _
FirstString, _
strRetMatch1, _
min_match, _
recur_level + 1)
If strRetMatch1 <> "" Then
RetMatch = RetMatch & strRetMatch1 & "*"
Else
RetMatch = RetMatch & IIf(recur_level = 0 _
And lngLocalLongestMatch > 0 _
And (lngMatchAt1 > 1 Or lngMatchAt2 > 1) _
, "*", "")
End If
RetMatch = RetMatch & Mid$(FirstString, lngMatchAt1 + 1, lngLocalLongestMatch)
lngLongestMatch = lngLongestMatch _
+ Similarity_sub(lngMatchAt1 + lngLocalLongestMatch, end1, _
lngMatchAt2 + lngLocalLongestMatch, end2, _
b1, b2, _
FirstString, _
strRetMatch2, _
min_match, _
recur_level + 1)
If strRetMatch2 <> "" Then
RetMatch = RetMatch & "*" & strRetMatch2
Else
RetMatch = RetMatch & IIf(recur_level = 0 _
And lngLocalLongestMatch > 0 _
And ((lngMatchAt1 + lngLocalLongestMatch < end1) _
Or (lngMatchAt2 + lngLocalLongestMatch < end2)) _
, "*", "")
End If
Similarity_sub = lngLongestMatch
End Function
That a simple test is seen in the following image:
Thatmeansbothhaveasimilarityof94.176%.
Thisfunction(Similarity
)canbeusedtocompareadatabaseandfromaminimumsetpointduplicatescanbefoundbysimilarityinpercentage.Thisanalysiscanbetime-consumingonalargebasis,requiringgoodqualityprogrammingandanemphasisonperformanceifyouwantbetterprocessingtime.
3rdOption-LevenshteinDistance
Credits: sysmod
Private Declare Function GetTickCount Lib "kernel32.dll" () As Long
Option Explicit
'Option Base 0 assumed
Sub testLevenshtein()
Dim s1 As String, s2 As String, lTime As Long, i As Long, teste As Long
s1 = "Luiz Carlos Silva"
s2 = "Luis Carlos Silva"
lTime = GetTickCount()
teste = LevenshteinB(s1, s2)
Debug.Print GetTickCount - lTime; " ms" ' 234 ms
Debug.Print teste
End Sub
'POB: fn with byte array and inline MIN code is 17 times faster
Function LevenshteinB(ByVal string1 As String, ByVal string2 As String) As Long
'http://www.sysmod.com/modLevenshtein.bas
Dim i As Long, j As Long, ByteArray1() As Byte, ByteArray2() As Byte
Dim string1_length As Long
Dim string2_length As Long
Dim distance() As Long
Dim min1 As Long, min2 As Long, min3 As Long
Const UseWSMIN = False
string1_length = Len(string1)
string2_length = Len(string2)
ReDim distance(string1_length, string2_length)
ByteArray1 = string1
ByteArray2 = string2
For i = 0 To string1_length
distance(i, 0) = i
Next
For j = 0 To string2_length
distance(0, j) = j
Next
For i = 1 To string1_length
For j = 1 To string2_length
'Unicode, compare both even and odd bytes
If ByteArray1((i - 1) * 2) = ByteArray2((j - 1) * 2) And _
ByteArray1((i - 1) * 2 + 1) = ByteArray2((j - 1) * 2 + 1) Then
distance(i, j) = distance(i - 1, j - 1)
Else
If UseWSMIN Then
distance(i, j) = WorksheetFunction.Min _
(distance(i - 1, j) + 1, _
distance(i, j - 1) + 1, _
distance(i - 1, j - 1) + 1)
Else
' spell it out, 50 times faster than worksheetfunction.min
min1 = distance(i - 1, j) + 1
min2 = distance(i, j - 1) + 1
min3 = distance(i - 1, j - 1) + 1
If min1 <= min2 And min1 <= min3 Then
distance(i, j) = min1
ElseIf min2 <= min1 And min2 <= min3 Then
distance(i, j) = min2
Else
distance(i, j) = min3
End If
End If
End If
Next
Next
LevenshteinB = distance(string1_length, string2_length)
End Function
Which returns the result of the image:
Inthistesttheinformationthatappearsishowmanycharactersaredifferentbetweenthetwostringsandthetimeforacomparisonthatislessthan1ms,for100iterationsittook16ms.
EDIT:
Asamplecodetoworkwiththesimilarityfunction:
SubtestSimilaridade()DimLastRowAsLong,iAsLongDimArr()AsVariant,NewArr()AsVariantDimNamesAsWorksheet,wsAsWorksheetDimSimilaridadeAsSingle,LimiteAsSingleSetNames=ThisWorkbook.Worksheets("Names")
SheetKiller ("NewNames")
Set ws = Sheets.Add
ws.Name = "NewNames"
LastRow = Names.Cells(Names.Rows.Count, "A").End(xlUp).Row
Arr = Names.Range("a2", Names.Cells(LastRow, 1))
NewArr = Arr
Limite = 0.9
For i = LBound(Arr) To UBound(Arr)
If Not i = UBound(Arr) Then x = i + 1
For k = x To UBound(Arr)
Similaridade = Similarity(CStr(Arr(i, 1)), CStr(Arr(k, 1)))
If Similaridade > Limite Then
NewArr(k, 1) = ""
End If
Next k
Next i
For i = LBound(Arr) To UBound(Arr)
ws.Cells(i, 1) = NewArr(i, 1)
Next i
ws.Range("A:A").Sort Key1:=ws.Range("A2"), Order1:=xlAscending, Header:=xlYes
End Sub
Public Function SheetKiller(Name As String)
Dim s As Worksheet, t As String
Dim i As Long, k As Long
k = Sheets.Count
For i = k To 1 Step -1
t = Sheets(i).Name
If t = Name Then
Application.DisplayAlerts = False
Sheets(i).Delete
Application.DisplayAlerts = True
End If
Next i
End Function
Results:
With this sample data:
Theresultisasfollows:
AnotherResultTest
Atestcanbedonewiththe database nickname- and-diminutive-names-lookup ", which with 2266 entries, output was 1509 records. Performing in less than 60 seconds.
To perform data parsing, copy and paste the raw data in cell A2. And then run the following code:
Sub test()
Dim ws As Worksheet, source As Worksheet
Dim LastRowA As Long, LastRowB As Long, i As Long, k As Long
Dim strCell As String
SheetKiller ("Names")
Set ws = Sheets.Add
ws.Name = "Names"
Set source = ThisWorkbook.Worksheets("Planilha1")
LastRowA = source.Cells(source.Rows.Count, "A").End(xlUp).Row
For i = 1 To LastRowA
strCell = CStr(source.Cells(i, 1))
Count = Len(strCell) - Len(Replace(strCell, ",", ""))
For k = 1 To Count
LastRowB = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row
ws.Cells(LastRowB + 1, 1) = EXTRACTELEMENT(strCell, k, ",")
Next k
Next i
End Sub
Function EXTRACTELEMENT(Txt As String, n, Separator As String) As String
On Error GoTo ErrHandler:
EXTRACTELEMENT = Split(Application.Trim(Mid(Txt, 1)), Separator)(n - 1)
Exit Function
ErrHandler:
' error handling code
EXTRACTELEMENT = CVErr(xlErrNA)
On Error GoTo 0
End Function