<?xml version="1.0" encoding="utf-16"?>
<ANATELLA version='2.93'>
<GlobalParameters wDirLoc='1' dataDirWrite=':/'>
</GlobalParameters>
<WorkingSpace>AAAA/wAAAAD9AAAAAgAAAAEAAAEPAAACNfwCAAAAAfsAAAAcAEEAbABsAEEAYwB0AGkAbwBuAHMARABvAGMAawEAAABeAAACNQAAAHgA////AAAAAwAACNUAAAG2/AEAAAAC+wAAABoARABhAHQAYQBUAGEAYgBsAGUARABvAGMAawEAAAAAAAADuAAAAiIA/////AAAA74AAAUXAAAB1gD////6AAAAAAEAAAAC+wAAACgAQQBjAHQAaQBvAG4AUAByAG8AcABlAHIAdABpAGUAcwBEAG8AYwBrAQAAAAD/////AAAB1gD////7AAAADgBMAG8AZwBEAG8AYwBrAQAAA8QAAAO8AAAAhQD///8AAAfAAAACNQAAAAQAAAAEAAAACAAAAAj8AAAAAQAAAAIAAAADAAAAFgBtAGEAaQBuAFQAbwBvAGwAQgBhAHIBAAAAAP////8AAAAAAAAAAAAAABYAdABlAHgAdABUAG8AbwBsAEIAYQByAQAAA7L/////AAAAAAAAAAAAAAAeAGcAcgBvAHUAcABCAG8AeABUAG8AbwBsAEIAYQByAQAABof/////AAAAAAAAAAA=</WorkingSpace>
<ACTIONS>
<Generic keyHD='167449898418900308' forceHDBuffering='1' idx='5' x='-180' y='-48' libs='BagOfWord2' id='fuzzyJoin'
  longName='fuzzy join 2 tables'
  nPinIn='2'
  nPinInMin='2'
  le='1516711883018'
  keywords='fuzzy join word text jaro winkler damerau leven dice'
  image='/fuzzyJoin.svg'
  author='Frank Vanden Berghen'
  revision='0.08'
  tags='350_Text Mining, 100_Join Tables'
  pdfDestination='5_4_8_fuzzy_join_javascript_act'>
<Description>

</Description>
<Parameters>
  <Parameter id='idKeyToFind' text='Key in Master Table' type='onecolumn'><meta>0</meta><data>Client_Name</data></Parameter>
  <Parameter id='idRefKey' text='Key in Slave Table' type='onecolumn'><meta>1</meta><data>Client_Name</data></Parameter>
  <Parameter id='idRefId' text='Column to Join in Slave Table' type='manycolumns'><meta>1</meta><data><c>ID</c></data></Parameter>
  <Parameter id='idKNNSize' text='Find k-NN. K=?' type='double'>3</Parameter>
  <Parameter id='idType' text='Type of similarity' type='combobox'><meta>&lt;i&gt;Damereau LevenStein similarity&lt;/i&gt;
&lt;i&gt;Jaro Winkler similarity&lt;/i&gt;
&lt;i&gt;Dice Coefficient similarity&lt;/i&gt;
&lt;i&gt;Damereau LevenStein distance&lt;/i&gt;</meta><data>2</data></Parameter>
  <Parameter id='idPartition1' text='Parition Var in Main Table (optional)' type='onecolumn'><meta>0</meta><data></data></Parameter>
  <Parameter id='idPartition2' text='Parition Var in Reference (optional)' type='onecolumn'><meta>1</meta><data></data></Parameter>
  <Parameter id='idPrefix' text='Prefix to add on joined column name (optional)' type='string'></Parameter>
</Parameters>
<Script>
var dm,jw,dc;
var isFirstRun,refKeys,refID,refHash,mr,kb,nMax,ii,nn;
var curPartition;

function init()
{
   var inRowMaster=getCurrentRow(0);
   if (inRowMaster.isNull) return 1;
   var r=getCurrentRow(1);

   isFirstRun=true;
   nn=idRefId.length;
   var i=idKNNSize*(2+nn),j=0,k;
   setOutputRowSize(0,inRowMaster.nColumn+i);
   mr=new Row(i);
   for(i=0;i&lt;idKNNSize;i++)
   {
        rowSetColumnName(mr,j,"ClosestKey_"+(i+1)); 
        j++;
         if (idType!=3) rowSetColumnName(mr,j,"Similarity_"+(i+1));
        else rowSetColumnName(mr,j,"Distance_"+(i+1));
        rowSetMetaType(mr,j,'F'); 
        j++;
        for(k=0;k&lt;nn;k++)
        {
            rowSetColumnName(mr,j,idPrefix+rowGetColumnName(r,idRefId[k])+"_"+(i+1));
            rowSetMetaType(mr,j,rowGetMetaType(r,idRefId[k]));
            j++;
        }
   }
   kb=new KeepBests(idKNNSize);
   dm=new DamerauLeven();
   jw=new JaroWinkler();
   dc=new DiceCoefficient();

   if (idPartition1>=0)
   {
        if (idPartition2&lt;0) throw "Erronous Partition Var parameter (1)";

        curPartition="__"+inRowMaster.col(idPartition1);

        r=getNextRow(1);

        if (rowGetMetaType(inRowMaster,idPartition1)=='U')
        {
            if (rowGetSortType(inRowMaster)!='A')
                throw "Unsupported sort type in Master table: only alphabetic sort is supported when the partition column is a string";
            if (rowGetSortType(r)!='A')
                throw "Unsupported sort type in Reference table: only alphabetic sort is supported when the partition column is a string";
        } else
        {
            if (rowGetSortType(inRowMaster)!='0')
                throw "Unsupported sort type in Master table: only numeric sort is supported when the partition column is a number";
            if (rowGetSortType(r)!='0')
                throw "Unsupported sort type in Reference table: only numeric sort is supported when the partition column is a number";
        }
        if (rowGetSortColumnIdx(inRowMaster,0)!=idPartition1)
            throw "Master table not sorted on partition var";
        if (rowGetSortColumnIdx(r,0)!=idPartition2)
            throw "Slave table not sorted on partition var";
       
   } else if (idPartition2>=0) throw "Erronous Partition Var parameter (2)";
   return 0;
}

function saveSlaveTableRow(r)
{
    s=r.col(idRefKey);
    refHash[s]=ii; ii++;
    refKeys.push(s); 
    if (nn==1) { refID.push(r.col(idRefId[0])); return; }
    if (!nn) return;
    var a=new Array(nn);
    var i=nn;
    while(i--) a[i]=r.col(idRefId[i]);
    refID.push(a);
}

function outputSlaveTableRow(k,jj)
{
    if (nn==1) { mr.setColumn(jj,refID[k]); return; }
    if (!nn) return;
    var a=refID[k],j;
    for(j=0;j&lt;nn;j++)
    {
        mr.setColumn(jj,a[j]); 
        jj++;
    }
}

function run()
{
   var inRowMaster=getNextRow();
   if (inRowMaster.isNull) return 1;

   if (idPartition1>=0)
   {
        var p=inRowMaster.col(idPartition1);
        if (p!=curPartition)
        {
            var r=getCurrentRow(1);
            if ((r.isNull)||(r.col(idPartition2)>p))
            {
                inRowMaster.write(); rowWriteNull(mr); writeEOL(); return 0;
            } else
            {
                if (r.col(idPartition2)&lt;p)
                {
                    var s;
                    do
                    {
                        r=getNextRow(1);
                         if (r.isNull) 
                        {
                            inRowMaster.write(); rowWriteNull(mr); writeEOL(); return 0;
                        }
                        if (userAskedAbort()) return 1;
                        s=r.col(idPartition2);
                    } while (s&lt;p)
                    if (s!=p)
                    {
                        inRowMaster.write(); rowWriteNull(mr); writeEOL(); return 0;
                    }
                }
            }
            curPartition=p;
            refKeys=[]; refID=[]; refHash=[]; ii=0;
            for(;;)
            {
                saveSlaveTableRow(r);
                 r=getNextRow(1);
                 if (r.isNull) break;
                 if (r.col(idPartition2)!=curPartition) break;
                 if (userAskedAbort()) return 1;
             }

             nMax=idKNNSize;
             if (refKeys.length&lt;nMax) nMax=refKeys.length;
        }
   } else
   {
       if (isFirstRun)
       {
            isFirstRun=false;
           refKeys=[]; refID=[]; refHash=[]; ii=0;
           var r;
           for(;;)
           {
                 r=getNextRow(1);
                 if (r.isNull) break;
                 if (userAskedAbort()) return 1;
                saveSlaveTableRow(r);
            }
            if (idKNNSize>refKeys.length) throw "K is too big compared to Reference Table size";
            nMax=idKNNSize;
       }
    }
   inRowMaster.write();
  var i,j=0,jj=0,jjj;
  var keyMain=inRowMaster.col(idKeyToFind),k=refHash[keyMain];
   if (k!=null)
   {
        mr.setColumn(0,refKeys[k]);
        if (idType!=3) mr.setColumn(1,1); else mr.setColumn(1,0);
        outputSlaveTableRow(k,2);
        if (idKNNSize==1) 
        {
            mr.write(); writeEOL();
            return 0;
        }
        j=1;
        jj=2+nn;
   }

   kb.reset();
   i=refKeys.length;
   switch(idType)
   {
        case 0: 
            while(i--)
            {
                if (i==k) continue;
                kb.add(dm.similarity(keyMain,refKeys[i]),i); 
            }
            break;
        case 1: 
            while(i--)
            {
                if (i==k) continue;
                kb.add(jw.similarity(keyMain,refKeys[i]),i); 
            }
            break;
        case 2: 
            while(i--)
            {
                if (i==k) continue;
                kb.add(dc.similarity(keyMain,refKeys[i]),i); 
            }
            break;
        case 3: 
            while(i--)
            {
                if (i==k) continue;
                kb.add(-dm.distance(keyMain,refKeys[i]),i); 
            }
            break;
   }

   i=0;
   for(;j&lt;nMax;j++)
   {
        k=kb.getValue(i);
        mr.setColumn(jj,refKeys[k]);
        jj++;
        if (idType!=3) mr.setColumn(jj,kb.getKey(i));
        else                mr.setColumn(jj,-kb.getKey(i));
        jj++;
        outputSlaveTableRow(k,jj);
        jj+=nn;
        i++;
   }
   for(;j&lt;idKNNSize;j++)
   {
        i=nn+2;
        while(i--)
        {
            mr.setColumn(jj); jj++;
        }
   }
   mr.write(); writeEOL();
   return 0;
}
</Script>
</Generic>
<Unflatten module='DefaultActions' idx='7' x='-48' y='-48' addSetName='0' setColumnName='proposition'>
 <common><c>ID</c><c>Client_Name</c></common>
 <sets>
  <set name='Set_2'><c>ClosestKey_2</c><c>Similarity_2</c><c>ID_2</c></set>
  <set name='Set_3'><c>ClosestKey_3</c><c>Similarity_3</c><c>ID_3</c></set>
 </sets>
</Unflatten>
<FilterRows keyHD='166843663719228174' module='MathParser' idx='12' x='84' y='-48'>
 <Expression>Similarity>=0.6
  &amp;&amp;
ID != ID2</Expression>
 <InputVars>
    <InputVar column='Similarity_2' label='Similarity' meta='F'/>
    <InputVar column='ID'/>
    <InputVar column='ID_2' label='ID2'/>
 </InputVars>
</FilterRows>
<writeExcel module='WriteReport' idx='40' x='888' y='-48' fileNameDestination=':/duplicateCorrectionsForAgencyX.xlsx' writeAllRows='0' testCol='New_Agency' testValue='X' nrIncludeTitleRow='1' dateFormatExcel='4'/>
<Generic idx='41' x='216' y='-48' id='FindConnectedGroups'
  longName='find groups'
  nPinInMin='2'
  le='1668377489260'
  keywords='compute connected group undirected graph sna network node telecom'
  image='/FindConnectedGroups.svg'
  author='Frank Vanden Berghen'
  revision='0.04'
  tags='300_Graph Mining'>
<Description>
Find the nodes that belongs to the same connected group in an undirected graph
</Description>
<Parameters>
  <Parameter id='idA' text='ID Node A' type='onecolumn'><meta>0</meta><data>ID</data></Parameter>
  <Parameter id='idB' text='ID node B' type='onecolumn'><meta>0</meta><data>ID_2</data></Parameter>
  <Parameter id='idGroup' text='Arc A-B belongs to group? (optional)' type='onecolumn'><meta>0</meta><data></data></Parameter>
  <Parameter id='isNumberNodes' text='A and B are SMALL numbers (faster)' type='bool'>0</Parameter>
</Parameters>
<Script>
var mr,results,isFirstRun,posR,groups,v,vn,nGroups;

function init()
{
    setOutputRowSize(0,2);
    mr=new Row(2);
    rowSetColumnName(mr,0,"Node");
    rowSetColumnName(mr,1,"Group");
    if (isNumberNodes) rowSetMetaType(mr,0,'K');
    if ((idGroup&lt;0)&amp;&amp;(isNumberNodes)) rowSetMetaType(mr,1,'K');
    results=[]; posR=0;
    groups={}; nGroups=0;
    v={}; vn=[];
    isFirstRun=true;
    return 0;
}


function findHeadOfTree(topA2)
{
    if (typeof v[topA2]=="undefined") return topA2;
    var topA,topA3;
    for(;;)
    {
        topA3=topA2;
        topA=v[topA2];
        if (topA==topA2) return topA;
        topA2=v[topA];
        if (topA==topA2) return topA;
        v[topA3]=topA2;
    }
}

function findHeadOfTreeNumber(topA2)
{
    if (typeof vn[topA2]=="undefined") return topA2;
    var topA,topA3;
    for(;;)
    {
        topA3=topA2;
        topA=vn[topA2];
        if (topA==topA2) return topA;
        topA2=vn[topA];
        if (topA==topA2) return topA;
        vn[topA3]=topA2;
    }
}
function run()
{
    if (!isFirstRun)
    {
        var a=results[posR],topA;
        if (isNumberNodes) topA=findHeadOfTreeNumber(a);
        else topA=findHeadOfTree(a);
        mr.setColumn(0,a);
         if (idGroup>=0) mr.setColumn(1,groups[topA]);
        else mr.setColumn(1,topA);
        mr.write(); 
        writeEOL();
        posR++;
        return posR==results.length;
    }

    isFirstRun=false;
    var a,b,topA,topB;
    if (isNumberNodes)
    {
        for(;;)
        {
            if (userAskedAbort()) return 1;
            var r=getNextRow(0);
            if (r.isNull) break;
    
            a=Number(r.col(idA));
            b=Number(r.col(idB));
    
            topA=findHeadOfTreeNumber(a);
            topB=findHeadOfTreeNumber(b);
            if (topA!=topB)
            {
                if ((a==topA)&amp;&amp;(typeof vn[a]=="undefined")) results.push(a);
                if ((b==topB)&amp;&amp;(typeof vn[b]=="undefined")) results.push(b);
                if ((idGroup>=0)&amp;&amp;(typeof groups[topA]=="undefined")) groups[topA]=r.col(idGroup);
                vn[a]=topA; vn[b]=topA; vn[topB]=topA;
            }
        }
        return results.length==0;
    }
    for(;;)
    {
        if (userAskedAbort()) return 1;
        var r=getNextRow(0);
        if (r.isNull) break;

        a=r.col(idA);
        b=r.col(idB);

        topA=findHeadOfTree(a);
        topB=findHeadOfTree(b);
        if (topA!=topB)
        {
            if ((a==topA)&amp;&amp;(typeof v[a]=="undefined")) results.push(a);
            if ((b==topB)&amp;&amp;(typeof v[b]=="undefined")) results.push(b);
            if ((idGroup>=0)&amp;&amp;(typeof groups[topA]=="undefined")) groups[topA]=r.col(idGroup);
            v[a]=topA; v[b]=topA; v[topB]=topA;
        }
    }
    return results.length==0;
}
</Script>
</Generic>
<inlineTable keyHD='167449898418900309' forceHDBuffering='1' idx='47' x='-312' y='-48'>
<ColumnNames><c>ID</c><c>Client_Name</c><c>Agency</c></ColumnNames>
<Rows>
 <r><c>A</c><c>Cara Delevingne</c><c>X</c></r>
 <r><c>B</c><c>Delevingne Cara</c><c>X</c></r>
 <r><c>C</c><c>Cara Delevine</c><c>Y</c></r>
 <r><c>D</c><c>Angelina Jolie</c><c>Y</c></r>
 <r><c>E</c><c>Jolie, Angy</c><c>Y</c></r>
 <r><c>F</c><c>Sigourney Weaver</c><c>Z</c></r>
 <r><c>G</c><c>Weaver Sigounay&#32;</c><c>X</c></r>
</Rows>
</inlineTable>
<MultiJoin module='DefaultActions' idx='52' x='480' y='-48'>
  <Join pin='1' mainKey='Node' slaveKey='ID' prefix='Origin_'>
    </Join>
  <Join pin='1' mainKey='Group' slaveKey='ID' prefix='New_'>
    </Join>
</MultiJoin>
<ColumnRename module='DefaultActions' idx='54' x='612' y='-48'><before>
<c>Group</c><c>Node</c></before><after>
<c>New_ID</c><c>Original_ID</c></after>
<QuickRename ></QuickRename>
</ColumnRename>
<SelectColumns module='DefaultActions' idx='55' x='744' y='-48' keep='1'>
 <c>Original_ID</c>
 <c>New_ID</c>
 <c>Origin_Agency</c>
 <c>New_Agency</c>
 <c>Origin_Client_Name</c>
 <c>New_Client_Name</c>
</SelectColumns>
<writeExcel module='WriteReport' idx='58' x='912' y='-24' fileNameDestination=':/duplicateCorrectionsForAgencyY.xlsx' writeAllRows='0' testCol='New_Agency' testValue='Y' nrIncludeTitleRow='1' dateFormatExcel='4'/>
<writeExcel module='WriteReport' idx='59' x='936' y='0' fileNameDestination=':/duplicateCorrectionsForAgencyZ.xlsx' writeAllRows='0' testCol='New_Agency' testValue='Z' nrIncludeTitleRow='1' dateFormatExcel='4'/>
<FilterRows module='MathParser' idx='61' x='348' y='-48'>
 <Expression>Node !=  Group</Expression>
 <InputVars>
    <InputVar column='Node' isNumber='0'/>
    <InputVar column='Group' isNumber='0'/>
 </InputVars>
</FilterRows>
<inlineTable idx='62' x='684' y='120'>
<ColumnNames><c>emailFrom</c><c>emailTo</c><c>emailSubject</c><c>emailMessage</c><c>AttachmentIDs</c></ColumnNames>
<Rows>
 <r><c>frank@timi.eu</c><c>marieangelevan@yahoo.fr</c><c>Client Duplicate Corrections&#32;</c><c>&lt;html>Here are the Client Duplicate Corrections for Agency A</c><c>:/duplicateCorrectionsForAgencyX.xlsx</c></r>
 <r><c>frank@timi.eu</c><c>marieangelevan@yahoo.fr</c><c>Client Duplicate Corrections&#32;</c><c>&lt;html>Here are the Client Duplicate Corrections for Agency B</c><c>:/duplicateCorrectionsForAgencyY.xlsx</c></r>
 <r><c>frank@timi.eu</c><c>marieangelevan@yahoo.fr</c><c>Client Duplicate Corrections&#32;</c><c>&lt;html>Here are the Client Duplicate Corrections for Agency C</c><c>:/duplicateCorrectionsForAgencyZ.xlsx</c></r>
</Rows>
</inlineTable>
<SendEmail module='EMailSMS' idx='64' x='936' y='120' cc='' bcc='' p2AttachmentIDCol='AttachmentIDs' includeAllAttachments='0' encryption='2' port='587'>
</SendEmail>
<aggregate idx='65' x='804' y='156' inRAMAlgo='1'>
 <OneAggregate>
  <GroupBy>
   <v name='AttachmentIDs'/>
  </GroupBy>
  <OutputVars>
  </OutputVars>
 </OneAggregate>
</aggregate>
<RunToFinishLine idx='67' x='1080' y='-84' nPinIn='21'/>
<inlineTable keyHD='167449898418900316' forceHDBuffering='1' idx='71' x='348' y='96'>
<ColumnNames><c>ID</c><c>Client_Name</c><c>Agency</c><c>id</c></ColumnNames>
<Rows>
 <r><c>A</c><c>Cara Delevingne</c><c>X</c><c>1</c></r>
 <r><c>B</c><c>Delevingne Cara</c><c>X</c><c>2</c></r>
 <r><c>C</c><c>Cara Delevine</c><c>Y</c><c>3</c></r>
 <r><c>D</c><c>Angelina Jolie</c><c>Y</c><c>4</c></r>
 <r><c>E</c><c>Jolie, Angy</c><c>Y</c><c>5</c></r>
 <r><c>F</c><c>Sigourney Weaver</c><c>Z</c><c>6</c></r>
 <r><c>G</c><c>Weaver Sigounay&#32;</c><c>X</c><c>7</c></r>
</Rows>
</inlineTable>
</ACTIONS>
<CONNECTORS>
<Connection idxSrc='5' idxDest='7'/>
<Connection idxSrc='47' idxDest='5'/>
<Connection idxSrc='47' idxDest='5' idxPinIn='1'/>
<Connection idxSrc='7' idxDest='12'/>
<Connection idxSrc='12' idxDest='41'/>
<Connection idxSrc='54' idxDest='55'/>
<Connection idxSrc='40' idxDest='58'/>
<Connection idxSrc='58' idxDest='59'/>
<Connection idxSrc='52' idxDest='54'/>
<Connection idxSrc='41' idxDest='61'/>
<Connection idxSrc='61' idxDest='52'/>
<Connection idxSrc='55' idxDest='40'/>
<Connection idxSrc='62' idxDest='64'/>
<Connection idxSrc='62' idxDest='65'/>
<Connection idxSrc='64' idxDest='67' idxPinIn='17'/>
<Connection idxSrc='59' idxDest='67' idxPinIn='7'/>
<Connection idxSrc='71' idxDest='52' idxPinIn='1'/>
<Connection idxSrc='65' idxDest='64' idxPinIn='1'/>
</CONNECTORS>
<TEXTANNOTATIONS>
  <Annotation x='-180' y='48' size='14' anchor='5'>fuzzy search 
in full DB</Annotation>
  <Annotation x='-36' y='48' size='14' anchor='7'>flatten all 
solutions</Annotation>
  <Annotation x='204' y='48' size='14' anchor='41'>find connected 
groups</Annotation>
  <Annotation x='360' y='192' size='14' anchor='71'>client DB</Annotation>
  <Annotation x='-300' y='48' size='14' anchor='47'>client DB</Annotation>
  <Annotation x='744' y='48' size='14' anchor='55'>List of all the
Corrections</Annotation>
  <Annotation x='84' y='48' size='14' anchor='12'>Keep only
very similar
names</Annotation>
  <Annotation x='612' y='240' size='14'>Send Emails to All Agencies with the list of corrections</Annotation>
</TEXTANNOTATIONS>
<GROUPBOXANNOTATIONS>
  <Groupbox x='588' y='108' w='468' h='168' color='#afeeee'/>
</GROUPBOXANNOTATIONS>
</ANATELLA>
