/* null dataset for testing SAS' Levenstein-distance function */; data _null_; /* create two version of the same name */; originalName = "Lincoln"; modifiedName = "Linclon"; /* compute Levenstein distance between names */; distanceBetweenNames = complev(originalName,modifiedName); /* output Levenstein distance between names to the log window */; put distanceBetweenNames=; run; /* macro to create dataset matchedDS from datasets ds1 and ds2, where the following conditions hold: (1) the datasets ds1 and ds2 both contain the columns IDNO, NAMEFRST, NAMELAST, and BIRTHDAY; (2) each record in ds1 and ds2 contains a unique value of IDNO, and these values can be compared as either numbers or strings using the less-than operator; (3) the columns NAMEFRST and NAMELAST in ds1 and ds2 have been cleaned so that their values only contain alphabet characters, hyphens, and spaces; and (4) the BIRTHDAY columns in ds1 and ds2 are formatted using the same convention. Under these assumptions, the dataset matchesDS will be created (or overwritten) with all tuples of possible matching records from datasets ds1 and ds2 */; %macro matchByNameAndDOB(ds1,ds2,matchesDS); proc sql; /* create a new dataset called &matchesDS. */; create table &matchesDS. as /* use column names as specified below, where columns are renamed to prevent naming conflicts */; select A.IDNO as ID1, A.NAMEFRST as FNAME1, A.NAMELAST as LNAME1, A.BIRTHDAY as DOB1, B.IDNO as ID2, B.NAMEFRST as FNAME2, B.NAMELAST as LNAME2, B.BIRTHDAY as DOB2, /* create a new column containing the Levenstein distance between pairs of upcased and concatenated firstname-lastname combinations, with an upper limit of distance 3 */; complev(cats(compress(upcase(FNAME1),'- '),compress(upcase(LNAME1),'- ')), cats(compress(upcase(FNAME2),'- '),compress(upcase(LNAME2),'- ')),3) as LEVdistance /* the datasetcalled &matchesDS. should consist of all possible tuples of records from ds1 and ds2 */; from &ds1. as A, &ds2. as B /* where the conditions given below are satisfied and the results are sorted as specified */; where ID1 < ID2 and calculated LEVdistance < 3 and not(missing(FNAME1)) and not(missing(FNAME2)) and not(missing(LNAME1)) and not(missing(LNAME2)) and not(missing(DOB1)) and not(missing(DOB2)) and DOB1 = DOB2 order by ID1, ID2; quit; %mend;