| Date: | Fri, 20 Aug 2004 14:17:56 -0400 |
| Reply-To: | Richard Ristow <wrristow@MINDSPRING.COM> |
| Sender: | "SAS(r) Discussion" <SAS-L@LISTSERV.UGA.EDU> |
| From: | Richard Ristow <wrristow@MINDSPRING.COM> |
| Subject: | Re: group by first name or last name |
| In-Reply-To: | <f5738cf8.0408170750.29b91fb6@posting.google.com> |
| Content-Type: | text/plain; charset="us-ascii"; format=flowed |
|---|
At 11:50 AM 8/17/2004, Art wrote:
>How to group people by their first name OR last name,
>
>1 John Smith
>2 Goerge Smith
>3 Bill Clinton
>4 George Bush
>
>I need to put 1,2,4 into to one group. Is there any way I can do this
>in SAS or SQL?
I just worked this up over on the SPSS side, where the absence of SQL,
and of macros that can test for a result and exit, was a *MAJOR* pain
(and required some compromises). I don't have a working copy of SAS
here, so consider this pseudocode. Particular apologies for any "."
instead of ";" as a statement closure: I've been writing SPSS recently,
and have got in the habit of ".". (And this is old-school SAS:
essentially, PROC SQL is used only when sortin and a DATA step won't do
the job.)
/* Assume input is INPUT.NAMELIST, and contains three */
/* variables: NAME_NUM, FRSTNAME, LASTNAME, where */
/* NAME_NUM is a numeric value that uniquely */
/* identifies a name. */
/* (If the original input does not contain NAME_NUM as */
/* described, it can easily be added in a DATA step.) */
/* I. Identify "direct pairs", i.e. pairs of names */
/* sharing a first name, or sharing a last name. */
/* If a name is paired with more than one other, */
/* pair it with the lowest-numbered possible */
/* other. */
/* I.A. Direct pairs by match of first names */
PROC SORT DATA=INPUT.NAMELIST OUT=BY_FIRST;
BY FRSTNAME NAME_NUM;
RUN;
DATA PAIRFRST(KEEP=LO_NAME HI_NAME);
SET BY_FIRST;
BY FRSTNAME NAME_NUM;
RETAIN LO_NAME;
IF FIRST.FRSTNAME
THEN LO_NAME = NAME_NUM;
ELSE DO;
HI_NAME = NAME_NUM;
OUTPUT;
END;
RUN;
/* I.B. Ditto, by match of last names */
PROC SORT DATA=INPUT.NAMELIST OUT=BY_LAST;
BY LASTNAME NAME_NUM;
RUN;
DATA PAIRLAST(KEEP=LO_NAME HI_NAME);
SET BY_LAST;
BY LASTNAME NAME_NUM;
RETAIN LO_NAME;
IF FIRST.LASTNAME
THEN LO_NAME = NAME_NUM;
ELSE DO;
HI_NAME = NAME_NUM;
OUTPUT;
END;
RUN;
/* I.C. Combine the two, into a single "direct pairs" */
/* dataset. (A name may now not be paired with */
/* the lowest name it matches, but that doesn't. */
/* matter. However, keep only one instance of */
/* each pair identified.) */
DATA DIRECT;
SET PAIRFRST PAIRLAST;
BY LO_NAME HI_NAME.
IF FIRST.HI_NAME THEN OUTPUT;
RUN;
/* II. Transitive closure: If two names are paired */
/* with the same name, they are paired with */
/* each other. */
/* (This is where PROC SQL, and macros to test */
/* output results, are crucial.) */
/* II.A. Initialize the output dataset */
DATA ALL_PAIR;
SET DIRECT;
RUN;
/* II.B. Macro to add pairs implied by known pairs. */
/* (This is commonly a many-to-many join.) */
%macro Nxt_Step(dpt);
%* Macro variable NEWNUMB must be local to the caller;
PROC SQL;
CREATE TABLE NEW_PAIR
AS SELECT
all.LO_NAME as LO_NAME,
drc.HI_NAME as HI_NAME,
FROM
ALL_PAIR as all,
DIRECT as drc
WHERE
all.HI_NAME
=drc.LO_NAME
ORDER BY
LO_NAME, HI_NAME
;
RUN;
QUIT;
DATA ALL_PAIR;
SET ALL_PAIR NEW_PAIR;
BY LO_NAME HI_NAME;
RETAIN N_PAIRS 0;
IF FIRST.HI_NAME
THEN DO;
N_PAIRS = N_PAIRS + 1;
CALL SYMPUT ('NEWNUMB',PUT(N_PAIRS,5.));
OUTPUT;
END;
RUN;
%MEND Nxt_Step;
/* II.C. Loop, until all pairs have been found */
/* This is a looping macro, adding one level of pairs */
/* at each pass */
%macro TRANSIT;
%local depth OLDNUMB NEWNUMB;
%LET OLDNUMB = 0;
%do depth=2 %to 20;
%Nxt_Step
%put &NEWNUMB pairs in chains through length &depth ;
%if &NEWNUMB LE &OLDNUMB %then %goto End_OK;
%let OLDNUMB = &NEWNUMB;
%End;
%put UNSUCCESSFUL -- chain length of 20 insufficient;
%goto Exit_Mac;
%End_OK: %put This completes the search;
%Exit_Mac:
%mend TRANSIT;
%TRANSIT;
/* II.D. Pair each name with the lowest possible name */
PROC SORT DATA=ALL_PAIR OUT=BY_HI;
BY HI_NAME LO_NAME;
RUN;
DATA PAIRINGS;
SET BY_HI;
BY HI_NAME LO_NAME;
IF FIRST.HI_NAME THEN OUTPUT;
RUN;
/* III. In one dataset, the lowest numbers (which */
/* identify the group of names), the old name */
/* number, and the name itself. */
PROC SORT DATA=INPUT.NAMELIST OUT=BY_NUM;
BY NAME_NUM;
RUN;
DATA GROUPS;
MERGE PAIRINGS
(RENAME=(LO_NAME=GRP_NUM HI_NAME= NAME_NUM))
BY_NUM;
BY NAME_NUM;
/* Ungrouped names are 'grouped' with themselves */
IF GRP_NUM = . THEN GRP_NUM = NAME_NUM;
RUN;
|