Date: Thu, 21 Dec 2006 01:55:32 -0500
Reply-To: "Richard A. DeVenezia" <rdevenezia@WILDBLUE.NET>
Sender: "SAS(r) Discussion" <SAS-L@LISTSERV.UGA.EDU>
From: "Richard A. DeVenezia" <rdevenezia@WILDBLUE.NET>
Subject: Re: Hashing in exchange for Cartesian matching
"Gerstle, John (CDC/CCID/NCHHSTP) (CTR)" wrote:
> SAS 9.1.3, WinXP, ~500Mb RAM
>
> Have 1.3 million records, with 1.1 million unique cases, each with a
> UID value. Would like to test several code variations against the
> UIDs, looking at concordance rates. The variations revolve around
> changing the suffix of the code to see if we're over or under
> counting cases.
Are the following data and queries representative of the problem?
data field;
do uid = 1 to 1000;
do _n_ = 1 by 1 until (ranuni(123) > .2/1.3);
if _n_ = 1 then
if ranuni(123) < .01 then
code = ceil(ranuni(123)*(uid-1));
else
code = uid;
else
if ranuni(123) < 1/7 then
code = ceil(ranuni(123)*(uid-1));
else
code = uid;
output;
end;
end;
run;
proc sql;
create table uid_duplicates as
select *
from field
group by uid
having count(*) > 1
;
create table uid_duplicates_multi_coded as
select *
from field
group by uid
having count(distinct code) > 1
;
create table uid_duplicates_single_coded as
select *
from field
group by uid
having count(distinct code) = 1 and count(*) > 1
;
create table code_duplicates as
select *
from field
group by code
having count(*) > 1
;
create table code_duplicates_multi_uided as
select *
from field
group by code
having count(distinct uid) > 1
;
/* this table should be the same as uid_duplicates_single_coded */
create table code_duplicates_single_uided as
select *
from field
group by code
having count(distinct uid) = 1 and count(*) > 1
;
quit;
--
Richard A. DeVenezia
http://www.devenezia.com/