Date: Mon, 15 Aug 2011 11:07:31 -0400
Reply-To: Arthur Tabachneck <art297@ROGERS.COM>
Sender: "SAS(r) Discussion" <SAS-L@LISTSERV.UGA.EDU>
From: Arthur Tabachneck <art297@ROGERS.COM>
Subject: Re: Unique values.
Robin,
It definitely wasn't the fastest, but I was still impressed with how quickly
proc tabulate produced the desired file.
Just as interesting, though, I was comparing how the various methods worked
with files of different sizes and what the results would be if the file were
indexed.
It was the latter that most captured my attention and I would like to find
out if others get similar results. With my test file, using an indexed
version of the source file, proc sql was the only one that appeared to gain
any benefit and the only other proc that indicated that it was using the
index was proc summary if one changed the class statement to be a by
statement. However, in that case, the resulting performance was dreadful.
The code I ran was:
options fullstimer=1 generic=0;
data source;
length key1 key2 8 id $4 var1 var2 8;
do key1=1 to 100;
do key2=1 to 100;
do _n_ = 1 to 1000;
id = put(_n_,z4.);
/* id=byte(64+_n_);*/
/* id=byte(64+key2);*/
var1=int(ranuni(1)*1000);
var2=int(ranuni(1)*1000);
output;
end;
end;
end;
run;
* set by and var variables for proc transpose ;
* these will be used later for post-processing ;
%let by=key1 key2;
%let var=var1 var2;
data _null_;
if 0 then set source(keep=id);
declare hash h(dataset:'source(keep=id)',ordered:'Y');
h.defineKey('id');
h.defineData('id');
h.defineDone();
h.output(dataset:'ID1');
stop;
run;
proc sql noprint;
create table id2 as select distinct id from source(keep=id);
quit;
proc summary data=source(keep=id) nway;
class id;
output out=id3(keep=id);
run;
proc sort nodupkey data=source(keep=id) out=id4;
by id;
run;
proc freq data=source(keep=id) noprint;
tables id / out=id5(keep=id);
run;
proc iml;
/* read source data */
use source(keep=id); read all var {id}; close source;
/* find unique values */
u = unique(id);
/* if necessary, write results */
create id6 var {u}; append; close id6;
quit;
ods output table=id7(keep=id);
ods listing close;
PROC TABULATE DATA=source(keep=id);
class id;
TABLE id, n ;
run;
ods listing;
data Isource (index=(id));
set source;
run;
data _null_;
if 0 then set Isource(keep=id);
declare hash h(dataset:'Isource(keep=id)',ordered:'Y');
h.defineKey('id');
h.defineData('id');
h.defineDone();
h.output(dataset:'iID1');
stop;
run;
proc sql noprint;
create table iid2 as select distinct id from Isource(keep=id);
quit;
proc summary data=Isource(keep=id) nway;
class id;
output out=iid3(keep=id);
run;
proc summary data=Isource(keep=id);
by id;
output out=iid3a(keep=id);
run;
proc sort nodupkey data=Isource(keep=id) out=iid4;
by id;
run;
proc freq data=Isource(keep=id) noprint;
tables id / out=iid5(keep=id);
run;
proc iml;
/* read source data */
use Isource(keep=id); read all var {id}; close Isource;
/* find unique values */
u = unique(id);
/* if necessary, write results */
create iid6 var {u}; append; close iid6;
quit;
ods output table=iid7(keep=id);
ods listing close;
PROC TABULATE DATA=Isource(keep=id);
class id;
TABLE id, n ;
run;
ods listing;
--------
On Mon, 15 Aug 2011 08:37:24 -0500, Robin R High <rhigh@UNMC.EDU> wrote:
>for what it is worth (sometimes a lot for other reasons and to collect
>summary stats in a dataset), another approach is:
>
>ods output table=id5(keep=id);
>ods listing close;
>PROC TABULATE DATA=source(keep=id);
>CLASS id;
>TABLE id, n ;
>run;
>ods listing;
>
>
>Robin High
>UNMC
>
>
>
>
>
>From:
>"Data _null_;" <iebupdte@GMAIL.COM>
>To:
>SAS-L@LISTSERV.UGA.EDU
>Date:
>08/13/2011 01:54 PM
>Subject:
>Unique values.
>Sent by:
>"SAS(r) Discussion" <SAS-L@LISTSERV.UGA.EDU>
>
>
>
>Below are 4 methods that return a data set of unique values of a
>variable (ID). On my computer somewhat to my surprise PROC SUMMARY is
>fastest, by a few seconds over the next fastest PROC FREQ. If anyone
>cares to test please do. If you have another method please add it.
>These four that I could think of.
>
> options fullstimer=1 generic=0;
>data source;
> length key1 key2 8 id $4 var1 var2 8;
> do key1=1 to 1e4;
> do key2=1 to 1e1;
> do _n_ = 1 to 999;
> id = put(_n_,z3.);
>/* id=byte(64+_n_);*/
>/* id=byte(64+key2);*/
> var1=int(ranuni(1)*1000);
> var2=int(ranuni(1)*1000);
> output;
> end;
> end;
> end;
> run;
>* set by and var variables for proc transpose ;
>* these will be used later for post-processing ;
>
>%let by=key1 key2;
>%let var=var1 var2;
>
>data _null_;
> if 0 then set source(keep=id);
> declare hash h(dataset:'source(keep=id)',ordered:'Y');
> h.defineKey('id');
> h.defineData('id');
> h.defineDone();
> h.output(dataset:'ID1');
> stop;
> run;
>proc sql noprint;
> create table id2 as select distinct id from source(keep=id);
> quit;
> run;
>proc summary data=source(keep=id) nway;
> class id;
> output out=id3(keep=id);
> run;
>proc sort nodupkey data=source(keep=id) out=id4;
> by id;
> run;
>proc freq data=source(keep=id) noprint;
> tables id / out=id(keep=id);
> run;