Date: Wed, 19 May 2004 16:40:07 -0400
Reply-To: Sigurd Hermansen <HERMANS1@WESTAT.COM>
Sender: "SAS(r) Discussion" <SAS-L@LISTSERV.UGA.EDU>
From: Sigurd Hermansen <HERMANS1@WESTAT.COM>
Subject: Re: Sliding window self-join
Content-Type: text/plain
Bruce:
This query generates identical counts and, as the row counts increase, works
a lot faster:
* Mergetest.sas;
* generate test data file;
data intbl;
do Person = 1 to 300; /* about 2E6 in actual data */
do Date = 1 to 450; /* average in real data - varies between people */
Var = Person*1000+Date;
output;
end;
end;
run;
* generate expanded file;
Proc sql feedback;
create table newtblx as
select p1.person, p1.date ,
sum( p2.date<p1.date) as Nprior
from intbl as p1 INNER JOIN
intbl as p2
on p1.person = p2.person
where (p2.date - p1.date) between -26 and 26
group by p1.person, p1.date
;
Quit ;
Proc sql feedback;
create table newtbly as
select p1.person, p1.date , count(*)-1 as Nprior
from intbl as p1 INNER JOIN
intbl as p2
on p1.person = p2.person
where (p2.date - p1.date) between -26 and 26
and p2.date<=p1.date
group by p1.person, p1.date
;
Quit ;
proc compare data=newtblx compare=newtbly;
run; quit;
It seems to me that pivoting person groups into indexed arrays would give
you something close to the optimal solution. Whether it's worth it depends
on how you plan to use the program.
Sig
-----Original Message-----
From: Bruce Bradbury [mailto:BruceBrad@INAME.COM]
Sent: Wednesday, May 19, 2004 3:28 AM
To: SAS-L@LISTSERV.UGA.EDU
Subject: Re: Sliding window self-join
I've used Roy's SQL syntax to write a test program. After doing my first
version, I realised just how large the output file in my real application
was going to be. Since I was planning to summarise the data anyway, I
thought that the best way would be to do it within the SQL statement - hence
the program below.
I've been testing this and I've noticed that the SQL statement seems to be
creating a working file that is about as large as the file would have been
if I hadn't used 'group by'. Is this normal? It seems to me that it
shouldn't be necessary, as all the calculations for one person should fit in
memory.
* Mergetest.sas;
* generate test data file;
data intbl;
do Person = 1 to 100; /* about 2E6 in actual data */
do Date = 1 to 150; /* average in real data - varies between people */
Var = Person*1000+Date;
output;
end;
end;
run;
* generate expanded file;
Proc sql feedback;
create table newtbl as
select p1.person, p1.date ,
sum( p2.date<p1.date) as Nprior
from intbl as p1 INNER JOIN
intbl as p2
on p1.person = p2.person
where (p2.date - p1.date) between -26 and 26
group by p1.person, p1.date
;
Quit ;