更新时间:2023-02-26 18:17:42
此纯sql代码在5分钟内(约8核/ 32 gb ram)将大约35000条记录分组。
This pure sql code grouped about 35000 records in 5 minutes (8 cores/32 gb ram). Enjoy.
--table with RELATIONS, idea is to place every related item in a bucket
create table RELATIONS
(
bucket int, -- initially 0
bucketsub int, -- initially 0
relnr1 float,
relnr2 float -- relnr1 = a, relnr2 = b means a and b are related
)
create table ##BucketRelnrs ( relnr float ); --table functions as temp list
declare @bucket int;
declare @bucketsub int;
declare @nrOfUpdates int;
declare @relnr float;
set @bucket=0;
set @relnr=0;
WHILE @relnr>=0 and @bucket<50000 --to prevent the while loop from hanging.
BEGIN
set @bucket = @bucket+1
set @bucketsub=1;
set @relnr = (select isnull(min(relnr1),-1) from RELATIONS where bucket=0); --pick the smallest relnr that has not been assigned a bucket yet
set @nrOfUpdates = (select count(*) from RELATIONS where bucket=0 and (relnr1=@relnr or relnr2=@relnr));
update RELATIONS set bucket=@bucket, bucketsub=@bucketsub where bucket=0 and (relnr1=@relnr or relnr2=@relnr);
set @bucketsub = @bucketsub+1;
WHILE @nrOfUpdates>0 and @bucketsub<=10 --to prevent the inner while loop from hanging, actually determines the number of iterations
BEGIN
--refill temp list with newly found related relnrs
truncate table ##BucketRelnrs;
insert into ##BucketRelnrs
select distinct relnr1 from RELATIONS where bucket=@bucket
union select distinct relnr2 from RELATIONS where bucket=@bucket;
--calculate the number of relations that will be updated next, if zero then stop iteration
set @nrOfUpdates =
(
select count(*) from RELATIONS where bucket=0
and (relnr1 in (select relnr from ##BucketRelnrs) or relnr2 in (select relnr from ##BucketRelnrs))
);
--update the RELATIONS table
update RELATIONS set bucket=@bucket, bucketsub=@bucketsub where bucket=0
and (relnr1 in (select relnr from ##BucketRelnrs) or relnr2 in (select relnr from ##BucketRelnrs));
set @bucketsub = @bucketsub+1;
END
END
drop table ##BucketRelnrs; --clean temp table