lunedì 10 agosto 2015

Oracle:Linear regression through a SQL Query



I want to have a linear approximation directly in an SQL query.

To make it easy: having a generic function f, I want to approximate it with the function:


Being sure that I found the A and B which minimize the global error between f and f' (where f' is the linear approximation of f). 

Let's generate some data: 
 create table zzz_some_data as   
  SELECT    
    'X' groupid,   
    level x,    
    exp(level/10) + sin(level/10 *3.14) + dbms_random.value(0,1) y   
   FROM DUAL   
   CONNECT BY LEVEL <= 20   
 union all   
  SELECT    
    'Y' groupid,   
    level x,    
    exp(level/10) + Cos(level/10 *3.14) + dbms_random.value(0,1) y  
   FROM DUAL   
   CONNECT BY LEVEL <= 20;  

As you can see there is a random factor. At the bottom of the tutorial, you can find the exact data generated. 

Plotting these numbers the functions look like this: 



Running the following query, I can get the value of A and B for the two functions:

 WITH mean_estimates AS  
 (  SELECT GroupID  
      ,AVG(x) AS xmean  
      ,AVG(y) AS ymean  
      ,min(y) as first_seq  
      ,max(y) as last_seq  
   FROM zzz_some_data  
   GROUP BY GroupID  
 ),  
 stdev_estimates AS  
 (  SELECT pd.GroupID  
      ,CASE     
        WHEN SUM(POWER(x - xmean, 2)) = 0 THEN 1   
        when COUNT(*) = 1 then 10000000  
        ELSE SQRT(SUM(POWER(x - xmean, 2)) / (COUNT(*) - 1))   
       END AS xstdev  
      ,CASE  
        when count(*) = 1 then 10000000   
        else SQRT(SUM(POWER(y - ymean, 2)) / (COUNT(*) - 1))     
       end AS ystdev  
   FROM zzz_some_data pd  
   INNER JOIN mean_estimates pm ON pm.GroupID = pd.GroupID  
   GROUP BY pd.GroupID, pm.xmean, pm.ymean  
 ),  
 standardized_data AS          -- increases numerical stability  
 (  SELECT pd.GroupID  
      ,(x - xmean) / xstdev                  AS xstd  
      ,CASE ystdev WHEN 0 THEN 0 ELSE (y - ymean) / ystdev END AS ystd  
   FROM zzz_some_data pd  
   INNER JOIN stdev_estimates ps ON ps.GroupID = pd.GroupID  
   INNER JOIN mean_estimates pm ON pm.GroupID = pd.GroupID  
 ),  
 standardized_beta_estimates AS  
 (  SELECT GroupID  
      ,CASE WHEN SUM(xstd * xstd) = 0 THEN 0  
         ELSE SUM(xstd * ystd) / (COUNT(*) - 1) END     AS betastd  
   FROM standardized_data pd  
   GROUP BY GroupID  
 ),   
 linear_model as   
 (  
 SELECT   
    pb.GroupID,     
    ymean - xmean * betastd * ystdev / xstdev AS BETA,   
    betastd * ystdev / xstdev         AS ALPHA,   
    first_seq,   
    last_seq  
 FROM standardized_beta_estimates pb  
 INNER JOIN stdev_estimates ps ON ps.GroupID = pb.GroupID  
 INNER JOIN mean_estimates pm ON pm.GroupID = pb.GroupID  
 )  
 select   
  GroupID,   
  alpha,   
  Beta  
 from linear_model  
 order by GroupID desc;  


This query generate the following output: 

 GROUPID   ALPHA    BETA  
 ------- ---------- ----------  
 Y    ,341603277 ,27926809   
 X    ,221725267 1,66190364   



Let's see now the results: 


Here the plot of the approximation of X:


Here the plot of the approximation of Y:








 CREATE TABLE "ZZZ_SOME_DATA"  
  (  
   "GROUPID" CHAR(1 BYTE),  
   "X"    NUMBER,  
   "Y"    NUMBER  
  );  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','1','1,61932317662496064883408392432113402523');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','2','2,77694913213341470441940320202765814948');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','3','2,91090102987016066482064043800309804071');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','4','3,13678897398740391101470575963142792902');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','5','3,31605237538417176477155850638842190865');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','6','3,13667075089211953539069065645455345878');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','7','3,47488861159608290867160313703997232386');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','8','3,42251149078741711848240969510483689103');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','9','3,70603465587716541507204559098228342981');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','10','3,49314419776404379007856528151793653642');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','11','3,24172315449369167895615169724235970383');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','12','2,96881260394832309096782780931487864967');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','13','3,07513520161764733445750922412894102533');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','14','3,81479688098856311171457385615269516843');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','15','4,32475565056326473828436305142365381565');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','16','4,85233459222270714560038164481370508461');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','17','5,61989845355259006300115338181868398089');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','18','6,31700004318003808435939347557223237057');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','19','6,76689263318576069778969371993663187878');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','20','7,82576523994149918219695417536412769495');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','1','2,53714598427679136859871205905510695503');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','2','2,3110817502573158903868891890087776345');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','3','1,94494243937675278618005281210530984129');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','4','2,25982980116506374957923410951751088925');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','5','1,86603427812480463944739874768807857366');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','6','2,05505938632236899572636979590661767446');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','7','2,3258999982985244735180126549118494278');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','8','1,83188925742276377791489099175111599133');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','9','2,41166969667358720655946060775919042819');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','10','1,8104604598253185583746180090281519982');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','11','2,87738150798966416515834453685956594055');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','12','3,43069799388754748665140656518089681805');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','13','3,54673238603787598200769925226968142282');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','14','4,43614682007291136740320048898865794443');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','15','4,87577441430844397983546298110499857896');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','16','6,00804958270558010522626027655493241713');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','17','6,3755271699704708032463599263767041073');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','18','7,12510967938699587673395044064905861434');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','19','8,17916759972655812255130821868093121265');  
 Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','20','9,11344975307640059368329002839752407251');  
 COMMIT;  

Nessun commento:

Posta un commento