I want to have a linear approximation directly in an SQL query.
To make it easy: having a generic function f, I want to approximate it with the function:
Being sure that I found the A and B which minimize the global error between f and f' (where f' is the linear approximation of f).
Let's generate some data:
create table zzz_some_data as
SELECT
'X' groupid,
level x,
exp(level/10) + sin(level/10 *3.14) + dbms_random.value(0,1) y
FROM DUAL
CONNECT BY LEVEL <= 20
union all
SELECT
'Y' groupid,
level x,
exp(level/10) + Cos(level/10 *3.14) + dbms_random.value(0,1) y
FROM DUAL
CONNECT BY LEVEL <= 20;
As you can see there is a random factor. At the bottom of the tutorial, you can find the exact data generated.
Plotting these numbers the functions look like this:
Running the following query, I can get the value of A and B for the two functions:
WITH mean_estimates AS
( SELECT GroupID
,AVG(x) AS xmean
,AVG(y) AS ymean
,min(y) as first_seq
,max(y) as last_seq
FROM zzz_some_data
GROUP BY GroupID
),
stdev_estimates AS
( SELECT pd.GroupID
,CASE
WHEN SUM(POWER(x - xmean, 2)) = 0 THEN 1
when COUNT(*) = 1 then 10000000
ELSE SQRT(SUM(POWER(x - xmean, 2)) / (COUNT(*) - 1))
END AS xstdev
,CASE
when count(*) = 1 then 10000000
else SQRT(SUM(POWER(y - ymean, 2)) / (COUNT(*) - 1))
end AS ystdev
FROM zzz_some_data pd
INNER JOIN mean_estimates pm ON pm.GroupID = pd.GroupID
GROUP BY pd.GroupID, pm.xmean, pm.ymean
),
standardized_data AS -- increases numerical stability
( SELECT pd.GroupID
,(x - xmean) / xstdev AS xstd
,CASE ystdev WHEN 0 THEN 0 ELSE (y - ymean) / ystdev END AS ystd
FROM zzz_some_data pd
INNER JOIN stdev_estimates ps ON ps.GroupID = pd.GroupID
INNER JOIN mean_estimates pm ON pm.GroupID = pd.GroupID
),
standardized_beta_estimates AS
( SELECT GroupID
,CASE WHEN SUM(xstd * xstd) = 0 THEN 0
ELSE SUM(xstd * ystd) / (COUNT(*) - 1) END AS betastd
FROM standardized_data pd
GROUP BY GroupID
),
linear_model as
(
SELECT
pb.GroupID,
ymean - xmean * betastd * ystdev / xstdev AS BETA,
betastd * ystdev / xstdev AS ALPHA,
first_seq,
last_seq
FROM standardized_beta_estimates pb
INNER JOIN stdev_estimates ps ON ps.GroupID = pb.GroupID
INNER JOIN mean_estimates pm ON pm.GroupID = pb.GroupID
)
select
GroupID,
alpha,
Beta
from linear_model
order by GroupID desc;
This query generate the following output:
GROUPID ALPHA BETA
------- ---------- ----------
Y ,341603277 ,27926809
X ,221725267 1,66190364
Let's see now the results:
Here the plot of the approximation of X:
Here the plot of the approximation of Y:
CREATE TABLE "ZZZ_SOME_DATA"
(
"GROUPID" CHAR(1 BYTE),
"X" NUMBER,
"Y" NUMBER
);
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','1','1,61932317662496064883408392432113402523');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','2','2,77694913213341470441940320202765814948');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','3','2,91090102987016066482064043800309804071');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','4','3,13678897398740391101470575963142792902');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','5','3,31605237538417176477155850638842190865');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','6','3,13667075089211953539069065645455345878');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','7','3,47488861159608290867160313703997232386');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','8','3,42251149078741711848240969510483689103');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','9','3,70603465587716541507204559098228342981');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','10','3,49314419776404379007856528151793653642');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','11','3,24172315449369167895615169724235970383');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','12','2,96881260394832309096782780931487864967');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','13','3,07513520161764733445750922412894102533');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','14','3,81479688098856311171457385615269516843');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','15','4,32475565056326473828436305142365381565');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','16','4,85233459222270714560038164481370508461');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','17','5,61989845355259006300115338181868398089');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','18','6,31700004318003808435939347557223237057');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','19','6,76689263318576069778969371993663187878');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('X','20','7,82576523994149918219695417536412769495');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','1','2,53714598427679136859871205905510695503');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','2','2,3110817502573158903868891890087776345');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','3','1,94494243937675278618005281210530984129');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','4','2,25982980116506374957923410951751088925');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','5','1,86603427812480463944739874768807857366');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','6','2,05505938632236899572636979590661767446');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','7','2,3258999982985244735180126549118494278');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','8','1,83188925742276377791489099175111599133');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','9','2,41166969667358720655946060775919042819');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','10','1,8104604598253185583746180090281519982');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','11','2,87738150798966416515834453685956594055');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','12','3,43069799388754748665140656518089681805');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','13','3,54673238603787598200769925226968142282');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','14','4,43614682007291136740320048898865794443');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','15','4,87577441430844397983546298110499857896');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','16','6,00804958270558010522626027655493241713');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','17','6,3755271699704708032463599263767041073');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','18','7,12510967938699587673395044064905861434');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','19','8,17916759972655812255130821868093121265');
Insert into ZZZ_SOME_DATA (GROUPID,X,Y) values ('Y','20','9,11344975307640059368329002839752407251');
COMMIT;