This shows how rpart
decides on splits.
Load the example dataset:
library(sp)
data(meuse)
names(meuse)
## [1] "x" "y" "cadmium" "copper" "lead" "zinc" "elev"
## [8] "dist" "om" "ffreq" "soil" "lime" "landuse" "dist.m"
We want to predict Zn from distance to river and elevation.
We first try to split on distance. The idea is to find the cutpoint at which the residual sum of squares (within-group) is lowest, i.e., the between-group is highest. So, we sort the values and try them all.
We first find the total sum of squares (TSS), i.e., with no model:
(tss <- sum((meuse$zinc - mean(meuse$zinc))^2))
## [1] 20750448
Now try all thresholds; keep the results in a dataframe
(distances <- sort(unique(meuse$dist.m)))
## [1] 10 20 30 40 50 60 70 80 100 110 120 130 140 150
## [15] 160 170 190 200 210 220 240 260 270 280 290 300 310 320
## [29] 330 340 350 360 370 380 390 400 410 420 430 440 450 460
## [43] 470 480 490 500 520 530 540 550 560 570 630 650 660 680
## [57] 690 710 720 750 760 860 1000
(nd <- length(distances))
## [1] 63
results.df <- data.frame(distance=distances, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
branch.less <- meuse$zinc[meuse$dist.m < distances[i]]
branch.more <- meuse$zinc[meuse$dist.m >= distances[i]]
rss.less <- sum((branch.less-mean(branch.less))^2)
rss.more <- sum((branch.more-mean(branch.more))^2)
rss <- sum(rss.less + rss.more)
results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
}
print(results.df)
## distance rss.less rss.more rss r.squared
## 1 10 0 2.075045e+07 20750448 0.000000000
## 2 20 1832626 1.483439e+07 16667016 0.196787655
## 3 30 2316309 1.140827e+07 13724576 0.338588909
## 4 40 2333315 1.083309e+07 13166408 0.365487991
## 5 50 2723383 1.083307e+07 13556453 0.346691036
## 6 60 3174655 8.722037e+06 11896692 0.426677814
## 7 70 3236145 8.277919e+06 11514063 0.445117348
## 8 80 6067468 5.433085e+06 11500553 0.445768412
## 9 100 6395320 4.686610e+06 11081930 0.465942608
## 10 110 6692027 3.806981e+06 10499009 0.494034598
## 11 120 6698102 3.566876e+06 10264978 0.505312936
## 12 130 6979739 3.225653e+06 10205392 0.508184486
## 13 140 7127795 3.030296e+06 10158091 0.510464027
## 14 150 7477245 2.876879e+06 10354123 0.501016862
## 15 160 8372655 2.677065e+06 11049720 0.467494875
## 16 170 9071302 2.669654e+06 11740956 0.434183022
## 17 190 9370854 2.629214e+06 12000069 0.421695916
## 18 200 9611896 2.629090e+06 12240986 0.410085694
## 19 210 9714370 2.198424e+06 11912794 0.425901826
## 20 220 9744329 2.102696e+06 11847025 0.429071353
## 21 240 10077872 2.023965e+06 12101837 0.416791517
## 22 260 10687674 1.993349e+06 12681023 0.388879539
## 23 270 11171707 1.870105e+06 13041812 0.371492482
## 24 280 11517144 1.103623e+06 12620767 0.391783369
## 25 290 11644762 1.093727e+06 12738489 0.386110140
## 26 300 12289870 1.085278e+06 13375148 0.355428464
## 27 310 12463583 1.084929e+06 13548511 0.347073775
## 28 320 12752996 1.032326e+06 13785321 0.335661490
## 29 330 13436297 7.633569e+05 14199653 0.315694112
## 30 340 13594447 7.630010e+05 14357448 0.308089713
## 31 350 13917643 7.486246e+05 14666268 0.293207152
## 32 360 13998117 7.335915e+05 14731709 0.290053443
## 33 370 14272616 6.254629e+05 14898079 0.282035779
## 34 380 14453002 6.249539e+05 15077956 0.273367184
## 35 390 14927480 6.186454e+05 15546126 0.250805283
## 36 400 15304443 6.112366e+05 15915680 0.232995820
## 37 410 15770533 6.093446e+05 16379878 0.210625331
## 38 420 16095849 6.035239e+05 16699373 0.195228278
## 39 430 16382791 6.019859e+05 16984777 0.181474176
## 40 440 16455204 5.957830e+05 17050987 0.178283413
## 41 450 16693074 5.957748e+05 17288849 0.166820414
## 42 460 17004729 5.561872e+05 17560916 0.153709048
## 43 470 17256084 5.533319e+05 17809415 0.141733427
## 44 480 17424991 5.486696e+05 17973661 0.133818174
## 45 490 17598302 5.221048e+05 18120407 0.126746210
## 46 500 17709527 5.220175e+05 18231545 0.121390275
## 47 520 18041346 5.212990e+05 18562645 0.105433999
## 48 530 18042140 4.369718e+05 18479111 0.109459623
## 49 540 18254111 4.369278e+05 18691038 0.099246489
## 50 550 18623506 4.314852e+05 19054991 0.081706970
## 51 560 18862636 4.236141e+05 19286250 0.070562221
## 52 570 19011345 4.149336e+05 19426279 0.063813972
## 53 630 19097868 4.148858e+05 19512754 0.059646599
## 54 650 19376465 4.120889e+05 19788554 0.046355319
## 55 660 19556095 4.069047e+05 19963000 0.037948471
## 56 680 19599209 4.031925e+05 20002402 0.036049630
## 57 690 19851207 3.847440e+05 20235951 0.024794481
## 58 710 19863012 3.635260e+05 20226538 0.025248128
## 59 720 19930817 3.635234e+05 20294340 0.021980591
## 60 750 20054249 3.538015e+05 20408050 0.016500717
## 61 760 20261299 5.327500e+02 20261832 0.023547250
## 62 860 20373355 8.866667e+01 20373443 0.018168477
## 63 1000 20625231 0.000000e+00 20625231 0.006034401
(ix.r.squared.max <- which.max(results.df$r.squared))
## [1] 13
print(results.df[ix.r.squared.max,])
## distance rss.less rss.more rss r.squared
## 13 140 7127795 3030296 10158091 0.510464
(distance.r.squared.max <- results.df[ix.r.squared.max,"r.squared"])
## [1] 0.510464
(d.threshold.1 <- results.df[ix.r.squared.max,"distance"])
## [1] 140
plot(r.squared ~ distance, data=results.df, type="h",
col=ifelse(distance==d.threshold.1,"red","gray"))
The cutoff for distance should be 140 meters. This would explain 51% of the variation in Zn concentration.
But we also need to try the other possible splitting variable:
(elevations <- sort(unique(meuse$elev)))
## [1] 5.180 5.700 5.760 5.800 6.160 6.280 6.320 6.340 6.360 6.420
## [11] 6.480 6.560 6.680 6.740 6.860 6.900 6.920 6.983 6.986 7.000
## [21] 7.020 7.100 7.160 7.200 7.220 7.300 7.307 7.320 7.360 7.400
## [31] 7.440 7.480 7.516 7.536 7.540 7.552 7.564 7.610 7.633 7.640
## [41] 7.653 7.655 7.680 7.702 7.706 7.720 7.740 7.756 7.760 7.780
## [51] 7.784 7.791 7.792 7.800 7.809 7.820 7.822 7.860 7.900 7.909
## [61] 7.932 7.940 7.951 7.971 7.980 8.060 8.070 8.121 8.128 8.176
## [71] 8.180 8.217 8.226 8.231 8.261 8.292 8.328 8.351 8.410 8.429
## [81] 8.463 8.468 8.470 8.472 8.490 8.504 8.507 8.536 8.538 8.540
## [91] 8.575 8.577 8.659 8.668 8.688 8.694 8.704 8.727 8.741 8.743
## [101] 8.760 8.779 8.809 8.815 8.830 8.840 8.867 8.908 8.937 8.973
## [111] 8.976 8.990 9.015 9.020 9.036 9.041 9.043 9.049 9.073 9.095
## [121] 9.097 9.155 9.206 9.280 9.404 9.406 9.420 9.518 9.523 9.528
## [131] 9.555 9.573 9.604 9.634 9.691 9.717 9.720 9.732 9.811 9.879
## [141] 9.924 9.970 10.080 10.136 10.320 10.520
(nd <- length(elevations))
## [1] 146
results.df <- data.frame(elevation=elevations, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
branch.less <- meuse$zinc[meuse$elev < elevations[i]]
branch.more <- meuse$zinc[meuse$elev >= elevations[i]]
rss.less <- sum((branch.less-mean(branch.less))^2)
rss.more <- sum((branch.more-mean(branch.more))^2)
rss <- sum(rss.less + rss.more)
results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
}
# print(results.df)
(ix.r.squared.max <- which.max(results.df$r.squared))
## [1] 32
print(results.df[ix.r.squared.max,])
## elevation rss.less rss.more rss r.squared
## 32 7.48 3362676 10176189 13538864 0.3475387
(elevation.r.squared.max <- results.df[ix.r.squared.max,"r.squared"])
## [1] 0.3475387
(e.threshold.1 <- results.df[ix.r.squared.max,"elevation"])
## [1] 7.48
plot(r.squared ~ elevation, data=results.df, type="h",
col=ifelse(elevation==e.threshold.1,"red","gray"))
The cutoff for elevation should be 7.48 m.a.s.l. This would explain 34.8% of the variation in Zn concentration.
Clearly the distance gives a better split (\(R^2\) = 0.51) than elevation (\(R^2\) = 0.348).
We now split the dataset into those two groups, but only if the increase in \(R^2\) is more than a user-specified threshold. Here there is a very large increase in \(R^2\) (the unsplit \(R^2\) is by definition 0), so we split.
meuse.split1.less <- meuse[meuse$dist.m < d.threshold.1,]
meuse.split1.more <- meuse[meuse$dist.m >= d.threshold.1,]
dim(meuse.split1.less)[1]; dim(meuse.split1.more)[1]
## [1] 51
## [1] 104
We see that the first group (closer to river) has fewer points.
Now rpart
does the same procedure for each group separately; the improvement in \(R^2\) of the finer split is compared to that from the first split.
We try both distance and elevation on the left-hand group.
(distances <- sort(unique(meuse.split1.less$dist.m)))
## [1] 10 20 30 40 50 60 70 80 100 110 120 130
nd <- length(distances)
All of these distances are less then the threshold use for the first split.
We now see how much further splitting on distance improves the fit.
results.df <- data.frame(distance=distances, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
branch.less <- meuse.split1.less$zinc[meuse.split1.less$dist.m < distances[i]]
branch.more <- meuse.split1.less$zinc[meuse.split1.less$dist.m >= distances[i]]
rss.less <- sum((branch.less-mean(branch.less))^2)
rss.more <- sum((branch.more-mean(branch.more))^2)
rss <- sum(rss.less + rss.more)
results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
}
print(results.df)
## distance rss.less rss.more rss r.squared
## 1 10 0 7127795.0 7127795 0.6564992
## 2 20 1832626 4830846.3 6663472 0.6788757
## 3 30 2316309 3845385.5 6161695 0.7030573
## 4 40 2333315 3668820.8 6002136 0.7107467
## 5 50 2723383 3550061.7 6273445 0.6976718
## 6 60 3174655 2507435.8 5682091 0.7261702
## 7 70 3236145 2455665.0 5691810 0.7257018
## 8 80 6067468 446060.0 6513528 0.6861018
## 9 100 6395320 417804.0 6813124 0.6716638
## 10 110 6692027 118938.8 6810966 0.6717678
## 11 120 6698102 85938.0 6784040 0.6730654
## 12 130 6979739 14792.0 6994531 0.6629214
ix.r.squared.max <- which.max(results.df$r.squared)
print(results.df[ix.r.squared.max,])
## distance rss.less rss.more rss r.squared
## 6 60 3174655 2507436 5682091 0.7261702
distance.r.squared.max <- results.df[ix.r.squared.max,"r.squared"]
d.threshold.1.1 <- results.df[ix.r.squared.max,"distance"]
plot(r.squared ~ distance, data=results.df, type="h",
col=ifelse(distance==d.threshold.1.1,"red","gray"))
The cutoff for distance should be 60 meters; however we see it is very close to the next-further distance. This explains 72.62% of the variation in this group.
But we also need to try the other possible splitting variable, elevation.
(elevations <- sort(unique(meuse.split1.less$elev)))
## [1] 5.180 5.700 5.760 6.280 6.340 6.420 6.480 6.680 6.860 6.920 6.983
## [12] 7.000 7.020 7.100 7.160 7.200 7.220 7.300 7.307 7.320 7.360 7.400
## [23] 7.440 7.536 7.540 7.552 7.680 7.702 7.706 7.740 7.784 7.900 7.909
## [34] 7.940 7.980 8.060 8.231 8.261 8.463 8.470 8.490 8.538 8.540 8.704
## [45] 8.779 8.809 8.815 8.908 9.518
nd <- length(elevations)
Although we did not split on elevation, the set of elevations at this closer distance is smaller than the set for all the points.
results.df <- data.frame(elevation=elevations, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
branch.less <- meuse.split1.less$zinc[meuse.split1.less$elev < elevations[i]]
branch.more <- meuse.split1.less$zinc[meuse.split1.less$elev >= elevations[i]]
rss.less <- sum((branch.less-mean(branch.less))^2)
rss.more <- sum((branch.more-mean(branch.more))^2)
rss <- sum(rss.less + rss.more)
results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
}
# print(results.df)
ix.r.squared.max.e <- which.max(results.df$r.squared)
print(results.df[ix.r.squared.max.e,])
## elevation rss.less rss.more rss r.squared
## 37 8.231 4351389 498522.8 4849912 0.7662744
elevation.r.squared.max <- results.df[ix.r.squared.max.e,"r.squared"]
e.threshold.1.1 <- results.df[ix.r.squared.max.e,"elevation"]
plot(r.squared ~ elevation, data=results.df, type="h",
col=ifelse(elevation==e.threshold.1.1,"red","gray"))
The cutoff for elevation should be 8.231 meters; however we see it is very close to another similar elevation. This explains 76.63% of the variation in this group.
The two improvements are close, but now splitting this branch on elevation (\(R^2\) = 0.766) gives a better \(R^2\) within this branch than splitting on distance (\(R^2\) = 0.726).
The \(R^2\) reported in the previous code is only for this group; to decide if the split is enough improvement in the overall model we need to compute the \(R^2\) of the proposed split over all the (new) groups, and compare with the \(R^2\) from the first split we’ve already made.
rss.right <- sum((meuse.split1.more$zinc - mean(meuse.split1.more$zinc))^2)
rss.left <- sum((meuse.split1.less$zinc - mean(meuse.split1.less$zinc))^2)
(r2.1 <- 1 - ((rss.right + rss.left)/tss))
## [1] 0.510464
rss.left.split <- sum(results.df[ix.r.squared.max.e,c("rss.less","rss.more")])
(r2.2 <- 1 - ((rss.right + rss.left.split)/tss))
## [1] 0.6202392
The \(R^2\) after this split is 0.62; the \(R^2\) after the first split is 0.51; the improvement is 0.11. Depending on the setting of the complexity parameter (CP), we would decide to split or not. This improvment is not too much, only about 4%, i.e., corresponding to a CP of 0.04.
Make the split:
meuse.split1.less.split2.less <- meuse.split1.less[meuse.split1.less$elev < e.threshold.1.1,]
meuse.split1.less.split2.more <- meuse.split1.less[meuse.split1.less$elev >= e.threshold.1.1,]
dim(meuse.split1.less.split2.less)[1]; dim(meuse.split1.less.split2.more)[1]
## [1] 38
## [1] 13
row.names(meuse.split1.less.split2.less)
## [1] "1" "2" "13" "16" "17" "18" "19" "20" "39" "40" "41"
## [12] "46" "53" "54" "55" "56" "57" "60" "61" "62" "63" "64"
## [23] "65" "66" "67" "79" "80" "81" "87" "88" "89" "90" "123"
## [34] "160" "97" "151" "152" "153"
row.names(meuse.split1.less.split2.more)
## [1] "8" "14" "21" "96" "120" "121" "124" "128" "129" "130" "135"
## [12] "149" "164"
Same procedure for the other group.
(distances <- sort(unique(meuse.split1.more$dist.m)))
## [1] 140 150 160 170 190 200 210 220 240 260 270 280 290 300
## [15] 310 320 330 340 350 360 370 380 390 400 410 420 430 440
## [29] 450 460 470 480 490 500 520 530 540 550 560 570 630 650
## [43] 660 680 690 710 720 750 760 860 1000
(nd <- length(distances))
## [1] 51
results.df <- data.frame(distance=distances, rss.more=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
branch.less <- meuse.split1.more$zinc[meuse.split1.more$dist.m < distances[i]]
branch.more <- meuse.split1.more$zinc[meuse.split1.more$dist.m >= distances[i]]
rss.less <- sum((branch.less-mean(branch.less))^2)
rss.more <- sum((branch.more-mean(branch.more))^2)
rss <- sum(rss.less + rss.more)
results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
}
print(results.df)
## distance rss.more rss.more.1 rss r.squared
## 1 140 0.0 3.030296e+06 3030296 0.8539648
## 2 150 14126.0 2.876879e+06 2891005 0.8606775
## 3 160 197781.7 2.677065e+06 2874847 0.8614562
## 4 170 239020.5 2.669654e+06 2908674 0.8598260
## 5 190 254558.9 2.629214e+06 2883773 0.8610260
## 6 200 269820.8 2.629090e+06 2898910 0.8602965
## 7 210 466023.0 2.198424e+06 2664447 0.8715957
## 8 220 485336.5 2.102696e+06 2588033 0.8752782
## 9 240 498229.8 2.023965e+06 2522195 0.8784511
## 10 260 587656.0 1.993349e+06 2581005 0.8756169
## 11 270 697420.0 1.870105e+06 2567525 0.8762665
## 12 280 1157769.9 1.103623e+06 2261393 0.8910195
## 13 290 1166118.2 1.093727e+06 2259846 0.8910941
## 14 300 1287622.5 1.085278e+06 2372900 0.8856458
## 15 310 1310127.6 1.084929e+06 2395056 0.8845781
## 16 320 1366140.2 1.032326e+06 2398466 0.8844138
## 17 330 1584093.6 7.633569e+05 2347450 0.8868723
## 18 340 1605216.1 7.630010e+05 2368217 0.8858715
## 19 350 1659463.0 7.486246e+05 2408088 0.8839501
## 20 360 1660659.2 7.335915e+05 2394251 0.8846169
## 21 370 1716922.0 6.254629e+05 2342385 0.8871164
## 22 380 1750630.0 6.249539e+05 2375584 0.8855165
## 23 390 1829347.0 6.186454e+05 2447992 0.8820270
## 24 400 1907668.1 6.112366e+05 2518905 0.8786096
## 25 410 1982753.7 6.093446e+05 2592098 0.8750823
## 26 420 2041436.9 6.035239e+05 2644961 0.8725348
## 27 430 2084403.7 6.019859e+05 2686390 0.8705382
## 28 440 2085895.2 5.957830e+05 2681678 0.8707653
## 29 450 2113021.2 5.957748e+05 2708796 0.8694584
## 30 460 2167097.9 5.561872e+05 2723285 0.8687602
## 31 470 2203513.9 5.533319e+05 2756846 0.8671428
## 32 480 2214846.6 5.486696e+05 2763516 0.8668214
## 33 490 2243200.1 5.221048e+05 2765305 0.8667352
## 34 500 2257298.7 5.220175e+05 2779316 0.8660599
## 35 520 2300478.8 5.212990e+05 2821778 0.8640136
## 36 530 2333477.5 4.369718e+05 2770449 0.8664872
## 37 540 2361337.9 4.369278e+05 2798266 0.8651467
## 38 550 2425614.3 4.314852e+05 2857100 0.8623114
## 39 560 2468958.3 4.236141e+05 2892572 0.8606019
## 40 570 2502733.7 4.149336e+05 2917667 0.8593926
## 41 630 2511414.2 4.148858e+05 2926300 0.8589765
## 42 650 2545715.6 4.120889e+05 2957804 0.8574583
## 43 660 2569572.4 4.069047e+05 2976477 0.8565584
## 44 680 2569736.3 4.031925e+05 2972929 0.8567294
## 45 690 2621907.7 3.847440e+05 3006652 0.8551042
## 46 710 2628841.3 3.635260e+05 2992367 0.8557926
## 47 720 2633713.9 3.635234e+05 2997237 0.8555579
## 48 750 2659832.5 3.538015e+05 3013634 0.8547678
## 49 760 2920717.3 5.327500e+02 2921250 0.8592199
## 50 860 2943033.2 8.866667e+01 2943122 0.8581659
## 51 1000 3001233.7 0.000000e+00 3001234 0.8553654
ix.r.squared.max <- which.max(results.df$r.squared)
print(results.df[ix.r.squared.max,])
## distance rss.more rss.more.1 rss r.squared
## 13 290 1166118 1093727 2259846 0.8910941
(distance.r.squared.max <- results.df[ix.r.squared.max,"r.squared"])
## [1] 0.8910941
d.threshold.2.1 <- results.df[ix.r.squared.max,"distance"]
plot(r.squared ~ distance, data=results.df, type="h",
col=ifelse(distance==d.threshold.2.1,"red","gray"))
The cutoff for distance should be 290 meters, explaining 89.11% of the variation in this group; however we see it is very close to the next-further distance.
But we also need to try the other possible splitting variable, elevation.
(elevations <- sort(unique(meuse.split1.more$elev)))
## [1] 5.800 6.160 6.320 6.360 6.560 6.740 6.900 6.986 7.480 7.516
## [11] 7.564 7.610 7.633 7.640 7.653 7.655 7.720 7.756 7.760 7.780
## [21] 7.791 7.792 7.800 7.809 7.820 7.822 7.860 7.932 7.951 7.971
## [31] 8.070 8.121 8.128 8.176 8.180 8.217 8.226 8.292 8.328 8.351
## [41] 8.410 8.429 8.468 8.472 8.504 8.507 8.536 8.575 8.577 8.659
## [51] 8.668 8.688 8.694 8.727 8.741 8.743 8.760 8.830 8.840 8.867
## [61] 8.937 8.973 8.976 8.990 9.015 9.020 9.036 9.041 9.043 9.049
## [71] 9.073 9.095 9.097 9.155 9.206 9.280 9.404 9.406 9.420 9.523
## [81] 9.528 9.555 9.573 9.604 9.634 9.691 9.717 9.720 9.732 9.811
## [91] 9.879 9.924 9.970 10.080 10.136 10.320 10.520
nd <- length(elevations)
results.df <- data.frame(elevation=elevations, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
branch.less <-
meuse.split1.more$zinc[meuse.split1.more$elev < elevations[i]]
branch.more <-
meuse.split1.more$zinc[meuse.split1.more$elev >= elevations[i]]
rss.less <- sum((branch.less-mean(branch.less))^2)
rss.more <- sum((branch.more-mean(branch.more))^2)
rss <- sum(rss.less + rss.more)
results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
}
# print(results.df)
ix.r.squared.max.e <- which.max(results.df$r.squared)
print(results.df[ix.r.squared.max.e,])
## elevation rss.less rss.more rss r.squared
## 8 6.986 219496 1234277 1453773 0.9299401
elevation.r.squared.max <- results.df[ix.r.squared.max.e,"r.squared"]
e.threshold.1.2 <- results.df[ix.r.squared.max.e,"elevation"]
plot(r.squared ~ elevation, data=results.df, type="h",
col=ifelse(elevation==e.threshold.1.2,"red","gray"))
The cutoff for elevation should be 6.986 m.a.s.l., explaining 92.99% of the variation in this group; however we see it is very close to the next-further distance.
The two improvements are close, but now splitting this branch on elevation (\(R^2\) = 0.93) gives a better \(R^2\) than splitting on distance (\(R^2\) = 0.891).
Compute the \(R^2\) of the proposed split over all the (new) groups, and compare with the \(R^2\) from the first split we’ve already made. We do not include the split of the left branch.
rss.left.split <- sum(results.df[ix.r.squared.max.e,c("rss.less","rss.more")])
(r2.2 <- 1 - ((rss.right + rss.left.split)/tss))
## [1] 0.783905
The \(R^2\) after this second split is 0.784; the \(R^2\) after the first split is 0.51; the improvement is 0.273. This is a very substantial improvement, so the split would be made.
Make the split:
meuse.split1.more.split2.less <- meuse.split1.more[meuse.split1.more$elev < e.threshold.1.2,]
meuse.split1.more.split2.more <- meuse.split1.more[meuse.split1.more$elev >= e.threshold.1.2,]
dim(meuse.split1.more.split2.less)[1]; dim(meuse.split1.more.split2.more)[1]
## [1] 9
## [1] 95
row.names(meuse.split1.more.split2.less)
## [1] "47" "59" "69" "76" "82" "83" "84" "85" "86"
row.names(meuse.split1.more.split2.more)
## [1] "3" "4" "5" "6" "7" "9" "10" "11" "12" "15" "22"
## [12] "23" "24" "25" "26" "27" "28" "29" "30" "31" "32" "33"
## [23] "34" "35" "37" "38" "42" "43" "44" "45" "48" "49" "50"
## [34] "51" "52" "58" "75" "163" "70" "71" "91" "92" "93" "94"
## [45] "95" "98" "99" "100" "101" "102" "103" "104" "105" "106" "108"
## [56] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119"
## [67] "122" "125" "126" "127" "131" "132" "133" "134" "136" "161" "162"
## [78] "137" "138" "140" "141" "142" "143" "144" "145" "146" "147" "148"
## [89] "150" "154" "155" "156" "157" "158" "159"
We now have four groups at two levels. The predictions for the groups are the averages of the target variable value in the group.
Top level (no split):
mean(meuse$zinc)
## [1] 469.7161
First split:
mean(meuse.split1.less$zinc)
## [1] 843.0196
mean(meuse.split1.more$zinc)
## [1] 286.6538
This is a big difference: closer to the river than 140 m has a much higher mean Zn concentration than further away, about 3x.
Second split, left branch:
mean(meuse.split1.less$zinc)
## [1] 843.0196
(pred.1.1 <- mean(meuse.split1.less.split2.less$zinc))
## [1] 966.6316
(pred.1.2 <- mean(meuse.split1.less.split2.more$zinc))
## [1] 481.6923
At the elevations lower than 6.986 m.a.s.l., close to the river, are the highest Zn concentrations.
Second split, right branch:
mean(meuse.split1.more$zinc)
## [1] 286.6538
(pred.2.1 <- mean(meuse.split1.more.split2.less$zinc))
## [1] 686.6667
(pred.2.2 <- mean(meuse.split1.more.split2.more$zinc))
## [1] 248.7579
Again the elevation is important: at the elevations lower than 6.986 m.a.s.l., but far from the river, are the second-highest Zn concentrations.
Add the predictions for each point and display:
meuse$rf.pred <- 0
meuse[(meuse$dist.m < d.threshold.1) & (meuse$elev < e.threshold.1.1), "rf.pred"] <- pred.1.1
meuse[(meuse$dist.m < d.threshold.1) & (meuse$elev >= e.threshold.1.1), "rf.pred"] <- pred.1.2
meuse[(meuse$dist.m >= d.threshold.1) & (meuse$elev < e.threshold.1.2), "rf.pred"] <- pred.2.1
meuse[(meuse$dist.m >= d.threshold.1) & (meuse$elev >= e.threshold.1.2), "rf.pred"] <- pred.2.2
table(round(meuse$rf.pred,2))
##
## 248.76 481.69 686.67 966.63
## 95 13 9 38
plot(meuse$y ~ meuse$x, cex=1.5*meuse$rf.pred/max(meuse$rf.pred), asp=1)
grid()
data(meuse.riv)
lines(meuse.riv)