This shows how rpart decides on splits.

1 Dataset

Load the example dataset:

library(sp)
data(meuse)
names(meuse)

##  [1] "x"       "y"       "cadmium" "copper"  "lead"    "zinc"    "elev"   
##  [8] "dist"    "om"      "ffreq"   "soil"    "lime"    "landuse" "dist.m"

We want to predict Zn from distance to river and elevation.

2 Top-level split

2.1 Split on distance to river

We first try to split on distance. The idea is to find the cutpoint at which the residual sum of squares (within-group) is lowest, i.e., the between-group is highest. So, we sort the values and try them all.

We first find the total sum of squares (TSS), i.e., with no model:

(tss <- sum((meuse$zinc - mean(meuse$zinc))^2))

## [1] 20750448

Now try all thresholds; keep the results in a dataframe

(distances <- sort(unique(meuse$dist.m)))

##  [1]   10   20   30   40   50   60   70   80  100  110  120  130  140  150
## [15]  160  170  190  200  210  220  240  260  270  280  290  300  310  320
## [29]  330  340  350  360  370  380  390  400  410  420  430  440  450  460
## [43]  470  480  490  500  520  530  540  550  560  570  630  650  660  680
## [57]  690  710  720  750  760  860 1000

(nd <- length(distances))

## [1] 63

results.df <- data.frame(distance=distances, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
  branch.less <- meuse$zinc[meuse$dist.m < distances[i]]
  branch.more <- meuse$zinc[meuse$dist.m >= distances[i]]
  rss.less <- sum((branch.less-mean(branch.less))^2)
  rss.more <- sum((branch.more-mean(branch.more))^2)
  rss <- sum(rss.less + rss.more)
  results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
  }
print(results.df)

##    distance rss.less     rss.more      rss   r.squared
## 1        10        0 2.075045e+07 20750448 0.000000000
## 2        20  1832626 1.483439e+07 16667016 0.196787655
## 3        30  2316309 1.140827e+07 13724576 0.338588909
## 4        40  2333315 1.083309e+07 13166408 0.365487991
## 5        50  2723383 1.083307e+07 13556453 0.346691036
## 6        60  3174655 8.722037e+06 11896692 0.426677814
## 7        70  3236145 8.277919e+06 11514063 0.445117348
## 8        80  6067468 5.433085e+06 11500553 0.445768412
## 9       100  6395320 4.686610e+06 11081930 0.465942608
## 10      110  6692027 3.806981e+06 10499009 0.494034598
## 11      120  6698102 3.566876e+06 10264978 0.505312936
## 12      130  6979739 3.225653e+06 10205392 0.508184486
## 13      140  7127795 3.030296e+06 10158091 0.510464027
## 14      150  7477245 2.876879e+06 10354123 0.501016862
## 15      160  8372655 2.677065e+06 11049720 0.467494875
## 16      170  9071302 2.669654e+06 11740956 0.434183022
## 17      190  9370854 2.629214e+06 12000069 0.421695916
## 18      200  9611896 2.629090e+06 12240986 0.410085694
## 19      210  9714370 2.198424e+06 11912794 0.425901826
## 20      220  9744329 2.102696e+06 11847025 0.429071353
## 21      240 10077872 2.023965e+06 12101837 0.416791517
## 22      260 10687674 1.993349e+06 12681023 0.388879539
## 23      270 11171707 1.870105e+06 13041812 0.371492482
## 24      280 11517144 1.103623e+06 12620767 0.391783369
## 25      290 11644762 1.093727e+06 12738489 0.386110140
## 26      300 12289870 1.085278e+06 13375148 0.355428464
## 27      310 12463583 1.084929e+06 13548511 0.347073775
## 28      320 12752996 1.032326e+06 13785321 0.335661490
## 29      330 13436297 7.633569e+05 14199653 0.315694112
## 30      340 13594447 7.630010e+05 14357448 0.308089713
## 31      350 13917643 7.486246e+05 14666268 0.293207152
## 32      360 13998117 7.335915e+05 14731709 0.290053443
## 33      370 14272616 6.254629e+05 14898079 0.282035779
## 34      380 14453002 6.249539e+05 15077956 0.273367184
## 35      390 14927480 6.186454e+05 15546126 0.250805283
## 36      400 15304443 6.112366e+05 15915680 0.232995820
## 37      410 15770533 6.093446e+05 16379878 0.210625331
## 38      420 16095849 6.035239e+05 16699373 0.195228278
## 39      430 16382791 6.019859e+05 16984777 0.181474176
## 40      440 16455204 5.957830e+05 17050987 0.178283413
## 41      450 16693074 5.957748e+05 17288849 0.166820414
## 42      460 17004729 5.561872e+05 17560916 0.153709048
## 43      470 17256084 5.533319e+05 17809415 0.141733427
## 44      480 17424991 5.486696e+05 17973661 0.133818174
## 45      490 17598302 5.221048e+05 18120407 0.126746210
## 46      500 17709527 5.220175e+05 18231545 0.121390275
## 47      520 18041346 5.212990e+05 18562645 0.105433999
## 48      530 18042140 4.369718e+05 18479111 0.109459623
## 49      540 18254111 4.369278e+05 18691038 0.099246489
## 50      550 18623506 4.314852e+05 19054991 0.081706970
## 51      560 18862636 4.236141e+05 19286250 0.070562221
## 52      570 19011345 4.149336e+05 19426279 0.063813972
## 53      630 19097868 4.148858e+05 19512754 0.059646599
## 54      650 19376465 4.120889e+05 19788554 0.046355319
## 55      660 19556095 4.069047e+05 19963000 0.037948471
## 56      680 19599209 4.031925e+05 20002402 0.036049630
## 57      690 19851207 3.847440e+05 20235951 0.024794481
## 58      710 19863012 3.635260e+05 20226538 0.025248128
## 59      720 19930817 3.635234e+05 20294340 0.021980591
## 60      750 20054249 3.538015e+05 20408050 0.016500717
## 61      760 20261299 5.327500e+02 20261832 0.023547250
## 62      860 20373355 8.866667e+01 20373443 0.018168477
## 63     1000 20625231 0.000000e+00 20625231 0.006034401

(ix.r.squared.max <- which.max(results.df$r.squared))

## [1] 13

print(results.df[ix.r.squared.max,])

##    distance rss.less rss.more      rss r.squared
## 13      140  7127795  3030296 10158091  0.510464

(distance.r.squared.max <- results.df[ix.r.squared.max,"r.squared"])

## [1] 0.510464

(d.threshold.1 <- results.df[ix.r.squared.max,"distance"])

## [1] 140

plot(r.squared ~ distance, data=results.df, type="h",
     col=ifelse(distance==d.threshold.1,"red","gray"))

The cutoff for distance should be 140 meters. This would explain 51% of the variation in Zn concentration.

2.2 Split on elevation

But we also need to try the other possible splitting variable:

(elevations <- sort(unique(meuse$elev)))

##   [1]  5.180  5.700  5.760  5.800  6.160  6.280  6.320  6.340  6.360  6.420
##  [11]  6.480  6.560  6.680  6.740  6.860  6.900  6.920  6.983  6.986  7.000
##  [21]  7.020  7.100  7.160  7.200  7.220  7.300  7.307  7.320  7.360  7.400
##  [31]  7.440  7.480  7.516  7.536  7.540  7.552  7.564  7.610  7.633  7.640
##  [41]  7.653  7.655  7.680  7.702  7.706  7.720  7.740  7.756  7.760  7.780
##  [51]  7.784  7.791  7.792  7.800  7.809  7.820  7.822  7.860  7.900  7.909
##  [61]  7.932  7.940  7.951  7.971  7.980  8.060  8.070  8.121  8.128  8.176
##  [71]  8.180  8.217  8.226  8.231  8.261  8.292  8.328  8.351  8.410  8.429
##  [81]  8.463  8.468  8.470  8.472  8.490  8.504  8.507  8.536  8.538  8.540
##  [91]  8.575  8.577  8.659  8.668  8.688  8.694  8.704  8.727  8.741  8.743
## [101]  8.760  8.779  8.809  8.815  8.830  8.840  8.867  8.908  8.937  8.973
## [111]  8.976  8.990  9.015  9.020  9.036  9.041  9.043  9.049  9.073  9.095
## [121]  9.097  9.155  9.206  9.280  9.404  9.406  9.420  9.518  9.523  9.528
## [131]  9.555  9.573  9.604  9.634  9.691  9.717  9.720  9.732  9.811  9.879
## [141]  9.924  9.970 10.080 10.136 10.320 10.520

(nd <- length(elevations))

## [1] 146

results.df <- data.frame(elevation=elevations, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
  branch.less <- meuse$zinc[meuse$elev < elevations[i]]
  branch.more <- meuse$zinc[meuse$elev >= elevations[i]]
  rss.less <- sum((branch.less-mean(branch.less))^2)
  rss.more <- sum((branch.more-mean(branch.more))^2)
  rss <- sum(rss.less + rss.more)
  results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
  }
# print(results.df)
(ix.r.squared.max <- which.max(results.df$r.squared))

## [1] 32

print(results.df[ix.r.squared.max,])

##    elevation rss.less rss.more      rss r.squared
## 32      7.48  3362676 10176189 13538864 0.3475387

(elevation.r.squared.max <- results.df[ix.r.squared.max,"r.squared"])

## [1] 0.3475387

(e.threshold.1 <- results.df[ix.r.squared.max,"elevation"])

## [1] 7.48

plot(r.squared ~ elevation, data=results.df, type="h",
     col=ifelse(elevation==e.threshold.1,"red","gray"))

The cutoff for elevation should be 7.48 m.a.s.l. This would explain 34.8% of the variation in Zn concentration.

Clearly the distance gives a better split (\(R^2\) = 0.51) than elevation (\(R^2\) = 0.348).

2.3 Make the split

We now split the dataset into those two groups, but only if the increase in \(R^2\) is more than a user-specified threshold. Here there is a very large increase in \(R^2\) (the unsplit \(R^2\) is by definition 0), so we split.

meuse.split1.less <- meuse[meuse$dist.m < d.threshold.1,]
meuse.split1.more <- meuse[meuse$dist.m >= d.threshold.1,]
dim(meuse.split1.less)[1]; dim(meuse.split1.more)[1]

## [1] 51

## [1] 104

We see that the first group (closer to river) has fewer points.

3 Second-level split

Now rpart does the same procedure for each group separately; the improvement in \(R^2\) of the finer split is compared to that from the first split.

3.1 Split of left group

We try both distance and elevation on the left-hand group.

(distances <- sort(unique(meuse.split1.less$dist.m)))

##  [1]  10  20  30  40  50  60  70  80 100 110 120 130

nd <- length(distances)

All of these distances are less then the threshold use for the first split.

We now see how much further splitting on distance improves the fit.

results.df <- data.frame(distance=distances, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
  branch.less <- meuse.split1.less$zinc[meuse.split1.less$dist.m < distances[i]]
  branch.more <- meuse.split1.less$zinc[meuse.split1.less$dist.m >= distances[i]]
  rss.less <- sum((branch.less-mean(branch.less))^2)
  rss.more <- sum((branch.more-mean(branch.more))^2)
  rss <- sum(rss.less + rss.more)
  results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
  }
print(results.df)

##    distance rss.less  rss.more     rss r.squared
## 1        10        0 7127795.0 7127795 0.6564992
## 2        20  1832626 4830846.3 6663472 0.6788757
## 3        30  2316309 3845385.5 6161695 0.7030573
## 4        40  2333315 3668820.8 6002136 0.7107467
## 5        50  2723383 3550061.7 6273445 0.6976718
## 6        60  3174655 2507435.8 5682091 0.7261702
## 7        70  3236145 2455665.0 5691810 0.7257018
## 8        80  6067468  446060.0 6513528 0.6861018
## 9       100  6395320  417804.0 6813124 0.6716638
## 10      110  6692027  118938.8 6810966 0.6717678
## 11      120  6698102   85938.0 6784040 0.6730654
## 12      130  6979739   14792.0 6994531 0.6629214

ix.r.squared.max <- which.max(results.df$r.squared)
print(results.df[ix.r.squared.max,])

##   distance rss.less rss.more     rss r.squared
## 6       60  3174655  2507436 5682091 0.7261702

distance.r.squared.max <- results.df[ix.r.squared.max,"r.squared"]
d.threshold.1.1 <- results.df[ix.r.squared.max,"distance"]
plot(r.squared ~ distance, data=results.df, type="h",
     col=ifelse(distance==d.threshold.1.1,"red","gray"))

The cutoff for distance should be 60 meters; however we see it is very close to the next-further distance. This explains 72.62% of the variation in this group.

But we also need to try the other possible splitting variable, elevation.

(elevations <- sort(unique(meuse.split1.less$elev)))

##  [1] 5.180 5.700 5.760 6.280 6.340 6.420 6.480 6.680 6.860 6.920 6.983
## [12] 7.000 7.020 7.100 7.160 7.200 7.220 7.300 7.307 7.320 7.360 7.400
## [23] 7.440 7.536 7.540 7.552 7.680 7.702 7.706 7.740 7.784 7.900 7.909
## [34] 7.940 7.980 8.060 8.231 8.261 8.463 8.470 8.490 8.538 8.540 8.704
## [45] 8.779 8.809 8.815 8.908 9.518

nd <- length(elevations)

Although we did not split on elevation, the set of elevations at this closer distance is smaller than the set for all the points.

results.df <- data.frame(elevation=elevations, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
  branch.less <- meuse.split1.less$zinc[meuse.split1.less$elev < elevations[i]]
  branch.more <- meuse.split1.less$zinc[meuse.split1.less$elev >= elevations[i]]
  rss.less <- sum((branch.less-mean(branch.less))^2)
  rss.more <- sum((branch.more-mean(branch.more))^2)
  rss <- sum(rss.less + rss.more)
  results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
  }
# print(results.df)
ix.r.squared.max.e <- which.max(results.df$r.squared)
print(results.df[ix.r.squared.max.e,])

##    elevation rss.less rss.more     rss r.squared
## 37     8.231  4351389 498522.8 4849912 0.7662744

elevation.r.squared.max <- results.df[ix.r.squared.max.e,"r.squared"]
e.threshold.1.1 <- results.df[ix.r.squared.max.e,"elevation"]
plot(r.squared ~ elevation, data=results.df, type="h",
     col=ifelse(elevation==e.threshold.1.1,"red","gray"))

The cutoff for elevation should be 8.231 meters; however we see it is very close to another similar elevation. This explains 76.63% of the variation in this group.

The two improvements are close, but now splitting this branch on elevation (\(R^2\) = 0.766) gives a better \(R^2\) within this branch than splitting on distance (\(R^2\) = 0.726).

The \(R^2\) reported in the previous code is only for this group; to decide if the split is enough improvement in the overall model we need to compute the \(R^2\) of the proposed split over all the (new) groups, and compare with the \(R^2\) from the first split we’ve already made.

rss.right <- sum((meuse.split1.more$zinc - mean(meuse.split1.more$zinc))^2)
rss.left <- sum((meuse.split1.less$zinc - mean(meuse.split1.less$zinc))^2)
(r2.1 <- 1 - ((rss.right + rss.left)/tss))

## [1] 0.510464

rss.left.split <- sum(results.df[ix.r.squared.max.e,c("rss.less","rss.more")])
(r2.2 <- 1 - ((rss.right + rss.left.split)/tss))

## [1] 0.6202392

The \(R^2\) after this split is 0.62; the \(R^2\) after the first split is 0.51; the improvement is 0.11. Depending on the setting of the complexity parameter (CP), we would decide to split or not. This improvment is not too much, only about 4%, i.e., corresponding to a CP of 0.04.

Make the split:

meuse.split1.less.split2.less <- meuse.split1.less[meuse.split1.less$elev < e.threshold.1.1,]
meuse.split1.less.split2.more <- meuse.split1.less[meuse.split1.less$elev >= e.threshold.1.1,]
dim(meuse.split1.less.split2.less)[1]; dim(meuse.split1.less.split2.more)[1]

## [1] 38

## [1] 13

row.names(meuse.split1.less.split2.less)

##  [1] "1"   "2"   "13"  "16"  "17"  "18"  "19"  "20"  "39"  "40"  "41" 
## [12] "46"  "53"  "54"  "55"  "56"  "57"  "60"  "61"  "62"  "63"  "64" 
## [23] "65"  "66"  "67"  "79"  "80"  "81"  "87"  "88"  "89"  "90"  "123"
## [34] "160" "97"  "151" "152" "153"

row.names(meuse.split1.less.split2.more)

##  [1] "8"   "14"  "21"  "96"  "120" "121" "124" "128" "129" "130" "135"
## [12] "149" "164"

3.2 Split of right group

Same procedure for the other group.

(distances <- sort(unique(meuse.split1.more$dist.m)))

##  [1]  140  150  160  170  190  200  210  220  240  260  270  280  290  300
## [15]  310  320  330  340  350  360  370  380  390  400  410  420  430  440
## [29]  450  460  470  480  490  500  520  530  540  550  560  570  630  650
## [43]  660  680  690  710  720  750  760  860 1000

(nd <- length(distances))

## [1] 51

results.df <- data.frame(distance=distances, rss.more=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
  branch.less <- meuse.split1.more$zinc[meuse.split1.more$dist.m < distances[i]]
  branch.more <- meuse.split1.more$zinc[meuse.split1.more$dist.m >= distances[i]]
  rss.less <- sum((branch.less-mean(branch.less))^2)
  rss.more <- sum((branch.more-mean(branch.more))^2)
  rss <- sum(rss.less + rss.more)
  results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
  }
print(results.df)

##    distance  rss.more   rss.more.1     rss r.squared
## 1       140       0.0 3.030296e+06 3030296 0.8539648
## 2       150   14126.0 2.876879e+06 2891005 0.8606775
## 3       160  197781.7 2.677065e+06 2874847 0.8614562
## 4       170  239020.5 2.669654e+06 2908674 0.8598260
## 5       190  254558.9 2.629214e+06 2883773 0.8610260
## 6       200  269820.8 2.629090e+06 2898910 0.8602965
## 7       210  466023.0 2.198424e+06 2664447 0.8715957
## 8       220  485336.5 2.102696e+06 2588033 0.8752782
## 9       240  498229.8 2.023965e+06 2522195 0.8784511
## 10      260  587656.0 1.993349e+06 2581005 0.8756169
## 11      270  697420.0 1.870105e+06 2567525 0.8762665
## 12      280 1157769.9 1.103623e+06 2261393 0.8910195
## 13      290 1166118.2 1.093727e+06 2259846 0.8910941
## 14      300 1287622.5 1.085278e+06 2372900 0.8856458
## 15      310 1310127.6 1.084929e+06 2395056 0.8845781
## 16      320 1366140.2 1.032326e+06 2398466 0.8844138
## 17      330 1584093.6 7.633569e+05 2347450 0.8868723
## 18      340 1605216.1 7.630010e+05 2368217 0.8858715
## 19      350 1659463.0 7.486246e+05 2408088 0.8839501
## 20      360 1660659.2 7.335915e+05 2394251 0.8846169
## 21      370 1716922.0 6.254629e+05 2342385 0.8871164
## 22      380 1750630.0 6.249539e+05 2375584 0.8855165
## 23      390 1829347.0 6.186454e+05 2447992 0.8820270
## 24      400 1907668.1 6.112366e+05 2518905 0.8786096
## 25      410 1982753.7 6.093446e+05 2592098 0.8750823
## 26      420 2041436.9 6.035239e+05 2644961 0.8725348
## 27      430 2084403.7 6.019859e+05 2686390 0.8705382
## 28      440 2085895.2 5.957830e+05 2681678 0.8707653
## 29      450 2113021.2 5.957748e+05 2708796 0.8694584
## 30      460 2167097.9 5.561872e+05 2723285 0.8687602
## 31      470 2203513.9 5.533319e+05 2756846 0.8671428
## 32      480 2214846.6 5.486696e+05 2763516 0.8668214
## 33      490 2243200.1 5.221048e+05 2765305 0.8667352
## 34      500 2257298.7 5.220175e+05 2779316 0.8660599
## 35      520 2300478.8 5.212990e+05 2821778 0.8640136
## 36      530 2333477.5 4.369718e+05 2770449 0.8664872
## 37      540 2361337.9 4.369278e+05 2798266 0.8651467
## 38      550 2425614.3 4.314852e+05 2857100 0.8623114
## 39      560 2468958.3 4.236141e+05 2892572 0.8606019
## 40      570 2502733.7 4.149336e+05 2917667 0.8593926
## 41      630 2511414.2 4.148858e+05 2926300 0.8589765
## 42      650 2545715.6 4.120889e+05 2957804 0.8574583
## 43      660 2569572.4 4.069047e+05 2976477 0.8565584
## 44      680 2569736.3 4.031925e+05 2972929 0.8567294
## 45      690 2621907.7 3.847440e+05 3006652 0.8551042
## 46      710 2628841.3 3.635260e+05 2992367 0.8557926
## 47      720 2633713.9 3.635234e+05 2997237 0.8555579
## 48      750 2659832.5 3.538015e+05 3013634 0.8547678
## 49      760 2920717.3 5.327500e+02 2921250 0.8592199
## 50      860 2943033.2 8.866667e+01 2943122 0.8581659
## 51     1000 3001233.7 0.000000e+00 3001234 0.8553654

ix.r.squared.max <- which.max(results.df$r.squared)
print(results.df[ix.r.squared.max,])

##    distance rss.more rss.more.1     rss r.squared
## 13      290  1166118    1093727 2259846 0.8910941

(distance.r.squared.max <- results.df[ix.r.squared.max,"r.squared"])

## [1] 0.8910941

d.threshold.2.1 <- results.df[ix.r.squared.max,"distance"]
plot(r.squared ~ distance, data=results.df, type="h",
     col=ifelse(distance==d.threshold.2.1,"red","gray"))

The cutoff for distance should be 290 meters, explaining 89.11% of the variation in this group; however we see it is very close to the next-further distance.

But we also need to try the other possible splitting variable, elevation.

(elevations <- sort(unique(meuse.split1.more$elev)))

##  [1]  5.800  6.160  6.320  6.360  6.560  6.740  6.900  6.986  7.480  7.516
## [11]  7.564  7.610  7.633  7.640  7.653  7.655  7.720  7.756  7.760  7.780
## [21]  7.791  7.792  7.800  7.809  7.820  7.822  7.860  7.932  7.951  7.971
## [31]  8.070  8.121  8.128  8.176  8.180  8.217  8.226  8.292  8.328  8.351
## [41]  8.410  8.429  8.468  8.472  8.504  8.507  8.536  8.575  8.577  8.659
## [51]  8.668  8.688  8.694  8.727  8.741  8.743  8.760  8.830  8.840  8.867
## [61]  8.937  8.973  8.976  8.990  9.015  9.020  9.036  9.041  9.043  9.049
## [71]  9.073  9.095  9.097  9.155  9.206  9.280  9.404  9.406  9.420  9.523
## [81]  9.528  9.555  9.573  9.604  9.634  9.691  9.717  9.720  9.732  9.811
## [91]  9.879  9.924  9.970 10.080 10.136 10.320 10.520

nd <- length(elevations)
results.df <- data.frame(elevation=elevations, rss.less=rep(0,nd), rss.more=rep(0,nd), rss=rep(0,nd), r.squared=rep(0,nd))
for (i in 1:nd) {
  branch.less <-
    meuse.split1.more$zinc[meuse.split1.more$elev < elevations[i]]
  branch.more <-
    meuse.split1.more$zinc[meuse.split1.more$elev >= elevations[i]]
  rss.less <- sum((branch.less-mean(branch.less))^2)
  rss.more <- sum((branch.more-mean(branch.more))^2)
  rss <- sum(rss.less + rss.more)
  results.df[i,2:5] <- c(rss.less, rss.more, rss, 1-rss/tss)
  }
# print(results.df)
ix.r.squared.max.e <- which.max(results.df$r.squared)
print(results.df[ix.r.squared.max.e,])

##   elevation rss.less rss.more     rss r.squared
## 8     6.986   219496  1234277 1453773 0.9299401

elevation.r.squared.max <- results.df[ix.r.squared.max.e,"r.squared"]
e.threshold.1.2 <- results.df[ix.r.squared.max.e,"elevation"]
plot(r.squared ~ elevation, data=results.df, type="h",
     col=ifelse(elevation==e.threshold.1.2,"red","gray"))

The cutoff for elevation should be 6.986 m.a.s.l., explaining 92.99% of the variation in this group; however we see it is very close to the next-further distance.

The two improvements are close, but now splitting this branch on elevation (\(R^2\) = 0.93) gives a better \(R^2\) than splitting on distance (\(R^2\) = 0.891).

Compute the \(R^2\) of the proposed split over all the (new) groups, and compare with the \(R^2\) from the first split we’ve already made. We do not include the split of the left branch.

rss.left.split <- sum(results.df[ix.r.squared.max.e,c("rss.less","rss.more")])
(r2.2 <- 1 - ((rss.right + rss.left.split)/tss))

## [1] 0.783905

The \(R^2\) after this second split is 0.784; the \(R^2\) after the first split is 0.51; the improvement is 0.273. This is a very substantial improvement, so the split would be made.

Make the split:

meuse.split1.more.split2.less <- meuse.split1.more[meuse.split1.more$elev < e.threshold.1.2,]
meuse.split1.more.split2.more <- meuse.split1.more[meuse.split1.more$elev >= e.threshold.1.2,]
dim(meuse.split1.more.split2.less)[1]; dim(meuse.split1.more.split2.more)[1]

## [1] 9

## [1] 95

row.names(meuse.split1.more.split2.less)

## [1] "47" "59" "69" "76" "82" "83" "84" "85" "86"

row.names(meuse.split1.more.split2.more)

##  [1] "3"   "4"   "5"   "6"   "7"   "9"   "10"  "11"  "12"  "15"  "22" 
## [12] "23"  "24"  "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33" 
## [23] "34"  "35"  "37"  "38"  "42"  "43"  "44"  "45"  "48"  "49"  "50" 
## [34] "51"  "52"  "58"  "75"  "163" "70"  "71"  "91"  "92"  "93"  "94" 
## [45] "95"  "98"  "99"  "100" "101" "102" "103" "104" "105" "106" "108"
## [56] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119"
## [67] "122" "125" "126" "127" "131" "132" "133" "134" "136" "161" "162"
## [78] "137" "138" "140" "141" "142" "143" "144" "145" "146" "147" "148"
## [89] "150" "154" "155" "156" "157" "158" "159"

4 Predictions for the groups

We now have four groups at two levels. The predictions for the groups are the averages of the target variable value in the group.

Top level (no split):

mean(meuse$zinc)

## [1] 469.7161

First split:

mean(meuse.split1.less$zinc)

## [1] 843.0196

mean(meuse.split1.more$zinc)

## [1] 286.6538

This is a big difference: closer to the river than 140 m has a much higher mean Zn concentration than further away, about 3x.

Second split, left branch:

mean(meuse.split1.less$zinc)

## [1] 843.0196

(pred.1.1 <- mean(meuse.split1.less.split2.less$zinc))

## [1] 966.6316

(pred.1.2 <- mean(meuse.split1.less.split2.more$zinc))

## [1] 481.6923

At the elevations lower than 6.986 m.a.s.l., close to the river, are the highest Zn concentrations.

Second split, right branch:

mean(meuse.split1.more$zinc)

## [1] 286.6538

(pred.2.1 <- mean(meuse.split1.more.split2.less$zinc))

## [1] 686.6667

(pred.2.2 <- mean(meuse.split1.more.split2.more$zinc))

## [1] 248.7579

Again the elevation is important: at the elevations lower than 6.986 m.a.s.l., but far from the river, are the second-highest Zn concentrations.

5 Map

Add the predictions for each point and display:

meuse$rf.pred <- 0
meuse[(meuse$dist.m < d.threshold.1) & (meuse$elev < e.threshold.1.1), "rf.pred"] <- pred.1.1
meuse[(meuse$dist.m < d.threshold.1) & (meuse$elev >= e.threshold.1.1), "rf.pred"] <- pred.1.2
meuse[(meuse$dist.m >= d.threshold.1) & (meuse$elev < e.threshold.1.2), "rf.pred"] <- pred.2.1
meuse[(meuse$dist.m >= d.threshold.1) & (meuse$elev >= e.threshold.1.2), "rf.pred"] <- pred.2.2
table(round(meuse$rf.pred,2))

## 
## 248.76 481.69 686.67 966.63 
##     95     13      9     38

plot(meuse$y ~ meuse$x, cex=1.5*meuse$rf.pred/max(meuse$rf.pred), asp=1)
grid()
data(meuse.riv)
lines(meuse.riv)

Regression Trees – How splits are determined

D G Rossiter

2019-08-04