Purpose
DB - Fox - Regression. The usage of + - * / ^ in lm is going to be different. .Simulate a Multi reg '' .+
> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2] + indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2] + indep[, 3])
Residuals:
Min 1Q Median 3Q Max
-5.69660 -1.10317 0.02548 1.11955 5.30918
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.9875 0.1014 19.60 <2e-16 ***
indep[, 2] 2.8206 0.1320 21.37 <2e-16 ***
indep[, 3] 4.1764 0.1340 31.18 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.709 on 1974 degrees of freedom
Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189
F-statistic: 713.2 on 2 and 1974 DF, p-value: < 2.2e-16 |
> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2] - indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2] - indep[, 3])
Residuals:
Min 1Q Median 3Q Max
-7.19413 -1.43267 -0.07039 1.44780 6.05450
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.07416 0.09307 43.78 <2e-16 ***
indep[, 2] 2.81388 0.16121 17.45 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.087 on 1975 degrees of freedom
Multiple R-squared: 0.1336, Adjusted R-squared: 0.1332
F-statistic: 304.7 on 1 and 1975 DF, p-value: < 2.2e-16 |
> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2] * indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2] * indep[, 3])
Residuals:
Min 1Q Median 3Q Max
-5.69500 -1.10398 0.02508 1.12103 5.30697
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.99090 0.15574 12.783 <2e-16 ***
indep[, 2] 2.81386 0.26792 10.502 <2e-16 ***
indep[, 3] 4.16967 0.26972 15.459 <2e-16 ***
indep[, 2]:indep[, 3] 0.01331 0.46198 0.029 0.977
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.709 on 1973 degrees of freedom
Multiple R-squared: 0.4195, Adjusted R-squared: 0.4186
F-statistic: 475.2 on 3 and 1973 DF, p-value: < 2.2e-16 |
/
> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2]/indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2]/indep[, 3])
Residuals:
Min 1Q Median 3Q Max
-6.45644 -1.17190 0.04854 1.16028 5.65789
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.09045 0.08069 50.694 <2e-16 ***
indep[, 2] -0.31556 0.18579 -1.698 0.0896 .
indep[, 2]:indep[, 3] 6.21142 0.24296 25.566 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.81 on 1974 degrees of freedom
Multiple R-squared: 0.3491, Adjusted R-squared: 0.3485
F-statistic: 529.5 on 2 and 1974 DF, p-value: < 2.2e-16 |
^
> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ I(indep[, 2]^2))
> summary(fit)
Call:
lm(formula = dep ~ I(indep[, 2]^2))
Residuals:
Min 1Q Median 3Q Max
-7.07126 -1.46190 -0.08996 1.45188 6.24808
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.57679 0.07029 65.11 <2e-16 ***
I(indep[, 2]^2) 2.70063 0.15678 17.23 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.091 on 1975 degrees of freedom
Multiple R-squared: 0.1306, Adjusted R-squared: 0.1302
F-statistic: 296.7 on 1 and 1975 DF, p-value: < 2.2e-16
> fit <- lm(dep ~ (indep[, 2]^2))
> summary(fit)
Call:
lm(formula = dep ~ (indep[, 2]^2))
Residuals:
Min 1Q Median 3Q Max
-7.19413 -1.43267 -0.07039 1.44780 6.05450
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.07416 0.09307 43.78 <2e-16 ***
indep[, 2] 2.81388 0.16121 17.45 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.087 on 1975 degrees of freedom
Multiple R-squared: 0.1336, Adjusted R-squared: 0.1332
F-statistic: 304.7 on 1 and 1975 DF, p-value: < 2.2e-16 |
To remove a few observations from the old model
> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2] + indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2] + indep[, 3])
Residuals:
Min 1Q Median 3Q Max
-5.69660 -1.10317 0.02548 1.11955 5.30918
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.9875 0.1014 19.60 <2e-16 ***
indep[, 2] 2.8206 0.1320 21.37 <2e-16 ***
indep[, 3] 4.1764 0.1340 31.18 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.709 on 1974 degrees of freedom
Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189
F-statistic: 713.2 on 2 and 1974 DF, p-value: < 2.2e-16
> summary(update(fit, subset = -10))
Call:
lm(formula = dep ~ indep[, 2] + indep[, 3], subset = -10)
Residuals:
Min 1Q Median 3Q Max
-5.69632 -1.10368 0.02294 1.12023 5.31033
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.9861 0.1015 19.57 <2e-16 ***
indep[, 2] 2.8219 0.1321 21.37 <2e-16 ***
indep[, 3] 4.1772 0.1340 31.17 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.709 on 1973 degrees of freedom
Multiple R-squared: 0.4194, Adjusted R-squared: 0.4188
F-statistic: 712.7 on 2 and 1973 DF, p-value: < 2.2e-16 |
Want to do a standardized regression coeff
> z <- cbind(dep, indep)
> z <- scale(z)
> fit <- lm(z[, 1] ~ z[, 3] + z[, 4])
> summary(fit)
Call:
lm(formula = z[, 1] ~ z[, 3] + z[, 4])
Residuals:
Min 1Q Median 3Q Max
-2.54100 -0.49208 0.01136 0.49938 2.36819
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.868e-19 1.714e-02 5.76e-17 1
z[, 3] 3.664e-01 1.715e-02 21.37 <2e-16 ***
z[, 4] 5.346e-01 1.715e-02 31.18 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7623 on 1974 degrees of freedom
Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189
F-statistic: 713.2 on 2 and 1974 DF, p-value: < 2.2e-16 |
Want to remove intercept from the model.
> z <- cbind(dep, indep)
> z <- scale(z)
> fit <- lm(z[, 1] ~ z[, 3] + z[, 4] - 1)
> summary(fit)
Call:
lm(formula = z[, 1] ~ z[, 3] + z[, 4] - 1)
Residuals:
Min 1Q Median 3Q Max
-2.54100 -0.49208 0.01136 0.49938 2.36819
Coefficients:
Estimate Std. Error t value Pr(>|t|)
z[, 3] 0.36644 0.01714 21.37 <2e-16 ***
z[, 4] 0.53463 0.01714 31.18 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7621 on 1975 degrees of freedom
Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189
F-statistic: 713.5 on 2 and 1975 DF, p-value: < 2.2e-16
> fit <- lm(z[, 1] ~ z[, 3] + z[, 4] + 0)
> summary(fit)
Call:
lm(formula = z[, 1] ~ z[, 3] + z[, 4] + 0)
Residuals:
Min 1Q Median 3Q Max
-2.54100 -0.49208 0.01136 0.49938 2.36819
Coefficients:
Estimate Std. Error t value Pr(>|t|)
z[, 3] 0.36644 0.01714 21.37 <2e-16 ***
z[, 4] 0.53463 0.01714 31.18 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7621 on 1975 degrees of freedom
Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189
F-statistic: 713.5 on 2 and 1975 DF, p-value: < 2.2e-16 |
Change the dummy variable coding
> n <- 2000
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> z <- as.data.frame(cbind(dep, indep))
> z$fac <- factor(rep(letters[1:4], 500))
> contrasts(z$fac)
b c d
a 0 0 0
b 1 0 0
c 0 1 0
d 0 0 1 |
Change the base coding to
> contrasts(z$fac) <- contr.treatment(levels(z$fac), base = 2)
> contrasts(z$fac)
a c d
a 1 0 0
b 0 0 0
c 0 1 0
d 0 0 1 |
Run a reg with factor a as base
> n <- 2000
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> z <- as.data.frame(cbind(dep, indep))
> z$fac <- factor(rep(letters[1:4], 500))
> fit <- lm(z[, 1] ~ z[, 3] + z[, 4] + z$fac)
> summary(fit)
Call:
lm(formula = z[, 1] ~ z[, 3] + z[, 4] + z$fac)
Residuals:
Min 1Q Median 3Q Max
-5.75244 -1.15317 -0.03433 1.18877 5.13664
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.14092 0.11970 17.886 <2e-16 ***
z[, 3] 2.99510 0.13357 22.423 <2e-16 ***
z[, 4] 3.87625 0.13317 29.107 <2e-16 ***
z$facb -0.03152 0.10841 -0.291 0.771
z$facc -0.01576 0.10838 -0.145 0.884
z$facd -0.09286 0.10838 -0.857 0.392
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.713 on 1994 degrees of freedom
Multiple R-squared: 0.4063, Adjusted R-squared: 0.4048
F-statistic: 272.9 on 5 and 1994 DF, p-value: < 2.2e-16 |
Run a reg with factor b as base
> n <- 2000
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> z <- as.data.frame(cbind(dep, indep))
> z$fac <- factor(rep(letters[1:4], 500))
> contrasts(z$fac) <- contr.treatment(levels(z$fac), base = 2)
> fit <- lm(z[, 1] ~ z[, 3] + z[, 4] + z$fac)
> summary(fit)
Call:
lm(formula = z[, 1] ~ z[, 3] + z[, 4] + z$fac)
Residuals:
Min 1Q Median 3Q Max
-5.75244 -1.15317 -0.03433 1.18877 5.13664
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.10940 0.12157 17.351 <2e-16 ***
z[, 3] 2.99510 0.13357 22.423 <2e-16 ***
z[, 4] 3.87625 0.13317 29.107 <2e-16 ***
z$faca 0.03152 0.10841 0.291 0.771
z$facc 0.01576 0.10840 0.145 0.884
z$facd -0.06135 0.10838 -0.566 0.571
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.713 on 1994 degrees of freedom
Multiple R-squared: 0.4063, Adjusted R-squared: 0.4048
F-statistic: 272.9 on 5 and 1994 DF, p-value: < 2.2e-16 |
See the anova of the model
> anova(fit)
Analysis of Variance Table
Response: z[, 1]
Df Sum Sq Mean Sq F value Pr(>F)
z[, 3] 1 1516.2 1516.2 516.4704 <2e-16 ***
z[, 4] 1 2487.2 2487.2 847.2168 <2e-16 ***
z$fac 3 2.5 0.8 0.2813 0.839
Residuals 1994 5853.8 2.9
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 |
- Best thing is to test the goodness of fit relating to two models
> fit1 <- lm(z[, 1] ~ z[, 3] + z[, 4])
> fit2 <- lm(z[, 1] ~ z[, 3] + z[, 4] + z$fac)
> anova(fit1, fit2)
Analysis of Variance Table
Model 1: z[, 1] ~ z[, 3] + z[, 4]
Model 2: z[, 1] ~ z[, 3] + z[, 4] + z$fac
Res.Df RSS Df Sum of Sq F Pr(>F)
1 1997 5856.3
2 1994 5853.8 3 2.5 0.2813 0.839 |
Basic Hypo testing
> library(car)
> attach(Duncan)
The following object(s) are masked from Duncan ( position 3 ) :
education income prestige type
The following object(s) are masked from Duncan ( position 4 ) :
education income prestige type
The following object(s) are masked from package:robustbase :
education
> fit <- lm(prestige ~ income + education)
> linear.hypothesis(fit, c(0, 1, -1))
Linear hypothesis test
Hypothesis:
income - education = 0
Model 1: prestige ~ income + education
Model 2: restricted model
Res.Df RSS Df Sum of Sq F Pr(>F)
1 42 7506.7
2 43 7518.9 -1 -12.2 0.0682 0.7952
> linear.hypothesis(fit, c(0, -1, 1))
Linear hypothesis test
Hypothesis:
-income + education = 0
Model 1: prestige ~ income + education
Model 2: restricted model
Res.Df RSS Df Sum of Sq F Pr(>F)
1 42 7506.7
2 43 7518.9 -1 -12.2 0.0682 0.7952
> fit <- lm(prestige ~ income)
> linear.hypothesis(fit, c(0, 1))
Linear hypothesis test
Hypothesis:
income = 0
Model 1: prestige ~ income
Model 2: restricted model
Res.Df RSS Df Sum of Sq F Pr(>F)
1 43 13023
2 44 43688 -1 -30665 101.25 7.144e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> linear.hypothesis(fit, diag(2), c(0, 1))
Linear hypothesis test
Hypothesis:
(Intercept) = 0
income = 1
Model 1: prestige ~ income
Model 2: restricted model
Res.Df RSS Df Sum of Sq F Pr(>F)
1 43 13022.8
2 45 14718.0 -2 -1695.2 2.7987 0.07201 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 |
Check for elliptical plots between two normal distributions
> data.ellipse(rnorm(1000), rnorm(1000), levels = c(0.5, 0.75,
+ 0.9, 0.95)) |
Check for elliptical plots between a normal and a uniform distributions
> data.ellipse(rnorm(1000), runif(1000), levels = c(0.5, 0.75,
+ 0.9, 0.95)) |
Confidence plots for estimates
> fit <- lm(prestige ~ income + education)
> confidence.ellipse(fit, levels = c(0.5, 0.75, 0.9, 0.95)) |
Arguments for lm
> args(lm)
function (formula, data, subset, weights, na.action, method = "qr",
model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
contrasts = NULL, offset, ...)
NULL |
Use of Identity
> fit <- lm(prestige ~ I(income + education))
> summary(fit)
Call:
lm(formula = prestige ~ I(income + education))
Residuals:
Min 1Q Median 3Q Max
-29.6049 -6.7078 0.1235 6.9794 33.2895
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.06319 4.22540 -1.435 0.159
I(income + education) 0.56927 0.03958 14.382 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.22 on 43 degrees of freedom
Multiple R-squared: 0.8279, Adjusted R-squared: 0.8239
F-statistic: 206.8 on 1 and 43 DF, p-value: < 2.2e-16
> fit <- lm(prestige ~ I(income * education))
> summary(fit)
Call:
lm(formula = prestige ~ I(income * education))
Residuals:
Min 1Q Median 3Q Max
-30.559 -10.845 -1.661 5.826 49.967
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.728e+01 3.425e+00 5.044 8.75e-06 ***
I(income * education) 1.120e-02 9.385e-04 11.934 3.10e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 15.35 on 43 degrees of freedom
Multiple R-squared: 0.7681, Adjusted R-squared: 0.7627
F-statistic: 142.4 on 1 and 43 DF, p-value: 3.102e-15
> fit <- lm(prestige ~ (income * education))
> summary(fit)
Call:
lm(formula = prestige ~ (income * education))
Residuals:
Min 1Q Median 3Q Max
-29.1898 -6.2758 0.4532 7.0933 32.7585
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -8.250934 7.378546 -1.118 0.26998
income 0.655621 0.197148 3.326 0.00187 **
education 0.606030 0.192363 3.150 0.00304 **
income:education -0.001237 0.003386 -0.365 0.71673
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.51 on 41 degrees of freedom
Multiple R-squared: 0.8287, Adjusted R-squared: 0.8162
F-statistic: 66.13 on 3 and 41 DF, p-value: 9.286e-16
> fit <- lm(prestige ~ (income:education))
> summary(fit)
Call:
lm(formula = prestige ~ (income:education))
Residuals:
Min 1Q Median 3Q Max
-30.559 -10.845 -1.661 5.826 49.967
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.728e+01 3.425e+00 5.044 8.75e-06 ***
income:education 1.120e-02 9.385e-04 11.934 3.10e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 15.35 on 43 degrees of freedom
Multiple R-squared: 0.7681, Adjusted R-squared: 0.7627
F-statistic: 142.4 on 1 and 43 DF, p-value: 3.102e-15 |