DataScience Classroomnotes 24/Feb/2022

Linear Regression with known dataset diamonds contd..

  • Tidy Model approach
    Preview
    Preview
    Preview

loading libraries

library(ggplot2)
library(tidyverse)
library(tidymodels)
library(skimr)
library(corrr)
diamonds

Lets start working only with numberical variables

diamonds %>%
  select(where(~ !is.factor(.x))) %>%
  corrr::correlate() %>%
  corrr::fashion()

Lets build a correlation network plot

diamonds %>%
  select(where(~ !is.factor(.x))) %>%
  corrr::correlate() %>%
  corrr::network_plot()

finding the correlation b/w other variables and price


diamonds %>%
  select(where(~ !is.factor(.x))) %>%
  corrr::correlate() %>%
  corrr::focus(price) %>%
  arrange(desc(price))

Fetch variable names with correlation greater than 0.5

high_corr_variables <- diamonds %>%
  select(where(~ !is.factor(.x))) %>%
  corrr::correlate() %>%
  corrr::focus(price) %>%
  arrange(desc(price)) %>%
  filter(price > 0.5) %>%
  pull(term)

high_corr_variables

Lets get the data with only these variables and price

data <- diamonds %>%
  select(high_corr_variables, price)

data

split the training and testing data

data_split <- initial_split(data, strata = price)
data_training <- training(data_split)
data_testing <- testing(data_split)
data_training

building a model using lm

model <- lm(price ~ ., data=data_training)

summary of the model

summary(model)

summary using broom

broom::tidy(model)

Consider all the variables of diamond

complete_data <- diamonds
complete_data_split <- initial_split(complete_data, strata = price)
complete_data_training <- training(complete_data_split)
complete_data_testing <- testing(complete_data_split)
complete_model <- lm(price ~ ., data=complete_data_training)
summary(complete_model)

broom::tidy(complete_model)

The tidy models approach

Split the data

tidy_data_split <- initial_split(diamonds, strata = price)
tidy_data_train <- training(tidy_data_split)
tidy_data_test <- testing(tidy_data_split)

Data Preprocessing

data_rec <- tidy_data_train %>%
  recipe(price ~ ., tidy_data_train) %>%
  step_dummy(all_nominal()) %>%
  step_normalize(all_numeric(), -all_outcomes())%>%
  prep()

juiced <- juice(data_rec)
juiced

Create a model using lm

lm_model <- parsnip::linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

fitting the model

lm_fit <- parsnip::fit(lm_model, price ~ ., juiced)
lm_fit
glance(lm_fit)
broom::tidy(lm_fit)
results_train <- lm_fit %>%
  predict(new_data = juiced) %>%
  mutate(truth = tidy_data_train$price)

results_test <- lm_fit %>%
  predict(new_data = bake(data_rec, tidy_data_test)) %>%
  mutate(truth = tidy_data_test$price)

results <- results_train %>%
  mutate(type = "train") %>%
  bind_rows(
    results_test %>%
      mutate(type = "test")
  )
results
results %>%
  group_by(type) %>%
  yardstick::rmse(truth, .pred)

Prediction vs truth

ggplot(results, aes(truth, .pred)) +
  geom_point() +
  geom_abline(color="red", size=2) +
  coord_fixed()

Leave a Comment

This site uses Akismet to reduce spam. Learn how your comment data is processed.

About continuous learner

devops & cloud enthusiastic learner