Data Cleaning

Load the packages

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytuesdayR)

Import the data

library(tidyverse)
library(tidytuesdayR)
df <- tidytuesdayR::tt_load('2024-05-21')
---- Compiling #TidyTuesday Information for 2024-05-21 ----
--- There is 1 file available ---


── Downloading files ───────────────────────────────────────────────────────────

  1 of 1: "emissions.csv"
df<- df$emissions

Explore the data

First few rows

head(df)
# A tibble: 6 × 7
   year parent_entity     parent_type commodity production_value production_unit
  <dbl> <chr>             <chr>       <chr>                <dbl> <chr>          
1  1962 Abu Dhabi Nation… State-owne… Oil & NGL            0.913 Million bbl/yr 
2  1962 Abu Dhabi Nation… State-owne… Natural …            1.84  Bcf/yr         
3  1963 Abu Dhabi Nation… State-owne… Oil & NGL            1.83  Million bbl/yr 
4  1963 Abu Dhabi Nation… State-owne… Natural …            4.42  Bcf/yr         
5  1964 Abu Dhabi Nation… State-owne… Oil & NGL            7.3   Million bbl/yr 
6  1964 Abu Dhabi Nation… State-owne… Natural …           17.3   Bcf/yr         
# ℹ 1 more variable: total_emissions_MtCO2e <dbl>

Column details

glimpse(df)
Rows: 12,551
Columns: 7
$ year                   <dbl> 1962, 1962, 1963, 1963, 1964, 1964, 1965, 1965,…
$ parent_entity          <chr> "Abu Dhabi National Oil Company", "Abu Dhabi Na…
$ parent_type            <chr> "State-owned Entity", "State-owned Entity", "St…
$ commodity              <chr> "Oil & NGL", "Natural Gas", "Oil & NGL", "Natur…
$ production_value       <dbl> 0.91250, 1.84325, 1.82500, 4.42380, 7.30000, 17…
$ production_unit        <chr> "Million bbl/yr", "Bcf/yr", "Million bbl/yr", "…
$ total_emissions_MtCO2e <dbl> 0.3638848, 0.1343552, 0.7277697, 0.3224525, 2.9…

Data Cleaning

Clean up column names

df1 <- df |>
   #To make the column names more clearer and concise.
  rename("year"=year,
         "company"=parent_entity,
         "owner_type"=parent_type,
         "commodity"=commodity,
         "quantity"=production_value,
         "unit"=production_unit,
         "emissions"=total_emissions_MtCO2e)

glimpse(df1)
Rows: 12,551
Columns: 7
$ year       <dbl> 1962, 1962, 1963, 1963, 1964, 1964, 1965, 1965, 1966, 1966,…
$ company    <chr> "Abu Dhabi National Oil Company", "Abu Dhabi National Oil C…
$ owner_type <chr> "State-owned Entity", "State-owned Entity", "State-owned En…
$ commodity  <chr> "Oil & NGL", "Natural Gas", "Oil & NGL", "Natural Gas", "Oi…
$ quantity   <dbl> 0.91250, 1.84325, 1.82500, 4.42380, 7.30000, 17.32655, 10.9…
$ unit       <chr> "Million bbl/yr", "Bcf/yr", "Million bbl/yr", "Bcf/yr", "Mi…
$ emissions  <dbl> 0.3638848, 0.1343552, 0.7277697, 0.3224525, 2.9110786, 1.26…

Select columns

df2<-df1|>
  # The units of measurement for various commodities differ. This column is not necessary in analysis.
  select(-unit)

glimpse(df2)
Rows: 12,551
Columns: 6
$ year       <dbl> 1962, 1962, 1963, 1963, 1964, 1964, 1965, 1965, 1966, 1966,…
$ company    <chr> "Abu Dhabi National Oil Company", "Abu Dhabi National Oil C…
$ owner_type <chr> "State-owned Entity", "State-owned Entity", "State-owned En…
$ commodity  <chr> "Oil & NGL", "Natural Gas", "Oil & NGL", "Natural Gas", "Oi…
$ quantity   <dbl> 0.91250, 1.84325, 1.82500, 4.42380, 7.30000, 17.32655, 10.9…
$ emissions  <dbl> 0.3638848, 0.1343552, 0.7277697, 0.3224525, 2.9110786, 1.26…

Change the quantity and emissions to two decimal places

df3<-df2
# To enhance the readability, facilitating comprehension and analysis.
  df3$emissions <- round(df3$emissions, 2)
  df3$quantity<-round(df3$quantity,2)
glimpse(df3)
Rows: 12,551
Columns: 6
$ year       <dbl> 1962, 1962, 1963, 1963, 1964, 1964, 1965, 1965, 1966, 1966,…
$ company    <chr> "Abu Dhabi National Oil Company", "Abu Dhabi National Oil C…
$ owner_type <chr> "State-owned Entity", "State-owned Entity", "State-owned En…
$ commodity  <chr> "Oil & NGL", "Natural Gas", "Oil & NGL", "Natural Gas", "Oi…
$ quantity   <dbl> 0.91, 1.84, 1.83, 4.42, 7.30, 17.33, 10.95, 25.07, 13.50, 2…
$ emissions  <dbl> 0.36, 0.13, 0.73, 0.32, 2.91, 1.26, 4.37, 1.83, 5.39, 2.18,…

String all the cleaning steps together

df_cleaned <- df |>
  rename("year"=year,
         "company"=parent_entity,
         "owner_type"=parent_type,
         "commodity"=commodity,
         "quantity"=production_value,
         "unit"=production_unit,
         "emissions"=total_emissions_MtCO2e)|>
  select(-unit)
  df_cleaned$emissions <- round(df_cleaned$emissions, 2)
  df_cleaned$quantity<-round(df_cleaned$quantity,2)
  
glimpse(df_cleaned)
Rows: 12,551
Columns: 6
$ year       <dbl> 1962, 1962, 1963, 1963, 1964, 1964, 1965, 1965, 1966, 1966,…
$ company    <chr> "Abu Dhabi National Oil Company", "Abu Dhabi National Oil C…
$ owner_type <chr> "State-owned Entity", "State-owned Entity", "State-owned En…
$ commodity  <chr> "Oil & NGL", "Natural Gas", "Oil & NGL", "Natural Gas", "Oi…
$ quantity   <dbl> 0.91, 1.84, 1.83, 4.42, 7.30, 17.33, 10.95, 25.07, 13.50, 2…
$ emissions  <dbl> 0.36, 0.13, 0.73, 0.32, 2.91, 1.26, 4.37, 1.83, 5.39, 2.18,…

Save the cleaned data

save(df_cleaned, file = "data/df_cleaned.RData")