library(rio) # Load the package for future use
library(dplyr) # Load the package for future use
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr) # Load the package for future use
library(readr) # Load the package for future use
linkGit= "https://github.com/FundamentalsRudraksh/final_hw_part2/raw/refs/heads/main/HDR25_Statistical_Annex_HDI_Table.xlsx" # Get the link from GITHUB
HDI_raw=rio::import(file = linkGit, col_names = FALSE) # Creates an object that stores the data
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
new_names <- paste(as.character(HDI_raw[3, ]), as.character(HDI_raw[4, ]), 
                   as.character(HDI_raw[5, ]), as.character(HDI_raw[6, ]),
                   as.character(HDI_raw[7, ]), sep = " ") # Combines rows 3,4,5,6,7 into a single row as they all are part of the column header/name

names(HDI_raw) <- make.names(trimws(new_names), unique = TRUE) # Sets the combined column name from above as the new column names

HDI_clean <- HDI_raw[-c(1:8), ] # Deletes rows 1 to 8 as they are not necessary

names(HDI_clean) <- gsub("(^NA|NA$|NA\\.|\\.NA|\\.NA\\.)", "", names(HDI_clean)) # Removes any kind of NA from column names to simplify them 

HDI_clean[c(" ", "1", "a", "a 1", "2", "b")] <- NULL # Drops these columns which were created due to merging the rows earlier as they do not contain any useful info are are mostly NA

HDI_clean[, 4] <- NULL # Drops columns 4 which was just NA with no column name

names(HDI_clean) <- gsub("^SDG[0-9.]+", "", names(HDI_clean)) # Removes SDG3, SDG4.3 etc from column names as they are of no use 

names(HDI_clean) <- gsub("\\.", " ", names(HDI_clean)) # Replaces any . with spaces to ensure proper formatting and readability for the column names

names(HDI_clean) <- gsub(" +", " ", names(HDI_clean)) # Combines multiple spaces into one space to ensure proper formatting and readability for the column names

names(HDI_clean) # Shows the names of the columns for checking
##  [1] "HDI rank"                                          
##  [2] "Country"                                           
##  [3] "Human Development Index HDI Value 2023"            
##  [4] "Life expectancy at birth years 2023"               
##  [5] "Expected years of schooling years 2023"            
##  [6] "Mean years of schooling years 2023"                
##  [7] "a 1"                                               
##  [8] "Gross national income GNI per capita 2021 PPP 2023"
##  [9] "GNI per capita rank minus HDI rank2023"            
## [10] "HDI rank2022"
str(HDI_clean) # Shows the type and some entries for each column
## 'data.frame':    271 obs. of  10 variables:
##  $ HDI rank                                          : chr  "1" "2" "2" "4" ...
##  $ Country                                           : chr  "Iceland" "Norway" "Switzerland" "Denmark" ...
##  $ Human Development Index HDI Value 2023            : chr  "0.97199999999999998" "0.97" "0.97" "0.96199999999999997" ...
##  $ Life expectancy at birth years 2023               : chr  "82.691000000000003" "83.308000000000007" "83.953999999999994" "81.933000000000007" ...
##  $ Expected years of schooling years 2023            : chr  "18.850589750000001" "18.792850489999999" "16.667530060000001" "18.704010010000001" ...
##  $ Mean years of schooling years 2023                : chr  "13.908926279999999" "13.117962179999999" "13.94912109" "13.027320599999999" ...
##  $ a 1                                               : chr  "d" "e" "e" "e" ...
##  $ Gross national income GNI per capita 2021 PPP 2023: chr  "69116.937359999996" "112710.0211" "81948.901769999997" "76007.856690000001" ...
##  $ GNI per capita rank minus HDI rank2023            : chr  "12" "0" "5" "4" ...
##  $ HDI rank2022                                      : chr  "3" "1" "2" "4" ...
HDI_clean <- HDI_clean[!is.na(HDI_clean$`HDI rank`), ] # Keeps countries with HDI rank
HDI_final <- na.omit(HDI_clean) # Removes countries with NA or missing values
HDI_final[-which(names(HDI_final) == "Country")] <- 
  lapply(HDI_final[-which(names(HDI_final) == "Country")], function(x) readr::parse_number(as.character(x))) # Keeps Country column as a char and changes the data type for all the other columns to numeric
## Warning: 128 parsing failures.
## row col expected actual
##   1  -- a number      d
##   2  -- a number      e
##   3  -- a number      e
##   4  -- a number      e
##   5  -- a number      e
## ... ... ........ ......
## See problems(...) for more details.
## Warning: 1 parsing failure.
## row col expected actual
## 127  -- a number     ..
str(HDI_final) # Shows the data type and initial values for the columns
## 'data.frame':    128 obs. of  10 variables:
##  $ HDI rank                                          : num  1 2 2 4 5 5 8 10 11 12 ...
##  $ Country                                           : chr  "Iceland" "Norway" "Switzerland" "Denmark" ...
##  $ Human Development Index HDI Value 2023            : num  0.972 0.97 0.97 0.962 0.959 0.959 0.955 0.951 0.949 0.948 ...
##  $ Life expectancy at birth years 2023               : num  82.7 83.3 84 81.9 81.4 ...
##  $ Expected years of schooling years 2023            : num  18.9 18.8 16.7 18.7 17.3 ...
##  $ Mean years of schooling years 2023                : num  13.9 13.1 13.9 13 14.3 ...
##  $ a 1                                               : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "problems")= tibble [128 × 4] (S3: tbl_df/tbl/data.frame)
##   .. ..$ row     : int [1:128] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ col     : int [1:128] NA NA NA NA NA NA NA NA NA NA ...
##   .. ..$ expected: chr [1:128] "a number" "a number" "a number" "a number" ...
##   .. ..$ actual  : chr [1:128] "d" "e" "e" "e" ...
##  $ Gross national income GNI per capita 2021 PPP 2023: num  69117 112710 81949 76008 64053 ...
##  $ GNI per capita rank minus HDI rank2023            : num  12 0 5 4 13 10 6 9 -6 10 ...
##  $ HDI rank2022                                      : num  3 1 2 4 6 4 7 13 10 11 ...
##   ..- attr(*, "problems")= tibble [1 × 4] (S3: tbl_df/tbl/data.frame)
##   .. ..$ row     : int 127
##   .. ..$ col     : int NA
##   .. ..$ expected: chr "a number"
##   .. ..$ actual  : chr ".."
##  - attr(*, "na.action")= 'omit' Named int [1:65] 7 8 13 14 15 16 19 31 32 38 ...
##   ..- attr(*, "names")= chr [1:65] "15" "16" "21" "22" ...
means <- colMeans( HDI_final[, c(
    "Life expectancy at birth years 2023",
    "Expected years of schooling years 2023",
    "Mean years of schooling years 2023",
    "Gross national income GNI per capita 2021 PPP 2023"
  )], na.rm = TRUE) # Calculates column wise mean for each of the following columns

means # Displays the mean for each column computed above
##                Life expectancy at birth years 2023 
##                                          73.393461 
##             Expected years of schooling years 2023 
##                                          13.765561 
##                 Mean years of schooling years 2023 
##                                           9.295992 
## Gross national income GNI per capita 2021 PPP 2023 
##                                       25167.360498
readr::write_csv(HDI_final, "Final_HDI.csv") # Creates a csv file with everything cleaned and updated for the original file