The goal of the project is to write functions that will manipulate and process data sets. It will have a generic function that will automatically plot the returned data.
censusData <- rawData %>%
select(Area_name, STCOU, ends_with("D")) %>%
rename("area_name" = Area_name)
censusData
## # A tibble: 3,198 × 12
## area_…¹ STCOU EDU01…² EDU01…³ EDU01…⁴ EDU01…⁵ EDU01…⁶ EDU01…⁷ EDU01…⁸ EDU01…⁹
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 UNITED… 00000 4.00e7 4.00e7 4.03e7 4.07e7 4.14e7 4.21e7 4.27e7 4.34e7
## 2 ALABAMA 01000 7.34e5 7.28e5 7.30e5 7.28e5 7.26e5 7.26e5 7.28e5 7.31e5
## 3 Autaug… 01001 6.83e3 6.9 e3 6.92e3 6.85e3 7.01e3 7.14e3 7.15e3 7.38e3
## 4 Baldwi… 01003 1.64e4 1.65e4 1.68e4 1.71e4 1.75e4 1.80e4 1.87e4 1.94e4
## 5 Barbou… 01005 5.07e3 5.10e3 5.07e3 5.16e3 5.17e3 5.25e3 5.14e3 5.11e3
## 6 Bibb, … 01007 3.56e3 3.51e3 3.57e3 3.62e3 3.65e3 3.56e3 3.56e3 3.55e3
## 7 Blount… 01009 7.32e3 7.22e3 7.20e3 7.21e3 7.16e3 7.15e3 7.21e3 7.3 e3
## 8 Bulloc… 01011 2.01e3 1.98e3 1.98e3 1.98e3 1.98e3 2.02e3 2.01e3 2.01e3
## 9 Butler… 01013 4.64e3 4.58e3 4.61e3 4.59e3 4.54e3 4.48e3 4.44e3 4.35e3
## 10 Calhou… 01015 2.09e4 2.09e4 2.09e4 2.08e4 2.08e4 2.05e4 2.05e4 2.01e4
## # … with 3,188 more rows, 2 more variables: EDU010195D <dbl>, EDU010196D <dbl>,
## # and abbreviated variable names ¹area_name, ²EDU010187D, ³EDU010188D,
## # ⁴EDU010189D, ⁵EDU010190D, ⁶EDU010191D, ⁷EDU010192D, ⁸EDU010193D,
## # ⁹EDU010194D
longCensusData <- censusData %>%
pivot_longer(cols = 3:12, names_to = "item_ID", values_to = "US_total")
longCensusData
## # A tibble: 31,980 × 4
## area_name STCOU item_ID US_total
## <chr> <chr> <chr> <dbl>
## 1 UNITED STATES 00000 EDU010187D 40024299
## 2 UNITED STATES 00000 EDU010188D 39967624
## 3 UNITED STATES 00000 EDU010189D 40317775
## 4 UNITED STATES 00000 EDU010190D 40737600
## 5 UNITED STATES 00000 EDU010191D 41385442
## 6 UNITED STATES 00000 EDU010192D 42088151
## 7 UNITED STATES 00000 EDU010193D 42724710
## 8 UNITED STATES 00000 EDU010194D 43369917
## 9 UNITED STATES 00000 EDU010195D 43993459
## 10 UNITED STATES 00000 EDU010196D 44715737
## # … with 31,970 more rows
longCensusData$year = as.Date(substr(longCensusData$item_ID,8,9),"%y")
longCensusData$year <- as.numeric(format(longCensusData$year,"%Y"))
longCensusData$measurement = substr(longCensusData$item_ID,1,7)
longCensusData
## # A tibble: 31,980 × 6
## area_name STCOU item_ID US_total year measurement
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101
## # … with 31,970 more rows
county <- grep(pattern = ", \\w\\w", longCensusData$area_name)
county_data <- longCensusData[county,]
non_county_data <- longCensusData[-county,]
class(county_data) <- c("county", class(county_data))
class(non_county_data) <- c("state", class(non_county_data))
county_data
## # A tibble: 31,450 × 6
## area_name STCOU item_ID US_total year measurement
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101
## # … with 31,440 more rows
class(county_data)
## [1] "county" "tbl_df" "tbl" "data.frame"
non_county_data
## # A tibble: 530 × 6
## area_name STCOU item_ID US_total year measurement
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101
## # … with 520 more rows
class(non_county_data)
## [1] "state" "tbl_df" "tbl" "data.frame"
5.The given tibble has a variable which has information of county and state. In order to determine the state of every county we added a new variable that describes the state of all the county’s using str_sub()
county_data$state <- str_sub(county_data$area_name,-2)
county_data
## # A tibble: 31,450 × 7
## area_name STCOU item_ID US_total year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101 AL
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101 AL
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101 AL
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101 AL
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101 AL
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101 AL
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101 AL
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101 AL
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101 AL
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101 AL
## # … with 31,440 more rows
New_England <- c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont")
Mid_Atlantic <- c("New Jersey", "New York", "Pennsylvania")
East_North_Central <- c("Illinois", "Indiana", "Michigan", "Ohio","Wisconsin")
West_North_Central <- c("Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota","South Dakota")
South_Atlantic <- c("Delaware", "Florida", "Georgia", "Maryland", "North Carolina", "South Carolina", "Virginia", "DISTRICT OF COLUMBIA", "West Virginia")
East_South_Central <- c("Alabama", "Kentucky", "Mississippi", "Tennessee")
West_South_Central <- c("Arkansas", "Louisiana", "Oklahoma", "Texas")
Pacific <- c("Alaska", "California", "Hawaii", "Oregon", "Washington")
Mountain <- c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", "New Mexico", "Utah", "Wyoming")
non_county_data <- non_county_data %>%
mutate(division = if_else(tolower(non_county_data$area_name) %in% tolower(New_England),"New England",
if_else(tolower(non_county_data$area_name) %in% tolower(Mid_Atlantic),"Mid Atlantic",
if_else(tolower(non_county_data$area_name) %in% tolower(East_North_Central),"East North Central",
if_else(tolower(non_county_data$area_name) %in% tolower(West_North_Central),"West North Central",
if_else(tolower(non_county_data$area_name) %in% tolower(South_Atlantic),"South Atlantic",
if_else(tolower(non_county_data$area_name) %in% tolower(East_South_Central),"East South Central",
if_else(tolower(non_county_data$area_name) %in% tolower(West_South_Central),"West South Central",
if_else(tolower(non_county_data$area_name) %in% tolower(Pacific),"Pacific",
if_else(tolower(non_county_data$area_name) %in% tolower(Mountain),"Mountain", "ERROR" ))))))))))
non_county_data
## # A tibble: 530 × 7
## area_name STCOU item_ID US_total year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101 ERROR
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101 ERROR
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101 ERROR
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101 ERROR
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101 ERROR
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101 ERROR
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101 ERROR
## # … with 520 more rows
function_step_1_2 <- function(sheet2, colName = "enrollment_value"){
censusDataSecond <- sheet2 %>%
select(Area_name, STCOU, ends_with("D")) %>%
rename("area_name" = Area_name) %>%
pivot_longer(cols = 3:12, names_to = "item_ID", values_to = colName) %>%
return(censusDataSecond)
}
function_step_3 <- function(censusDataSecond){
censusDataSecond$year = as.Date(substr(censusDataSecond$item_ID,8,9),"%y")
censusDataSecond$year <- as.numeric(format(censusDataSecond$year,"%Y"))
censusDataSecond$measurement = substr(censusDataSecond$item_ID,1,7)
return(censusDataSecond)
}
function_step_5 <- function(county_data){
county_data$state <- str_sub(county_data$area_name,-2)
return(county_data)
}
function_step_6 <- function(non_county_data){
New_England <- c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont")
Mid_Atlantic <- c("New Jersey", "New York", "Pennsylvania")
East_North_Central <- c("Illinois", "Indiana", "Michigan", "Ohio","Wisconsin")
West_North_Central <- c("Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota","South Dakota")
South_Atlantic <- c("Delaware", "Florida", "Georgia", "Maryland", "North Carolina", "South Carolina", "Virginia", "DISTRICT OF COLUMBIA", "West Virginia")
East_South_Central <- c("Alabama", "Kentucky", "Mississippi", "Tennessee")
West_South_Central <- c("Arkansas", "Louisiana", "Oklahoma", "Texas")
Pacific <- c("Alaska", "California", "Hawaii", "Oregon", "Washington")
Mountain <- c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", "New Mexico", "Utah", "Wyoming")
non_county_data <- non_county_data %>%
mutate(division = if_else(tolower(non_county_data$area_name) %in% tolower(New_England),"New England",
if_else(tolower(non_county_data$area_name) %in% tolower(Mid_Atlantic),"Mid Atlantic",
if_else(tolower(non_county_data$area_name) %in% tolower(East_North_Central),"East North Central",
if_else(tolower(non_county_data$area_name) %in% tolower(West_North_Central),"West North Central",
if_else(tolower(non_county_data$area_name) %in% tolower(South_Atlantic),"South Atlantic",
if_else(tolower(non_county_data$area_name) %in% tolower(East_South_Central),"East South Central",
if_else(tolower(non_county_data$area_name) %in% tolower(West_South_Central),"West South Central",
if_else(tolower(non_county_data$area_name) %in% tolower(Pacific),"Pacific",
if_else(tolower(non_county_data$area_name) %in% tolower(Mountain),"Mountain", "ERROR" ))))))))))
return(non_county_data)
}
function_Step_4_5_6 <- function(censusDataSecond){
county <- grep(pattern = ", \\w\\w", censusDataSecond$area_name)
county_data <- censusDataSecond[county,]
non_county_data <- censusDataSecond[-county,]
class(county_data) <- c("county", class(county_data))
class(non_county_data) <- c("state", class(non_county_data))
county_data
class(county_data)
non_county_data
class(non_county_data)
county <- function_step_5(county_data)
state <- function_step_6(non_county_data)
return(list(county,state))
}
my_Wrapper_function <- function(url, colName = "US_total_enrollment" ){
a <- read_csv_function(url)
b <- function_step_1_2(a, colName)
c <- function_step_3(b)
d <- function_Step_4_5_6(c)
return(d)
}
wrapperOutput <- my_Wrapper_function("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", colName = "US_total_enrollment" )
wrapperOutput
## [[1]]
## # A tibble: 31,450 × 7
## area_name STCOU item_ID US_total_enrollment year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010197D 8099 1997 EDU0101 AL
## 2 Autauga, AL 01001 EDU010198D 8211 1998 EDU0101 AL
## 3 Autauga, AL 01001 EDU010199D 8489 1999 EDU0101 AL
## 4 Autauga, AL 01001 EDU010200D 8912 2000 EDU0102 AL
## 5 Autauga, AL 01001 EDU010201D 8626 2001 EDU0102 AL
## 6 Autauga, AL 01001 EDU010202D 8762 2002 EDU0102 AL
## 7 Autauga, AL 01001 EDU015203D 9105 2003 EDU0152 AL
## 8 Autauga, AL 01001 EDU015204D 9200 2004 EDU0152 AL
## 9 Autauga, AL 01001 EDU015205D 9559 2005 EDU0152 AL
## 10 Autauga, AL 01001 EDU015206D 9652 2006 EDU0152 AL
## # … with 31,440 more rows
##
## [[2]]
## # A tibble: 530 × 7
## area_name STCOU item_ID US_total_enrollment year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010197D 44534459 1997 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010198D 46245814 1998 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010199D 46368903 1999 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010200D 46818690 2000 EDU0102 ERROR
## 5 UNITED STATES 00000 EDU010201D 47127066 2001 EDU0102 ERROR
## 6 UNITED STATES 00000 EDU010202D 47606570 2002 EDU0102 ERROR
## 7 UNITED STATES 00000 EDU015203D 48506317 2003 EDU0152 ERROR
## 8 UNITED STATES 00000 EDU015204D 48693287 2004 EDU0152 ERROR
## 9 UNITED STATES 00000 EDU015205D 48978555 2005 EDU0152 ERROR
## 10 UNITED STATES 00000 EDU015206D 49140702 2006 EDU0152 ERROR
## # … with 520 more rows
The below code chunk calls mywrapper function to read and parse the two different files
fileOneData <- my_Wrapper_function("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv", colName = "US_total_enrollment")
fileTwoData <- my_Wrapper_function("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", colName = "US_total_enrollment")
The following function will combine the two data sets created
combineFunction <- function(fileOneData, fileTwoData){
combine_county_data <- bind_rows(fileOneData[[1]], fileTwoData[[1]])
combine_non_county_data <-
bind_rows(fileOneData[2],fileTwoData[[2]])
return(list(combine_county_data,combine_non_county_data))
}
combineDf <- combineFunction(fileOneData,fileTwoData)
combineDf
## [[1]]
## # A tibble: 62,900 × 7
## area_name STCOU item_ID US_total_enrollment year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101 AL
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101 AL
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101 AL
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101 AL
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101 AL
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101 AL
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101 AL
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101 AL
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101 AL
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101 AL
## # … with 62,890 more rows
##
## [[2]]
## # A tibble: 1,060 × 7
## area_name STCOU item_ID US_total_enrollment year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101 ERROR
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101 ERROR
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101 ERROR
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101 ERROR
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101 ERROR
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101 ERROR
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101 ERROR
## # … with 1,050 more rows
Given below is a chunk of code that creates plot function for the the state that shows mean enrollment value for each division acroos a given year.
plot.state <- function(df, var_name = "US_total_enrollment"){
new_df <-
df %>%
group_by(year,division) %>%
summarise(mean = mean(get(var_name))) %>%
filter(division != "ERROR")
ggplot(new_df, aes(x = year , y = mean, color = division)) + geom_line() + labs(x = "Year" , y = "Enrollment_Mean")
}
Given below is a chun0k of code that creates plot function for the the county, which takes in multiple arguments which specifies wheter to show top 5 or bottom 5 rows of the tibble and asks user to tell how many entries can be selected.
plot.county <- function(df, stateVar = "AL", var_name = "US_total_enrollment", topBottomVar = "top", numberOfEntries = 5) {
new_df <-
df %>%
filter(state == stateVar) %>%
group_by(area_name) %>%
summarise(mean = mean(get(var_name)))
if(topBottomVar == "top"){
new_df <-
new_df %>%
arrange(desc(mean)) %>%
head(n = numberOfEntries)
} else if(topBottomVar == "bottom"){
new_df <-
new_df %>%
arrange(mean) %>%
head(n = numberOfEntries)
}
final_df <-
df %>%
filter(area_name %in% new_df$area_name)
ggplot(final_df, aes(x = year , y = get(var_name), color = area_name)) + geom_line() +
labs(x = "Year" , y = "Enrollment_Count")
}
The below chunk of code will read the data from 2 .csv files and process it using my_Wrapper_function function and the will combine the data using combineFunction
firstDF <- my_Wrapper_function("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv", colName = "US_total_enrollment")
firstDF[[1]]
## # A tibble: 31,450 × 7
## area_name STCOU item_ID US_total_enrollment year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101 AL
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101 AL
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101 AL
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101 AL
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101 AL
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101 AL
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101 AL
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101 AL
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101 AL
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101 AL
## # … with 31,440 more rows
firstDF[[2]]
## # A tibble: 530 × 7
## area_name STCOU item_ID US_total_enrollment year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101 ERROR
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101 ERROR
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101 ERROR
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101 ERROR
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101 ERROR
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101 ERROR
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101 ERROR
## # … with 520 more rows
secondDF <- my_Wrapper_function("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", colName = "US_total_enrollment")
secondDF[[1]]
## # A tibble: 31,450 × 7
## area_name STCOU item_ID US_total_enrollment year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010197D 8099 1997 EDU0101 AL
## 2 Autauga, AL 01001 EDU010198D 8211 1998 EDU0101 AL
## 3 Autauga, AL 01001 EDU010199D 8489 1999 EDU0101 AL
## 4 Autauga, AL 01001 EDU010200D 8912 2000 EDU0102 AL
## 5 Autauga, AL 01001 EDU010201D 8626 2001 EDU0102 AL
## 6 Autauga, AL 01001 EDU010202D 8762 2002 EDU0102 AL
## 7 Autauga, AL 01001 EDU015203D 9105 2003 EDU0152 AL
## 8 Autauga, AL 01001 EDU015204D 9200 2004 EDU0152 AL
## 9 Autauga, AL 01001 EDU015205D 9559 2005 EDU0152 AL
## 10 Autauga, AL 01001 EDU015206D 9652 2006 EDU0152 AL
## # … with 31,440 more rows
secondDF[[2]]
## # A tibble: 530 × 7
## area_name STCOU item_ID US_total_enrollment year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010197D 44534459 1997 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010198D 46245814 1998 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010199D 46368903 1999 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010200D 46818690 2000 EDU0102 ERROR
## 5 UNITED STATES 00000 EDU010201D 47127066 2001 EDU0102 ERROR
## 6 UNITED STATES 00000 EDU010202D 47606570 2002 EDU0102 ERROR
## 7 UNITED STATES 00000 EDU015203D 48506317 2003 EDU0152 ERROR
## 8 UNITED STATES 00000 EDU015204D 48693287 2004 EDU0152 ERROR
## 9 UNITED STATES 00000 EDU015205D 48978555 2005 EDU0152 ERROR
## 10 UNITED STATES 00000 EDU015206D 49140702 2006 EDU0152 ERROR
## # … with 520 more rows
combinedDF_1_2 <- combineFunction(firstDF,secondDF)
countyDF_1_2 <- combinedDF_1_2[[1]]
stateDF_1_2 <- combinedDF_1_2[[2]]
countyDF_1_2
## # A tibble: 62,900 × 7
## area_name STCOU item_ID US_total_enrollment year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 1987 EDU0101 AL
## 2 Autauga, AL 01001 EDU010188D 6900 1988 EDU0101 AL
## 3 Autauga, AL 01001 EDU010189D 6920 1989 EDU0101 AL
## 4 Autauga, AL 01001 EDU010190D 6847 1990 EDU0101 AL
## 5 Autauga, AL 01001 EDU010191D 7008 1991 EDU0101 AL
## 6 Autauga, AL 01001 EDU010192D 7137 1992 EDU0101 AL
## 7 Autauga, AL 01001 EDU010193D 7152 1993 EDU0101 AL
## 8 Autauga, AL 01001 EDU010194D 7381 1994 EDU0101 AL
## 9 Autauga, AL 01001 EDU010195D 7568 1995 EDU0101 AL
## 10 Autauga, AL 01001 EDU010196D 7834 1996 EDU0101 AL
## # … with 62,890 more rows
stateDF_1_2
## # A tibble: 1,060 × 7
## area_name STCOU item_ID US_total_enrollment year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 EDU010187D 40024299 1987 EDU0101 ERROR
## 2 UNITED STATES 00000 EDU010188D 39967624 1988 EDU0101 ERROR
## 3 UNITED STATES 00000 EDU010189D 40317775 1989 EDU0101 ERROR
## 4 UNITED STATES 00000 EDU010190D 40737600 1990 EDU0101 ERROR
## 5 UNITED STATES 00000 EDU010191D 41385442 1991 EDU0101 ERROR
## 6 UNITED STATES 00000 EDU010192D 42088151 1992 EDU0101 ERROR
## 7 UNITED STATES 00000 EDU010193D 42724710 1993 EDU0101 ERROR
## 8 UNITED STATES 00000 EDU010194D 43369917 1994 EDU0101 ERROR
## 9 UNITED STATES 00000 EDU010195D 43993459 1995 EDU0101 ERROR
## 10 UNITED STATES 00000 EDU010196D 44715737 1996 EDU0101 ERROR
## # … with 1,050 more rows
The below chunk of code will plot state data frame and plot county data frame with given conditions for states of PA and MN. The plot shows variation in the enrollment values with respect to year, which can be used to learn the state wise variations.
plot(stateDF_1_2, var_name = "US_total_enrollment")
plot(countyDF_1_2, var_name = "US_total_enrollment", stateVar = "PA", topBottomVar = "top", numberOfEntries = 7)
plot(countyDF_1_2, var_name = "US_total_enrollment", stateVar = "PA", topBottomVar = "bottom", numberOfEntries = 4)
plot(countyDF_1_2)
plot(countyDF_1_2, var_name = "US_total_enrollment", stateVar = "MN", topBottomVar = "top", numberOfEntries = 10)
The below chunk of code will read and process the data from 4 csv files using my_Wrapper_function() and will combine all the 4 tibbles using combineFunction.
thirdDF <- my_Wrapper_function("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv", colName = "US_total_enrollment")
fourthDF <- my_Wrapper_function("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv", colName = "US_total_enrollment")
fifthDF <- my_Wrapper_function("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv", colName = "US_total_enrollment")
SixthDF <- my_Wrapper_function("https://www4.stat.ncsu.edu/~online/datasets/PST01d.csv", colName = "US_total_enrollment")
combineDF_3_4 <- combineFunction(thirdDF,fourthDF)
combineDF_5_6 <- combineFunction(fifthDF,SixthDF)
combineFinal <- combineFunction(combineDF_3_4,combineDF_5_6)
countyDF <- combineFinal[[1]]
stateDF <- combineFinal[[2]]
combineFinal
## [[1]]
## # A tibble: 125,800 × 7
## area_name STCOU item_ID US_total_enrollment year measurement state
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Autauga, AL 01001 PST015171D 25508 1971 PST0151 AL
## 2 Autauga, AL 01001 PST015172D 27166 1972 PST0151 AL
## 3 Autauga, AL 01001 PST015173D 28463 1973 PST0151 AL
## 4 Autauga, AL 01001 PST015174D 29266 1974 PST0151 AL
## 5 Autauga, AL 01001 PST015175D 29718 1975 PST0151 AL
## 6 Autauga, AL 01001 PST015176D 29896 1976 PST0151 AL
## 7 Autauga, AL 01001 PST015177D 30462 1977 PST0151 AL
## 8 Autauga, AL 01001 PST015178D 30882 1978 PST0151 AL
## 9 Autauga, AL 01001 PST015179D 32055 1979 PST0151 AL
## 10 Autauga, AL 01001 PST025181D 31985 1981 PST0251 AL
## # … with 125,790 more rows
##
## [[2]]
## # A tibble: 2,120 × 7
## area_name STCOU item_ID US_total_enrollment year measurement division
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED STATES 00000 PST015171D 206827028 1971 PST0151 ERROR
## 2 UNITED STATES 00000 PST015172D 209283904 1972 PST0151 ERROR
## 3 UNITED STATES 00000 PST015173D 211357490 1973 PST0151 ERROR
## 4 UNITED STATES 00000 PST015174D 213341552 1974 PST0151 ERROR
## 5 UNITED STATES 00000 PST015175D 215465246 1975 PST0151 ERROR
## 6 UNITED STATES 00000 PST015176D 217562728 1976 PST0151 ERROR
## 7 UNITED STATES 00000 PST015177D 219759860 1977 PST0151 ERROR
## 8 UNITED STATES 00000 PST015178D 222095080 1978 PST0151 ERROR
## 9 UNITED STATES 00000 PST015179D 224567234 1979 PST0151 ERROR
## 10 UNITED STATES 00000 PST025181D 229466391 1981 PST0251 ERROR
## # … with 2,110 more rows
The below chunk of code will plot state data frame and plot county data frame with given conditions for the state of CT, NC and MN.It generates plots for enrollment value against year and mean values of the enrollment, which will be helpful for the further analysis.
plot(stateDF)
plot(countyDF, stateVar="CT",topBottomVar="top",numberOfEntries=6)
plot(countyDF, stateVar="NC",topBottomVar="bottom",numberOfEntries=10)
plot(countyDF)
plot(countyDF, stateVar="MN",topBottomVar="top",numberOfEntries=4)