Reading Multiple csvs as 1 data frame

October 27, 2018

In an earlier posting I wrote about having to break a single csv into multiple csvs. In other scenarios one data set maybe provided as multiple a csvs.

Thankfully purrr has a beautiful function called map_df() which will make this into a two liner. This process has essentially 3 steps.

Create a vector of all .csv files that should be merged together.
Read each file using readr::read_csv()
Combine each dataframe into one.

map_df() maps (applys) a function to each value of an object and produces a dataframe of all outputs.

For this example I will use the csvs I created in a previous tutorial utilizing a dataset from the Quantitative Social Science book.

# Get all csv file names 
file_names <- list.files("../../static/data/chunk_data", pattern = "\\.csv", full.names = TRUE)
file_names

##  [1] "../../static/data/chunk_data/social_chunked_1.csv" 
##  [2] "../../static/data/chunk_data/social_chunked_10.csv"
##  [3] "../../static/data/chunk_data/social_chunked_11.csv"
##  [4] "../../static/data/chunk_data/social_chunked_12.csv"
##  [5] "../../static/data/chunk_data/social_chunked_13.csv"
##  [6] "../../static/data/chunk_data/social_chunked_2.csv" 
##  [7] "../../static/data/chunk_data/social_chunked_3.csv" 
##  [8] "../../static/data/chunk_data/social_chunked_4.csv" 
##  [9] "../../static/data/chunk_data/social_chunked_5.csv" 
## [10] "../../static/data/chunk_data/social_chunked_6.csv" 
## [11] "../../static/data/chunk_data/social_chunked_7.csv" 
## [12] "../../static/data/chunk_data/social_chunked_8.csv" 
## [13] "../../static/data/chunk_data/social_chunked_9.csv"

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

# apply 
all_csvs <- map_df(file_names, read_csv)

## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )

## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )
## Parsed with column specification:
## cols(
##   sex = col_character(),
##   yearofbirth = col_double(),
##   primary2004 = col_double(),
##   messages = col_character(),
##   primary2006 = col_double(),
##   hhsize = col_double()
## )

# preview the data
head(all_csvs)

## # A tibble: 6 x 6
##   sex    yearofbirth primary2004 messages   primary2006 hhsize
##   <chr>        <dbl>       <dbl> <chr>            <dbl>  <dbl>
## 1 male          1941           0 Civic Duty           0      2
## 2 female        1947           0 Civic Duty           0      2
## 3 male          1951           0 Hawthorne            1      3
## 4 female        1950           0 Hawthorne            1      3
## 5 female        1982           0 Hawthorne            1      3
## 6 male          1981           0 Control              0      3