Data Frames in R

Data frames is the most widely used data structure in R. It is somewhat similar to matrix(in terms of structure) but the difference between the two is that – matrix can only support one time of data type in one matrix else they will be coerced – the concept of coercion is explained here. Whereas in data frames – multiple data types can exist in one table.

Like we have witness in this series of lectures on basics of R – we will follow the same path and this too shall be a hands on exercise. Please follow along:

# Creating Data Frames using vectors
x = c(1,2,3)
y = c(‘a’,’b’,’c’)
z = c(TRUE, FALSE, TRUE)

df = data.frame(x,y,z)
df

# Creating Data Frames from matrix
x = matrix(c(1:16), ncol = 4)
x = as.data.frame(x)
x

# Creating data frames directly using data.frames function

df = data.frame(name = ‘nehal’, marks = ’10’)
df

# Blank Data Frame
BlankDF = data.frame(col1 = numeric(10), col2 = character(10), col3 = numeric(10))
# Editing the blank data frame

TempDF <- edit(BlankDF)
BlankDF <- TempDF
fix(BlankDF)

# Accessing columns of Data Frames
a = BlankDF[1]
class(a)

a = BlankDF[[1]]
class(a)

# Observe the difference between the usage of ‘[]’ and ‘[[]]’
# ‘[]’ – Preserves the output as data frame as seen from above
# ‘[[]]’ – Outputs in the simplest form, here in the form of characters

# Accessing data frame using ‘$’ – does not preserve it as data frame
a = BlankDF$col2
class(a)

# ———————————Adding rows and columns to DF————

# Adding columns using data.frame function
data.frame(BlankDF, SSN = c(1:2))

# Adding columns using cbind function
cbind(BlankDF, SSN = c(2:11))

# Adding Rows using rbind function
rbind(BlankDF, list(‘Ramesh’, 9887, 123))

# Adding Rows using data.frame function – appends X to each col name and replicates
data.frame(BlankDF, list(‘Ramesh’, 9887, 123))

# ———————–Accessing columns in df———————-

abc = c(1:5)
bde = letters[1:5]
bec = c(2:6)

df = data.frame(abc, bde, bec)
df

# Concept of partial matching
df$a

# In case of conflict
df$b

# Accessing columns using subset function
subset(df, select = a)

# Partial matching in subset does not work
subset(df, select = abc)

# Suppose we want to output every column except abc
subset(df, select = -abc)
# ——————subsetting data frames ———————-

df = mtcars

# Lets see the structure of the dataset

str(df)

# Lets see important statistics of the data frame
summary(df)

# vs, am,gear,carb has a lot of similar values, lets inspect
view(df)

# Lets find out the unique values for these columns
apply(df[8:11],2,unique)

# Subset all the cars with gear 4
subset(df, gear == 4)

# Suppose we want mpg, cyl and disp info only for gears = 4;
subset(df, select = c(mpg, cyl, disp), subset=(gear == 4))

# Suppose we want mpg, cyl and disp info only for carb > 3 and am = 1
subset(df, select = c(mpg, cyl, disp), subset = (carb > 3 & am == 1))

# ————————-Transforming Data Frames—————————————

EmpName = letters[1:10]
AmountSold = c(54,55,32,34,89,78,43,100,23,34)
Cost = c(10,11,8,9,14,12,8,16,9,10)
Revenue = c(2480, 2890, 4200, 4619, 1200, 4590, 1900, 1888, 4783, 1300)

SalesInfo = data.frame(EmpName, AmountSold, Cost, Revenue)

# To obtain profit we can subtract cost from Revenue, add a new column profit

SalesInfo$Profit = SalesInfo$Revenue – SalesInfo$Cost

# Even better way to do this using transform function

SalesInfo = transform(SalesInfo, Profit = Revenue – Cost)

# To obtain margin

transform(SalesInfo, Margin = round((Profit/Revenue) * 100))

# —————Merging Data Frames —————————

buildingNumber = c(2,9,3,1,5)
CompanyName = c(‘Apple’, ‘Microsoft’, ‘Huron Consulting’, ‘McKinsey’, ‘BCG’)
info = data.frame(buildingNumber, CompanyName)

info1 = data.frame(CompanyName = c(‘Citi Bank’, ‘HSBC’, ‘Goldmann Sachs’, ‘MakeMyTrip’, ‘Adobe’), buildingNumber = c(1,2,3,4,5)
, EmpCount = c(100,101,88,23,67))
# Merge above two data frames using the common column – buildingNumber
merge(info, info1, by = ‘buildingNumber’)

# Merge two columns in R if the matching column names are different

building = c(2,9,3,1,5)
CompanyName = c(‘Apple’, ‘Microsoft’, ‘Huron Consulting’, ‘McKinsey’, ‘BCG’)
info = data.frame(building, CompanyName)

info1 = data.frame(CompanyName = c(‘Citi Bank’, ‘HSBC’, ‘Goldmann Sachs’, ‘MakeMyTrip’, ‘Adobe’), buildingNumber = c(1,2,3,4,5)
, EmpCount = c(100,101,88,23,67))

merge(info, info1, by.x = ‘building’, by.y = ‘buildingNumber’)

# What if we try to match columns with different data types

building = c(2,9,3,1,5)
CompanyName = c(‘Apple’, ‘Microsoft’, ‘Huron Consulting’, ‘McKinsey’, ‘BCG’)
info = data.frame(building, CompanyName)

info1 = data.frame(CompanyName = c(‘Citi Bank’, ‘HSBC’, ‘Goldmann Sachs’, ‘MakeMyTrip’, ‘Adobe’), buildingNumber = c(‘1′,’2′,’3′,’4′,’5’)
, EmpCount = c(100,101,88,23,67))

merge(info, info1, by.x = ‘building’, by.y = ‘buildingNumber’)

# Inner Join without specifying the common column

buildings <- data.frame(location=c(1, 2, 3), name=c(“building1”, “building2”, “building3”))

data <- data.frame(survey=c(1,1,1,2,2,2), location=c(1,2,3,2,3,1),
efficiency=c(51,64,70,71,80,58))

merge(buildings, data)

# Left, Right, OuterJoin

df = data.frame(ItemID = c(1:6), ItemName = c(rep(‘Lily’, 3), rep(‘Rose’, 3)))
df1 = data.frame(ItemID = c(1,3,5, 7), City = c(‘Mum’,’Delhi’,’Noida’, ‘Kolkata’))

merge(df, df1, by = ‘ItemID’, all.x = TRUE)

# Right Outer Join
merge(df, df1, by = ‘ItemID’, all.y = TRUE)

# Full Outer Join
merge(df, df1, by = ‘ItemID’, all = TRUE)

# Cross Join
merge(df, df1, by = NULL)

Leave a Reply

Your email address will not be published. Required fields are marked *