Warning: Very Geeky
If you’re here because you want some nice datasets to use, then head here.
I wanted to see if it was possible to write some code in R that would randomly create a dataset suitable to perform a t.test, cor.test or prop.test on.
I’ve not got much coding experience but I thought it was worth a punt. So below I share what I’ve come up with…
First copy and paste the following code into R Studio:
######## Functions 2.0
### First run these libraries
Install dplyr package
install.packages("dplyr")
# Load dplyr package
library("dplyr")
### Now Run overthing between the #########------------
#########------------
### Cor test
cor_test_data_creator <- function(dataset_name = "mydata.csv",
first_column_name="First",
second_coloumn_name="Second",
mean_x = 50,
sd_x = 3,
max_y=100,
min_y=0,
m = 1,
c = 5,
R = "Strong") {
#Sample size
n <- sample(30:110, 1)
#x data
x <- round(rnorm(n, mean=mean_x, sd=sd_x),1)
#strong or weak?
if (R == "Strong"){
fuzz_max <- 1.06
fuzz_min <- 0.94
} else {
if (R == "Weak"){
fuzz_max <- 1.18
fuzz_min <- 0.82
} else {
y <- round(rnorm(n, mean=mean(x)*m+c, sd_x*m+c),1)
## dataframe
df <- data.frame(first_column_name = x, second_coloumn_name = y)
names(df)[1] <- first_column_name
names(df)[2] <- second_coloumn_name
#create csv file
file_location <- paste("C:\\Users\\YOU\\Desktop\\", dataset_name, sep="")
write.csv(df, file_location, row.names=FALSE)
print(cor.test(x,y))
return()
}
}
# y data
length_of_y <- length(x)
y <- vector(mode = "numeric", length_of_y)
for (i in 1:length_of_y){
y[i]<- round((x[i]*runif(1, fuzz_min, fuzz_max))*m+c*runif(1, fuzz_min, fuzz_max),1)
#validate
if (between(y[i],min_y, max_y)){
} else {
y[i] <- round(rnorm(1, mean=mean(x)*m+c, sd_x*m+c),1)
print("Out of bounds")
}
}
print(y)
## dataframe
df <- data.frame(first_column_name = x, second_coloumn_name = y)
names(df)[1] <- first_column_name
names(df)[2] <- second_coloumn_name
#create csv file
file_location <- paste("C:\\Users\\YOU\\Desktop\\", dataset_name, sep="")
write.csv(df, file_location, row.names=FALSE)
print(df)
print(cor.test(x,y))
}
### Z-test
z_test_data_creator <- function(dataset_name = "mydata.csv",
first_column_name="First",
second_coloumn_name="Second",
category1 = "Male",
category2 = "Female",
Expected = 0.5,
Different = TRUE) {
#Sample size
n <- sample(30:60, 1)
print(n)
#If different, higher or lower
up <- sample(c(TRUE, FALSE), 1)
if(up == TRUE){
fuzzy <- 1.8
} else {
fuzzy <- 0.2
}
#create vectors just of the categories
first_cat <- c(category1)
second_cat <- c(category2)
# Create a vector of the possible categories
categories_first <- c(category1, category2)
#Number expected
expected_number <- round(Expected*n,0)
print("expected:")
print(expected_number)
#Create category pool
yes_pool <- sample(first_cat, 10*expected_number, replace = TRUE)
no_pool <- sample(second_cat, 10*(n-expected_number), replace = TRUE)
pool <- c(yes_pool, no_pool)
#First column
x <- sample(pool, n, replace=TRUE)
# second column
if (Different==FALSE){
y <- sample(pool, n, replace=TRUE)
} else {
new_expected_number <- round(fuzzy*Expected*n,0)
print(new_expected_number)
print(n)
yes_pool <- sample(first_cat, 10*new_expected_number, replace = TRUE)
no_pool <- sample(second_cat, 10*(n-new_expected_number), replace = TRUE)
pool <- c(yes_pool, no_pool)
y <- sample(pool, n, replace=TRUE)
}
### Making the data frame
df <- data.frame(first_column_name = x, second_coloumn_name = y)
names(df)[1] <- first_column_name
names(df)[2] <- second_coloumn_name
#check
print(df)
#create csv file
file_location <- paste("C:\\Users\\YOU\\Desktop\\", dataset_name, sep="")
print(file_location)
write.csv(df, file_location, row.names=FALSE)
print(table(x))
print(table(y))
print(n)
prop.test(
x = c(as.numeric(table(x)[1]), as.numeric(table(y)[1])),
n = c(n, n))
}
### t-test
t_test_data_creator <- function(dataset_name = "mydata.csv",
first_column_name="First",
mean_x=50, sd_x=2,
second_coloumn_name="Second",
mean_y=50,
sd_y=2) {
### Create the two columns
#Sample size
n <- sample(30:110, 1)
# The two sets
x <- round(rnorm(n, mean=mean_x, sd=sd_x),1)
y <- round(rnorm(n, mean=mean_y, sd=sd_y),1)
#create the dataframe
df <- data.frame(first_column_name = x, second_coloumn_name = y)
names(df)[1] <- first_column_name
names(df)[2] <- second_coloumn_name
#check
print(df)
#create csv file
file_location <- paste("C:\\Users\\bazhi\\Desktop\\Datasets For HA\\", dataset_name, sep="")
print(file_location)
write.csv(df, file_location, row.names=FALSE)
print(t.test(x,y))
}
#########------------
In the code above you’ll need to replace the yellow highlighted code with the file path for where you want the datasets to be created. Note it should be in quotes, and also note the double slash \\.
Here are some examples of how to use the functions:
# Cor tests
args(cor_test_data_creator)
cor_test_data_creator("car_value.csv", "age_months", "value", 60, 15, 35000, 1000, -218.75, 25000, "Weak")
cor_test_data_creator("charity_walk.csv", "Distance_(miles)", "Money_Raised", 12, 3, 200, 10, 7.91, 5, "Weak")
cor_test_data_creator("temperature_test.csv", "Temperature", "Test_Score", 20, 7.5, 100, 0, 5, -25, "None")
### Z tests
args(z_test_data_creator)
z_test_data_creator("twenty_twenty_vision.csv", "Female", "Male", "20-20", "Not 20-20", 0.35, FALSE)
z_test_data_creator("smokers.csv", "Edinburgh", "Kirkcaldy", "Smoker", "Non-smoker", 0.22, TRUE)
z_test_data_creator("football_fans.csv", "Fan", "Not Fan", "Male", "Female", 0.6, FALSE)
### t-test
args(t_test_data_creator)
t_test_data_creator("handspan.csv", "Piano_Players", 24.6, 1.16, "Not_Piano_Player", 22.1, 1.2)
t_test_data_creator("red_snapper.csv", "Male", 60, 19.5, "Female", 60, 18.5)
For correlation you include the name of the dataset (with .csv!), the name of column 1, the name of column 2, the mean for column 1, the standard deviation for column 1, the maximum value for column 1, the minimum value, the gradient of the line of best fit, the intercept for the line of best fit, and finally whether “Strong”/”Weak”/”No” correlation.
For example the average car age was 60 months with a SD of 15. I wanted the value between £35,000 and £1000. The gradient and intercept come from a back of the envelope calculation to make those facts work, using imagined data-points.
For the z-test you include the dataset name, column 1 name, column 2 name, category 1, category 2, the proportion you’re expecting to see category 1 appear, and TRUE/FALSE depending on whether you want a difference between the proportions or not.
For example with the 20-20 vision we have columns called “female” and “male”, we expected 35% of the population to have 20-20 vision, and we don’t want there to be a difference.
For the t test you take the dataset name, column 1 name, mean, standard deviation, column 2 name, mean, standard deviation. With the piano example I gave the musicians a boost in order to make the t-test have a p-value less than 5%.
I’m sure the code has many faults. But it was quite a fun coding challenge.