Randomly Create Datasets - Applying Maths

Warning: Very Geeky

If you’re here because you want some nice datasets to use, then head here.

I wanted to see if it was possible to write some code in R that would randomly create a dataset suitable to perform a t.test, cor.test or prop.test on.

I’ve not got much coding experience but I thought it was worth a punt. So below I share what I’ve come up with…

First copy and paste the following code into R Studio:

######## Functions 2.0


### First run these libraries

Install dplyr package
install.packages("dplyr")   

# Load dplyr package    
library("dplyr") 

### Now Run overthing between the #########------------


#########------------

### Cor test


cor_test_data_creator <- function(dataset_name = "mydata.csv",
                                  first_column_name="First", 
                                  second_coloumn_name="Second", 
                                  mean_x = 50, 
                                  sd_x = 3, 
                                  max_y=100, 
                                  min_y=0, 
                                  m = 1, 
                                  c = 5, 
                                  R = "Strong") {
  #Sample size
  n <- sample(30:110, 1)
  
  
  #x data
  x <- round(rnorm(n, mean=mean_x, sd=sd_x),1)
  
  #strong or weak?
  

  if (R == "Strong"){
    fuzz_max <- 1.06
    fuzz_min <- 0.94
  
  } else {
    if (R == "Weak"){
      
      fuzz_max <- 1.18
      fuzz_min <- 0.82
      
    } else {
      y <- round(rnorm(n, mean=mean(x)*m+c, sd_x*m+c),1)
      ## dataframe
      
      df <- data.frame(first_column_name = x, second_coloumn_name = y)
      names(df)[1] <- first_column_name
      names(df)[2] <- second_coloumn_name
      
      #create csv file
      file_location <- paste("C:\\Users\\YOU\\Desktop\\", dataset_name, sep="")
      
      
      write.csv(df, file_location, row.names=FALSE)
      print(cor.test(x,y))
      return()
      
    } 
  }
  
  
  # y data
  
  length_of_y <- length(x)
  
  y <- vector(mode = "numeric", length_of_y)
  
  for (i in 1:length_of_y){
    y[i]<- round((x[i]*runif(1, fuzz_min, fuzz_max))*m+c*runif(1, fuzz_min, fuzz_max),1)
    #validate
    if (between(y[i],min_y, max_y)){
      
    } else {
      y[i] <- round(rnorm(1, mean=mean(x)*m+c, sd_x*m+c),1)
      print("Out of bounds")
    }

  }
  print(y)
  
  ## dataframe
  
  df <- data.frame(first_column_name = x, second_coloumn_name = y)
  names(df)[1] <- first_column_name
  names(df)[2] <- second_coloumn_name
  
  #create csv file
  file_location <- paste("C:\\Users\\YOU\\Desktop\\", dataset_name, sep="")
  
  
  write.csv(df, file_location, row.names=FALSE)
  print(df)
  print(cor.test(x,y))
  
}

### Z-test

z_test_data_creator <- function(dataset_name = "mydata.csv",
                                first_column_name="First", 
                                second_coloumn_name="Second", 
                                category1 = "Male", 
                                category2 = "Female", 
                                Expected = 0.5, 
                                Different = TRUE) {
  #Sample size
  n <- sample(30:60, 1)
  print(n)
  
  #If different, higher or lower
  up <- sample(c(TRUE, FALSE), 1)
  if(up == TRUE){
    fuzzy <- 1.8
  } else {
    fuzzy <- 0.2
  }
  
  #create vectors just of the categories
  first_cat <- c(category1)
  second_cat <- c(category2)
  
  # Create a vector of the possible categories
  categories_first <- c(category1, category2)
  
  #Number expected
  expected_number <- round(Expected*n,0)
  print("expected:")
  print(expected_number)
  
  #Create category pool
  yes_pool <- sample(first_cat, 10*expected_number, replace = TRUE)
  no_pool <- sample(second_cat, 10*(n-expected_number), replace = TRUE)
  
  pool <- c(yes_pool, no_pool)
  
  
  #First column
  x <- sample(pool, n, replace=TRUE)
  
  # second column
  
  if (Different==FALSE){
    y <- sample(pool, n, replace=TRUE)
  } else {
    new_expected_number <- round(fuzzy*Expected*n,0)
    print(new_expected_number)
    print(n)
    yes_pool <- sample(first_cat, 10*new_expected_number, replace = TRUE)
    no_pool <- sample(second_cat, 10*(n-new_expected_number), replace = TRUE)
    
    pool <- c(yes_pool, no_pool)
    
    y <- sample(pool, n, replace=TRUE)
  }
  
    ### Making the data frame

  df <- data.frame(first_column_name = x, second_coloumn_name = y)
  names(df)[1] <- first_column_name
  names(df)[2] <- second_coloumn_name
  
  #check
  print(df)
  
  #create csv file
  file_location <- paste("C:\\Users\\YOU\\Desktop\\", dataset_name, sep="")
  print(file_location)
  
  write.csv(df, file_location, row.names=FALSE)
  print(table(x))
  print(table(y))
  print(n)
  prop.test(
    x = c(as.numeric(table(x)[1]), as.numeric(table(y)[1])), 
    n = c(n, n))
}

### t-test



t_test_data_creator <- function(dataset_name = "mydata.csv", 
                                first_column_name="First", 
                                mean_x=50, sd_x=2, 
                                second_coloumn_name="Second", 
                                mean_y=50, 
                                sd_y=2) {
  ### Create the two columns
  
  #Sample size
  n <- sample(30:110, 1)
  # The two sets
  x <- round(rnorm(n, mean=mean_x, sd=sd_x),1)
  y <- round(rnorm(n, mean=mean_y, sd=sd_y),1)
  
  #create the dataframe
  df <- data.frame(first_column_name = x, second_coloumn_name = y)
  names(df)[1] <- first_column_name
  names(df)[2] <- second_coloumn_name
  
  #check
  print(df)
  
  #create csv file
  file_location <- paste("C:\\Users\\bazhi\\Desktop\\Datasets For HA\\", dataset_name, sep="")
  print(file_location)
  
  write.csv(df, file_location, row.names=FALSE)
  print(t.test(x,y))
}




#########------------

In the code above you’ll need to replace the yellow highlighted code with the file path for where you want the datasets to be created. Note it should be in quotes, and also note the double slash \\.

Here are some examples of how to use the functions:


# Cor tests

args(cor_test_data_creator)

cor_test_data_creator("car_value.csv", "age_months", "value", 60, 15, 35000, 1000, -218.75, 25000, "Weak")

cor_test_data_creator("charity_walk.csv", "Distance_(miles)", "Money_Raised", 12, 3, 200, 10, 7.91, 5, "Weak")

cor_test_data_creator("temperature_test.csv", "Temperature", "Test_Score", 20, 7.5, 100, 0, 5, -25, "None")

### Z tests

args(z_test_data_creator)

z_test_data_creator("twenty_twenty_vision.csv", "Female", "Male", "20-20", "Not 20-20", 0.35, FALSE)

z_test_data_creator("smokers.csv", "Edinburgh", "Kirkcaldy", "Smoker", "Non-smoker", 0.22, TRUE)

z_test_data_creator("football_fans.csv", "Fan", "Not Fan", "Male", "Female", 0.6, FALSE)

### t-test

args(t_test_data_creator)

t_test_data_creator("handspan.csv", "Piano_Players", 24.6, 1.16, "Not_Piano_Player", 22.1, 1.2)

t_test_data_creator("red_snapper.csv", "Male", 60, 19.5, "Female", 60, 18.5)

For correlation you include the name of the dataset (with .csv!), the name of column 1, the name of column 2, the mean for column 1, the standard deviation for column 1, the maximum value for column 1, the minimum value, the gradient of the line of best fit, the intercept for the line of best fit, and finally whether “Strong”/”Weak”/”No” correlation.

For example the average car age was 60 months with a SD of 15. I wanted the value between £35,000 and £1000. The gradient and intercept come from a back of the envelope calculation to make those facts work, using imagined data-points.

For the z-test you include the dataset name, column 1 name, column 2 name, category 1, category 2, the proportion you’re expecting to see category 1 appear, and TRUE/FALSE depending on whether you want a difference between the proportions or not.

For example with the 20-20 vision we have columns called “female” and “male”, we expected 35% of the population to have 20-20 vision, and we don’t want there to be a difference.

For the t test you take the dataset name, column 1 name, mean, standard deviation, column 2 name, mean, standard deviation. With the piano example I gave the musicians a boost in order to make the t-test have a p-value less than 5%.

I’m sure the code has many faults. But it was quite a fun coding challenge.