reference-files • ldsR

library(ldsR)
library(purrr)
library(arrow)
#> 
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#> 
#>     timestamp
library(fs)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

Files shipped with the ldsR package

Three set of files are made available through the ldsR package. 1. eur_w_ld_chr - used for univariate ldscore regression (estimating heritability and genetic correlations) 2. w_hm3.snplist.parquet used for the munge function. This file exists to recreate the output of the munge.py function 3. 1000G_Phase3_weights_hm3_no_MHC.parquet - a second set of regression weights - used for partitioned heritability

How were the reference files used by ldsR construced?

There are a lot of different versions of the original LDscore files floating around the internet. for example, the tutorial from the ldsc github references the eur_w_ld_chr folder, which as far as i can tell is longer available online.

The readme from the previous location https://alkesgroup.broadinstitute.org/LDSCORE/README_new_data_location.txt references the S-LDSC directory from Steven Gaza at Zenodo

However, the files corresponding to univariate LDscore regression, 1000G_Phase3_ldscores.tgz are not identical to the previously standard, eur_w_ld_chr I have made available the version of eur_w_ld_chr that our group had available at zenodo This file is shipped with the ldsR package.

setwd(tempdir())
# Define the URL and destination file
url <- "https://zenodo.org/records/14993076/files/eur_w_ld_chr.tgz"
destfile <- "eur_w_ld_chr.tgz"

# Download the file
download.file(url, destfile, mode = "wb")
# Extract the contents
untar(destfile, exdir = "eur_w_ld_chr")

# Remove the tar file after extraction
file.remove(destfile)

# contains LD split by chr. 
list.files("eur_w_ld_chr/eur_w_ld_chr")

# ordered matters here for the jackknife. Reorder as intended
ld_files <- paste0("eur_w_ld_chr/eur_w_ld_chr/", c(1:22), ".l2.ldscore.gz")

ld <- map(ld_files, \(x) read_delim_arrow(x, delim = "\t")) |> 
  list_rbind()

# Verify that they are ordered by physical location.
all(dplyr::arrange(ld, CHR, BP)$SNP == ld$SNP)

# get M
m_files <- paste0("eur_w_ld_chr/eur_w_ld_chr/", c(1:22), ".l2.M_5_50")
m <- map_dbl(m_files, \(x) readLines(x) |> as.numeric()) |> sum()
# m : 1173569

dplyr::select(ld, SNP, L2) |> 
  arrow::write_parquet("inst/extdata/eur_w_ld.parquet")


# The ldsc munge function actual restrices the regression SNPs even further, with the set of SNPs in w_hm3.snplist" 
df <- arrow::read_tsv_arrow("eur_w_ld_chr/eur_w_ld_chr/w_hm3.snplist") |> dplyr::tibble()

arrow::write_parquet(df, "inst/extdata/w_hm3.snplist.parquet")

partitioned heritability (multivariate LDscore regression)

For partitioned heritability, cell-type analysis and other applications with multiple annotations, another set of weights is the standard use case. Here we turn the files on zenodo from Steven Gazal at Zenodo.

setwd(tempdir())
# Define the URL and destination file
url <- "https://zenodo.org/records/7768714/files/1000G_Phase3_weights_hm3_no_MHC.tgz"
destfile <- "1000G_Phase3_weights_hm3_no_MHC.tgz"

# Download the file
download.file(url, destfile, mode = "wb")
# Extract the contents
untar(destfile, exdir = "1000G_Phase3_weights_hm3_no_MHC")

# Remove the tar file after extraction
file.remove(destfile)

# contains LD split by chr. 
list.files("1000G_Phase3_weights_hm3_no_MHC/1000G_Phase3_weights_hm3_no_MHC")

# again, order matters
ld_files <- paste0("1000G_Phase3_weights_hm3_no_MHC/1000G_Phase3_weights_hm3_no_MHC/", "weights.hm3_noMHC.", c(1:22),".l2.ldscore.gz")
ld <- map(ld_files, \(x) read_delim_arrow(x, delim = "\t")) |> 
  list_rbind()

all(dplyr::arrange(ld, CHR, BP)$SNP == ld$SNP)
arrow::write_parquet(dplyr::select(ld, SNP, L2), "inst/extdata/1000G_Phase3_weights_hm3_no_MHC.parquet")

partitioned heritability, baseline annotationss

setwd("~/")
# Define the URL and destination file
options(timeout = max(3000, getOption("timeout")))
downloaded <- TRUE
if(!downloaded) {
  url <- "https://zenodo.org/records/7768714/files/1000G_Phase3_baseline_v1.2_ldscores.tgz"
  destfile <- "1000G_Phase3_baseline_v1.2_ldscores.tgz"
  
  # Download the file
  download.file(url, destfile, mode = "wb")
  # Extract the contents
  untar(destfile, exdir = "baseline_v1.2")
  # Remove the tar file after extraction
  file.remove(destfile)
}

dir <- "~/baseline_v1.2/"
naming <- fs::dir_ls(dir, glob = "*ldscore.gz")[1] |> 
  fs::path_file() |> 
  stringr::str_remove(".1.l2.ldscore.gz")

paste0(naming, ".1.l2.ldscore.gz")
stopifnot(fs::file_exists(fs::path(dir, paste0(naming, ".1.l2.ldscore.gz"))))

ld_paths <- fs::path(dir, paste0(naming,".", c(1:22), ".l2.ldscore.gz"))

m50_paths <- fs::path(dir, paste0(naming,".", c(1:22), ".l2.M_5_50"))
m_paths <- fs::path(dir, paste0(naming,".", c(1:22), ".l2.M"))
annot_path <- fs::path(dir, paste0(naming,".", c(1:22), ".annot.gz"))

ld <- ld_paths |> 
  purrr::map(arrow::read_tsv_arrow) |>
  purrr::list_rbind()
ld <- select(ld, -CHR,-BP)

m50 <- m50_paths |>
  purrr::map(\(x) read_tsv_arrow(x, col_names = FALSE)) |> 
  map(\(x) tidyr::pivot_longer(x, everything()) |> pull(value)) |> 
  reduce(`+`)

m <- m_paths |>
  purrr::map(\(x) read_tsv_arrow(x, col_names = FALSE)) |> 
  map(\(x) tidyr::pivot_longer(x, everything()) |> pull(value)) |> 
  reduce(`+`)

annot <- dplyr::tibble(annot = colnames(ld)[-1], m50 = m50, m = m)



annot_ref <-
  annot_path |>
  purrr::map(\(x) arrow::read_tsv_arrow(x)) |>
  purrr::list_rbind()
annot_ref <- select(annot_ref, -CHR, -CM, -BP)




outdir <- "~/baseline_model-v1.2" |> dir_create()
arrow::write_parquet(annot, fs::path(outdir, "annot.parquet"))
arrow::write_parquet(ld, fs::path(outdir, "ld.parquet"))
arrow::write_parquet(annot_ref, fs::path(outdir, "annot_ref.parquet"))