Simple steps to import dataset and start querying it.
Prerequisites
- Scala 2.12
- Coursier proivides a simple way to install scala
and manage scala environments and artifacts. Once
cs
is installed, runcs install scala:2.12.15
. Coursier can also install an appropriate JVM if none found.
- Coursier proivides a simple way to install scala
and manage scala environments and artifacts. Once
- Apache Spark. Either 3.1 or 3.2 would work.
- Docker & Docker Compose
- VEP (optional)
- gRPCurl (optional)
Annotations
This is optional. Dnaerys can parse standard VEP annotations, with CADD, ClinVar & gnomAD annotations if desired. For SIFT & PolyPhen both prediction and binary values are used.
A typical annotation command line (the last 3 lines are optional and use custom CADD, ClinVar & gnomAD databases, not the ones shipped with VEP):
/path/vep \
--format vcf \
-i /path/vcf.gz \
--vcf \
-o /path/annotated.vcf \
--verbose \
--cache \
--dir /path/vep/cache \
--offline \
--fork XX \
--force_overwrite \
--assembly GRCh38 \
--terms SO \
--variant_class \
--no_stats \
--gencode_basic \
--biotype \
--sift b \
--polyphen b \
--plugin CADD,/path/cadd/GRCh38/whole_genome_SNVs.tsv.gz \
--custom /path/clinvar/GRCh38/clinvar.vcf.gz,clinvar,vcf,exact,0,CLNSIG \
--custom /path/gnomad/r3/gnomad.genomes.r3.0.snv.tsv.gz,gnomAD,vcf,exact,0,AF
clinvar
is the custom prefix added toCLNSIG
field which is used for annotations (in ClinVar's VCF). The resulting annotation field in output VCF isclinvar_CLNSIG
.gnomAD_AF
is the field in output VCF for gnomAD AF annotations.
ETL
Submit a job to spark running in local mode
$ spark-submit \
--master local[*] \
--conf spark.local.dir=/tmppath \
--conf spark.driver.maxResultSize=XX \
--driver-memory=XX \
--packages=io.projectglow:glow-spark3_2.12:1.2.1 \
--class org.dnaerys.etl \
/path/dnaerys-ctl.jar \
--path /path/to/input_dir/with/vcfs \
--path2save /path/to/dnaerys_dataset \
--sinfo /path/samples.csv \
--notes "XXX w/ VEP annotations" \
--cohort <cohort_name> \
--rings 2 \
--grch38 \
--vep \
--cadd \
--clinsig clinvar_CLNSIG \
--gnomad gnomAD_AF \
--skipnotannotated
- if VCFs are not annotated (or you want to skip annotations) remove the last 5 lines
--conf
and--driver-memory
are optional
License
Show the license
$ scala -cp /path/dnaerys-ctl.jar org.dnaerys.license --show
Accept the license
$ scala -cp /path/dnaerys-ctl.jar org.dnaerys.license --accept --path /path/dnaerys_dataset
Cluster Installation
docker pull dnaerys/dnaerys
Starting Cluster
Run Docker Compose with configuration below to start a cluster with 2 nodes
version: '3.5'
networks:
cluster-network:
services:
node0:
networks:
- cluster-network
image: dnaerys/dnaerys
ports:
- '8001:8000'
- '8081:8081'
volumes:
- /path/to/dnaerys_dataset:/dnaerys/dataset
shm_size: '1gb'
environment:
CLUSTER_HOST: node0
CLUSTER_SEED_HOST: node0
JAVA_OPTS: '--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED'
RING_ID: ring0
node1:
networks:
- cluster-network
image: dnaerys/dnaerys
ports:
- '8002:8000'
volumes:
- /path/to/dnaerys_dataset:/dnaerys/dataset
shm_size: '1gb'
environment:
CLUSTER_HOST: node1
CLUSTER_SEED_HOST: node0
JAVA_OPTS: '--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED'
RING_ID: ring1
Querying
A side note: gRPCurl is a handy CLI util to interact with gRPC services. We'll use it in examples below.
- All variants in TP53
grpcurl \
-plaintext \
-proto dnaerys.proto \
-d '{"chr":"17", "start":"7661779", "end":"7687538", "hom":"true", "het":"true", "assembly":"GRCh37"}' \
localhost:8001 \
org.dnaerys.cluster.grpc.DnaerysService/SelectVariantsInRegion
- All transcript variants with high impact in TP53
grpcurl \
-plaintext \
-proto dnaerys.proto \
-d '{"chr":"17", "start":"7661779", "end":"7687538", "hom":"true", "het":"true", "ann": {"ftypes":["TRANSCRIPT"], "impact":["HIGH"]}, "assembly":"GRCh37"}' \
localhost:8001 \
org.dnaerys.cluster.grpc.DnaerysService/SelectVariantsInRegion
- All pathogenic homozygous variants in sample_name in BRCA2
grpcurl \
-plaintext \
-proto dnaerys.proto \
-d '{"chr":"13", "start":"32315086", "end":"32400268", "hom":"true", "samples":"<sample_name>", "ann": {"clnsgn":["PATHOGENIC"]}, "assembly":"GRCh37"}' \
localhost:8001 \
org.dnaerys.cluster.grpc.DnaerysService/SelectVariantsInRegionInVirtualCohort
- All samples with pathogenic homozygous variants in transcripts in TP53
grpcurl \
-plaintext \
-proto dnaerys.proto \
-d '{"chr":"17", "start":"7661779", "end":"7687538", "hom":"true", "ann": {"ftypes":["TRANSCRIPT"], "clnsgn":["PATHOGENIC"]}, "assembly":"GRCh37"}' \
localhost:8001 \
org.dnaerys.cluster.grpc.DnaerysService/SelectSamplesInRegion
Querying: QC
- Reported vs observed sex mismatch check for all samples in cohort
grpcurl \
-plaintext \
-proto dnaerys.proto \
-d '{"cohort_name":<cohort_name>}' \
localhost:8001 \
org.dnaerys.cluster.grpc.DnaerysService/SexMismatchCheck
- Identical twins and samples duplications in all possible pairs in cohort
grpcurl \
-plaintext \
-proto dnaerys.proto \
-d '{"cohort_name":<cohort_name>, "degree":"TWINS_MONOZYGOTIC"}' \
localhost:8001 \
org.dnaerys.cluster.grpc.DnaerysService/Kinship
- All related pairs up to 3rd degree
grpcurl \
-plaintext \
-proto dnaerys.proto \
-d '{"cohort_name":<cohort_name>, "degree":"THIRD_DEGREE"}' \
localhost:8001 \
org.dnaerys.cluster.grpc.DnaerysService/Kinship
Querying w/ authz
JWT-based authorization has to be set active before starting a node -
either in application.conf
or via passing env variables
GRPC_AUTHZ_METHOD/GRPC_JWT_ALGO/GRPC_JWT_VALIDATION_KEY
version: '3.5'
networks:
cluster-network:
services:
node0:
networks:
- cluster-network
image: dnaerys/dnaerys
ports:
- '8001:8000'
- '8081:8081'
volumes:
- /path/to/dnaerys_dataset:/dnaerys/dataset
shm_size: '1gb'
environment:
CLUSTER_HOST: node0
CLUSTER_SEED_HOST: node0
JAVA_OPTS: '--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED'
RING_ID: ring0
GRPC_AUTHZ_METHOD: 'JWT'
GRPC_JWT_ALGO: 'RS256'
GRPC_JWT_VALIDATION_KEY: '<key>'
node1:
networks:
- cluster-network
image: dnaerys/dnaerys
ports:
- '8002:8000'
volumes:
- /path/to/dnaerys_dataset:/dnaerys/dataset
shm_size: '1gb'
environment:
CLUSTER_HOST: node1
CLUSTER_SEED_HOST: node0
JAVA_OPTS: '--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED'
RING_ID: ring1
GRPC_AUTHZ_METHOD: 'JWT'
GRPC_JWT_ALGO: 'HS256'
GRPC_JWT_VALIDATION_KEY: '<key>'
grpcurl \
-rpc-header "jwt: <JWT>"
-plaintext \
-proto dnaerys.proto \
-d '{"chr":"17", "start":"7661779", "end":"7687538", "hom":"true", "het":"true", "assembly":"GRCh37"}' \
localhost:8002 \
org.dnaerys.cluster.grpc.DnaerysService/SelectVariantsInRegion