Simple steps to import dataset and start querying it.


Prerequisites


Annotations

This is optional. Dnaerys can parse standard VEP annotations, with CADD, ClinVar & gnomAD annotations if desired. For SIFT & PolyPhen both prediction and binary values are used.

A typical annotation command line (the last 3 lines are optional and use custom CADD, ClinVar & gnomAD databases, not the ones shipped with VEP):

/path/vep \
  --format vcf \
  -i /path/vcf.gz \
  --vcf \
  -o /path/annotated.vcf \
  --verbose \
  --cache \
  --dir /path/vep/cache \
  --offline \
  --fork XX \
  --force_overwrite \
  --assembly GRCh38 \
  --terms SO \
  --variant_class \
  --no_stats \
  --gencode_basic \
  --biotype \
  --sift b \
  --polyphen b \
  --plugin CADD,/path/cadd/GRCh38/whole_genome_SNVs.tsv.gz \
  --custom /path/clinvar/GRCh38/clinvar.vcf.gz,clinvar,vcf,exact,0,CLNSIG \
  --custom /path/gnomad/r3/gnomad.genomes.r3.0.snv.tsv.gz,gnomAD,vcf,exact,0,AF

ETL

Submit a job to spark running in local mode

$ spark-submit \
    --master local[*] \
    --conf spark.local.dir=/tmppath \
    --conf spark.driver.maxResultSize=XX \
    --driver-memory=XX \
    --packages=io.projectglow:glow-spark3_2.12:1.2.1 \
    --class org.dnaerys.etl \
    /path/dnaerys-ctl.jar \
    --path /path/to/input_dir/with/vcfs \
    --path2save /path/to/dnaerys_dataset \
    --sinfo  /path/samples.csv \
    --notes "XXX w/ VEP annotations" \
    --cohort <cohort_name> \
    --rings 2 \
    --grch38 \
    --vep \
    --cadd \
    --clinsig clinvar_CLNSIG \
    --gnomad gnomAD_AF \
    --skipnotannotated

License

Show the license

$ scala -cp /path/dnaerys-ctl.jar org.dnaerys.license --show

Accept the license

$ scala -cp /path/dnaerys-ctl.jar org.dnaerys.license --accept --path /path/dnaerys_dataset

Cluster Installation

docker pull dnaerys/dnaerys

Starting Cluster

Run Docker Compose with configuration below to start a cluster with 2 nodes

version: '3.5'

networks:
  cluster-network:

services:

  node0:
    networks:
      - cluster-network
    image: dnaerys/dnaerys
    ports:
      - '8001:8000'
      - '8081:8081'
    volumes:
      - /path/to/dnaerys_dataset:/dnaerys/dataset
    shm_size: '1gb'
    environment:
      CLUSTER_HOST: node0
      CLUSTER_SEED_HOST: node0
      JAVA_OPTS: '--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED'
      RING_ID: ring0

  node1:
    networks:
      - cluster-network
    image: dnaerys/dnaerys
    ports:
      - '8002:8000'
    volumes:
      - /path/to/dnaerys_dataset:/dnaerys/dataset
    shm_size: '1gb'
    environment:
      CLUSTER_HOST: node1
      CLUSTER_SEED_HOST: node0
      JAVA_OPTS: '--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED'
      RING_ID: ring1

Querying

A side note: gRPCurl is a handy CLI util to interact with gRPC services. We'll use it in examples below.

grpcurl \
  -plaintext \
  -proto dnaerys.proto \
  -d '{"chr":"17", "start":"7661779", "end":"7687538", "hom":"true", "het":"true", "assembly":"GRCh37"}' \
  localhost:8001 \
  org.dnaerys.cluster.grpc.DnaerysService/SelectVariantsInRegion
grpcurl \
  -plaintext \
  -proto dnaerys.proto \
  -d '{"chr":"17", "start":"7661779", "end":"7687538", "hom":"true", "het":"true", "ann": {"ftypes":["TRANSCRIPT"], "impact":["HIGH"]}, "assembly":"GRCh37"}' \
  localhost:8001 \
  org.dnaerys.cluster.grpc.DnaerysService/SelectVariantsInRegion
grpcurl \
  -plaintext \
  -proto dnaerys.proto \
  -d '{"chr":"13", "start":"32315086", "end":"32400268", "hom":"true", "samples":"<sample_name>", "ann": {"clnsgn":["PATHOGENIC"]}, "assembly":"GRCh37"}' \
  localhost:8001 \
  org.dnaerys.cluster.grpc.DnaerysService/SelectVariantsInRegionInVirtualCohort
grpcurl \
  -plaintext \
  -proto dnaerys.proto \
  -d '{"chr":"17", "start":"7661779", "end":"7687538", "hom":"true", "ann": {"ftypes":["TRANSCRIPT"], "clnsgn":["PATHOGENIC"]}, "assembly":"GRCh37"}' \
  localhost:8001 \
  org.dnaerys.cluster.grpc.DnaerysService/SelectSamplesInRegion

Querying: QC

grpcurl \
  -plaintext \
  -proto dnaerys.proto \
  -d '{"cohort_name":<cohort_name>}' \
  localhost:8001 \
  org.dnaerys.cluster.grpc.DnaerysService/SexMismatchCheck
grpcurl \
  -plaintext \
  -proto dnaerys.proto \
  -d '{"cohort_name":<cohort_name>, "degree":"TWINS_MONOZYGOTIC"}' \
  localhost:8001 \
  org.dnaerys.cluster.grpc.DnaerysService/Kinship
grpcurl \
  -plaintext \
  -proto dnaerys.proto \
  -d '{"cohort_name":<cohort_name>, "degree":"THIRD_DEGREE"}' \
  localhost:8001 \
  org.dnaerys.cluster.grpc.DnaerysService/Kinship

Querying w/ authz

JWT-based authorization has to be set active before starting a node - either in application.conf or via passing env variables GRPC_AUTHZ_METHOD/GRPC_JWT_ALGO/GRPC_JWT_VALIDATION_KEY

version: '3.5'

networks:
  cluster-network:

services:

  node0:
    networks:
      - cluster-network
    image: dnaerys/dnaerys
    ports:
      - '8001:8000'
      - '8081:8081'
    volumes:
      - /path/to/dnaerys_dataset:/dnaerys/dataset
    shm_size: '1gb'
    environment:
      CLUSTER_HOST: node0
      CLUSTER_SEED_HOST: node0
      JAVA_OPTS: '--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED'
      RING_ID: ring0
      GRPC_AUTHZ_METHOD: 'JWT'
      GRPC_JWT_ALGO: 'RS256'
      GRPC_JWT_VALIDATION_KEY: '<key>'

  node1:
    networks:
      - cluster-network
    image: dnaerys/dnaerys
    ports:
      - '8002:8000'
    volumes:
      - /path/to/dnaerys_dataset:/dnaerys/dataset
    shm_size: '1gb'
    environment:
      CLUSTER_HOST: node1
      CLUSTER_SEED_HOST: node0
      JAVA_OPTS: '--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED'
      RING_ID: ring1
      GRPC_AUTHZ_METHOD: 'JWT'
      GRPC_JWT_ALGO: 'HS256'
      GRPC_JWT_VALIDATION_KEY: '<key>'
grpcurl \
  -rpc-header "jwt: <JWT>"
  -plaintext \
  -proto dnaerys.proto \
  -d '{"chr":"17", "start":"7661779", "end":"7687538", "hom":"true", "het":"true", "assembly":"GRCh37"}' \
  localhost:8002 \
  org.dnaerys.cluster.grpc.DnaerysService/SelectVariantsInRegion