---
title: "7. The Sertotype Database API: Serotype Definition and Sequence Matching"
author: "William Lane MD, PhD, A(ACHI)"
format: 
  html:
    code-fold: false
    toc: true
    theme: cosmo
execute:
  warning: false
  message: false
---

In this document, we'll demonstrate how to query the Serotype Database API using two additional endpoints:

1. `serotypeSequences` - To retrieve amino acid definitions for serotypes.
2. `serotypeBestSequenceMatch` - To find serotypes matching specific amino acid patterns.

We'll explore several use cases:

1. Finding serotype definitions for a specific locus (e.g., locus A).
2. Getting the amino acid definition for a specific serotype (e.g., A0201).
3. Filtering serotypes by specific amino acid properties.
4. Finding the best sequence matches based on multiple amino acid criteria.

## 7.1 Setup Packages

```{r}
#| label: setup-packages-7

# Clear everything
rm(list = ls())

# Install required packages if not already installed
options(repos = c(CRAN = "https://cloud.r-project.org"))
if (!requireNamespace("httr", quietly = TRUE)) install.packages("httr")
if (!requireNamespace("jsonlite", quietly = TRUE)) install.packages("jsonlite")
if (!requireNamespace("conflicted", quietly = TRUE)) install.packages("conflicted")
if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr")
if (!requireNamespace("knitr", quietly = TRUE)) install.packages("knitr")
if (!requireNamespace("kableExtra", quietly = TRUE)) install.packages("kableExtra")
if (!requireNamespace("dotenv", quietly = TRUE)) install.packages("dotenv")

# Load packages
library(httr)
library(jsonlite)
library(conflicted)
library(dplyr)
library(knitr)
library(kableExtra)
library(dotenv)

# Load environment variables
load_dot_env()

# Resolve conflicts
conflict_prefer("filter", "dplyr")
```

## 7.2 Set API Key

To query the Serotype Database API, you will need an API Key, which are available for free by signing up for an account at <https://www.serotype.org/user>.

```{r}
# Check for the API key in environment variables
apiKey <- Sys.getenv("SEROTYPE_API_KEY", unset = NA)

# Allow manual override of the API key by user here
apiKeyOverride <- ""  # Set this to your API manually if not using environment variables

# Use the override if provided, otherwise use the environment variable value
if (!is.null(apiKeyOverride) && nzchar(apiKeyOverride)) {
  apiKey <- apiKeyOverride
}

# Set API URL
url <- "https://serotype.org/api/graphql"
```

## 7.3 Using serotypeSequences

The `serotypeSequences` endpoint allows you to query the amino acid definitions that define specific serotypes.

### 7.3.1 All Serotype Definitions for Locus A

This query returns all serotype definitions for locus A:

```{r}
#| label: all-locus-a-definitions-7

query_locus_a <- '
query {
  serotypeSequences(
    loci: ["A"]
  ) {
    locus
    serotype
    protein {
      aminoAcid {
        position
        residue
        isDefinition
      }
    }
  }
}
'

resp_locus_a <- POST(
  url,
  body = list(query = query_locus_a),
  encode = "json",
  add_headers(`x-api-key` = apiKey)
)

# Parse the response
data_locus_a <- fromJSON(content(resp_locus_a, "text"), flatten = TRUE)

# Process the data frame with protein amino acid data
if (!is.null(data_locus_a$data$serotypeSequences)) {
  # Get the data
  serotypes_df <- data_locus_a$data$serotypeSequences
  
  # Report the number of serotypes
  cat("Number of serotypes found for locus A:", nrow(serotypes_df), "\n\n")
  
  # Create a table for displaying serotypes with their definitions
  result_table <- data.frame(
    Locus = character(),
    Serotype = character(),
    Definition = character(),
    stringsAsFactors = FALSE
  )
  
  # Process each row in the data frame
  for (i in 1:nrow(serotypes_df)) {
    # Extract serotype information
    locus <- serotypes_df$locus[i]
    serotype <- serotypes_df$serotype[i]
    
    # Extract amino acid data
    amino_acids <- serotypes_df$protein.aminoAcid[[i]]
    
    # Filter for definition amino acids
    definition_aa <- amino_acids[amino_acids$isDefinition == TRUE, ]
    
    # Create a definition string
    if (nrow(definition_aa) > 0) {
      aa_definition <- paste(
        paste(definition_aa$position, definition_aa$residue, sep = ":"),
        collapse = ", "
      )
    } else {
      aa_definition <- "No definition amino acids"
    }
    
    # Add to result table
    result_table <- rbind(
      result_table,
      data.frame(
        Locus = locus,
        Serotype = serotype,
        Definition = aa_definition,
        stringsAsFactors = FALSE
      )
    )
  }
  
  # Display the first 10 serotypes with their definitions
  kable(head(result_table, 10), caption = "Serotypes with their amino acid definitions") %>%
    kable_styling()
}
```

### 7.3.2 Definition for a Specific Serotype (A0201)

This query retrieves the amino acid definition specifically for the A0201 serotype:

```{r}
#| label: specific-serotype-definition-7

query_serotype <- '
query {
  serotypeSequences(
    serotypes: ["A0201"]
  ) {
    locus
    serotype
    protein {
      aminoAcid {
        position
        residue
        isDefinition
      }
    }
  }
}
'

resp_serotype <- POST(
  url,
  body = list(query = query_serotype),
  encode = "json",
  add_headers(`x-api-key` = apiKey)
)

# Parse the response
data_serotype <- fromJSON(content(resp_serotype, "text"), flatten = TRUE)

# Process specific serotype data (defensive: an empty result from the API
# comes back as NULL, an empty list, or a zero-row data frame depending
# on how jsonlite simplifies it).
serotypes_df <- data_serotype$data$serotypeSequences
if (!is.null(serotypes_df) && is.data.frame(serotypes_df) && nrow(serotypes_df) > 0) {
  # Process each serotype row
  for (i in seq_len(nrow(serotypes_df))) {
    serotype_info <- serotypes_df[i, ]
    cat("\nSerotype:", serotype_info$serotype, "\n")
    
    # Access amino acid data
    if (!is.null(serotype_info$protein.aminoAcid[[1]])) {
      amino_acids <- serotype_info$protein.aminoAcid[[1]]
      
      # Show all amino acids
      kable(amino_acids, caption = paste("All amino acids for", serotype_info$serotype)) %>%
        kable_styling() %>%
        scroll_box(height = "300px")
      
      # Show only the definition amino acids
      definition_aa <- amino_acids[amino_acids$isDefinition == TRUE, ]
      
      if (nrow(definition_aa) > 0) {
        kable(definition_aa, caption = paste("Definition amino acids for", serotype_info$serotype)) %>%
          kable_styling()
        
        # Create definition string
        definition_string <- paste(
          paste(definition_aa$position, definition_aa$residue, sep = ":"),
          collapse = ", "
        )
        
        cat("\nSerotype", serotype_info$serotype, "definition:", definition_string, "\n")
      } else {
        cat("No definition amino acids found for this serotype\n")
      }
    } else {
      cat("No amino acid data found for this serotype\n")
    }
  }
}
```

### 7.3.3 Filtering Serotypes by Specific Amino Acid Properties

This query finds serotypes where position 43 has residue Q (glutamine) and is part of the definition. The API itself performs the filtering based on the criteria provided in `proteinFilters`:

```{r}
#| label: filter-by-amino-acid-7

query_aa_filter <- '
query {
  serotypeSequences(
    loci: ["A"], 
    proteinFilters: [{position: "43", residue: "Q", isDefinition: true}]
  ) {
    locus
    serotype
  }
}
'

resp_aa_filter <- POST(
  url,
  body = list(query = query_aa_filter),
  encode = "json",
  add_headers(`x-api-key` = apiKey)
)

# Parse the response
data_aa_filter <- fromJSON(content(resp_aa_filter, "text"), flatten = TRUE)

# Extract and display the results
if (!is.null(data_aa_filter$data$serotypeSequences)) {
  serotypes_df <- data_aa_filter$data$serotypeSequences
  
  # Count the number of serotypes found
  serotype_count <- nrow(serotypes_df)
  cat("Number of serotypes with Q at position 43 as part of definition:", serotype_count, "\n\n")
  
  # Display the results (up to 10 rows)
  if (serotype_count > 0) {
    # Create a simple table with just locus and serotype
    result_table <- serotypes_df[, c("locus", "serotype")]
    
    kable(head(result_table, 10), 
          caption = paste("Serotypes with Q at position 43 (showing first", 
                          min(10, nrow(result_table)), "of", nrow(result_table), ")")) %>%
      kable_styling()
  } else {
    cat("No serotypes found with Q at position 43 as part of the definition.\n")
  }
}
```

## 7.4 Using serotypeBestSequenceMatch

The `serotypeBestSequenceMatch` endpoint helps find serotypes that best match a specific amino acid sequence pattern. This is useful for identifying which serotype a sequence might represent.

### 7.4.1 Finding Best Matches with Multiple Amino Acid Criteria

This query finds serotypes that best match a sequence with specific amino acid requirements at three positions:
- Q at position 43 (must be part of definition)
- A at position 44 (must be part of definition)
- I at position 73 (must be part of definition)

```{r}
#| label: best-sequence-match-7

query_sequence_match <- '
query {
  serotypeBestSequenceMatch(
    loci: ["A"],
    proteinFilters: [
      {position: "43", residue: "Q", isDefinition: true},
      {position: "44", residue: "A", isDefinition: true}, 
      {position: "73", residue: "I", isDefinition: true}
    ]
  ) {
    locus
    serotype
    protein {
      aminoAcid {
        position
        residue
        isDefinition
      }
    }
    sequenceIdentity
  }
}
'

resp_sequence_match <- POST(
  url,
  body = list(query = query_sequence_match),
  encode = "json",
  add_headers(`x-api-key` = apiKey)
)

# Parse the response
data_sequence_match <- fromJSON(content(resp_sequence_match, "text"), flatten = TRUE)

# Extract and display the results
if (!is.null(data_sequence_match$data$serotypeBestSequenceMatch)) {
  matches_df <- data_sequence_match$data$serotypeBestSequenceMatch
  
  # Count matches
  match_count <- nrow(matches_df)
  cat("Number of sequence matches found:", match_count, "\n\n")
  
  if (match_count > 0) {
    # Create a basic results table with serotype and sequence identity
    basic_results <- data.frame(
      Locus = matches_df$locus,
      Serotype = matches_df$serotype,
      SequenceIdentity = matches_df$sequenceIdentity,
      stringsAsFactors = FALSE
    )
    
    # Sort by sequence identity (descending)
    basic_results <- basic_results %>% arrange(desc(SequenceIdentity))
    
    # Display the basic results
    kable(basic_results, caption = "Top sequence matches for Q43 + A44 + I73") %>%
      kable_styling()
    
    # Create a more detailed table showing matches for each position
    matching_positions <- data.frame(
      Serotype = character(),
      SequenceIdentity = numeric(),
      Pos43 = character(),
      Pos44 = character(),
      Pos73 = character(),
      stringsAsFactors = FALSE
    )
    
    # Process each match to extract position details
    for (i in 1:nrow(matches_df)) {
      serotype <- matches_df$serotype[i]
      seq_identity <- matches_df$sequenceIdentity[i]
      
      # Get the amino acid data
      amino_acids <- matches_df$protein.aminoAcid[[i]]
      
      # Find the specific positions we're interested in
      pos43 <- amino_acids[amino_acids$position == "43", ]
      pos44 <- amino_acids[amino_acids$position == "44", ]
      pos73 <- amino_acids[amino_acids$position == "73", ]
      
      # Create match indicators (✓ for match, ✗ for mismatch)
      pos43_match <- ifelse(nrow(pos43) > 0 && pos43$residue == "Q" && pos43$isDefinition, "✓", "✗")
      pos44_match <- ifelse(nrow(pos44) > 0 && pos44$residue == "A" && pos44$isDefinition, "✓", "✗")
      pos73_match <- ifelse(nrow(pos73) > 0 && pos73$residue == "I" && pos73$isDefinition, "✓", "✗")
      
      # Add to the table
      matching_positions <- rbind(
        matching_positions,
        data.frame(
          Serotype = serotype,
          SequenceIdentity = seq_identity,
          Pos43 = paste0(pos43_match, " (", ifelse(nrow(pos43) > 0, pos43$residue, "?"), ")"),
          Pos44 = paste0(pos44_match, " (", ifelse(nrow(pos44) > 0, pos44$residue, "?"), ")"),
          Pos73 = paste0(pos73_match, " (", ifelse(nrow(pos73) > 0, pos73$residue, "?"), ")"),
          stringsAsFactors = FALSE
        )
      )
    }
    
    # Sort by sequence identity
    matching_positions <- matching_positions %>% arrange(desc(SequenceIdentity))
    
    # Display the matching positions
    kable(matching_positions, 
          caption = "Match details by position (✓ match, ✗ mismatch)") %>%
      kable_styling()
  } else {
    cat("No matching sequences found.\n")
  }
} else {
  cat("No data returned from the sequence match query.\n")
}
```

## 7.5 Summary

In this document, we demonstrated how to use the `serotypeSequences` and `serotypeBestSequenceMatch` API endpoints to:

1. Retrieve amino acid definitions for serotypes at a specific locus.
2. Find the amino acid definition for a specific serotype.
3. Filter serotypes based on specific amino acid properties.
4. Find the best sequence matches based on multiple amino acid criteria.

These examples showcase how to interact with the Serotype Database API to gain insights into the amino acid compositions that define serotypes and find sequence matches for specific patterns.