4.1 Conversion with biomaRt

## Install missing packages
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
bio_pkgs <- c("biomaRt", "Biostrings")
for (pkg_i in bio_pkgs) {
  if (!require(pkg_i, quietly = T, character.only = T))
    BiocManager::install(pkg_i)
}
if (!require("remotes", quietly = T)) install.packages("remotes")
if (!require("MSnID", quietly = T, character.only = T))
  remotes::install_github("PNNL-Comp-Mass-Spec/MSnID@pnnl-master")
## ------------------------
library(biomaRt) # ID conversion
library(Biostrings) # read FASTA files
library(MSnID) # parse_FASTA_names

The first steps are to determine which mart and dataset to use. listMarts will show the available marts. The first 6 rows of the available datasets (provided by listDatasets(mart)) are also shown. (Use View, rather than head, to search for the desired database.)

# Create mart
listMarts() # determine biomart for useMart
##                biomart                version
## 1 ENSEMBL_MART_ENSEMBL      Ensembl Genes 106
## 2   ENSEMBL_MART_MOUSE      Mouse strains 106
## 3     ENSEMBL_MART_SNP  Ensembl Variation 106
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 106
mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL")
head(listDatasets(mart)) # determine dataset for useMart
##                        dataset                           description
## 1 abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1)
## 2     acalliptera_gene_ensembl      Eastern happy genes (fAstCal1.2)
## 3   acarolinensis_gene_ensembl       Green anole genes (AnoCar2.0v2)
## 4    acchrysaetos_gene_ensembl       Golden eagle genes (bAquChr1.2)
## 5    acitrinellus_gene_ensembl        Midas cichlid genes (Midas_v5)
## 6    amelanoleuca_gene_ensembl       Giant panda genes (ASM200744v2)
##       version
## 1 ASM259213v1
## 2  fAstCal1.2
## 3 AnoCar2.0v2
## 4  bAquChr1.2
## 5    Midas_v5
## 6 ASM200744v2
mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", 
                dataset = "rnorvegicus_gene_ensembl")

Next, we determine which attributes to select for the conversion table with listAttributes(mart). From that table, we select “refseq_peptide” and “external_gene_name”.

# Create conversion table
head(listAttributes(mart)) # determine attributes for getBM
##                            name                  description         page
## 1               ensembl_gene_id               Gene stable ID feature_page
## 2       ensembl_gene_id_version       Gene stable ID version feature_page
## 3         ensembl_transcript_id         Transcript stable ID feature_page
## 4 ensembl_transcript_id_version Transcript stable ID version feature_page
## 5            ensembl_peptide_id            Protein stable ID feature_page
## 6    ensembl_peptide_id_version    Protein stable ID version feature_page
conv_tbl1 <- getBM(attributes = c("refseq_peptide", "external_gene_name"),
                   mart = mart)
head(conv_tbl1, 10)
##    refseq_peptide external_gene_name
## 1                                   
## 2                         AC118165.1
## 3    NP_001000130              Olr56
## 4    NP_001000302             Olr473
## 5                         AC099294.1
## 6                     AABR07054368.1
## 7                             Olr760
## 8    NP_001014048              Clrn3
## 9                     AABR07000137.1
## 10   NP_001011937              Doc2g

This table has a lot of blank entries that need to be removed.