'Number of items to replace is not a multiple of replacement length when using a for loop

First, I parsed the AAChange.refGene column from the variant_calls dataframe and then extract the Refseq ID, cDNA level change, and Protein level change information.

library(tidyr)
variant_calls = read.delim("variant_calls.txt", sep="\t")
info = tidyr::separate(variant_calls["AAChange.refGene"], AAChange.refGene, c("Refseq ID", "cDNA level change", "Protein level change"), ":")
df = cbind(df["Gene.refGene"],info) 

Then, I extract the peptide sequence from hg19 (by refseq accession) and construct a 9mer peptide, where the mutation is in the middle (i.e., 4AAs on each side of the mutation).

library(biomaRt)
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl", host="https://grch37.ensembl.org", path="/biomart/martservice")
for(i in 1:length(df$`Refseq ID`)){
  seq <- getSequence(id=df$`Refseq ID`[i], type="refseq_mrna", seqType="peptide", mart=ensembl)
  seq <- sapply(seq$peptide, nchar)
  seq <- sort(seq, decreasing=TRUE)
  pep[i] <- seq
}

Warning:

1: In pep[i] <- seq:
  number of items to replace is not a multiple of replacement length

Dataframe

> dput(df)
structure(list(Gene.refGene = c("PRELP", "TRIM17", "CTNNB1", 
"C3orf30", "CLIP2", "TRIM21", "STARD10", "SYTL2", "SIRT4", "AGBL1", 
"PRSS8", "CDH11", "RAD51C", "ARHGEF1", "POU3F4", "PRELP", "TRIM17", 
"CTNNB1", "C3orf30", "CLIP2", "TRIM21", "STARD10", "SYTL2", "SIRT4", 
"AGBL1", "PRSS8", "CDH11", "RAD51C", "ARHGEF1", "POU3F4", "PRELP", 
"TRIM17", "CTNNB1", "C3orf30", "CLIP2", "TRIM21", "STARD10", 
"SYTL2", "SIRT4", "AGBL1", "PRSS8", "CDH11", "RAD51C", "ARHGEF1", 
"POU3F4"), `Refseq ID` = c("NM_002725", "NM_001024940", "NM_001098209", 
"NM_152539", "NM_032421", "NM_003141", "NM_006645", "NM_206927", 
"NM_012240", "NM_152336", "NM_002773", "NM_001797", "NM_058216", 
"NM_198977", "NM_000307", "NM_002725", "NM_001024940", "NM_001098209", 
"NM_152539", "NM_032421", "NM_003141", "NM_006645", "NM_206927", 
"NM_012240", "NM_152336", "NM_002773", "NM_001797", "NM_058216", 
"NM_198977", "NM_000307", "NM_002725", "NM_001024940", "NM_001098209", 
"NM_152539", "NM_032421", "NM_003141", "NM_006645", "NM_206927", 
"NM_012240", "NM_152336", "NM_002773", "NM_001797", "NM_058216", 
"NM_198977", "NM_000307"), `cDNA level change` = c("c.C301T", 
"c.T1054A", "c.T109C", "c.G955A", "c.A2422G", "c.G431A", "c.C749T", 
"c.C778A", "c.G209A", "c.A382C", "c.G750C", "c.C2125T", "c.C797A", 
"c.C1543T", "c.C356T", "c.C301T", "c.T1054A", "c.T109C", "c.G955A", 
"c.A2422G", "c.G431A", "c.C749T", "c.C778A", "c.G209A", "c.A382C", 
"c.G750C", "c.C2125T", "c.C797A", "c.C1543T", "c.C356T", "c.C301T", 
"c.T1054A", "c.T109C", "c.G955A", "c.A2422G", "c.G431A", "c.C749T", 
"c.C778A", "c.G209A", "c.A382C", "c.G750C", "c.C2125T", "c.C797A", 
"c.C1543T", "c.C356T"), `Protein level change` = c("p.P101S", 
"p.Y352N", "p.S37P", "p.E319K", "p.T808A", "p.G144E", "p.S250L", 
"p.P260T", "p.G70E", "p.K128Q", "p.W250C", "p.R709W", "p.A266D", 
"p.R515W", "p.A119V", "p.P101S", "p.Y352N", "p.S37P", "p.E319K", 
"p.T808A", "p.G144E", "p.S250L", "p.P260T", "p.G70E", "p.K128Q", 
"p.W250C", "p.R709W", "p.A266D", "p.R515W", "p.A119V", "p.P101S", 
"p.Y352N", "p.S37P", "p.E319K", "p.T808A", "p.G144E", "p.S250L", 
"p.P260T", "p.G70E", "p.K128Q", "p.W250C", "p.R709W", "p.A266D", 
"p.R515W", "p.A119V")), class = "data.frame", row.names = c(NA, 
-45L))


Solution 1:[1]

You need to declare pep prior to the for loop, and it is good practice to also declare it's length:

pep <- numeric(length(df$`Refseq ID`))
for(i in 1:length(df$`Refseq ID`)){
  seq <- getSequence(id=df$`Refseq ID`[i], type="refseq_mrna", seqType="peptide", mart=ensembl)
  seq <- sapply(seq$peptide, nchar)
  seq <- sort(seq, decreasing=TRUE)
  pep[i] <- seq
}

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Robert Long