ggseqlogo/ 0000755 0001762 0000144 00000000000 15122165032 012236 5 ustar ligges users ggseqlogo/MD5 0000644 0001762 0000144 00000003643 15122165032 012554 0 ustar ligges users 92e1ca2a64ea3bc1a14dc358e6a4f787 *DESCRIPTION e1876c5c8566167ffb84ca49b3ea9947 *NAMESPACE 5c257b11f7d81496b10282615c604b46 *R/align.r afc90f23fc14653df3ec75dc96289065 *R/col_schemes.r 556d261b07949c518c221e67a230f5e7 *R/ggseqlogo.r 1082182bf4333efeda6ce25e7c9dc94b *R/heights.r 3aa7c1106c297f202b1956c3b3175a28 *build/vignette.rds e59880f7c4372bb7673e5d6c914b4737 *data/ggseqlogo_sample.rda 6adbd54256b515e9991862c9ded21658 *inst/doc/ggseqlogo_introduction.Rmd ddf4fac801304a92649c5257d91ce03d *inst/doc/ggseqlogo_introduction.html cb142cc4e020d20d8436ddb65652b411 *inst/extdata/akrobat_bold.font 79db0ca99345902380bc0952a3054235 *inst/extdata/akrobat_regular.font ce5b35b35db1afa9e4619561f1879865 *inst/extdata/helvetica_bold.font 5b6bbd6635bfe178d0e7258387bcf815 *inst/extdata/helvetica_light.font f3349a4f2d585783642a1410d50c51cc *inst/extdata/helvetica_regular.font f0273808e002fe493d739012a9692e2c *inst/extdata/roboto_bold.font 47b0fe58a03526a9036834f6f71a11db *inst/extdata/roboto_medium.font d0f1bca492a4d30319a0e692dd9ee632 *inst/extdata/roboto_medium_orig.font 118514d7ef6d3bb592ed2dab0e6b5659 *inst/extdata/roboto_regular.font 7c75961b8420cc574790bc6db2348daf *inst/extdata/roboto_slab_bold.font 2482050fd5123611735b17534ad0136e *inst/extdata/roboto_slab_light.font 7c75961b8420cc574790bc6db2348daf *inst/extdata/roboto_slab_regular.font 902b0fe612e1d75598c1a982ce413312 *inst/extdata/xkcd_regular.font 549909e981c7782ff2027c0b2b59b263 *man/geom_logo.Rd 0a0d27e370a8ac24ef66e75779161e6e *man/ggseqlogo.Rd b0786bf889c1c0254caf36650bdcf35e *man/list_col_schemes.Rd f0f7f096819c9f4e8dc19931716108d9 *man/list_fonts.Rd 7a48916ec692e08a397c5312154da29d *man/make_col_scheme.Rd 8da484ec8701aec24954bc49500477b8 *man/pfms_dna.Rd 1fb69e3f042751001178feaaca5674b7 *man/seqs_aa.Rd ad28431426e1befc17ac76bf064fbbc7 *man/seqs_dna.Rd 1f287a60ec2fd29f54df046fb12047c6 *man/theme_logo.Rd 6adbd54256b515e9991862c9ded21658 *vignettes/ggseqlogo_introduction.Rmd ggseqlogo/R/ 0000755 0001762 0000144 00000000000 14561211641 012443 5 ustar ligges users ggseqlogo/R/align.r 0000644 0001762 0000144 00000002654 14561167632 013741 0 ustar ligges users # # pd = data.frame( # letters = strsplit("AGTGACCGACTATCATAGTGACCCAGAATCATAGTGACCGAGTATGAT", "")[[1]], # species = rep(c("Human", "Armadillo", "Porcupine"), each=16), # x = rep(1:16, 3), # change = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0, # 0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0), # score1 = c(0,0,0,0,0,0,1,1,2,2,2,3,3,3,4,3, # 0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), # score2 = c(0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0, # 0,0,0,0,2,2,2,2,0,0,0,0,0,0,0,0, # 0,0,0,0,3,3,3,3,0,0,0,0,0,0,0,0) # ) # # pd = subset(pd, x <= 15) # # # # p = ggplot(pd[pd$score1 != 0,], aes(x=x, y=species)) + # coord_fixed(ratio = 1.2, ylim=c(0.5, 3.5)) + # #geom_tile(aes(fill=score1)) + # #scale_fill_gradient2("Score 1", limits=c(0,4),low="#762A83", mid="white", high="#1B7837", guide=guide_colorbar(title.position="top")) + # geom_text(data=pd, aes(label=letters, color=factor(change)), size=rel(5), family="mono") + theme_logo()#+ # #scale_color_manual("Change", values=c("black", "#F2A11F"), labels=c("None", "Some"), guide=guide_legend(direction="vertical", title.position="top", override.aes=list(shape = "A"))) # # print(p) # # ll = z + guides(fill='none') # zz = p + scale_x_continuous(breaks=1:15) + theme(legend.position="none") # p3 = plot_grid(ll, zz, align = 'v', nrow = 2) # # print(p3) ggseqlogo/R/col_schemes.r 0000644 0001762 0000144 00000015205 14561167632 015127 0 ustar ligges users #' List color schemes available in ggseqlogo #' #' @param v If true, font names are printed to stderr. Otherwise, color scheme names are returned as a character vector #' @export list_col_schemes <- function(v=T){ col_schemes = c('auto', 'chemistry', 'chemistry2','hydrophobicity', 'nucleotide', 'nucleotide2', 'base_pairing', 'clustalx', 'taylor') if(!v) return(col_schemes) message('Available ggseqlogo color schemes:') for(f in col_schemes) message('\t', f) } # Get color scheme # @param col_scheme name of color scheme # @param seq_type sequence type of color scheme get_col_scheme = function(col_scheme, seq_type='auto'){ # Check if user-defined color scheme if(is.data.frame(col_scheme)){ if(!'ggseqlogo_cs' %in% class(col_scheme)) stop('Colour scheme must be generated using "make_col_scheme" function') return(col_scheme) } # Get ambigious colour scheme col_scheme = match.arg(col_scheme, list_col_schemes(F)) # Get default color scheme for sequence type if(col_scheme == 'auto'){ if(seq_type == 'auto') stop('"col_scheme" and "seq_type" cannot both be "auto"') col_scheme = switch(tolower(seq_type), aa = 'chemistry', dna = 'nucleotide', rna = 'nucleotide', other='nucleotide') } # Pick from default color schemes cs = switch(col_scheme, # Color scheme based on chemistry of amino acids chemistry2 = data.frame( letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'), group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)), col = c(rep('#058644', 5), rep('#720091', 2), rep('#0046C5', 3), rep('#C5003E', 2), rep('#2E2E2E', 8)), stringsAsFactors = F ), # Color scheme based on chemistry of amino acids chemistry = data.frame( letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'), group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)), col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)), stringsAsFactors = F ), # Hydrophobicity index (PMID: 7108955) from -4.5 to 4.5 hydrophobicity = data.frame( letter = c('I', 'V', 'L', 'F', 'C', 'M', 'A', 'G', 'T', 'W', 'S', 'Y', 'P', 'H', 'D', 'E', 'N', 'Q', 'K', 'R'), group = c(4.5, 4.2, 3.8, 2.8, 2.5, 1.9, 1.8, -0.4, -0.7, -0.9, -0.8, -1.3, -1.6, -3.2, -3.5, -3.5, -3.5, -3.5, -3.9, -4.5), stringsAsFactors=F ), # Colour based on nucleotide nucleotide2 = data.frame( letter = c('A', 'C', 'G', 'T', 'U'), col = c('darkgreen', 'blue', 'orange', 'red', 'red'), stringsAsFactors = F ), #alt red BA1200 nucleotide = data.frame( letter = c('A', 'C', 'G', 'T', 'U'), col = c('#109648', '#255C99', '#F7B32B', '#D62839', '#D62839'), stringsAsFactors = F ), base_pairing = data.frame( letter = c('A', 'T', 'U', 'G', 'C'), group = c(rep('Weak bonds', 3), rep('Strong bonds', 2)), col = c(rep('darkorange', 3), rep('blue', 2)), stringsAsFactors = F ), # ClustalX color scheme: # http://www.jalview.org/help/html/colourSchemes/clustal.html clustalx = data.frame( letter = c('W', 'L', 'V', 'I', 'M', 'F', 'A', 'R', 'K', 'T', 'S', 'N', 'Q', 'D', 'E', 'H', 'Y', 'C', 'G', 'P'), col = c(rep('#197FE5', 7), rep('#E53319', 2), rep('#19CC19', 4), rep('#CC4CCC', 2), rep('#19B2B2', 2), '#E57F7F', '#E5994C', '#B0B000'), stringsAsFactors = F ), # Taylor color scheme (PMID: 9342138) taylor = data.frame( letter = c('D','S','T','G','P','C','A','V','I','L','M','F','Y','W','H','R','K','N','Q','E'), col = c('#FF0000','#FF3300','#FF6600','#FF9900','#FFCC00','#FFFF00','#CCFF00','#99FF00', '#66FF00','#33FF00','#00FF00','#00FF66','#00FFCC','#00CCFF','#0066FF','#0000FF', '#6600FF','#CC00FF','#FF00CC','#FF0066'), stringsAsFactors = F ) ) if(!'group' %in% names(cs)) cs$group = cs$letter # Set attributes attr(cs, 'cs_label') = col_scheme class(cs) = c('data.frame','ggseqlogo_cs') return(cs) } #' Create new sequence logo color scheme #' #' @param chars Vector of one letter characters #' @param groups Vector of groups for letters with same length as chars (optional if cols parameter is provided) #' @param cols Vector of colors with same length as chars (optional if values parameter is provided) #' @param values Vector of numerical values with same length as chars #' @param name Name of color scheme #' #' @export #' #' @importFrom grDevices col2rgb #' @examples #' #' # Discrete color scheme examples #' cs1 = make_col_scheme(chars=c('A', 'T', 'G', 'C'), groups=c('g1', 'g1', 'g2', 'g2'), #' cols=c('red', 'red', 'blue', 'blue'), name='custom1') #' #' cs2 = make_col_scheme(chars=c('A', 'T', 'G', 'C'), cols=c('red', 'red', 'blue', 'blue'), #' name='custom2') #' #' # Quantitative color scheme #' cs3 = make_col_scheme(chars=c('A', 'T', 'G', 'C'), values=1:4, name='custom3') make_col_scheme <- function(chars=NULL, groups=NULL, cols=NULL, values=NULL, name=''){ if(is.null(chars) | any(nchar(chars) != 1) | !is.character(chars)) stop('"chars" must be a character vector of one letter characters') if(is.null(values)){ # Discrete colour scheme # Error check lengths if(length(chars) != length(cols)) stop('"chars" and "cols" must have same length') # Error check types if(!is.character(cols)) stop('"cols" must be a character vector') # Check valid colours tmp = col2rgb(cols); rm(tmp) if(is.null(groups)) groups = chars cs = data.frame( letter=chars, group=groups, col=cols, stringsAsFactors = F ) }else{ # Quantitative color scheme if(length(chars) != length(values)) stop('"chars" and "values" must have same length') cs = data.frame( letter=chars, group=values, stringsAsFactors=F ) } # Remove duplicate letters cs = cs[!duplicated(cs$letter),] # Set attributes attr(cs, 'cs_label') = name class(cs) = c('data.frame','ggseqlogo_cs') return(cs) } ggseqlogo/R/ggseqlogo.r 0000644 0001762 0000144 00000026220 15122161634 014617 0 ustar ligges users # if(T){ # require(ggplot2) # setwd('~/Development/ggseqlogo/') # source('R/heights.r') # source('R/col_schemes.r') # GGSEQLOGO_FONT_BASE = '~/Development/ggseqlogo/inst/fonts/' # } # Change range of values newRange <- function(old_vals, new_min=0, new_max=1){ old_min = min(old_vals) old_max = max(old_vals) new_vals = (((old_vals - old_min) * (new_max - new_min)) / (old_max - old_min)) + new_min new_vals } #' List fonts available in ggseqlogo #' #' @param v If true, font names are printed to stderr. Otherwise, font names are returned as a character vector #' @export list_fonts <- function(v=T){ fonts = c('helvetica_regular','helvetica_bold', 'helvetica_light', 'roboto_medium','roboto_bold', 'roboto_regular', 'akrobat_bold', 'akrobat_regular', 'roboto_slab_bold', 'roboto_slab_regular', 'roboto_slab_light', 'xkcd_regular') if(!v) return(fonts) message('Available ggseqlogo fonts:') for(f in fonts) message('\t', f) } # Read font from file if not in global envir. get_font <- function(font){ GGSEQLOGO_FONT_BASE = getOption('GGSEQLOGO_FONT_BASE') if(is.null(GGSEQLOGO_FONT_BASE)){ GGSEQLOGO_FONT_BASE=system.file("extdata", "", package = "ggseqlogo") options(GGSEQLOGO_FONT_BASE=GGSEQLOGO_FONT_BASE) } #all_fonts = c('sf_bold', 'sf_regular', 'ms_bold', 'ms_regular', 'xkcd_regular') font = match.arg(tolower(font), list_fonts(F)) font_filename = paste0(font, '.font') font_obj_name = sprintf('.ggseqlogo_font_%s', font) font_obj = getOption(font_obj_name) if(is.null(font_obj)){ # Not loaded into global env yet - load it into options font_path = file.path(GGSEQLOGO_FONT_BASE, font_filename) font_obj_list = list( tmp=readRDS(font_path) ) names(font_obj_list) = font_obj_name options(font_obj_list) font_obj = font_obj_list[[1]] } # Return font data font_obj } # Generate height data for logo logo_data <- function( seqs, method='bits', stack_width=0.95, rev_stack_order=F, font, seq_group=1, seq_type = 'auto', namespace=NULL ){ # Get font font_df = get_font(font) # TODO # hh = twosamplelogo_method(seqs, seqs_bg, pval_thresh=0.05, seq_type = seq_type, namespace = namespace) # Generate heights based on method if(method == 'bits'){ hh = bits_method(seqs, decreasing = rev_stack_order, seq_type = seq_type, namespace = namespace) }else if(method == 'probability'){ hh = probability_method(seqs, decreasing = rev_stack_order, seq_type = seq_type, namespace = namespace) }else if(method == 'custom'){ if(seq_type == 'auto') seq_type = guessSeqType(rownames(seqs)) hh = matrix_to_heights(seqs, seq_type, decreasing = rev_stack_order) }else{ stop('Invalid method!') } # Merge font df and heights ff = merge(font_df, hh, by = 'letter') # Scale x and ys to new positions x_pad = stack_width/2 ff$x = newRange(ff$x, ff$position - x_pad, ff$position + x_pad) ff$y = newRange(ff$y, ff$y0, ff$y1) # Rename columns ff = as.data.frame(ff)[,c('x', 'y', 'letter', 'position', 'order')] ff$seq_group = seq_group # Set sequence type as attribute, to be used downstream attr(ff, 'seq_type') = attr(hh, 'seq_type') # Return data table ff } #' ggseqlogo custom theme #' #' @param base_size font base size #' @param base_family font base family #' #' @import ggplot2 #' @export theme_logo <- function(base_size=12, base_family=''){ ggplot2::theme_minimal(base_size = base_size, base_family = base_family) %+replace% theme(panel.grid = element_blank(), legend.position = 'bottom', axis.text.x=element_text(colour="black"), axis.text.y=element_text(colour="black")) } #' Plots sequence logo as a layer on ggplot #' #' @param data Character vector of sequences or named list of sequences. All sequences must have same width. #' @param method Height method, can be one of "bits" or "probability" (default: "bits") #' @param seq_type Sequence type, can be one of "auto", "aa", "dna", "rna" or "other" #' (default: "auto", sequence type is automatically guessed) #' @param namespace Character vector of single letters to be used for custom namespaces. Can be alphanumeric, including Greek characters. #' @param font Name of font. See \code{list_fonts} for available fonts. #' @param stack_width Width of letter stack between 0 and 1 (default: 0.95) #' @param rev_stack_order If \code{TRUE}, order of letter stack is reversed (default: FALSE) #' @param col_scheme Color scheme applied to the sequence logo. See \code{list_col_schemes} for available fonts. #' (default: "auto", color scheme is automatically picked based on \code{seq_type}). #' One can also pass custom color scheme objects created with the \code{make_col_scheme} function #' @param low_col,high_col Colors for low and high ends of the gradient if a quantitative color scheme is used (default: "black" and "yellow"). #' @param na_col Color for letters missing in color scheme (default: "grey20") #' @param plot If \code{FALSE}, plotting data is returned #' @param ... Additional arguments passed to layer params #' #' @export #' @import ggplot2 #' #' @examples #' # Load sample data #' data(ggseqlogo_sample) #' #' # Produce single sequence logo using geom_logo #' p1 = ggseqlogo( seqs_dna[[1]] ) #' geom_logo <- function(data = NULL, method='bits', seq_type='auto', namespace=NULL, font='roboto_medium', stack_width=0.95, rev_stack_order=F, col_scheme = 'auto', low_col='black', high_col='yellow', na_col='grey20', plot=T, ...) { if(stack_width > 1 | stack_width <= 0) stop('"stack_width" must be between 0 and 1') if(is.null(data)) stop('Missing "data" parameter!') if(!is.null(namespace)) seq_type = 'other' # Validate method all_methods = c('bits', 'probability','custom')#, 'tsl') pind = pmatch(method, all_methods) method = all_methods[pind] if(is.na(method)) stop("method must be one of 'bits' or 'probability', or 'custom'") # Convert character seqs to list if(is.character(data) | is.matrix(data)) data = list("1"=data) if(is.list(data)){ # Set names for list if they dont exist if(is.null(names(data))) names(data) = seq_along(data) lvls = names(data) # We have list of sequences - loop and rbind data_sp = lapply(names(data), function(n){ curr_seqs = data[[n]] logo_data(seqs = curr_seqs, method = method, stack_width = stack_width, rev_stack_order = rev_stack_order, seq_group = n, seq_type = seq_type, font = font, namespace=namespace) }) data = do.call(rbind, data_sp) # Set factor for order of facet data$seq_group = factor(data$seq_group, levels = lvls) } if(!plot) return(data) # Get sequence type seq_type = attr(data, 'seq_type') cs = get_col_scheme( col_scheme, seq_type ) legend_title = attr(cs, 'cs_label') data = merge(data, cs, by='letter', all.x=T) # Make sure you retain order after merge data = data[order(data$order),] # Do we have a gradient colscale colscale_gradient = is.numeric( cs$group ) colscale_opts = NULL if(colscale_gradient){ # Set gradient colours colscale_opts = scale_fill_gradient(name=legend_title, low = low_col, high = high_col, na.value = na_col) }else{ # Make group -> colour map tmp = cs[!duplicated(cs$group) & !is.na(cs$group),] col_map = unlist( split(tmp$col, tmp$group) ) # Set colour scale options colscale_opts = scale_fill_manual(values=col_map, name=legend_title, na.value=na_col) } # If letters and group are the same, don't draw legend guides_opts = NULL if(identical(cs$letter, cs$group)) guides_opts = guides(fill=F) y_lim = NULL extra_opts = NULL if(method == 'tsl'){ y_lab = 'Depleted Enriched' tmp = max(abs(data$y)) #y_lim = c(-tmp, tmp) row_a = row_b = data[1,] row_a$y = -tmp row_b$y = tmp data = rbind(data, row_a, row_b) data$facet = factor(data$y > 0, c(T, F), c('Enriched', 'Depleted')) extra_opts = NULL#facet_grid(facet~., scales='free') }else if(method == 'custom'){ y_lab = '' }else{ y_lab = method substr(y_lab, 1, 1) = toupper(substr(y_lab, 1, 1)) } # Group data data$group_by = with(data, interaction(seq_group, letter, position)) data$x = data$x # Create layer logo_layer = layer( stat = 'identity', data = data, mapping = aes_string(x = 'x', y = 'y', fill='group', group='group_by'), geom = 'polygon', position = 'identity', show.legend = NA, inherit.aes = F, params = list(na.rm = T, ...) ) breaks_fun = function(lim){ # account for multiplicatuce expansion factor of 0.05 1: floor( lim[2] / 1.05 ) } # Expand 0.05 addidtive list(logo_layer, scale_x_continuous(breaks = breaks_fun, labels = identity), ylab(y_lab), xlab(''), colscale_opts, guides_opts, coord_cartesian(ylim=y_lim), extra_opts) } #' Quick sequence logo plot #' #' @description \code{ggseqlogo} is a shortcut for generating sequence logos. #' It adds the ggseqlogo theme \code{\link{theme_logo}} by default, and facets when multiple input data are provided. #' It serves as a convenient wrapper, so to customise logos beyond the defaults here, please use \code{\link{geom_logo}}. #' #' @param data Character vector of sequences or named list of sequences. All sequences must have same width #' @param facet Facet type, can be 'wrap' or 'grid' #' @param scales Facet scales, see facet_wrap #' @param ncol Number of columns, works only when \code{facet='wrap'}, see facet_wrap #' @param nrow Number of rows, same as \code{ncol} #' @param ... Additional arguments passed to \code{\link{geom_logo}} #' #' @export #' @examples #' # Load sample data #' data(ggseqlogo_sample) #' #' # Plot a single DNA sequence logo #' p1 = ggseqlogo( seqs_dna[[1]] ) #' print(p1) #' #' # Plot multiple sequence logos at once #' p2 = ggseqlogo( seqs_dna ) #' print(p2) ggseqlogo <- function(data, facet='wrap', scales='free_x', ncol=NULL, nrow=NULL, ...){ # Generate the plot with default theme p = ggplot() + geom_logo(data = data, ...) + theme_logo() # If it's an inidivdual sequence logo, return plot if(!'list' %in% class(data)) return(p) # If we have more than one plot, facet facet_opts = c('grid', 'wrap') pind = pmatch(facet, facet_opts) facet = facet_opts[pind] if(is.na(facet)) stop("facet option must be set to 'wrap' or 'grid'") if(facet == 'grid'){ p = p + facet_grid(~seq_group, scales = scales) }else if(facet == 'wrap'){ p = p + facet_wrap(~seq_group, scales = scales, nrow = nrow, ncol = ncol) } # Return plot return(p) } #' List of aligned transcription factor binding sequences #' #' @name seqs_dna #' @docType data #' @keywords data NULL #' List of aligned kinase-substrate binding sequences #' #' @name seqs_aa #' @docType data #' @keywords data NULL #' List of position frequency matrices for transcription factors #' #' @name pfms_dna #' @docType data #' @keywords data NULL # message('-- running example') # load('data/ggseqlogo_sample.rda') # p = ggseqlogo(sample_data$seqs_dna, nrow=3) # d = p$layers[[1]]$data # print(p) ggseqlogo/R/heights.r 0000644 0001762 0000144 00000026315 14561167632 014302 0 ustar ligges users # Namespaces .AA_NAMESPACE = function() c('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V') .DNA_NAMESPACE = function() c('A', 'T', 'G', 'C') .RNA_NAMESPACE = function() c('A', 'U', 'G', 'C') # Generate letter matrix from vector of sequences # # @param input vector of sequences letterMatrix <- function(input){ # Ensure kmers are the same length characters seq.len = sapply(input, nchar) num_pos = seq.len[1] if(! all(seq.len == num_pos)) stop('Sequences in alignment must have identical lengths') # Construct matrix of letters split = unlist( sapply(input, function(seq){strsplit(seq, '')}) ) t( matrix(split, seq.len, length(split)/num_pos) ) } # Guess sequence type based on letter matrix # # @param sp letters guessSeqType <- function(sp){ # Ensure we have something if(length( intersect(sp, c(.AA_NAMESPACE(), .DNA_NAMESPACE(),.RNA_NAMESPACE())) ) == 0) stop('Could not get guess seq_type. Please explicitly define sequence type or use "other" with custom namespaces.') dat = setdiff(intersect(sp, .AA_NAMESPACE()), c(.DNA_NAMESPACE(),.RNA_NAMESPACE())) if(length(dat) > 0){ return('AA') }else if('U' %in% sp){ return('RNA') } return('DNA') } # Find namespace # # @param letter_mat Matrix of latters # @param seq_type Sequence type # @param namespace Alphabet findNamespace <- function(letter_mat, seq_type, namespace){ # Get all letters in our alignment sp = as.character(letter_mat) # Other namespace if(seq_type == "other"){ if(is.null(namespace)) stop('seq_type of "other" must have a defined namespace') namespace = as.character(namespace) # Get unique namespace = unique( unlist(strsplit(namespace, '')) ) # Validate non_alphanumeric = grepl('[^a-zA-Z0-9\u03bb\u03b1\u03b2\u0393\u03b3\u0394\u03b4\u03b5\u03b6\u03b7\u03b8\u0398\u03b9\u03ba\u039b\u039b\u03bc\u039e\u03be\u03a0\u03c0\u03c1\u03c3\u03c4\u03c5\u03a6\u03c6\u03c7\u03c8\u03a8\u03a9\u03c9]', namespace) if( any( non_alphanumeric ) ) stop('All letters in the namespace must be alphanumeric') # Ensure there is something in each column # apply(letter_mat, 2, function(column_letters){ # int = intersect(namespace, column_letters) # if(length(int) == 0) # stop('The alignment has no letters in namespace match aligned sequences in at least one column') # }) }else{ if(!is.null(namespace)) stop('For custom namespaces please set seq_type to "other"') # Guess sequence type if(seq_type == "auto") seq_type = guessSeqType(sp) # Get predefined namespace namespace = get( sprintf('.%s_NAMESPACE', toupper(seq_type)) )() } return(list(seq_type = toupper(seq_type), namespace = namespace)) } # Calcualte bits # # @param pwm Position weight matrix # @param N Number of letters in namespace # @param Nseqs Number of sequences in PWM computeBits <- function(pwm, N=4, Nseqs=NULL){ Nseqs = attr(pwm, 'nongapped') H_i = - apply(pwm, 2, function(col) sum(col * log2(col), na.rm=T)) e_n = 0 if(!is.null(Nseqs)) e_n = (1/logb(2)) * (N-1)/(2*Nseqs) R_i = log2(N) - (H_i + e_n) # Set any negatives to 0 R_i = pmax(R_i, 0) return(R_i) } # Construct relative frequency matrix # @param seqs aligned sequences as vector # @param seq_type sequence type # @param namespace letters used for matrix construction # @param keep_letter_mat Keep letter matrix for some height methods makePFM <- function(seqs, seq_type='auto', namespace=NULL, keep_letter_mat=F){ letter_mat = NA if(is.matrix(seqs)){ # Process matrix if(is.null(rownames(seqs))) stop('Matrix must have letters for row names') num_pos = ncol(seqs) # Get namespace ns = findNamespace(rownames(seqs), seq_type, namespace) namespace = ns$namespace seq_type = ns$seq_type nseqs = NULL bg_prob = NA pfm_mat = seqs pfm_mat = apply(pfm_mat, 2, function(x) x / sum(x, na.rm=T)) missing_rows = setdiff(namespace, rownames(pfm_mat)) if(length(missing_rows) > 0){ miss = matrix(rep(0, length(missing_rows) * ncol(pfm_mat)), nrow=length(missing_rows), dimnames = list(missing_rows)) pfm_mat = rbind(pfm_mat, miss) } pfm_mat = pfm_mat[namespace,] }else{ # Process sequences # Number of positions in alignment num_pos = nchar(seqs[1]) # Number of sequences nseqs = length(seqs) # Letter matrix letter_mat = letterMatrix(seqs) # Get namespace ns = findNamespace(letter_mat, seq_type, namespace=namespace) namespace = ns$namespace seq_type = ns$seq_type # Construct PWM pfm_mat = apply(letter_mat, 2, function(pos.data){ # Get frequencies t = table(pos.data) # Match to aa ind = match(namespace, names(t)) # Create column col = t[ind] col[is.na(col)] = 0 names(col) = namespace # Do relative frequencies col = col / sum(col) col }) mat = matrix((letter_mat %in% namespace), nrow=nrow(letter_mat)) attr(pfm_mat, 'nongapped') = apply(mat, 2, sum) attr(pfm_mat, 'nseqs') = nseqs } # Number of letters in ns N = length(namespace) # Assign seq type and namespace as attributes attr(pfm_mat, 'seq_type') = seq_type attr(pfm_mat, 'namespace') = namespace # Non-gapped columns if(seq_type == 'aa') namespace = c(namespace, 'X', 'B', 'Z') # Information content attr(pfm_mat, 'bits') = computeBits(pfm_mat, N, nseqs) # Assign AA names to rows/pos col rownames(pfm_mat) = namespace colnames(pfm_mat) = 1:num_pos if(keep_letter_mat) return(list(letter_mat = letter_mat, pfm=pfm_mat)) return(pfm_mat) } ###################### # Matrix to heights ###################### # General function to convert matrix of heights to polygon data frame # @param mat matrix of heghts # @param seq_type sequence type # @decreasing Sets order of letters, high to low or low to high matrix_to_heights <- function(mat, seq_type, decreasing=T){ mat[is.infinite(mat)] = 0 if(any(duplicated(rownames(mat)))) stop('Matrix input must have unique row names') dat = lapply(1:ncol(mat), function(i){ vals = mat[,i] pos = sort( vals[vals >= 0], decreasing = decreasing) neg = sort(vals[vals < 0], decreasing = !decreasing) #vals = sort(vals, decreasing = T) cs_pos = cumsum( pos ) cs_neg = cumsum( neg ) df_pos = df_neg = NULL if(length(pos) > 0) df_pos = data.frame(letter=names(pos), position=i, y0=c(0, cs_pos[-length(cs_pos)]), y1=cs_pos, stringsAsFactors = F) if(length(neg) > 0) df_neg = data.frame(letter=names(neg), position=i, y0=cs_neg, y1=c(0, cs_neg[-length(cs_neg)]), stringsAsFactors = F) rbind(df_pos, df_neg) }) dat = do.call(rbind, dat) # Adjust y spacing space_factor = 0.004 y_pad = max(dat$y1) * space_factor dat$y0 = dat$y0 + y_pad dat = subset(dat, dat$y1 > dat$y0) # Dummy points to make sure full plot is drawn # Make sure position 1 and n have a dummy empty letter missing dummy = data.frame(letter=dat$letter[1], position=NA, y0=0, y1=0) # Missing first position if(dat$position[1] != 1){ dummy$position = 1 dat = rbind( dummy, dat ) } # Missing last position if(dat$position[nrow(dat)] != ncol(mat)){ dummy$position = ncol(mat) dat = rbind( dat, dummy ) } rownames(dat) = NULL attr(dat, 'seq_type') = seq_type dat } # Shannon entropy method bits_method <- function(seqs, decreasing, ...){ # Make PFM pfm = makePFM(seqs, ...) # Get ic ic = attr(pfm, 'bits') if(all(ic == 0)){ warning('All positions have zero information content perhaps due to too few input sequences. Setting all information content to 2.') ic = (ic * 0)+2 } heights = t(t(pfm) * ic) seq_type = attr(pfm, 'seq_type') matrix_to_heights(heights, seq_type, decreasing) } # Probability method probability_method <- function(seqs, decreasing, ...){ # Make PFM pfm = makePFM(seqs, ...) seq_type = attr(pfm, 'seq_type') matrix_to_heights(pfm, seq_type, decreasing) } ####################### # Two sample logo functions - method not used currently ####################### # t_test = function(a, b){ # x = tryCatch({ # return( t.test(a, b, var.equal = T)$p.value ) # } , error=function(e) return(1) ) # x # } # # binom_test = function(a, b){ # binom.test(sum(a), length(a), sum(b)/length(b))$p.value # } # # # ttest pvalue calculation reimplemented from TSL code # ttest_p_value <- function(k1, n1, k2, n2) { # mean1 = k1 / n1; # mean2 = k2 / n2; # # var1_mult = (k1*(1-mean1)*(1-mean1)) + ((n1-k1)*mean1*mean1); # var2_mult = (k2*(1-mean2)*(1-mean2)) + ((n2-k2)*mean2*mean2); # # df = n1 + n2 - 2; # svar = (var1_mult + var2_mult) / df; # t = (mean1-mean2) / sqrt(svar*(1.0/n1 + 1.0/n2)); # return( 2*pt(t, df, lower=FALSE) ) # } # # # Convert to matrix of 1s and 0s # to_matrix = function (x, seq){ # X <- matrix(0, length(x), length(seq), dimnames = list(names(x), seq)) # for (i in 1:length(seq)) X[x == seq[i], i] <- 1 # return(X) # } # # twosamplelogo_method <- function(fg, bg, fix_pos=NULL, test='t.test', pval_thresh=0.05, ...){ # if(!is.character(fg) | !is.character(bg)) # stop('Foreground and background sequences must be character vectors') # # if(!identical(unique(nchar(fg)), unique(nchar(bg)))) # stop('Foreground sequences must have same width as background') # # fg_obj = makePFM(fg, keep_letter_mat=T, ...) # # namespace = attr(fg_obj$pfm, 'namespace') # seq_type = attr(fg_obj$pfm, 'seq_type') # # # Pass sequence type and namespace - avoid double guessing # bg_obj = makePFM(bg, keep_letter_mat=T, seq_type = 'other', namespace = namespace) # # # Difference in relative frequencies # pfm_diff = fg_obj$pfm - bg_obj$pfm # # # Get letter matrices # fg_lm = fg_obj$letter_mat # bg_lm = bg_obj$letter_mat # # pv_mat = sapply(1:ncol(fg_lm), function(i){ # p = to_matrix(fg_lm[,i], namespace) # n = to_matrix(bg_lm[,i], namespace) # # np = nrow(p) # nn = nrow(n) # # #pv = sapply(1:ncol(p), function(j) binom_test(p[,j], n[,j]) ) # pv = sapply(1:ncol(p), function(j) ttest_p_value(sum(p[,j]), np, sum( n[,j] ), nn) ) # names(pv) = names(p) # pv # }) # # # Set things below threshold to zero # pfm_diff[ pv_mat >= pval_thresh ] = 0 # pfm_diff = pfm_diff * 100 # # #fix_pos = 1 # if(!is.null(fix_pos)){ # i = apply(fg_obj$pfm[,fix_pos,drop=F], 2, which.max) # ind = matrix(c(i, fix_pos), ncol=2) # x = pfm_diff # x[x < 0] = 0 # pfm_diff[ind] = max( apply(x, 2, sum) ) # } # # # Make heights # hh = matrix_to_heights(pfm_diff, seq_type) # hh # } # plogo <- function(fg, bg, pval_thresh=0.05){ # fg_obj = makePFM(fg, keep_letter_mat=T, NO REL FREQ, ...) # namespace = attr(fg_obj$pfm, 'namespace') # seq_type = attr(fg_obj$pfm, 'seq_type') # # Pass sequence type and namespace - avoid double guessing # bg_obj = makePFM(bg, keep_letter_mat=T, seq_type = seq_type, namespace = namespace) # # -log( binom.test(1, 100, 0.01, alternative = 'g')$estimate / # # binom.test(1, 100, 0.01, alternative = 'l')$estimate ) # } ggseqlogo/vignettes/ 0000755 0001762 0000144 00000000000 15122162114 014244 5 ustar ligges users ggseqlogo/vignettes/ggseqlogo_introduction.Rmd 0000644 0001762 0000144 00000000577 14561211252 021515 0 ustar ligges users --- title: "ggseqlogo introduction" author: "Omar Wagih" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{ggseqlogo introduction} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- Click here for the ggseqlogo introduction
ggseqlogo/data/ 0000755 0001762 0000144 00000000000 14561167632 013165 5 ustar ligges users ggseqlogo/data/ggseqlogo_sample.rda 0000644 0001762 0000144 00000022363 14561167632 017213 0 ustar ligges users |,˕cڍE3HZ>61ΈLEd*nIZ2Lx 0`ʄ F0a30a# !a/*VdK B*WEEXo}ݻ>>~e~???bȷp5~?iR?~mOQ/