R script for computation of 2D-SFS from frequency tables

R script for computation of 2D-SFS from frequency tables

This script allows one to take population SNP variant frequency data and compute 2D-SFS for each pair of populations. All scripts are provided in their current state of development with no warranty that they will work in all cases, and only for reference purposes. They can be used and modified by citing the Author (see script header). Example files can be obtained from the Author upon request.

#this script was written by Ivan Scotti, INRA, URFM
#Site Agroparc, Domaine Saint-Paul
#84914 AVIGNON Cedex 9
#you are free to use and modify it, provided you quote the author
#for questions, please contact Ivan Scotti at
#ivan.scotti[at]paca.inra.fr
#15 July 2015
#
fromFreqTableToSFS<-function(tableName,nContig,sampleSize,nPop)
    {
  #produces 2D-SFS for each pair of populations from a data set containing nPop populations
  #tableName is an allele frequency table
  #(including INDELS)
  #with all populations plus the global counts.
  #First column: contig name
  #second column: SNP position in contig
  #following columns: five per pop (A, C, G, T, indel), plus five for the global counts
  #last column: nb of alleles per locus
  #nContig is the number of contigs in the data set
  #sampleSize is number of hamploid genotypes per population
  #(assuming all pops have same numer of samples)
  #values:
  #minAllFreq.df: data frame with frequencies and contig id
  #MAF2DSFS: data frame containing the 2D-SFS
  #example arguments
  # tableName=SNPtableRS.df
  # nChrom=2883
  # sampleSize=40
  # nPop=4
  #replacing 0 values with NA in global count
  #(necessary to seek the global MAF, otherwise it will be zero for all loci
  temp<-tableName[,(nPop*5+1+2):(nPop*5+5+2)]
  temp[temp==0]<-NA
  tableName[,(nPop*5+1+2):(nPop*5+5+2)]<-temp
  #from here on, production of pairwise 2D-SFS's
  #the layout is generalised from the work done on waSNP
  #
  #starting the extraction of MAF
  #identifying the minor allele for each locus
  #i=2
  #y=tableName[1,]
  #creating a function to use with apply:
  #y is the data frame containing single-pop
  #and global frequencies
  minAllFreqCalcEmpirical<-function(y)
  {
    #identifying the minor allele in the global count
    #notice the index [1] that takes (arbitrarily)
    #the first allele in case of tie. This is needed
    #because for even allele frequencies, two values
    #will be returned by which()
    #also notice: as.integer() needed because otherwise
    #R will interpret erratically the min() command
    minAllPos<-which(as.integer(y[(5*nPop+1+2):(5*nPop+5+2)])==min(as.integer(y[(5*nPop+1+2):(5*nPop+5+2)]),na.rm=T))[1]
    #extracting the frequencies in each pop for that allele:
    #will take in each row the column corresponding
    #to the minor allele in the first pop, then move
    #four columns right and take the same allele in the
    #second pop
   as.integer(y[seq(minAllPos+2,5*nPop+2,5)])
  }
  #embedding into apply
  #notice the rather convolute way to obtain a data frame with characters
  #applying as.data.frame directly
  minAllFreq.df<-as.data.frame(t(apply(X=tableName,FUN=minAllFreqCalcEmpirical,MARGIN=1)))
  names(minAllFreq.df)<-paste("MAFpop",1:nPop,sep="")
  #importing contig identity
  minAllFreq.df<-cbind(tableName$contig,minAllFreq.df);names(minAllFreq.df)[1]<-"contig"
  #before proceeding with the construction and modification of the 2D-SFS,
  #loci with even frequencies must be identified
  #and their counts will be "split" later
  #first, get the global count of Minor Alleles:
  minAllGlobalCounts<-apply(X=minAllFreq.df[,2:(nPop+1)],FUN=sum,MARGIN=1)
  #paste it to its df
  minAllFreq.df<<-cbind(minAllFreq.df,minAllGlobalCounts)
  #looking for loci with even freqs, i.e. MAF=nPop*sampleSize*0.5
  #length(which(minAllGlobalCounts==nPop*sampleSize*0.5))
  #there are N of those.
  #computing the "raw" 2D-SFS's
  #generating  a list to store the matrices
  multiSFS.list<-list()
  #resetting the list index
  k<-0
  for (i in 1:(nPop-1))
  {
    for (j in (i+1):nPop)
    {
      #checking indexes (not used in the final script)
      #print(paste(i,j))
      #generating pairwise SFS for populations i and j
      #remember: first column is contig identity
      MAF2DSFS<-as.matrix(table(minAllFreq.df[,c(i+1,j+1)]))
      #completing 2DSFS with missing "marginal" columns and rows
      #NB this should not be necessary for multi-SFS, as at least
      #one cell should be (statistically) filled for all rows and all columns
      while(dim(MAF2DSFS)[1]<(sampleSize+1)) MAF2DSFS<-rbind(MAF2DSFS,rep(0,dim(MAF2DSFS)[2]))
      while(dim(MAF2DSFS)[2]<(sampleSize+1)) MAF2DSFS<-cbind(MAF2DSFS,rep(0,dim(MAF2DSFS)[1]))
      colnames(MAF2DSFS)<-paste("d0_",0:sampleSize,sep="")
      row.names(MAF2DSFS)<-paste("d1_",0:sampleSize,sep="")
      #curious to see the "fixed difference" loci? here they are
#       subset(minAllFreq.df,MAFpop0==5)
#       subset(minAllFreq.df,MAFpop1==5)
      #splitting in half the counts for alleles with global freq = 0.5
      for (t in 1:(dim(MAF2DSFS)[1]-1))
      {
        splitSum<-(MAF2DSFS[(t+1),(dim(MAF2DSFS)[1]-t)]+MAF2DSFS[(dim(MAF2DSFS)[1]-t),(t+1)])/2
        MAF2DSFS[(t+1),(dim(MAF2DSFS)[1]-t)]<-splitSum
        MAF2DSFS[(dim(MAF2DSFS)[1]-t),(t+1)]<-splitSum
      }
      k<-k+1
multiSFS.list[[k]]<-MAF2DSFS
names(multiSFS.list)[k]<-paste("pop",i,"vsPop",j,sep="")
    }
    }
multiPopSFS.list<<-multiSFS.list
  }
 
#SFS and allele freq tables ready for use with G2Dcalc ---> multiG2Dcalc (which embeds
#this function AND G2Dcalc)
# fromFreqTableToSFS(SNPtableRS.df,2883,40,4)

Date de modification : 22 juin 2023 | Date de création : 16 juillet 2015 | Rédaction : Ivan Scotti