hsj/code/dissimilarity_functions.R at master · stjohn/hsj · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
###################Hopkins and St. John 2018#####################
##function for applying proposed family of metrics. Input: character matrix in Claddis object format; table showing hierarchical relationships between characters; alpha value.
##Input required:
##matr = character matrix in Claddis object format (see "ReadMorphNexus")
##type = table with three columns with headings "Char", "Type", and "Sub"
##      "Char" is the character number
##      "Type" is the type of character, either "S" for secondary or "T" for tertiary.  In our experience, quaternary characters are rarely to never used
##      "Sub" is the character numeber that the secondary or tertiary character is contingent on
##alpha = desired alpha value.  Should vary between 0 and 1.
##      If alpha = 0, only primary characters will contribute to dissimilarity estimate
##      If alpha = 1, then shared primary characters will be weighted by the secondary characters shared; same for secondary characters with tertiary characters
##      See text for further explanation.

##note that delta{ijk} = 1 when characters are comparable, consistent with Gower's initial formulation. It may be preferable to replace the total number of
#comparable characters with the maximum character total possible (Lloyd's MORB metric) especially if there are ordered characters. This is not implemented here
#but could be done by modifying lines 90 and 212 for the new metric and Gower's, respectively
#################

alpha.coefficient<-function(matr,type,alpha){
  pairs<-combn(nrow(matr$Matrix),2)
  HSJ<-matrix(NA,nrow=nrow(matr$Matrix),ncol=nrow(matr$Matrix))
  alpha<-alpha
  for (i in 1:ncol(pairs)){
    sim.temp<-matrix(abs(suppressWarnings(as.numeric(matr$Matrix[pairs[1,i],]))-
                           suppressWarnings(as.numeric(matr$Matrix[pairs[2,i],]))))
    #correct values for ordered multistate characters
    if (any(matr$Ordering=='ord')){
      m.o<-which(matr$Ordering=='ord')
      sim.temp[m.o]<-abs(suppressWarnings(as.numeric(matr$Matrix[pairs[1,i],][m.o]))-
                           suppressWarnings(as.numeric(matr$Matrix[pairs[2,i],][m.o])))#/matr$MaxVals[m.o]
    }
    #correct values for unordered multistate characters
    if (any(matr$Ordering=='unord'&matr$MaxVals>1)){
      m.u<-which(matr$Ordering=='unord'&matr$MaxVals>1)
      sim.temp[m.u]<-replace(sim.temp[m.u],which(sim.temp[m.u]>1),1)
    }
    #correct polymorphic characters
    if (length(grep("&", unique(c(matr$Matrix[pairs[1,i],], matr$Matrix[pairs[2,i],])))) > 0){
      polym.states <- sort(c(grep("&", matr$Matrix[pairs[1,i],]), grep("&", matr$Matrix[pairs[2,i],])))
      for (v in 1:length(polym.states)) {
        char.state1<-strsplit(matr$Matrix[pairs[1,i],][polym.states[v]],"&")[[1]]
        char.state2<-strsplit(matr$Matrix[pairs[2,i],][polym.states[v]],"&")[[1]]
        int.value <- intersect(char.state1, char.state2)
        if (length(int.value)>0){
          sim.temp[polym.states[v]]<-0
        }
        if (length(int.value)==0 & anyNA(c(char.state1,char.state2))==FALSE){
          if (matr$Ordering[polym.states[v]]=='unord'){
            sim.temp[polym.states[v]]<-1
          }
          if (matr$Ordering[polym.states[v]]=='ord'){
            pairs.poly<-matrix(0,nrow=length(char.state1),ncol=length(char.state2))
            for (m in 1:length(char.state1)){
              for (n in 1:length(char.state2)){
                pairs.poly[m,n]<-abs(as.numeric(char.state1[m])-as.numeric(char.state2[n]))
              }
            }
            sim.temp[polym.states[v]]<-min(pairs.poly)
          }
        }
      }
    }
    #account for Tertiary characters (by weighting)
    if (any(type$Type=='T')){
      te<-which(type$Type=='T')
      for (j in 1:length(unique(type$Sub[te]))){
        te.sub<-which(type$Sub==as.character(unique(type$Sub[te])[j]))
        te.sim<-sim.temp[te.sub]
        s.te<-as.numeric(as.character(unique(na.omit(type$Sub[te])))[j])
        if (length(na.omit(te.sim))>0) {
          sim.temp[s.te]<-1-(alpha*(1-(sum(na.omit(te.sim))/length(na.omit(te.sim))))+(1-alpha))  #alpha is applied as if it were a similarity measure, therefore the distance is converted to a similarity by substracted from one, then the whole thing is substracted from one to convert back to dissimilarity
          sim.temp[te.sub]<-NA
        }
      }
    }

    #account for secondary characters (by weighting)
    if (any(type$Type=='S')){
      s<-which(type$Type=='S')
      for (j in 1:length(unique(type$Sub[s]))){
        s.sub<-which(type$Sub==as.character(unique(type$Sub[s])[j]))
        s.sim<-sim.temp[s.sub]
        p.s<-as.numeric(as.character(unique(na.omit(type$Sub[s])))[j])
        if (length(na.omit(s.sim))>0) {
          sim.temp[p.s]<-1-(alpha*(1-(sum(na.omit(s.sim))/length(na.omit(s.sim))))+(1-alpha))
          sim.temp[s.sub]<-NA
        }
      }
    }
    #calculate total dissimilarity
    wt.comp.char<-sum(na.omit(cbind(sim.temp,matr$Weights))[,2])
    HSJ[pairs[1,i],pairs[2,i]]<-HSJ[pairs[2,i],pairs[1,i]]<-sum(na.omit(sim.temp*matr$Weights))/wt.comp.char
  }
  diag(HSJ)<-0
  return(HSJ)
}


###############
##Implementation of Will's GED
##matr = character matrix in Claddis object format (see "ReadMorphNexus")

GED<-function(matr){
  pairs<-combn(nrow(matr$Matrix),2)

  GED<-matrix(NA,nrow=nrow(matr$Matrix),ncol=nrow(matr$Matrix))
  for (i in 1:ncol(pairs)){
    sim.temp<-matrix(abs(suppressWarnings(as.numeric(matr$Matrix[pairs[1,i],]))-
                           suppressWarnings(as.numeric(matr$Matrix[pairs[2,i],]))))
    #correct values for ordered multistate characters
    if (any(matr$Ordering=='ord')){
      m.o<-which(matr$Ordering=='ord')
      sim.temp[m.o]<-abs(suppressWarnings(as.numeric(matr$Matrix[pairs[1,i],][m.o]))-
                           suppressWarnings(as.numeric(matr$Matrix[pairs[2,i],][m.o])))
    }
    #correct values for unordered multistate characters
    if (any(matr$Ordering=='unord'&matr$MaxVals>1)){
      m.u<-which(matr$Ordering=='unord'&matr$MaxVals>1)
      sim.temp[m.u]<-replace(sim.temp[m.u],which(sim.temp[m.u]>1),1)
    }
    #correct polymorphic characters
    if (length(grep("&", unique(c(matr$Matrix[pairs[1,i],], matr$Matrix[pairs[2,i],])))) > 0){
      polym.states <- sort(c(grep("&", matr$Matrix[pairs[1,i],]), grep("&", matr$Matrix[pairs[2,i],])))
      for (v in 1:length(polym.states)) {
        char.state1<-strsplit(matr$Matrix[pairs[1,i],][polym.states[v]],"&")[[1]]
        char.state2<-strsplit(matr$Matrix[pairs[2,i],][polym.states[v]],"&")[[1]]
        int.value <- intersect(char.state1, char.state2)
        if (length(int.value)>0){
          sim.temp[polym.states[v]]<-0
        }
        if (length(int.value)==0 & anyNA(c(char.state1,char.state2))==FALSE){
          if (matr$Ordering[polym.states[v]]=='unord'){
            sim.temp[polym.states[v]]<-1
          }
          if (matr$Ordering[polym.states[v]]=='ord'){
            pairs.poly<-matrix(0,nrow=length(char.state1),ncol=length(char.state2))
            for (m in 1:length(char.state1)){
              for (n in 1:length(char.state2)){
                pairs.poly[m,n]<-abs(as.numeric(char.state1[m])-as.numeric(char.state2[n]))
              }
            }
            sim.temp[polym.states[v]]<-min(pairs.poly)
          }
        }
      }
    }

    #replace missing values
    Sij<-mean(na.omit(sim.temp))
    sim.temp[which(is.na(sim.temp)==TRUE)]<-Sij

    #calculate total dissimilarity
    #wt.comp.char<-sum(na.omit(cbind(sim.temp,matr$Weights))[,2])
    GED[pairs[1,i],pairs[2,i]]<-GED[pairs[2,i],pairs[1,i]]<-sqrt(sum((sim.temp*matr$Weights)^2))
  }
  diag(GED)<-0
  return(GED)
}


###############
##Implementation of Gower's coefficient of dissimilarity
##matr = character matrix in Claddis object format (see "ReadMorphNexus")
#Note: this script and functions in Claddis do not standardize ordered multistate by range (as if they were quantitative; this follows Wills 1998)
#If weights are according to P&K (part of the Claddis object format), this returns P&K weighting scheme

gower<-function(matr){
  pairs<-combn(nrow(matr$Matrix),2)
  gower<-matrix(NA,nrow=nrow(matr$Matrix),ncol=nrow(matr$Matrix))
  for (i in 1:ncol(pairs)){
    sim.temp<-matrix(abs(suppressWarnings(as.numeric(matr$Matrix[pairs[1,i],]))-
                           suppressWarnings(as.numeric(matr$Matrix[pairs[2,i],]))))
    #correct values for ordered multistate characters
    if (any(matr$Ordering=='ord')){
      m.o<-which(matr$Ordering=='ord')
      sim.temp[m.o]<-abs(suppressWarnings(as.numeric(matr$Matrix[pairs[1,i],][m.o]))-
                           suppressWarnings(as.numeric(matr$Matrix[pairs[2,i],][m.o])))
    }
    #correct values for unordered multistate characters
    if (any(matr$Ordering=='unord'&matr$MaxVals>1)){
      m.u<-which(matr$Ordering=='unord'&matr$MaxVals>1)
      sim.temp[m.u]<-replace(sim.temp[m.u],which(sim.temp[m.u]>1),1)
    }
    #correct polymorphic characters
    if (length(grep("&", unique(c(matr$Matrix[pairs[1,i],], matr$Matrix[pairs[2,i],])))) > 0){
      polym.states <- sort(c(grep("&", matr$Matrix[pairs[1,i],]), grep("&", matr$Matrix[pairs[2,i],])))
      for (v in 1:length(polym.states)) {
        char.state1<-strsplit(matr$Matrix[pairs[1,i],][polym.states[v]],"&")[[1]]
        char.state2<-strsplit(matr$Matrix[pairs[2,i],][polym.states[v]],"&")[[1]]
        int.value <- intersect(char.state1, char.state2)
        if (length(int.value)>0){
          sim.temp[polym.states[v]]<-0
        }
        if (length(int.value)==0 & anyNA(c(char.state1,char.state2))==FALSE){
          if (matr$Ordering[polym.states[v]]=='unord'){
            sim.temp[polym.states[v]]<-1
          }
          if (matr$Ordering[polym.states[v]]=='ord'){
            pairs.poly<-matrix(0,nrow=length(char.state1),ncol=length(char.state2))
            for (m in 1:length(char.state1)){
              for (n in 1:length(char.state2)){
                pairs.poly[m,n]<-abs(as.numeric(char.state1[m])-as.numeric(char.state2[n]))
              }
            }
            sim.temp[polym.states[v]]<-min(pairs.poly)
          }
        }
      }
    }
    #calculate total dissimilarity
    wt.comp.char<-sum(na.omit(cbind(sim.temp,matr$Weights))[,2])
    gower[pairs[1,i],pairs[2,i]]<-gower[pairs[2,i],pairs[1,i]]<-sum(na.omit(sim.temp*matr$Weights))/wt.comp.char
  }
  diag(gower)<-0
  return(gower)
}