-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpps_exist_HAY_CommonAndPseudo_bf.R
368 lines (310 loc) · 11 KB
/
pps_exist_HAY_CommonAndPseudo_bf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
#This script looks for Pearson-preserving surrogates for a dataset
#
#The script is currently written to work on the HAY dataset, with
#non-common species lumped into one pseudo-species. To change the
#dataset on which the script operates, change the first section,
#then the rest should run, perhaps with a bit of hand-holding.
#
#Dan Reuman
#2019 04 12
rm(list=ls())
#***Change this section for different data and other settings
#**Data
#Importing the dataset matrix of years by species
d<-readRDS("./Results/hays_results/skewness_results/ts_mat_all_sp_146_hays.RDS")# for all species except in "REMOVE" category
#Importing data with "C", "I", "R" values for common, intermediate, and rare species
category_hays<-readRDS("./Results/hays_results/skewness_results/all_sp_146_hays_category.RDS")
all(colnames(d)==category_hays$sp)==T
src<- category_hays$category #a vector with one entry for each species, corresponding to the columns of dHAY
#src stands for species rarity category
#Keep only the C species, combine the others into a pseudo-species
d<-cbind(d[,src=="C"],apply(FUN=sum,X=d[,src!="C"],MARGIN=1))
#**Settings
p<-seq(from=-1,to=1,by=0.1)
numreps<-250
resloc<-"./Results/Results_pps_exist_HAY_CommonAndPseudo/"
theseed1<-103
theseed2<-1001
numpd<-100000
numsimsforcheck<-1000
#***Code imports, packages, and other setup
library(matrixcalc)
library(mvtnorm)
source("copmap.R")
source("getinv.R")
source("getinvrg.R")
source("alignranks.R")
if (!dir.exists(resloc))
{
dir.create(resloc)
}
#***For each pair of species, find the preimage of the corelation
#under the map defined by copmap.R
set.seed(theseed1)
numsp<-dim(d)[2]
#receptacle for results, the preimage in allparms[,,2], the range
#specified by the other values
allparms<-array(NA,c(numsp,numsp,3))
#loop through each pair of species
for (c1 in 1:(numsp-1))
{
print(paste0("c1: ",c1))
for (c2 in (c1+1):numsp)
{
#get the time series for these two species
x<-d[,c1]
y<-d[,c2]
#get the (stochastic) map
thisres<-copmap(x,y,p,numreps)
#find the iverse range
allparms[c1,c2,c(1,3)]<-getinvrg(p,thisres,cor(x,y))
#find the actual preimage
allparms[c1,c2,2]<-getinv(p,thisres,cor(x,y))
#make and save a plot
mn<-apply(FUN=mean,MARGIN=2,X=thisres)
quants<-apply(FUN=quantile,MARGIN=2,X=thisres,probs=c(.25,.5,.75))
pdf(paste0(resloc,"mapplot_",c1,"_",c2,".pdf"))
plot(p,mn,type='l',ylim=range(mn,quants))
lines(p,quants[2,],type='l',lty="dashed")
lines(p,quants[1,],type='l',lty="dotted")
lines(p,quants[3,],type='l',lty="dotted")
lines(range(p),rep(cor(x,y),2))
lines(rep(allparms[c1,c2,1],2),c(-1,1))
lines(rep(allparms[c1,c2,2],2),c(-1,1))
lines(rep(allparms[c1,c2,3],2),c(-1,1))
dev.off()
}
}
#make sure the preimage is within the preimage range in all cases,
#and you got a sensible range and a single-point preimage in all cases
sum(allparms[,,1]<allparms[,,2],na.rm=TRUE)
sum(allparms[,,2]<allparms[,,3],na.rm=TRUE)
(numsp^2-numsp)/2
#***Now look for positive definite
#First check to see if we got lucky and the preimage itself is pos def
m1<-allparms[,,2]
m1[is.na(m1)]<-0
m1<-m1+t(m1)
diag(m1)<-1
m2<-p2P(P2p(m1))
sum(m1==m2)
prod(dim(m1))
is.positive.semi.definite(m1)
eigen(m1)$values
#If m1 is pos semi def, we can skip the next two steps
#now use nearPD and see if you get a result between hlo and hhi
hlo<-P2p(t(allparms[,,1]))
hpre<-P2p(t(allparms[,,2]))
hhi<-P2p(t(allparms[,,3]))
hmid<-(hlo+hhi)/2
bestmatNPD<-matrix(as.numeric(Matrix::nearPD(m2,corr=TRUE)$mat),numsp,numsp)
bestmatNPDvect<-P2p(bestmatNPD)
sum(bestmatNPDvect<=hhi & bestmatNPDvect>=hlo)
length(bestmatNPDvect)
inds<-which(bestmatNPDvect>hhi | bestmatNPDvect<hlo)
cbind(hlo[inds],bestmatNPDvect[inds],hhi[inds])
#so the nearPD matrix is well outside the bounds given by hlo and hhi for
#some of the coords, so I won't bother with it
#prepare to search the space of parameters desribed by allparms for
#pos def
set.seed(theseed2)
hlo<-P2p(t(allparms[,,1]))
hpre<-P2p(t(allparms[,,2]))
hhi<-P2p(t(allparms[,,3]))
hmid<-(hlo+hhi)/2
allposdef<-matrix(NA,numpd,length(hlo)+1)
rowcount<-1
fn<-function(h)
{
res<-min(eigen(p2P(h))$values)
if (res>0)
{
allposdef[rowcount,]<<-c(h,res)
rowcount<<-rowcount+1 #note the use of global variables, watch out!
print("Found one!")
}
return(res)
}
#The below optimizations will crash out once they have found
#numpd positive definite results. This is intentional.
#do a search starting with the preimage itself
optim(hpre,fn,method="L-BFGS-B",lower=hlo,upper=hhi,
control=list(fnscale=-1))
#do a search starting from the midpoint of the preimage range
optim((hlo+hhi)/2,fn,method="L-BFGS-B",lower=hlo,upper=hhi,
control=list(fnscale=-1))
#now do a bunch of searches from random start points
while (all(is.na(allposdef[numpd,])))
{
startvec<-(hhi-hlo)*runif(length(hlo))+hlo
optim(startvec,fn,method="L-BFGS-B",lower=hlo,upper=hhi,
control=list(fnscale=-1))
}
#I stopped this after an overnight run
save(allposdef,file=paste0(resloc,"PosDefMats.RData"))
#***New code
ldloc<-"./Results/FromShya_Results_pps_exist_HAY_CommonAndPseudo/haysresults/"
allposdef<-readRDS(paste0(ldloc,"PosDefMats.RDS"))
#Now find the one that is closest to hpre
allposdef<-allposdef[1:sum(!is.na(allposdef[,1])),1:(dim(allposdef)[2]-1)]
hpremat<-matrix(hpre,dim(allposdef)[1],length(hpre),byrow = TRUE)
sizemat<-matrix((hhi-hlo)/2,dim(allposdef)[1],length(hlo),byrow = TRUE)
dist<-apply(X=((allposdef-hpremat)/sizemat)^2,FUN=sum,MARGIN=1)
besth<-allposdef[which(dist==min(dist)),]
sum(besth>=hlo)
sum(besth<=hhi)
length(hhi)
bestmat<-p2P(besth) #this is supposed to be my parameter matrix for a normal copula!
isSymmetric(bestmat)
is.positive.semi.definite(bestmat)
#***Now make a plot showing where in the ranges given by hlo to hhi
#the selected values sit
inds<-order(besth)
plot(1:length(besth),besth[inds],type='l',ylim=range(besth,hlo,hhi))
lines(1:length(besth),hlo[inds],type='l',lty="dashed")
lines(1:length(besth),hhi[inds],type='l',lty="dashed")
#many of these come very close to the edges of the hlo-hhi range
#***Now assess whether you get reasonable pearson-preserving surrogates
#using bestmat
sims<-rmvnorm((dim(d)[1])*numsimsforcheck,sigma=bestmat)
sims<-aperm(array(sims,c(dim(d)[1],numsimsforcheck,dim(d)[2])),c(1,3,2))
dim(sims)
sd<-d
for (counter in 1:numsp)
{
sd[,counter]<-sort(d[,counter])
}#need to sort the columns of d to use aligncall
remapd<-alignranks(sd,sims)
holder<-apply(FUN=cor,MARGIN=3,X=remapd)
allcors<-array(holder,c(numsp,numsp,numsimsforcheck))
gtres<-apply(FUN=sum,MARGIN=c(1,2),X=(allcors>array(cor(d),c(numsp,numsp,numsimsforcheck))))
ltres<-apply(FUN=sum,MARGIN=c(1,2),X=(allcors<array(cor(d),c(numsp,numsp,numsimsforcheck))))
eqres<-apply(FUN=sum,MARGIN=c(1,2),X=(allcors==array(cor(d),c(numsp,numsp,numsimsforcheck))))
sum(diag(eqres)==numsimsforcheck)
length(diag(eqres))
diag(eqres)<-NA
sum(eqres==0,na.rm=TRUE)
(numsp^2-numsp)
inds<-which(eqres!=0,arr.ind = TRUE)
#so eq res is all 0s except the diagonal which equals numsimforcheck
sum(diag(gtres)==0)
sum(diag(ltres)==0)
diag(gtres)<-NA
diag(ltres)<-NA
hist(gtres)#very few of these should be outside the range 250-750,
#and if they are not outside that range you have pretty
#good surrogates
hist(ltres)#very few of these should be outside the range 250-750,
#and if they are not outside that range you have pretty
#good surrogates
#more of tese are not great compared to the JRG surrogates - we are not getting
#as good surrogates here. Look at the upper edge of the distr for gtres and the
#lower for ltres.
save(bestmat,file=paste0(resloc,"FinalParameterMatrix.RData"))
#This is the final result. If you generate data from a normal copula
#with this parameter matrix and then use alignranks, that will give
#ok Person-preserving surrogates.
#***Now produce 100000 surrogates and save for later use
sims<-rmvnorm((dim(d)[1])*100000,sigma=bestmat)
sims<-aperm(array(sims,c(dim(d)[1],100000,dim(d)[2])),c(1,3,2))
dim(sims)
sd<-d
for (counter in 1:numsp)
{
sd[,counter]<-sort(d[,counter])
}#need to sort the columns of d to use aligncall
surrogs<-alignranks(sd,sims)
save(surrogs,file=paste0(resloc,"SomeSurrogates.RData"))
#***New code
surrogs<-readRDS(paste0(ldloc,"SomeSurrogates.RDS"))
totpop<-apply(FUN=sum,X=surrogs,MARGIN=c(1,3))
allvars<-apply(FUN=var,X=totpop,MARGIN=2)
allmnsq<-apply(FUN=function(x){(mean(x))^2},X=totpop,MARGIN=2)
allCVcomsq<-allvars/allmnsq
hist(allCVcomsq,50)
totpop<-apply(FUN=sum,X=d,MARGIN=1)
CVd<-var(totpop)/((mean(totpop))^2)
lines(rep(CVd,2),c(1,1000),col="red")
sum(allCVcomsq<CVd)/length(allCVcomsq)
#***new code - let's try to find the one closest to the preimage of the median
#loop through each pair of species
#Attempts to find the unique preimage of a point under a map
#obtained by copmap (in expectation)
#
#Args
#p An increasing vector of points in [-1,1] - this would have been
# the p argument of copmap
#copmapout The output of the call to copmap
#imval The image value to take the preimage of - would be the correlation
# or covariance of the x and y inputs to copmap, depending on the
# value of cflag that was used in the call to copmap
#center Either "mean" or "median" depending on what you want the preimage
# of
#
#Output
#Taking the means of each column of copmapout gives a vector of the same length
#as p. These together determine a function, via linear interpolation between
#values. The code computes the pre-image under this map, if it exists, of imval.
#If there is no point in the pre-image, the function returns -Inf. If more than
#one point, it returns Inf. If exactly one point, it returns that value.
getinv<-function(p,copmapout,imval,center="mean")
{
#x and y of the function to be inverted
x<-p
if (center=="mean")
{
y<-apply(FUN=mean,X=copmapout,MARGIN=2)
}
if (center=="median")
{
y<-apply(FUN=median,X=copmapout,MARGIN=2)
}
if (!(center %in% c("mean","median")))
{
stop("Error in getinv: bad value for center")
}
res<-numeric(0)
for (counter in 1:(length(x)-1))
{
if (y[counter]==imval && y[counter+1]==imval)
{
return(Inf)
}
if ((y[counter]<=imval && y[counter+1]>=imval) ||
(y[counter]>=imval && y[counter+1]<=imval))
{
res<-c(res,x[counter]+
(x[counter+1]-x[counter])*(imval-y[counter])/(y[counter+1]-y[counter]))
}
}
if (length(res)==0)
{
return(-Inf)
}
if (length(res)>1)
{
return(Inf)
}
return(res)
}
medinvs<-matrix(NA,numsp,numsp)
for (c1 in 1:(numsp-1))
{
print(paste0("c1: ",c1))
for (c2 in (c1+1):numsp)
{
#get the time series for these two species
x<-d[,c1]
y<-d[,c2]
#get the (stochastic) map
thisres<-copmap(x,y,p,numreps)
#find the actual preimage
medinvs[c1,c2]<-getinv(p,thisres,cor(x,y),"median")
}
}
oldmedinvs<-medinvs
medinvs[is.na(medinvs)]<-0
medinvs<-medinvs+t(medinvs)
diag(medinvs)<-1
eigen(medinvs)$values
save.image(file=paste0(resloc,"WholeWorkspace.RData"))