SHOGUN  6.1.3
Statistics.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2014 Wu Lin
8  * Written (W) 2011-2016 Heiko Strathmann
9  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
10  *
11  * Most cdf routines are wrappers for CDFLIB, part of the public domain NETLIB.
12  * https://people.sc.fsu.edu/~jburkardt/f_src/cdflib/cdflib.html
13  */
14 
17 #include <shogun/lib/SGMatrix.h>
18 #include <shogun/lib/SGVector.h>
21 #include <shogun/lib/external/cdflib.hpp>
23 
24 using namespace Eigen;
25 
26 using namespace shogun;
27 
28 #ifndef M_SQRT1_2
29 #define M_SQRT1_2 0.707106781186547524401
30 #endif
31 
32 float64_t CStatistics::variance(SGVector<float64_t> values)
33 {
34  REQUIRE(values.vlen>1, "Number of observations (%d) needs to be at least 1.\n",
35  values.vlen);
36 
38 
39  float64_t sum_squared_diff=0;
40  for (index_t i=0; i<values.vlen; ++i)
41  sum_squared_diff+=CMath::pow(values.vector[i]-mean, 2);
42 
43  return sum_squared_diff/(values.vlen-1);
44 }
45 
46 SGVector<float64_t> CStatistics::matrix_mean(SGMatrix<float64_t> values,
47  bool col_wise)
48 {
49  ASSERT(values.num_rows>0)
50  ASSERT(values.num_cols>0)
51  ASSERT(values.matrix)
52 
53  SGVector<float64_t> result;
54 
55  if (col_wise)
56  {
57  result=SGVector<float64_t>(values.num_cols);
58  for (index_t j=0; j<values.num_cols; ++j)
59  {
60  result[j]=0;
61  for (index_t i=0; i<values.num_rows; ++i)
62  result[j]+=values(i,j);
63 
64  result[j]/=values.num_rows;
65  }
66  }
67  else
68  {
69  result=SGVector<float64_t>(values.num_rows);
70  for (index_t j=0; j<values.num_rows; ++j)
71  {
72  result[j]=0;
73  for (index_t i=0; i<values.num_cols; ++i)
74  result[j]+=values(j,i);
75 
76  result[j]/=values.num_cols;
77  }
78  }
79 
80  return result;
81 }
82 
83 SGVector<float64_t> CStatistics::matrix_variance(SGMatrix<float64_t> values,
84  bool col_wise)
85 {
86  ASSERT(values.num_rows>0)
87  ASSERT(values.num_cols>0)
88  ASSERT(values.matrix)
89 
90  /* first compute mean */
91  SGVector<float64_t> mean=CStatistics::matrix_mean(values, col_wise);
92 
93  SGVector<float64_t> result;
94 
95  if (col_wise)
96  {
97  result=SGVector<float64_t>(values.num_cols);
98  for (index_t j=0; j<values.num_cols; ++j)
99  {
100  result[j]=0;
101  for (index_t i=0; i<values.num_rows; ++i)
102  result[j]+=CMath::pow(values(i,j)-mean[j], 2);
103 
104  result[j]/=(values.num_rows-1);
105  }
106  }
107  else
108  {
109  result=SGVector<float64_t>(values.num_rows);
110  for (index_t j=0; j<values.num_rows; ++j)
111  {
112  result[j]=0;
113  for (index_t i=0; i<values.num_cols; ++i)
114  result[j]+=CMath::pow(values(j,i)-mean[j], 2);
115 
116  result[j]/=(values.num_cols-1);
117  }
118  }
119 
120  return result;
121 }
122 
123 float64_t CStatistics::std_deviation(SGVector<float64_t> values)
124 {
125  return CMath::sqrt(variance(values));
126 }
127 
128 SGVector<float64_t> CStatistics::matrix_std_deviation(
129  SGMatrix<float64_t> values, bool col_wise)
130 {
131  SGVector<float64_t> var=CStatistics::matrix_variance(values, col_wise);
132  for (index_t i=0; i<var.vlen; ++i)
133  var[i]=CMath::sqrt(var[i]);
134 
135  return var;
136 }
137 
138 SGMatrix<float64_t> CStatistics::covariance_matrix(
139  SGMatrix<float64_t> observations, bool in_place)
140 {
141  int32_t D = observations.num_rows;
142  int32_t N = observations.num_cols;
143  SG_SDEBUG("%d observations in %d dimensions\n", N, D)
144 
145  REQUIRE(N>1, "Number of observations (%d) must be at least 2.\n", N);
146  REQUIRE(D>0, "Number of dimensions (%d) must be at least 1.\n", D);
147 
148  SGMatrix<float64_t> centered=in_place ? observations :
149  SGMatrix<float64_t>(D, N);
150 
151  /* center observations, potentially in-place */
152  if (!in_place)
153  {
154  int64_t num_elements = N*D;
155  sg_memcpy(centered.matrix, observations.matrix,
156  sizeof(float64_t)*num_elements);
157  }
158  SG_SDEBUG("Centering observations\n");
159  Map<MatrixXd> eigen_centered(centered.matrix, D, N);
160  eigen_centered.colwise() -= eigen_centered.rowwise().mean();
161 
162  /* compute and store 1/(N-1) * X * X.T */
163  SG_SDEBUG("Computing squared differences\n");
164  SGMatrix<float64_t> cov(D, D);
165  Map<MatrixXd> eigen_cov(cov.matrix, D, D);
166  eigen_cov = (eigen_centered * eigen_centered.adjoint()) / double(N - 1);
167 
168  return cov;
169 }
170 
171 SGVector<float64_t> CStatistics::fishers_exact_test_for_multiple_2x3_tables(
172  SGMatrix<float64_t> tables)
173 {
174  SGMatrix<float64_t> table(NULL, 2, 3, false);
175  int32_t len=tables.num_cols/3;
176 
177  SGVector<float64_t> v(len);
178  for (int32_t i=0; i<len; i++)
179  {
180  table.matrix=&tables.matrix[2*3*i];
181  v.vector[i]=fishers_exact_test_for_2x3_table(table);
182  }
183  return v;
184 }
185 
186 float64_t CStatistics::fishers_exact_test_for_2x3_table(
187  SGMatrix<float64_t> table)
188 {
189  ASSERT(table.num_rows==2)
190  ASSERT(table.num_cols==3)
191 
192  int32_t m_len=3+2;
193  float64_t* m=SG_MALLOC(float64_t, 3+2);
194  m[0]=table.matrix[0]+table.matrix[2]+table.matrix[4];
195  m[1]=table.matrix[1]+table.matrix[3]+table.matrix[5];
196  m[2]=table.matrix[0]+table.matrix[1];
197  m[3]=table.matrix[2]+table.matrix[3];
198  m[4]=table.matrix[4]+table.matrix[5];
199 
200  float64_t n=SGVector<float64_t>::sum(m, m_len)/2.0;
201  int32_t x_len=2*3*CMath::sq(CMath::max(m, m_len));
202  float64_t* x=SG_MALLOC(float64_t, x_len);
203  SGVector<float64_t>::fill_vector(x, x_len, 0.0);
204 
205  float64_t log_nom=0.0;
206  for (int32_t i=0; i<3+2; i++)
207  log_nom+=lgamma(m[i]+1);
208  log_nom-=lgamma(n+1.0);
209 
210  float64_t log_denomf=0;
211  floatmax_t log_denom=0;
212 
213  for (int32_t i=0; i<3*2; i++)
214  {
215  log_denom+=lgammal((floatmax_t)table.matrix[i]+1);
216  log_denomf+=lgammal((floatmax_t)table.matrix[i]+1);
217  }
218 
219  floatmax_t prob_table_log=log_nom-log_denom;
220 
221  int32_t dim1=CMath::min(m[0], m[2]);
222 
223  //traverse all possible tables with given m
224  int32_t counter=0;
225  for (int32_t k=0; k<=dim1; k++)
226  {
227  for (int32_t l=CMath::max(0.0, m[0]-m[4]-k);
228  l<=CMath::min(m[0]-k, m[3]); l++)
229  {
230  x[0+0*2+counter*2*3]=k;
231  x[0+1*2+counter*2*3]=l;
232  x[0+2*2+counter*2*3]=m[0]-x[0+0*2+counter*2*3]-x[0+1*2+counter*2*3];
233  x[1+0*2+counter*2*3]=m[2]-x[0+0*2+counter*2*3];
234  x[1+1*2+counter*2*3]=m[3]-x[0+1*2+counter*2*3];
235  x[1+2*2+counter*2*3]=m[4]-x[0+2*2+counter*2*3];
236 
237  counter++;
238  }
239  }
240 
241 //#define DEBUG_FISHER_TABLE
242 #ifdef DEBUG_FISHER_TABLE
243  SG_SPRINT("counter=%d\n", counter)
244  SG_SPRINT("dim1=%d\n", dim1)
245  SG_SPRINT("l=%g...%g\n", CMath::max(0.0,m[0]-m[4]-0), CMath::min(m[0]-0, m[3]))
246  SG_SPRINT("n=%g\n", n)
247  SG_SPRINT("prob_table_log=%.18Lg\n", prob_table_log)
248  SG_SPRINT("log_denomf=%.18g\n", log_denomf)
249  SG_SPRINT("log_denom=%.18Lg\n", log_denom)
250  SG_SPRINT("log_nom=%.18g\n", log_nom)
251  display_vector(m, m_len, "marginals");
252  display_vector(x, 2*3*counter, "x");
253 #endif // DEBUG_FISHER_TABLE
254 
255  floatmax_t* log_denom_vec=SG_MALLOC(floatmax_t, counter);
256  SGVector<floatmax_t>::fill_vector(log_denom_vec, counter, (floatmax_t)0.0);
257 
258  for (int32_t k=0; k<counter; k++)
259  {
260  for (int32_t j=0; j<3; j++)
261  {
262  for (int32_t i=0; i<2; i++)
263  log_denom_vec[k]+=lgammal(x[i+j*2+k*2*3]+1.0);
264  }
265  }
266 
267  for (int32_t i=0; i<counter; i++)
268  log_denom_vec[i]=log_nom-log_denom_vec[i];
269 
270 #ifdef DEBUG_FISHER_TABLE
271  display_vector(log_denom_vec, counter, "log_denom_vec");
272 #endif // DEBUG_FISHER_TABLE
273 
274  float64_t nonrand_p=-CMath::INFTY;
275  for (int32_t i=0; i<counter; i++)
276  {
277  if (log_denom_vec[i]<=prob_table_log)
278  nonrand_p=CMath::logarithmic_sum(nonrand_p, log_denom_vec[i]);
279  }
280 
281 #ifdef DEBUG_FISHER_TABLE
282  SG_SPRINT("nonrand_p=%.18g\n", nonrand_p)
283  SG_SPRINT("exp_nonrand_p=%.18g\n", CMath::exp(nonrand_p))
284 #endif // DEBUG_FISHER_TABLE
285  nonrand_p=CMath::exp(nonrand_p);
286 
287  SG_FREE(log_denom_vec);
288  SG_FREE(x);
289  SG_FREE(m);
290 
291  return nonrand_p;
292 }
293 
294 float64_t CStatistics::mutual_info(float64_t* p1, float64_t* p2, int32_t len)
295 {
296  double e=0;
297 
298  for (int32_t i=0; i<len; i++)
299  for (int32_t j=0; j<len; j++)
300  e+=exp(p2[j*len+i])*(p2[j*len+i]-p1[i]-p1[j]);
301 
302  return (float64_t)e;
303 }
304 
305 float64_t CStatistics::relative_entropy(float64_t* p, float64_t* q, int32_t len)
306 {
307  double e=0;
308 
309  for (int32_t i=0; i<len; i++)
310  e+=exp(p[i])*(p[i]-q[i]);
311 
312  return (float64_t)e;
313 }
314 
315 float64_t CStatistics::entropy(float64_t* p, int32_t len)
316 {
317  double e=0;
318 
319  for (int32_t i=0; i<len; i++)
320  e-=exp(p[i])*p[i];
321 
322  return (float64_t)e;
323 }
324 
325 SGVector<int32_t> CStatistics::sample_indices(int32_t sample_size, int32_t N)
326 {
327  REQUIRE(sample_size<N,
328  "sample size should be less than number of indices\n");
329  int32_t* idxs=SG_MALLOC(int32_t,N);
330  int32_t i, rnd;
331  int32_t* permuted_idxs=SG_MALLOC(int32_t,sample_size);
332 
333  // reservoir sampling
334  for (i=0; i<N; i++)
335  idxs[i]=i;
336  for (i=0; i<sample_size; i++)
337  permuted_idxs[i]=idxs[i];
338  for (i=sample_size; i<N; i++)
339  {
340  rnd=CMath::random(1, i);
341  if (rnd<sample_size)
342  permuted_idxs[rnd]=idxs[i];
343  }
344  SG_FREE(idxs);
345 
346  SGVector<int32_t> result=SGVector<int32_t>(permuted_idxs, sample_size);
347  CMath::qsort(result);
348  return result;
349 }
350 
351 float64_t CStatistics::inverse_normal_cdf(float64_t p, float64_t mean,
352  float64_t std_deviation)
353 {
354  REQUIRE(p>=0, "p (%f); must be greater or equal to 0.\n", p);
355  REQUIRE(p<=1, "p (%f); must be greater or equal to 1.\n", p);
356  REQUIRE(std_deviation>0, "Standard deviation (%f); must be positive\n",
357  std_deviation);
358 
359  // invserse normal cdf case, see cdflib.cpp for details
360  int which=2;
361  float64_t output_x;
362  float64_t q=1-p;
363  float64_t output_bound;
364  int output_status;
365 
366  cdfnor(&which, &p, &q, &output_x, &mean, &std_deviation, &output_status, &output_bound);
367 
368  if (output_status!=0)
369  SG_SERROR("Error %d while calling cdflib::cdfnor\n", output_status);
370 
371  return output_x;
372 
373  //void cdfnor ( int *which, double *p, double *q, double *x, double *mean,
374  // double *sd, int *status, double *bound )
375 }
376 
377 float64_t CStatistics::chi2_cdf(float64_t x, float64_t k)
378 {
379  REQUIRE(x>=0, "x (%f) has to be greater or equal to 0.\n", x);
380  REQUIRE(k>0, "Degrees of freedom (%f) has to be positive.\n", k);
381 
382  // chi2 cdf case, see cdflib.cpp for details
383  int which=1;
384  float64_t df=k;
385  float64_t output_q;
386  float64_t output_p;
387  float64_t output_bound;
388  int output_status;
389 
390  cdfchi(&which, &output_p, &output_q, &x, &df, &output_status, &output_bound);
391 
392  if (output_status!=0)
393  SG_SERROR("Error %d while calling cdflib::cdfchi\n", output_status);
394 
395  return output_p;
396 }
397 
398 float64_t CStatistics::gamma_cdf(float64_t x, float64_t a, float64_t b)
399 {
400  REQUIRE(x>=0, "x (%f) has to be greater or equal to 0.\n", x);
401  REQUIRE(a>=0, "a (%f) has to be greater or equal to 0.\n", a);
402  REQUIRE(b>=0, "b (%f) has to be greater or equal to 0.\n", b);
403 
404  // inverse gamma cdf case, see cdflib.cpp for details
405  float64_t shape=a;
406  float64_t scale=b;
407  int which=1;
408  float64_t output_p;
409  float64_t output_q;
410  float64_t output_bound;
411  int output_error_code;
412 
413  cdfgam(&which, &output_p, &output_q, &x, &shape, &scale, &output_error_code, &output_bound);
414 
415  if (output_error_code!=0)
416  SG_SERROR("Error %d while calling cdflib::cdfgam\n", output_error_code);
417 
418  return output_p;
419 }
420 
421 float64_t CStatistics::lnormal_cdf(float64_t x)
422 {
423  /* Loosely based on logphi.m in
424  * Gaussian Process Machine Learning Toolbox file logphi.m
425  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
426  * Under FreeBSD license
427  */
428 
429  const float64_t sqrt_of_2=1.41421356237309514547;
430  const float64_t log_of_2=0.69314718055994528623;
431  const float64_t sqrt_of_pi=1.77245385090551588192;
432 
433  const index_t c_len=14;
434  static float64_t c_array[c_len]=
435  {
436  0.00048204,
437  -0.00142906,
438  0.0013200243174,
439  0.0009461589032,
440  -0.0045563339802,
441  0.00556964649138,
442  0.00125993961762116,
443  -0.01621575378835404,
444  0.02629651521057465,
445  -0.001829764677455021,
446  2.0*(1.0-CMath::PI/3.0),
447  (4.0-CMath::PI)/3.0,
448  1.0,
449  1.0
450  };
451 
452  if (x*x<ERFC_CASE1)
453  {
454  float64_t f = 0.0;
455  float64_t lp0 = -x/(sqrt_of_2*sqrt_of_pi);
456  for (index_t i=0; i<c_len; i++)
457  f=lp0*(c_array[i]+f);
458  return -2.0*f-log_of_2;
459  }
460  else if (x<ERFC_CASE2)
461  return CMath::log(erfc8_weighted_sum(x))-log_of_2-x*x*0.5;
462 
463  //id3 = ~id2 & ~id1; lp(id3) = log(erfc(-z(id3)/sqrt(2))/2);
464  return CMath::log(normal_cdf(x));
465 }
466 
467 float64_t CStatistics::erfc8_weighted_sum(float64_t x)
468 {
469  /* This is based on index 5725 in Hart et al */
470 
471  const float64_t sqrt_of_2=1.41421356237309514547;
472 
473  static float64_t P[]=
474  {
475  0.5641895835477550741253201704,
476  1.275366644729965952479585264,
477  5.019049726784267463450058,
478  6.1602098531096305440906,
479  7.409740605964741794425,
480  2.97886562639399288862
481  };
482 
483  static float64_t Q[]=
484  {
485  1.0,
486  2.260528520767326969591866945,
487  9.396034016235054150430579648,
488  12.0489519278551290360340491,
489  17.08144074746600431571095,
490  9.608965327192787870698,
491  3.3690752069827527677
492  };
493 
494  float64_t num=0.0, den=0.0;
495 
496  num = P[0];
497  for (index_t i=1; i<6; i++)
498  {
499  num=-x*num/sqrt_of_2+P[i];
500  }
501 
502  den = Q[0];
503  for (index_t i=1; i<7; i++)
504  {
505  den=-x*den/sqrt_of_2+Q[i];
506  }
507 
508  return num/den;
509 }
510 
511 float64_t CStatistics::normal_cdf(float64_t x, float64_t std_dev)
512 {
513  return 0.5*(erfc(-x*M_SQRT1_2/std_dev));
514 }
515 
516 float64_t CStatistics::gamma_inverse_cdf(float64_t p, float64_t a,
517  float64_t b)
518 {
519  REQUIRE(p>=0, "p (%f) has to be greater or equal to 0.\n", p);
520  REQUIRE(a>=0, "a (%f) has to be greater or equal to 0.\n", a);
521  REQUIRE(b>=0, "b (%f) has to be greater or equal to 0.\n", b);
522 
523  float64_t shape=a;
524  float64_t scale=b;
525  float64_t q = 1-p;
526  int which=2;
527  float64_t output_x=0;
528  float64_t output_bound;
529  int output_error_code=0;
530 
531  // inverse gamma cdf case, see cdflib.cpp for details
532  cdfgam(&which, &p, &q, &output_x, &shape, &scale, &output_error_code, &output_bound);
533 
534  if (output_error_code!=0)
535  SG_SERROR("Error %d while calling cdflib::beta_inc\n", output_error_code);
536 
537  return output_x;
538 }
539 
540 float64_t CStatistics::fdistribution_cdf(float64_t x, float64_t d1, float64_t d2)
541 {
542  REQUIRE(x>=0, "x (%f) has to be greater or equal to 0.\n", x);
543  REQUIRE(d1>0, "d1 (%f) has to be positive.\n", d1);
544  REQUIRE(d2>0, "d2 (%f) has to be positive.\n", d2);
545 
546  // fcdf case, see cdflib.cpp for details
547  int which=1;
548  float64_t output_p;
549  float64_t output_q;
550  float64_t output_bound;
551  int output_error_code;
552 
553  cdff(&which, &output_p, &output_q, &x, &d1, &d2, &output_error_code, &output_bound);
554 
555  if (output_error_code!=0)
556  SG_SERROR("Error %d while calling cdflib::cdff\n", output_error_code);
557 
558  return output_p;
559 }
560 
561 float64_t CStatistics::dlgamma(float64_t x)
562 {
563  float64_t result=0.0;
564 
565  if (x<0.0)
566  {
567  // use reflection formula
568  x=1.0-x;
569  result=CMath::PI/CMath::tan(CMath::PI*x);
570  }
571 
572  // make x>7 for approximation
573  // (use reccurent formula: psi(x+1) = psi(x) + 1/x)
574  while (x<=7.0)
575  {
576  result-=1.0/x;
577  x++;
578  }
579 
580  // perform approximation
581  x-=0.5;
582  result+=log(x);
583 
584  float64_t coeff[10]={
585  0.04166666666666666667,
586  -0.00729166666666666667,
587  0.00384424603174603175,
588  -0.00413411458333333333,
589  0.00756096117424242424,
590  -0.02108249687595390720,
591  0.08332316080729166666,
592  -0.44324627670587277880,
593  3.05393103044765369366,
594  -26.45616165999210241989};
595 
596  float64_t power=1.0;
597  float64_t ix2=1.0/CMath::sq(x);
598 
599  // perform approximation
600  for (index_t i=0; i<10; i++)
601  {
602  power*=ix2;
603  result+=coeff[i]*power;
604  }
605 
606  return result;
607 }
608 
609 float64_t CStatistics::log_det_general(const SGMatrix<float64_t> A)
610 {
611  Map<MatrixXd> eigen_A(A.matrix, A.num_rows, A.num_cols);
612  REQUIRE(eigen_A.rows()==eigen_A.cols(),
613  "Input matrix should be a sqaure matrix row(%d) col(%d)\n",
614  eigen_A.rows(), eigen_A.cols());
615 
616  PartialPivLU<MatrixXd> lu(eigen_A);
617  VectorXd tmp(eigen_A.rows());
618 
619  for (index_t idx=0; idx<tmp.rows(); idx++)
620  tmp[idx]=idx+1;
621 
622  VectorXd p=lu.permutationP()*tmp;
623  int32_t detP=1;
624 
625  for (index_t idx=0; idx<p.rows(); idx++)
626  {
627  if (p[idx]!=idx+1)
628  {
629  detP*=-1;
630  index_t j=idx+1;
631  while(j<p.rows())
632  {
633  if (p[j]==idx+1)
634  break;
635  j++;
636  }
637  p[j]=p[idx];
638  }
639  }
640 
641  VectorXd u=lu.matrixLU().diagonal();
642  int32_t check_u=1;
643 
644  for (index_t idx=0; idx<u.rows(); idx++)
645  {
646  if (u[idx]<0)
647  check_u*=-1;
648  else if (u[idx]==0)
649  {
650  check_u=0;
651  break;
652  }
653  }
654 
655  float64_t result=CMath::INFTY;
656 
657  if (check_u==detP)
658  result=u.array().abs().log().sum();
659 
660  return result;
661 }
662 
663 float64_t CStatistics::log_det(SGMatrix<float64_t> m)
664 {
665  /* map the matrix to eigen3 to perform cholesky */
667 
668  /* computing the cholesky decomposition */
669  LLT<MatrixXd> llt;
670  llt.compute(M);
671 
672  /* the lower triangular matrix */
673  MatrixXd l = llt.matrixL();
674 
675  /* calculate the log-determinant */
676  VectorXd diag = l.diagonal();
677  float64_t retval = 0.0;
678  for( int32_t i = 0; i < diag.rows(); ++i ) {
679  retval += log(diag(i));
680  }
681  retval *= 2;
682 
683  return retval;
684 }
685 
686 float64_t CStatistics::log_det(const SGSparseMatrix<float64_t> m)
687 {
688  typedef SparseMatrix<float64_t> MatrixType;
689  const MatrixType &M=EigenSparseUtil<float64_t>::toEigenSparse(m);
690 
691  SimplicialLLT<MatrixType> llt;
692 
693  // factorize using cholesky with amd permutation
694  llt.compute(M);
695  MatrixType L=llt.matrixL();
696 
697  // calculate the log-determinant
698  float64_t retval=0.0;
699  for( index_t i=0; i<M.rows(); ++i )
700  retval+=log(L.coeff(i,i));
701  retval*=2;
702 
703  return retval;
704 }
705 
706 SGMatrix<float64_t> CStatistics::sample_from_gaussian(SGVector<float64_t> mean,
707  SGMatrix<float64_t> cov, int32_t N, bool precision_matrix)
708 {
709  REQUIRE(cov.num_rows>0, "Number of covariance rows must be positive!\n");
710  REQUIRE(cov.num_cols>0,"Number of covariance cols must be positive!\n");
711  REQUIRE(cov.matrix, "Covariance is not initialized!\n");
712  REQUIRE(cov.num_rows==cov.num_cols, "Covariance should be square matrix!\n");
713  REQUIRE(mean.vlen==cov.num_rows, "Mean and covariance dimension mismatch!\n");
714 
715  int32_t dim=mean.vlen;
716  Map<VectorXd> mu(mean.vector, mean.vlen);
717  Map<MatrixXd> c(cov.matrix, cov.num_rows, cov.num_cols);
718 
719  // generate samples, z, from N(0, I), DxN
720  SGMatrix<float64_t> S(dim, N);
721  for( int32_t j=0; j<N; ++j )
722  for( int32_t i=0; i<dim; ++i )
723  S(i,j)=CMath::randn_double();
724 
725  // the cholesky factorization c=L*U
726  MatrixXd U=c.llt().matrixU();
727 
728  // generate samples, x, from N(mean, cov) or N(mean, cov^-1)
729  // return samples of dimension NxD
730  if( precision_matrix )
731  {
732  // here we have U*x=z, to solve this, we use cholesky again
734  LDLT<MatrixXd> ldlt;
735  ldlt.compute(U);
736  s=ldlt.solve(s);
737  }
738 
740 
741  if( !precision_matrix )
742  {
743  // here we need to find x=L*z, so, x'=z'*L' i.e. x'=z'*U
745  s=s*U;
746  }
747 
748  // add the mean
750  for( int32_t i=0; i<N; ++i )
751  x.row(i)+=mu;
752 
753  return S;
754 }
755 
756 SGMatrix<float64_t> CStatistics::sample_from_gaussian(SGVector<float64_t> mean,
757  SGSparseMatrix<float64_t> cov, int32_t N, bool precision_matrix)
758 {
759  REQUIRE(cov.num_vectors>0,
760  "CStatistics::sample_from_gaussian(): \
761  Number of covariance rows must be positive!\n");
762  REQUIRE(cov.num_features>0,
763  "CStatistics::sample_from_gaussian(): \
764  Number of covariance cols must be positive!\n");
766  "CStatistics::sample_from_gaussian(): \
767  Covariance is not initialized!\n");
769  "CStatistics::sample_from_gaussian(): \
770  Covariance should be square matrix!\n");
771  REQUIRE(mean.vlen==cov.num_vectors,
772  "CStatistics::sample_from_gaussian(): \
773  Mean and covariance dimension mismatch!\n");
774 
775  int32_t dim=mean.vlen;
776  Map<VectorXd> mu(mean.vector, mean.vlen);
777 
778  typedef SparseMatrix<float64_t> MatrixType;
779  const MatrixType &c=EigenSparseUtil<float64_t>::toEigenSparse(cov);
780 
781  SimplicialLLT<MatrixType> llt;
782 
783  // generate samples, z, from N(0, I), DxN
784  SGMatrix<float64_t> S(dim, N);
785  for( int32_t j=0; j<N; ++j )
786  for( int32_t i=0; i<dim; ++i )
787  S(i,j)=CMath::randn_double();
788 
790 
791  // the cholesky factorization P*c*P^-1 = LP*UP, with LP=P*L, UP=U*P^-1
792  llt.compute(c);
793  MatrixType LP=llt.matrixL();
794  MatrixType UP=llt.matrixU();
795 
796  // generate samples, x, from N(mean, cov) or N(mean, cov^-1)
797  // return samples of dimension NxD
798  if( precision_matrix )
799  {
800  // here we have UP*xP=z, to solve this, we use cholesky again
801  SimplicialLLT<MatrixType> lltUP;
802  lltUP.compute(UP);
803  s=lltUP.solve(s);
804  }
805  else
806  {
807  // here we need to find xP=LP*z
808  s=LP*s;
809  }
810 
811  // permute the samples back with x=P^-1*xP
812  s=llt.permutationPinv()*s;
813 
815  // add the mean
817  for( int32_t i=0; i<N; ++i )
818  x.row(i)+=mu;
819 
820  return S;
821 }
822 
823 
824 CStatistics::SigmoidParamters CStatistics::fit_sigmoid(SGVector<float64_t> scores)
825 {
826  SG_SDEBUG("entering CStatistics::fit_sigmoid()\n")
827 
828  REQUIRE(scores.vector, "CStatistics::fit_sigmoid() requires "
829  "scores vector!\n");
830 
831  /* count prior0 and prior1 if needed */
832  int32_t prior0=0;
833  int32_t prior1=0;
834  SG_SDEBUG("counting number of positive and negative labels\n")
835  {
836  for (index_t i=0; i<scores.vlen; ++i)
837  {
838  if (scores[i]>0)
839  prior1++;
840  else
841  prior0++;
842  }
843  }
844  SG_SDEBUG("%d pos; %d neg\n", prior1, prior0)
845 
846  /* parameter setting */
847  /* maximum number of iterations */
848  index_t maxiter=100;
849 
850  /* minimum step taken in line search */
851  float64_t minstep=1E-10;
852 
853  /* for numerically strict pd of hessian */
854  float64_t sigma=1E-12;
855  float64_t eps=1E-5;
856 
857  /* construct target support */
858  float64_t hiTarget=(prior1+1.0)/(prior1+2.0);
859  float64_t loTarget=1/(prior0+2.0);
860  index_t length=prior1+prior0;
861 
862  SGVector<float64_t> t(length);
863  for (index_t i=0; i<length; ++i)
864  {
865  if (scores[i]>0)
866  t[i]=hiTarget;
867  else
868  t[i]=loTarget;
869  }
870 
871  /* initial Point and Initial Fun Value */
872  /* result parameters of sigmoid */
873  float64_t a=0;
874  float64_t b=CMath::log((prior0+1.0)/(prior1+1.0));
875  float64_t fval=0.0;
876 
877  for (index_t i=0; i<length; ++i)
878  {
879  float64_t fApB=scores[i]*a+b;
880  if (fApB>=0)
881  fval+=t[i]*fApB+CMath::log(1+CMath::exp(-fApB));
882  else
883  fval+=(t[i]-1)*fApB+CMath::log(1+CMath::exp(fApB));
884  }
885 
886  index_t it;
887  float64_t g1;
888  float64_t g2;
889  for (it=0; it<maxiter; ++it)
890  {
891  SG_SDEBUG("Iteration %d, a=%f, b=%f, fval=%f\n", it, a, b, fval)
892 
893  /* Update Gradient and Hessian (use H' = H + sigma I) */
894  float64_t h11=sigma; //Numerically ensures strict PD
895  float64_t h22=h11;
896  float64_t h21=0;
897  g1=0;
898  g2=0;
899 
900  for (index_t i=0; i<length; ++i)
901  {
902  float64_t fApB=scores[i]*a+b;
903  float64_t p;
904  float64_t q;
905  if (fApB>=0)
906  {
907  p=CMath::exp(-fApB)/(1.0+CMath::exp(-fApB));
908  q=1.0/(1.0+CMath::exp(-fApB));
909  }
910  else
911  {
912  p=1.0/(1.0+CMath::exp(fApB));
913  q=CMath::exp(fApB)/(1.0+CMath::exp(fApB));
914  }
915 
916  float64_t d2=p*q;
917  h11+=scores[i]*scores[i]*d2;
918  h22+=d2;
919  h21+=scores[i]*d2;
920  float64_t d1=t[i]-p;
921  g1+=scores[i]*d1;
922  g2+=d1;
923  }
924 
925  /* Stopping Criteria */
926  if (CMath::abs(g1)<eps && CMath::abs(g2)<eps)
927  break;
928 
929  /* Finding Newton direction: -inv(H') * g */
930  float64_t det=h11*h22-h21*h21;
931  float64_t dA=-(h22*g1-h21*g2)/det;
932  float64_t dB=-(-h21*g1+h11*g2)/det;
933  float64_t gd=g1*dA+g2*dB;
934 
935  /* Line Search */
936  float64_t stepsize=1;
937 
938  while (stepsize>=minstep)
939  {
940  float64_t newA=a+stepsize*dA;
941  float64_t newB=b+stepsize*dB;
942 
943  /* New function value */
944  float64_t newf=0.0;
945  for (index_t i=0; i<length; ++i)
946  {
947  float64_t fApB=scores[i]*newA+newB;
948  if (fApB>=0)
949  newf+=t[i]*fApB+CMath::log(1+CMath::exp(-fApB));
950  else
951  newf+=(t[i]-1)*fApB+CMath::log(1+CMath::exp(fApB));
952  }
953 
954  /* Check sufficient decrease */
955  if (newf<fval+0.0001*stepsize*gd)
956  {
957  a=newA;
958  b=newB;
959  fval=newf;
960  break;
961  }
962  else
963  stepsize=stepsize/2.0;
964  }
965 
966  if (stepsize<minstep)
967  {
968  SG_SWARNING("CStatistics::fit_sigmoid(): line search fails, A=%f, "
969  "B=%f, g1=%f, g2=%f, dA=%f, dB=%f, gd=%f\n",
970  a, b, g1, g2, dA, dB, gd);
971  }
972  }
973 
974  if (it>=maxiter-1)
975  {
976  SG_SWARNING("CStatistics::fit_sigmoid(): reaching maximal iterations,"
977  " g1=%f, g2=%f\n", g1, g2);
978  }
979 
980  SG_SDEBUG("fitted sigmoid: a=%f, b=%f\n", a, b)
981 
983  result.a=a;
984  result.b=b;
985 
986  SG_SDEBUG("leaving CStatistics::fit_sigmoid()\n")
987  return result;
988 }
989 
990 const float64_t CStatistics::ERFC_CASE1=0.0492;
991 
992 const float64_t CStatistics::ERFC_CASE2=-11.3137;
SGMatrix< T > transpose_matrix(const SGMatrix< T > &A)
int32_t index_t
Definition: common.h:72
#define SG_SWARNING(...)
Definition: SGIO.h:163
void scale(SGVector< T > &a, SGVector< T > &result, T alpha=1)
Definition: SGMatrix.h:25
SGSparseVector< T > * sparse_matrix
array of sparse vectors of size num_vectors
#define REQUIRE(x,...)
Definition: SGIO.h:181
index_t num_vectors
total number of vectors
std::enable_if<!std::is_same< T, complex128_t >::value, float64_t >::type mean(const Container< T > &a)
#define SG_SPRINT(...)
Definition: SGIO.h:165
#define ASSERT(x)
Definition: SGIO.h:176
double float64_t
Definition: common.h:60
long double floatmax_t
Definition: common.h:61
index_t num_features
total number of features
index_t num_rows
Definition: SGMatrix.h:495
index_t num_cols
Definition: SGMatrix.h:497
This class contains some utilities for Eigen3 Sparse Matrix integration with shogun. Currently it provides a method for converting SGSparseMatrix to Eigen3 SparseMatrix.
Definition: eigen3.h:48
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
T sum(const Container< T > &a, bool no_diag=false)
#define SG_SDEBUG(...)
Definition: SGIO.h:153
#define M_SQRT1_2
Definition: Statistics.cpp:29
#define SG_SERROR(...)
Definition: SGIO.h:164
T max(const Container< T > &a)
index_t vlen
Definition: SGVector.h:571

SHOGUN Machine Learning Toolbox - Documentation