Source code for Stats

#
#   Stats.PY
#   functions to draw probability distribution function of fractianal polarization of sources.
#
#   date: 2017-12-21
#   author: GIUSEPPE PUGLISI
#
#   Copyright (C) 2017   Giuseppe Puglisi    giuspugl@sissa.it
#

import pylab as pl
import healpy as hp
import numpy as np
import h5py as h5
import sys
#from polarization_pdf import *
import scipy as sp
from scipy import interpolate
from scipy import stats
from pointsource_utilities import *
import glob
from scipy import special as spec

[docs]def lognormal_distribution(X,A,Xm, sigma ): """ Compute log-normal distribution on `X`: .. math:: \mathcal{P} (\Pi ) =A/ (\sqrt{2\pi \sigma^2} \Pi) \exp [ - ( \ln(\Pi / \mu))^2 /{2\sigma^2} ] **Parameters** - `X`:{array or float} polarization fraction array - `A`:{float} constant - `Xm`:{float} - `sigma`:{float} """ pi=np.pi return A* 1./(X*sigma *np.sqrt(2*pi)) * np.exp(- (np.log(X/Xm ))**2 /2./sigma**2)
[docs]def compute_poissonian_uncertainties(bc,Ntot,S,perr): """ Compute Poissonian upper and lower limit uncertainties from Gehrels 1986. **Parameters** - `bc`:{array} counts within each bin - `Ntot`: {int} total number of data - `S`: {int} the confidence level (CL) for uncertainties, we consider S=1, meaning 68% CL, S=2 for the 95%, etc... - `perr`:{array} array where upper and lower uncertainties are stored, `shape= 2 x size(bc)`. """ ppl=[lambda N:N * (1. - 1. / (9. * N) - S / (3. * np.sqrt(N))) ** 3, lambda N: N + S * np.sqrt(N + 1) + (S**2 + 2) / 3.] # Approximations Gehrels 1986 for i in range(len(bc)) : n=bc[i] if n<=100: perr[1][i]= (ppl[1] (n) - n)/Ntot if n==0: perr[0][i]= perr[1][i] else: perr[0][i]=(n- ppl[0](n))/Ntot else: perr[0][i]= S*np.sqrt(n)/Ntot perr[1][i]= S*np.sqrt(n)/Ntot return perr
[docs]def bootstrap_resampling(I, sigmaI, P, sigmaP,nsamples,upper_limit=False): """ It performs the bootstrap resampling from observations. For each group of values (I,P, sigmaI, sigmaP) it resamples `nsamples` values with Gaussian random values `Isamples` and `Psamples` with center around I, or P and width sigmaI and sigmaP. If data are upper limit it resamples by assuming a uniform distribution of random number from 0 to :math:`\sqrt{P^2 +\sigma_P^2}`. **Parameters** - `I, sigmaI, P, sigmaP`={floats} values for resampling - `nsamples`:{int} how many resampling to do - `upper_limit`:{bool} **Returns** - `f`:{array} resampled fractional polarization """ Isamples=abs(np.random.normal(loc=I, scale=sigmaI, size=nsamples)) if upper_limit: Psamples=np.random.uniform(low=0.0, high=np.sqrt(P**2+sigmaP**2), size=nsamples)# else: Psamples=abs(np.random.normal(loc=P, scale=sigmaP, size=nsamples)) f= Psamples/Isamples m=np.ma.masked_less_equal(Psamples/Isamples,1.) # mask the sources whose P>I return f[m.mask]
[docs]def pi2(mu, sigma, apply_correction=False ) : """ Compute :math:`< \Pi^2 >` from :math:`\mu, \sigma` lognormal best fit parameters, following Battye et al. 2011. .. math:: <\Pi^2 > = \mu^2 e^{2\sigma^2} """ if apply_correction: t= np.log(1./mu)/ np.sqrt(2)/sigma factor=( (1+ spec.erf(t+ (np.sqrt(2) *sigma ) ) ) / (1 +spec.erf(t) ) ) else: factor=1 pi2= mu**2 *np.exp(2.*sigma**2) * factor return pi2
[docs]def pi(mu, sigma, apply_correction=False) : """ Compute :math:`< \Pi >` from :math:`\mu, \sigma` lognormal best fit parameters, following Battye et al. 2011. .. math:: <\Pi> = \mu e^{\sigma^2/2} """ if apply_correction: t= np.log(1./mu)/ np.sqrt(2)/sigma factor=( (1+ spec.erf(t+ (sigma/np.sqrt(2) ) ) ) / (1 +spec.erf(t) ) ) else: factor=1 pi= mu *np.exp(.5*sigma**2) * factor return pi
[docs]def pimed(mu, sigma, apply_correction=False ): """ Compute :math:`\Pi_{med}` from :math:`\mu, \sigma` lognormal best fit parameters, following Battye et al. 2011. """ if apply_correction: t= np.log(1./mu)/ np.sqrt(2)/sigma factor=np.exp( np.sqrt(2)*sigma * spec.erfinv( .5 * (spec.erf(t) -1 ) ) ) else: factor=1. pi= mu *factor return pi
[docs]def err_p2(mu, sigma, cov): """ Compute errors on :math:`< \Pi^2 >` propagating uncertainties on :math:`\mu, \sigma` parameters. """ dfdmu=2.* mu*np.exp(2.*sigma**2) dfdsig=4.* mu**2 *np.exp(2.*sigma**2) * sigma deltap2= np.sqrt(dfdmu**2 * cov[1,1] + 2. * dfdmu* dfdsig* cov[2,1] + dfdsig**2* cov[2,2] ) return deltap2
[docs]def err_pi (mu, sigma,cov ): """ Compute errors on :math:`< \Pi >` propagating uncertainties on :math:`\mu, \sigma` parameters. """ dfdmu= np.exp(.5*sigma**2) dfdsig=mu*sigma *np.exp(.5*sigma**2) deltap= np.sqrt(dfdmu**2 * cov[1,1] + 2. * dfdmu* dfdsig* cov[2,1] + dfdsig**2* cov[2,2] ) return deltap
[docs]def chisquare(obs,exp,sigma): return np.sum(((obs-exp)/sigma)**2)
[docs]def fitting_lognormal_from_fluxes(flux, polflux,eI, eP, idstring, fig=None, nbins=15, resampling =True,workdir='./', **kwargs): """ Fit logrnormal distribution of fractional polarization from catalog encoding total intensity and polarized fluxes. **Parameters** - `flux, eI`:{array} total intensity fluxes and errors - `polflux, eP`:{array} polarization flux and errors - `idstring`: {string} string used for plotting - `fig`:{int} id of figure - `nbins`:{int} fractional polarization histogram bins - `resampling`:{bool} if `True` resample the catalog with 1000 resampling, otherwise it does not. - `workdir`:{string} path to store plots and files. - `saveplot`:{string} name and format to store the plot of the lognormal fit in `workdir` - `bestfitparams2file`:{bool} save the computed best fit parameters, the average fractional polarizations to binary file `*.npy` in `workdir`. **Returns** - `Pim, Pi2, Pi`:{floats} :math:`\Pi_{med},< \Pi^2 >,< \Pi >` - `deltapim, deltap2 deltap` errors on the computed quantities. """ if not 'saveplot' in kwargs: kwargs['saveplot']=None if not 'verbose' in kwargs: kwargs['verbose']=True if not 'bestfitparams2file' in kwargs: kwargs['bestfitparams2file']=True fpol=np.array(polflux/flux) if resampling : ziplist=zip(flux, polflux,eI, eP) resampled=[] nresampl=1000 for I,P,sigmaI,sigmaP in ziplist : resampled.append(p_resampling(I,sigmaI,P,sigmaP,nresampl)) resampled=np.concatenate(resampled) else : resampled=fpol if min(fpol)/max(fpol)< 1e-3: if kwargs['verbose']:print "using a not uniform binning" edges= np.concatenate([np.linspace(min(fpol),min(fpol)*5., 3), np.linspace(min(fpol)*5., max(fpol), nbins-2)]) else: edges=np.linspace(min(fpol),max(fpol),nbins+1) pdf,edges=np.histogram(resampled,bins= edges,normed=False ) bins=np.array([.5*(edges[i+1] +edges[i] ) for i in range(nbins-1 )]) maskbins=np.digitize(resampled,bins) mbins=np.digitize(fpol,bins) Ntot_r=1.*len(resampled) Ntot= 1.*len(fpol) pdf= 1.*pdf /Ntot_r pmean=np.zeros(nbins) deltab=np.array([.5*(edges[i+1] -edges[i] ) *100. for i in xrange(nbins)]) bc= np.bincount(mbins) S=1 # 1 sigma 68% CL perr=np.zeros(2*(nbins)).reshape(2,nbins ) perr=compute_poissonian_uncertainties(bc,Ntot,S,perr) for i in range(max(maskbins)+1) : idx=np.where(maskbins == i)[0] pmean[i]= np.mean(resampled[idx]) try: pl.figure(fig) pl.errorbar(pmean*100.,pdf,yerr=[perr[0],perr[1]],fmt='go') popt,pcov=so.curve_fit(lognormal_distribution, (pmean)*100., pdf,sigma= np.sqrt(perr[1]**2 + perr[0]**2)) chi2=chisquare(pdf ,lognormal_distribution(pmean*100. ,*popt) , np.sqrt(perr[1]**2 + perr[0]**2)) DOF=(nbins-len(popt)) redchi2=chi2/DOF pte = 1- stats.chi2.cdf(chi2,DOF) if kwargs['verbose']: print "----"*20,"\n","Fitting lognormal distribution on "+idstring+" with %d data"%(len(fpol)) print "A= %g\nmu =%g +- %g \nsigma=%g per cent +- %g"%(popt[0],(popt[1]),np.sqrt(pcov[1,1]),popt[2],np.sqrt(pcov[2,2])) print "Reduced Chi^2 : %g\nPTE :%g"%(redchi2, pte ) print "----"*20 x=np.linspace(0.1,max(fpol)*100.,1024) pl.plot(x,lognormal_distribution(x,*popt),label=idstring + r' $A= %.2f ,\, \mu= %.2f\, \sigma=%.2g $'%(popt[0],(popt[1]),popt[2])) Pi_m=popt[1] deltapim=np.sqrt(pcov[1,1]) sigma=popt[2] Pi2= Pi_m**2*np.exp(2.*sigma**2) deltap2=err_p2(Pi_m, sigma, pcov) Pi= Pi_m*np.exp(sigma**2/2.) deltap =err_pi(Pi_m, sigma, pcov) except RuntimeError: print "Unable to fit Lognormal too few data points" Pi_m=np.median(resampled) Pi2= np.mean(resampled**2) deltap2=0. Pi= np.mean(resampled) deltap=0. deltapim=0. pass finally: pl.xticks(fontsize=14) pl.yticks(fontsize=15) pl.xlabel(r'$\Pi$'+' [ % ] ' ,fontsize=14) pl.ylabel(r'$P\,(\Pi)$ ',fontsize=15) pl.legend(loc='best', fontsize=14) pl.tight_layout() #pl.title(idstring) if max(fpol)>.5: pl.xlim([0.,30.]) else: pl.xlim([0,max(fpol)*100.]) if kwargs['saveplot'] is not None : pl.savefig(workdir+idstring+kwargs['saveplot']) pl.close() if kwargs['bestfitparams2file']: np.save(workdir+'bestfit_params_'+idstring, [popt, pcov, redchi2, pte, Pi_m, deltapim, Pi, deltap, Pi2, deltap2]) return Pi_m, Pi2, deltap2,deltapim, Pi, deltap
[docs]def resampling_fpol(fpol, sigma,flag, nresampl=100): """ Resampling a catalogue starting from fractional polarization values, errors and flags into the data. Flags identify which data have to be considered as upper limits. **Parameters** - `fpol`:{float} - `sigma`:{float} - `flag`:{bool} - `nresampl`:{int} how many resampling to do **Returns** - `fpolres`:{array} array of resampled fractional polarizations. """ if flag: fpolres=np.random.uniform(low=0.,high=fpol, size=nresampl) else: fpolres=abs(np.random.normal(loc=fpol,scale=sigma, size=nresampl)) return fpolres
[docs]def fitting_lognormal_from_fpol(fpol,errfpol, flags,idstring, fig=None, nbins=15, resampling=True,workdir='./', **kwargs): """ Fit logrnormal distribution of fractional polarization from catalog encoding fractional polarization and errors on it. **Parameters** - `pol`:{array} fractional polarization - `errfpol`:{array} errors of fractional polariz. - `flag`:{array} if `flag=1 ` or `flag=True` it means that data are flagged and they are considered as upper limits. The rest of the parameters are the very same as in :func:`fitting_lognormal_from_fluxes` """ if not 'saveplot' in kwargs: kwargs['saveplot']=None if not 'verbose' in kwargs: kwargs['verbose']=True if not 'bestfitparams2file' in kwargs: kwargs['bestfitparams2file']=True if resampling : resampled=[] nresampl=1000 for P,eP, F in zip(fpol,errfpol, flags): resampled.append(resampling_fpol(P,eP,F,nresampl=nresampl )) resampled=np.concatenate(resampled) else : resampled=(fpol) if min(fpol)/max(fpol)< 1e-3: if kwargs['verbose']:print "using a not uniform binning" edges= np.concatenate([np.linspace(min(fpol),min(fpol)*5., 3), np.linspace(min(fpol)*5., max(fpol), nbins-2)]) else: edges=np.linspace(min(fpol),max(fpol),nbins+1) pdf,edges=np.histogram(resampled,bins= edges,normed=False ) bins=np.array([.5*(edges[i+1] +edges[i] ) for i in range(nbins-1 )]) maskbins=np.digitize(resampled,bins) mbins=np.digitize(fpol,bins) Ntot_r=1.*len(resampled) Ntot= 1.*len(fpol) pdf= 1.*pdf /Ntot_r pmean=np.zeros(nbins) deltab=np.array([.5*(edges[i+1] -edges[i] ) *100. for i in xrange(nbins)]) bc= np.bincount(mbins) S=1 # 1 sigma 68% CL perr=np.zeros(2*(nbins)).reshape(2,nbins ) perr=compute_poissonian_uncertainties(bc,Ntot,S,perr) for i in range(max(maskbins)+1) : idx=np.where(maskbins == i)[0] pmean[i]= np.mean(resampled[idx]) pl.figure(fig) pl.errorbar(pmean*100.,pdf,yerr=[perr[0],perr[1]],fmt='go') try: popt,pcov=so.curve_fit(lognormal_distribution, (pmean)*100., pdf,sigma= np.sqrt(perr[1]**2 + perr[0]**2)) mu=popt[1] sigma=popt[2] Pi_m = pimed(mu, sigma) deltapim=np.sqrt(pcov[1,1]) Pi2= pi2(mu, sigma) Pi=pi (mu, sigma ) chi2=chisquare(pdf ,lognormal_distribution(pmean*100. ,*popt) , np.sqrt(perr[1]**2 + perr[0]**2)) DOF=(nbins-len(popt)) redchi2=chi2/DOF pte = 1- stats.chi2.cdf(chi2,DOF) if kwargs['verbose']: print "----"*20,"\n","Fitting lognormal distribution on "+idstring+" with %d data"%(len(fpol)) print "A= %g\nmu =%g +- %g \nsigma=%g per cent +- %g"%(popt[0],(popt[1]),np.sqrt(pcov[1,1]),popt[2],np.sqrt(pcov[2,2])) print "Reduced Chi^2 : %g\nPTE :%g"%(redchi2, pte ) print "----"*20 x=np.linspace(0.1,max(fpol)*100.,1024) pl.plot(x,lognormal_distribution(x,*popt),label=idstring+r' $A= %.2f ,\, \mu= %.2f, \, \sigma=%.2g $'%(popt[0],(popt[1]),popt[2])) deltap2=err_p2(Pi_m, sigma, pcov) deltap =err_pi(Pi_m, sigma, pcov) except RuntimeError: print "Unable to fit Lognormal too few data points" Pi_m=np.median(resampled) Pi2= np.mean(resampled**2) Pi= np.mean(resampled) deltap2=0. deltap=0. deltapim=0. pass finally: pl.xticks(fontsize=14) pl.yticks(fontsize=15) pl.xlabel(r'$\Pi$'+' [ % ] ' ,fontsize=14) pl.ylabel(r'$P\,(\Pi)$ ',fontsize=15) pl.legend(loc='best', fontsize=14) #pl.title(idstring) pl.tight_layout() if max(fpol)>.5: pl.xlim([0.,10.]) else: pl.xlim([0,max(fpol)*100.]) if kwargs['saveplot'] is not None : pl.savefig(workdir+idstring+kwargs['saveplot']) pl.close() if kwargs['bestfitparams2file']: np.save(workdir+'bestfit_params_'+idstring, [popt, pcov, redchi2, pte, Pi_m, deltapim, Pi, deltap, Pi2, deltap2]) return Pi_m, Pi2, deltap2,deltapim, Pi, deltap