#! /usr/bin/env python """\ %prog [=ipol.dat] [opts] Interpolate histo bin values as a function of the parameter space by loading run data and parameter lists from run directories in $runsdir (often "mc") TODO: * Use weight file position matches to exclude some bins, as well as path matching * Handle run combination file/string (write a hash of the run list into the ipol filename?) * Support asymm error parameterisation """ import optparse, os, sys op = optparse.OptionParser(usage=__doc__) op.add_option("--pname", "--pfile", dest="PNAME", default="params.dat", help="Name of the params file to be found in each run directory (default: %default)") op.add_option("--limits", dest="LIMITS", default=None, help="Simple text file with parameter limits and fixed parameters") op.add_option("--wfile", dest="WFILE", default=None, help="Path to a weight file, used to restrict ipol building to a subset of bins (default: %default)") op.add_option("--order", dest="ORDER", default=3, type=int, help="Global order of polynomials for interpolation") op.add_option("--ierr", dest="IERR", default="symm", help="Whether to interpolate MC errors: none, mean, median, symm (default: %default)") #< add rel, asymm op.add_option("--eorder", dest="ERR_ORDER", default=None, type=int, help="Global order of polynomials for uncertainty interpolation (default: same as from --order)") op.add_option("--rc", dest="RUNCOMBS", default=None, help="Ru combination file") op.add_option("--filter", dest="FILTER", action="store_true", default=False, help="Filter out data bins that have 0 error") # TODO: Add a no-scale option # TODO: Change to instead (optionally) specify the max number of parallel threads/procs. Use by default if ncores > 1? op.add_option("-j", dest="MULTI", type=int, default=1, help="Number of threads to use") op.add_option("-s", "--strategy", dest="STRATEGY", default=1, type=int, help="Set Minuit strategy [0 fast, 1 default, 2 slow]") op.add_option("--summ", dest="SUMMARY", default=None, help="Summary description to be written to the ipol output file") op.add_option("-v", "--debug", dest="DEBUG", action="store_true", default=False, help="Turn on some debug messages") op.add_option("-q", "--quiet", dest="QUIET", action="store_true", default=False, help="Turn off messages") op.add_option("--split", dest="SPLIT", default=None, help="Can incorporate a linear split in parameter space. Provide \"ParamName1 ParamName2 Value1 Value2 Gradient (these describe the line down which to split the plot. Value1 and Value2 form the coords of a point on the line.) ABOVE/BELOW (ABOVE => use runs above the line etc)\"") opts, args = op.parse_args() ## Get mandatory arguments RUNSDIR = args[0] ## By default, use the same ipol order for errors as for values if opts.ERR_ORDER is None: opts.ERR_ORDER = opts.ORDER ## Load the Professor machinery import professor2 as prof if not opts.QUIET: print prof.logo MAXERRDICT = None ## Load MC run histos and params PARAMS, HISTOS = prof.read_all_rundata(RUNSDIR, opts.PNAME) RUNS, PARAMNAMES, PARAMSLIST = prof.mk_ipolinputs(PARAMS) print "Found RUNS", len(RUNS) print HISTOS.keys() REFDIR = args[1] DHISTOS = prof.read_all_histos(REFDIR) ## Weight file parsing to select a histos subset if opts.WFILE: matchers = prof.read_pointmatchers(opts.WFILE) for hn in HISTOS.keys(): if not any(m.match_path(hn) for m in matchers.keys()): del HISTOS[hn] elif opts.DEBUG: print "Observable %s passed weight file path filter" % hn print "Filtered observables by path, %d remaining" % len(HISTOS) HNAMES = HISTOS.keys() ## If there's nothing left to interpolate, exit! if not HNAMES: print "No observables remaining... exiting" sys.exit(1) def trivGOF(run): gof=0 nd=0 for hn in HNAMES: data = DHISTOS[hn] mc = HISTOS[hn][run] for i in xrange(data.nbins): y_d = data.bins[i].val dy_d = data.bins[i].err y_m = mc.bins[i].val dy_m = mc.bins[i].err try: gof += (y_d - y_m)**2 /(dy_d**2 + dy_m**2) nd+=1 except: pass return gof, nd bad, badnum = [], [] for irun, run in enumerate(RUNS): for hn in HNAMES: if not HISTOS[hn].has_key(run): bad.append(run) badnum.append(irun) break if not trivGOF(run)[0] >0 : print "Removing run beacuse of gof", run if not run in bad: bad.append(run) badnum.append(irun) if bad: print "Found %d bad runs in %d total... removing" % (len(bad), len(RUNS)) goodr, goodp = [], [] for irun, run in enumerate(RUNS): if not irun in badnum: goodr.append(run) goodp.append(PARAMSLIST[irun]) RUNS = goodr PARAMSLIST = goodp from numpy import array PARRAY=array(PARAMSLIST) GLOBLIM = [] for i in xrange(len(PARAMNAMES)): GLOBLIM.append([min(PARRAY[:,i]), max(PARRAY[:,i])]) GLOBMET = array([x[1]-x[0] for x in GLOBLIM]) points = zip(RUNS, PARRAY) points=sorted(points, key=lambda point: trivGOF(point[0])[0] ) # print points[0], trivGOF(points[0][0]) startpoint=points[0][1] def distance(a, b): return 0 def filterInputs(center): if center is None: return RUNS import numpy as np minnruns = int(prof.numCoeffs(len(PARAMNAMES), opts.ORDER)*2) points = zip(RUNS, PARRAY) points=sorted(points, key=lambda point: np.sqrt(sum(np.square((center-point[1])/GLOBMET)))) return [x[0] for x in points[0:minnruns]] def mkIpol(center=None, asString=False): ## Robustness tests and cleaning: only retain runs that contain every histo # TODO: combine with weights histo vetoing -- should we explicitly unload unused run data, or keep it for the next combination to use? Or do we now leave runcombs to the user? thisRUNS = filterInputs(center) thisPARAMSLIST = [PARAMS[x].values() for x in thisRUNS] from numpy import array tpar = array(thisPARAMSLIST) LOCLIM = [[min(tpar[:,i]), max(tpar[:,i])] for i in xrange(len(PARAMNAMES))] IHISTOS = {} for hn in HNAMES: print "Parametrising", hn ih = prof.mk_ipolhisto(HISTOS[hn], thisRUNS, thisPARAMSLIST, opts.ORDER, opts.IERR, opts.ERR_ORDER) IHISTOS[hn] =ih return IHISTOS, LOCLIM def prepareTune(thisIHISTOS): ## Weight file parsing matchers = prof.read_pointmatchers(opts.WFILE) if opts.WFILE else None ## Find things available in both ipol and ref data, and in the weight file if there is one available = [] for ihn in sorted(thisIHISTOS.keys()): ## Set default bin weights for ib in thisIHISTOS[ihn].bins: ib.w = 1.0 ## Find user-specified bin weights if there was a weight file if matchers is not None: ## Find matches pathmatch_matchers = [(m,wstr) for m,wstr in matchers.iteritems() if m.match_path(ihn)] ## Ditch histos not listed in the weight file if not pathmatch_matchers: del thisIHISTOS[ihn] continue ## Attach fit weights to the ibins, setting to zero if there's no position match for ib in thisIHISTOS[ihn].bins: posmatch_matchers = [(m,wstr) for (m,wstr) in pathmatch_matchers if m.match_pos(ib)] ib.w = float(posmatch_matchers[-1][1]) if posmatch_matchers else 0 #< NB. using last match for rhn in DHISTOS.keys(): if ihn in rhn: #< TODO: short for rhn = "/REF/"+ihn ? # TODO: we should eliminate this potential mismatch of ref and MC hnames available.append([ihn,rhn]) break #< TODO: ok? # else: # print "Could not find %s"%ihn ## Prepare lists of ibins and dbins IBINS, DBINS, MAXERRS, FILTERED = [], [], [], [] BINDICES={} # Allows for more helpful error messages in case of prof.StatError for a in available: # TODO: print out the available observables if len(thisIHISTOS[a[0]].bins) != len(DHISTOS[a[1]].bins): print "Inconsistency discovered between data bins and parametrised bins:" print "Removing histogram", a[0] del thisIHISTOS[a[0]] del DHISTOS[a[1]] else: BINDICES[a[0]] = []#range(len(IBINS), len(IBINS) + len(thisIHISTOS[a[0]])) # This is for debugging for nb in xrange(len(thisIHISTOS[a[0]].bins)): if opts.FILTER and DHISTOS[a[1]].bins[nb].err ==0: FILTERED.append(1) continue if thisIHISTOS[a[0]].bins[nb].w >0: IBINS.append(thisIHISTOS[a[0]].bins[nb]) DBINS.append(DHISTOS[a[1]].bins[nb]) BINDICES[a[0]].append(len(IBINS)) if MAXERRDICT: MAXERRS.extend(MAXERRS[a[0]]) if not MAXERRS: MAXERRS = None if opts.DEBUG: print "DEBUG: filtered %i bins due to zero data error" % len(FILTERED) ## Sanity checks assert len(IBINS) == len(DBINS) if not IBINS: print "No bins... exiting" sys.exit(1) assert MAXERRS is None or len(IBINS) == len(MAXERRS) return IBINS, DBINS, BINDICES def mkTune(thisIBINS, thisDBINS, thisBINDICES, thisLOCLIM): def simpleGoF(params): """ Very straightforward goodness-of-fit measure """ chi2 = 0.0 for num, ibin in enumerate(thisIBINS): ## Weight is attached to the ipol bin (default set to 1.0 above) w = ibin.w if w == 0: continue ## Get ipol & ref bin values and compute their difference ival = ibin.val(params) dval = thisDBINS[num].val diff = dval - ival ## Data error err2 = thisDBINS[num].err**2 ## Plus interpolation error added in quadrature # maxierr = MAXERRS[ibin] if MAXERRS else None err2 += ibin.err(params)**2#, emax=maxierr)**2 # TODO: compute asymm error for appropriate deviation direction cf. sum([e**2 for e in ibin.ierrs]) if not err2: culprit="" i_culprit=-1 for k, v in thisBINDICES.iteritems(): if num in v: culprit=k i_culprit = v.index(num) raise prof.StatError("Zero uncertainty on a bin being used in the fit -- cannot compute a reasonable GoF!\n\tObservable: %s\n\t%s %f+=%f\n\t%s \nSee weight-syntax in documentation or user --filter CL arg to remove bins with zero data error automatically"%(culprit, ibin, ival, ibin.err(params, emax=42), thisDBINS[num])) # TODO: should we square w too, so it penalised deviations _linearly_? chi2 += w * diff**2 / err2 return chi2 ## Function definition wrapper funcdef = prof.mk_fitfunc("simpleGoF", PARAMNAMES, "profGoF") exec funcdef in locals() if opts.DEBUG: print "Built GoF wrapper from:\n '%s'" % funcdef try: from iminuit import Minuit except ImportError, e: print "Unable to import iminuit, exiting", e print "Try installing iminuit with pip: pip install iminuit --user" import sys sys.exit(1) if not opts.QUIET: print "\n" print 66*"*" print "* Using iminuit, please visit https://github.com/iminuit/iminuit *" print 66*"*" print "\n" ## Ignition ## Dictionary fitarg for iminuit FARG=dict() ## Initial conditions --- use pos = center of hypercube, and step = range/10 # TODO: Optionally make an initial brute force scan to choose the Minuit starting point, using prof.scangrid pmins =[x[0] for x in thisLOCLIM] pmaxs =[x[1] for x in thisLOCLIM] pmids = [(pmins[i] + pmaxs[i])/2. for i in xrange(len(pmins))] pranges = [(pmaxs[i] - pmins[i]) for i in xrange(len(pmins))] for i, aname in enumerate(PARAMNAMES): FARG[aname] = pmids[i] FARG['error_%s'%aname] = pranges[i] / 10. ## Fix parameters, set limits (with pname translation) limits, fixed = prof.read_limitsandfixed(opts.LIMITS) for i, pname in enumerate(PARAMNAMES): # if pname in limits.keys(): FARG['limit_%s'%pname] = thisLOCLIM[i] if pname in fixed.keys(): if not opts.QUIET: print "Fixing", pname, "= %f"%fixed[PARAMNAMES[i]] FARG[pname] = fixed[PARAMNAMES[i]] FARG['fix_%s'%pname] = True # TODO: errordef as CL params? PRINTLEVEL = 0 if opts.QUIET else 1 minuit = Minuit(profGoF, errordef=1, print_level=PRINTLEVEL, forced_parameters=PARAMNAMES, **FARG) minuit.strategy = opts.STRATEGY import time start_time = time.time() ## Lift off minuit.migrad() print("Minimisation finished after %s seconds" % (time.time() - start_time)) ## Now process the result: ## Goodness of fit if not opts.QUIET: chi2 = minuit.fval ndof = len(thisDBINS) - (len(PARAMNAMES) - len(fixed.keys())) if ndof>0: print "'chi2': %.2f --- Ndf : %i --- ratio : %.2f" % (chi2, ndof, chi2/ndof) else: print "HAHA" ## Check if result is in validity range result = [minuit.values[p] for p in PARAMNAMES] return result, chi2, ndof def mkHistoPred(ihs, params): H=[] for k, v in ihs.iteritems(): bins = [b.toDataBin(params) for b in v.bins] D=prof.DataHisto(bins, path=v.path) H.append(D.toScatter2D()) return H # from IPython import embed # embed() I, loclim=mkIpol() ib,db, bindi = prepareTune(I) r,c,n=mkTune(ib,db,bindi,loclim) scatters=mkHistoPred(I, r) import yoda yoda.writeYODA(scatters, "Iteration_mitalle1_ipolhistos.yoda" ) I, loclim=mkIpol(startpoint) ib,db, bindi = prepareTune(I) r,c,n=mkTune(ib,db,bindi,loclim) scatters=mkHistoPred(I, r) import yoda yoda.writeYODA(scatters, "Iteration_start_ipolhistos.yoda" ) X, Y,R=[],[],[] import numpy for i in xrange(10): I, loclim=mkIpol(r) ib,db, bindi = prepareTune(I) r,c,n=mkTune(ib,db,bindi,loclim) # print i, r # diff = truth - array(r) # Y.append(numpy.sqrt(sum([x*x for x in diff]))) X.append(i) Y.append(c) R.append(r) import yoda scatters=mkHistoPred(I, r) yoda.writeYODA(scatters, "Iteration_%i_ipolhistos.yoda"%i ) from IPython import embed embed()