\documentclass[man]{apa} %\usepackage{amsmath,epsfig,amsfonts,amssymb,amsthm} \usepackage{amssymb} \usepackage{amsmath} \usepackage{verbatim} \usepackage{graphicx} %\usepackage{subfigure} %\usepackage{rotating} \usepackage{overpic} \newcommand{\reals}{\mathbb{R}} \newcommand{\positiveIntegers}{\mathbb{Z}_{+}} \newcommand{\lipschitzconstraint}{C_{j_1,j_2}} \newcommand{\lipschitzconstantij}{C_{i,j_2}} \newcommand{\lipschitzconstantji}{C_{j_1,i}} \newcommand{\half}{\frac{1}{2}} \newcommand{\onevec}{\underline{1}} \newcommand{\muaji} {\mu_{a,j_1,i}} \newcommand{\mubji} {\mu_{b,j_1,i}} \newcommand{\muaij} {\mu_{a,i,j_2}} \newcommand{\mubij} {\mu_{b,i,j_2}} \newcommand{\muajj} {\mu_{a,j_1,j_2}} \newcommand{\mubjj} {\mu_{b,j_1,j_2}} \newcommand{\gaji} {g_{a,i,(j_1,i)}(z_i)} \newcommand{\gbji} {g_{b,i,(j_1,i)}(z_i)} \newcommand{\gaij} {g_{a,i,(i,j_2)}(z_i)} \newcommand{\gbij} {g_{b,i,(i,j_2)}(z_i)} \newcommand{\KSstat}[1] {KS_{\textrm{stat}}(#1)} \newcommand{\partialderiv}[2] {\frac{\partial #1}{\partial #2}} \newcommand{\hessian}[3] {\frac{\partial^2 #1}{{\partial #2}{\partial #3}}} \newcommand{\parenth}[1] {\left(#1\right)} \newcommand{\braces}[1] {\left\{#1\right\}} \newcommand{\brackets}[1] {\left[#1\right]} \newcommand{\norm}[1] {\left\|#1\right\|} \newcommand{\infnorm}[1] {\left\|#1\right\|_{\infty}} \newcommand{\abs}[1] {\left|#1\right|} \newcommand{\cdf}[2] {F_{#1}\parenth{#2}} \newcommand{\prob}[1] {P\parenth{#1}} %\newcommand{\cndintfn}[1] {\lambda\parenth{#1|X_{#1}}} %\newcommand{\hatcndintfn}[1] {\hat{\lambda}\parenth{#1|X_{#1}}} \newcommand{\numspikes} {S} \newcommand{\simplex} {\mathcal{S}} \newcommand{\dualNetFloConstraintSet}[1] {\mathcal{P}_{#1}} \newcommand{\sumdy} {\numspikes} \newcommand{\tW} {\tilde{W}} \newcommand{\tn} {\tilde{n}} \newtheorem{theorem}{Theorem}[section] \newtheorem{lemma}[theorem]{Lemma} \newtheorem{proposition}[theorem]{Proposition} \newtheorem{corollary}[theorem]{Corollary} \newtheorem{example}[theorem]{Example} \newtheorem{conjecture}[theorem]{Conjecture} \newtheorem{definition}[theorem]{Definition} \newcommand{\Fempirical} {F_{\text{empirical}}} \newcommand{\Funiform} {F_{\text{uniform}}} \newcommand{\bfz} {\mathbf{z}} \newcommand{\ulmu} {\underline{\mu}} \newcommand{\ulz} {\underline{z}} \newcommand{\ulx} {\underline{x}} \newcommand{\ullambda} {\underline{\lambda}} \newcommand{\ulzero} {\underline{0}} \newcommand{\ulW} {\underline{W}} \newcommand{\ulp} {\underline{p}} \newcommand{\uld} {\underline{d}} \newcommand{\uldy} {\underline{dy}} \newcommand{\qmu} {q(\ulmu)} \newcommand{\LagrangianAll} {L(\ulz,\ulmu)} \newcommand{\Lagrangiani} {L_i(z_i,\ulmu)} \newcommand{\mL} {\mathcal{L}} \newcommand{\calYinterval}[1] {\mathcal{Y}_{#1}} \newcommand{\calYT}{\calYinterval{T}} \newcommand{\epochtime}[2] {\vec{#1}_{#2}} \newcommand{\integers}{\mathbb{Z}} \newcommand{\cndintfn}[1] {\lambda\parenth{#1|\History_{#1}}} \newcommand{\cndintfnsuccinct}[1] {\lambda(x_{#1})} \newcommand{\cndintfnSuccinct}[1] {\lambda(X_{#1})} \newcommand{\History} {\mathcal{H}} \newcommand{\kldist}[2] {D \parenth{#1\|#2}} \newcommand{\hq}[1]{\hat{q}_{#1}} \newcommand{\hqK}[1]{\hq{K_{#1}}} \newcommand{\hqKj} {\hqK{j}} \newcommand{\densityISI}[1] {f_{\text{ISI}}\parenth{#1|\History_{#1}}} \newcommand{\hatcndintfnsuccinct}[1] {\hat{\lambda}\parenth{x_{#1}}} \newcommand{\hatcndintfn}[1] {\hat{\lambda}\parenth{#1|\History_{#1}}} %\renewcommand{\baselinestretch}{0.97} \onecolumn \title{A Computationally Efficient Method for Nonparametric Modeling of Neural Spiking Activity with Point Processes} \twoauthors{Todd P. Coleman}{Sridevi V. Sarma} \twoaffiliations{University of Illinois, Urbana-Champaign}{Massachusetts Institute of Technology} \acknowledgements{Support of this research is through AFOSR Complex Networks Program, award no. FA9550-08-1-0079 to T.P.C. and Burroughs Wellcome Fund and L'Oreal USA FWIS to S.V.S.} \rightheader{Nonparametric Modeling of Point Processes} % defaults to title \shorttitle{Nonparametric Modeling of Point Processes} % defaults to title \leftheader{T.P.C. and S.V.S.} % defaults to author %\begin{small} %\author{Todd P. Coleman \thanks{T. P. Coleman (colemant@uiuc.edu) is affiliated with the ECE Department and Neuroscience Program at the University of Illinois. He would like to acknowledge the support of this research through AFOSR Complex Networks Program, award no. FA9550-08-1-0079.}%~\IEEEmembership{Member,~IEEE,} % % and Sridevi V. Sarma \thanks{S. Sarma (sree@mit.edu) is affiliated with the Department of Brain and Cognitive Sciences at MIT. She would like to acknowledge Burroughs Wellcome Fund and L'Oreal USA FWIS.}%~\IEEEmembership{Member,~IEEE} % } %\address {University of Illinois Urbana Champaign, Massachusetts Institute of Technology\\ %\ttfamily{colemant@uiuc.edu, sree@mit.edu} } %\end{small} \begin{document} \maketitle \begin{abstract} Point process models have been shown to be useful in characterizing neural spiking activity as a function of extrinsic and intrinsic factors. Most point process models of neural activity are parametric as they are often efficiently computable. However, if the actual point process does not lie in the assumed parametric class of functions, misleading inferences can arise. Nonparametric methods are attractive due to fewer assumptions, but computation in general, grows with the size of the data. We propose a computationally efficient method for nonparametric maximum likelihood estimation when the conditional intensity function, which characterizes the point process in its entirety, is assumed to be a Lipschitz continuous function but otherwise arbitrary. We show that by exploiting much structure, the problem becomes efficiently solvable. We next demonstrate a model selection procedure to estimate the Lipshitz parameter from data, akin to the minimum description length principle and demonstrate consistency of our estimator under appropriate assumptions. Lastly, we illustrate the effectiveness of our method with simulated neural spiking data, goldfish retinal ganglion neural data, and activity recorded in CA1 hippocampal neurons from an awake behaving rat. For the simulated dataset, our method uncovers a more compact representation of the conditional intensity function when it exists. For the goldfish and rat neural data sets, we show that our nonparametric method gives a superior absolute goodness-of-fit measure used for point processes than the most common parametric and semi-parametric approaches. \end{abstract} %\begin{keywords} %Point Processes, Maximum Likelihood Estimation, Nonparametric Models, Optimization %\end{keywords} % Title. \section{Introduction} Characterizing neural spiking activity as a function of environmental stimuli, and intrinsic effects such as a neuron's own spiking history and concurrent ensemble activity is important in neuroscience. Such a characterization is complex and there is increasing need for a broad class of models to capture such details. Point process models have been shown to be useful in characterizing neural spiking activity. For example, they have successfully characterized stimulus-response relationships computed from data recorded in hippocampal cells of a rat~\cite{BarbieriQuirkFrankWilsonBrown01}, retinal ganglion cells in salamander, rabbit, and cat, as well as from lateral geniculate nucleus neurons in cat~\cite{KeatReinagelReidMeister01}. These models also captured the dynamics of supplementary eye field cells of a macaque monkey~\cite{KassVentura01}, the influence of one neuron's spiking activity on another's in sea hare (Aplysia californica)~\cite{Brillinger88}, and ensemble activity in hippocampal cells of rats~\cite{HarrisCsicsvariHiraseDragoiBuzsaki03}. Most point process models based on likelihood analysis are parametric~\cite{Brillinger88,ChornoboySchrammKarr88,BarbieriQuirkFrankWilsonBrown01,BrownBarbieriEdenFrank03,Paninsky04,TruccoloEdenFellowsDonoghueBrown05}. That is, the conditional intensity function is assumed to belong to a class of parametric functions. Parametric models have several advantages. First, they are often efficiently computable. Second, the parameters may be related back to physiological and environmental factors. Finally, they have nice asymptotic properties when the actual distribution lies in the assumed parametric class. However, if the true conditional intensity function does not lie in the assumed class, large errors may occur resulting in misleading inferences. Previous models that relax the apriori assumption of the CIF belonging to functional classes of either polynomials or piece-wise polynomials (splines) including cardinal cubic splines~\cite{FrankEdenSoloWilsonBrown02}, Zernicke polynomials~\cite{BarbieriQuirkFrankWilsonBrown02}, smoothing splines~\cite{KassVentura01}, and Bayesian free-knot spline models~\cite{DiMatteoGenoveseKass01}. These semi-parametric models may provide better alternatives to purely parametric models in that they are broader classes of functions, but they are often computationally expensive and/or may have low goodness-of-fit performance. In contrast, there are very few computable nonparametric methods for estimating the conditional intensity function of a point process model. Furthermore, even if a method is computable, it is very difficult to show nice asymptotic properties of the estimate. Yet, nonparametric estimation methods have the advantage of not assuming that the conditional intensity function lies in some known class of functions. Recently, Truccolo and Donoghue introduced a computationally efficient nonparametric approach to model neural point processes via stochastic gradient boosting regression~\cite{TruccoloDonoghue07}. This approach estimates a discretized CIF to maximize the likelihood function in a computationally efficient manner which is based on a statistical learning approach. Given finite data samples, they exploit an iterative estimation procedure which introduces a parametric model in each iteration. We propose an alternative computationally efficient method to build nonparametric point process models, which only involves computing a convex optimization problem subject to linear constraints. We show that point process estimation is a simple application of a nonparametric nonlinear regression problem~\cite{BertsimasTsitsiklis97}. The only restriction on the conditional intensity function is that its logarithm must be Lipschitz continuous. We demonstrate a model selection procedure to estimate the Lipshitz parameter from data, akin to the minimum description length principle by performing a penalized likelihood minimization procedure using the $\epsilon$-entropy of Lipschitz functions of parameter $K$ \cite{kolmogorov1959e} and the index of resolvability \cite{barron1991minimum}. We illustrate the effectiveness of our method with simulated neural spiking data, goldfish retinal ganglion neural data, and activity recorded in CA1 hippocampal neurons from an awake behaving rat. For the simulated dat set, we showed that our method can uncover a more compact representation of the conditional intensity function when it exists. For the goldfish and rat neural data sets, we showed that our nonparametric method gives a superior absolute goodness-of-fit measure than all parametric approaches analyzed. \section{Point Process Models} Consider the time interval $(0,T]$ as %the continuum, the time window for which our neural spike train is observed. Define $\calYT$ to be the set of functions $y: (0,T] \to \integers_+$ that are non-decreasing, right-continuous, and $y_0=0$. In other words, $\calYT$ is the set of point processes on $(0,T]$. Denote the $i$th epoch of a realization of sample path of point process $y \in \calYT$ as \begin{eqnarray} \epochtime{y}{i} \triangleq \min \braces{t \in [0,T]: y_t \geq i }\;\;\; i \in \{1, \ldots, y_T\}. \label{eqn:epochtime} \end{eqnarray} Succinctly, we can represent a point process as a sample path $y \in \calYT$ where the jumps in $y$, given by $\epochtime{y}{1}, \epochtime{y}{2}, \ldots$, correspond to the times at which spikes occur. Throughout this paper, upper case letters correspond to random variables/processes, where as lower case variables correspond to constants/sample paths realization. \begin{itemize} \item It is well known that the conditional intensity function (CIF) completely characterizes the statistical structure of most point processes used in statistical inference of neural data \cite{BrownBarbieriEdenFrank03}. The CIF is defined as follows \cite{DaleyVereJones03}: \begin{eqnarray} \cndintfn{t} &\triangleq& \lim_{\Delta \to 0} \frac{ \prob{Y_{t+\Delta}-Y_t=1 | \History_t}}{\Delta} \label{eqn:cndintensityfn} %\Leftrightarrow \prob{N(t,t+\Delta)=1 | X_t} &\simeq& \cndintfn{t} \Delta, \end{eqnarray} where $\History_t$ is the $\sigma$-algebra generated by $(Y_\tau: \tau \in [0,t])$ as well as other possible exogenous processes that impact the propensity for a neuron to spike. Succinctly, the conditional intensity specifies the instantaneous probability of spiking per unit time, given previous neural spiking and exogenous inputs. \item The history-dependent inter-spike interval (ISI) density at time $t$, $\densityISI{t}$, and CIF are in one-to-one correspondence: \begin{eqnarray} \densityISI{t} &=& \cndintfn{t} \exp \braces{ -\int_{\epochtime{y}{y_t}}^{t} \cndintfn{u} du} \label{eqn:densityISIfromCIF} \\ \cndintfn{t} &=& \frac{\densityISI{t}}{1 -\int_{\epochtime{y}{y_t}}^t \densityISI{u} du}. \label{eqn:p2lam} \end{eqnarray} where from \eqref{eqn:epochtime}, note that $\epochtime{y}{y_t}$ is time backward from $t$ of the most recent spike. \item Throughout this paper, we will assume that $\History_t$ is generated by a random variable $X_t$ on $\mathbb{R}^m$, and we will use the notation $\cndintfnSuccinct{t}$ to explicitly represent \eqref{eqn:cndintensityfn} in this manner. Many neuroscience point process models (for example, renewal process models and generalized linear models \cite{BrownBarbieriVenturaKassFrank02}) implicitly use this assumption, where $X_t$ can pertain to recent spiking history as well as exogenous processes. Examples of how $X_t$ is interpreted will appear in the experimental results section. \item For a sample path $y \in \calYT$ of a point process $Y$ with conditional intensity function $\cndintfn{t}=\cndintfnSuccinct{t}$ and spike times $\braces{\epochtime{y}{1},\epochtime{y}{2}, \ldots, \epochtime{y}{Y_T}}$, the likelihood or density of $y$ given $x$ is given by \cite{BrownBarbieriEdenFrank03} \begin{align} f_{Y|X}(y|x;\lambda) &= \exp\braces{ \int_{0}^{T} \log \cndintfnsuccinct{t} dy_t -\cndintfnsuccinct{t} dt} \label{eqn:ppLikelihood}. \end{align} By discretizing $(0,T]$ into $n=T/\Delta$ intervals of length $\Delta \ll 1$ so that $dy = (dy_1, \ldots, dy_n)$ with \begin{eqnarray*} dy_i &\triangleq& y_{(i+1)\Delta}-y_{i\Delta} \in \{0,1\}, \end{eqnarray*} we can approximate \eqref{eqn:ppLikelihood} by \begin{eqnarray} - \log f_{Y|X}(y|x;\lambda) &\simeq& \sum_{i=1}^{n} -\log \cndintfnsuccinct{i}dy_i + \cndintfnsuccinct{i} \Delta. \label{eqn:dtppLikelihood} \end{eqnarray} where the discrete time index $i$ corresponds to the continuous interval $(0,T]$ at time $i\Delta$. \item Common renewal process (point processes in which ISIs are independent and identically distributed) models in neuroscience are given by the {\it exponential, gamma,} and {\it inverse Gaussian} \cite{Brown05}. The {\it generalized linear model} (GLM), has recently been used to model conditional intensity functions of point processes. In its simplest form, the GLM represents the conditional intensity function parametrically as follows: \[\log \cndintfn{t}=\log \cndintfnSuccinct{t} = \sum_{i} \alpha_i g^{ext}_i(X_t) + \sum_{j} \beta_j g^{ext}_j(X_t),\] where $g^{ext}_j(x)$ and $g^{int}_i(x)$ are known basis functions that capture the extrinsic and intrinsic effects (carried in $X_t \in \reals^m$) on the probability of spiking at time $t$, respectively. The extrinsic and intrinsic associated parameters to be estimated are $\alpha_i, \beta_j \in \reals,$ respectively. \item The {\it time-rescaling theorem} \cite{lenglart1977transformation,DaleyVereJones03} provides a change of measure for arbitrary point processes via their sample paths and CIF, resulting in a unit-rate Poisson process: \begin{theorem} Given a point process $Y \in \calYT$, with associated conditional intensity function $\cndintfn{t}$, the random variables \begin{eqnarray} Z_k &=& \int_{\epochtime{Y}{k-1}}^{\epochtime{Y}{k}} \cndintfn{t} dt \;\;\; k=1, \ldots, Y_T,\label{eqn:d} \end{eqnarray} are independent, identically distributed (i.i.d.) exponential random variables of unit rate. \end{theorem} \item Goodness of fit-tests with point processes are widely performed \cite{BrownBarbieriVenturaKassFrank02} by estimating the conditional intensity, $\hatcndintfn{t}$, performing the time-rescaling \eqref{eqn:d} on the inter-spike intervals to construct the sequence $\{Z_k\}$, constructing an ideally uniform $[0,1]$ sequence $\{U_k=1 -\exp(Z_k)\}$, and performing a Kolmogorov-Smirnov (KS) test. The KS statistic is given \begin{eqnarray} \max_{t \in [0,1]} \abs{\Fempirical(t) - \Funiform(t)}. \label{eqn:KSstatdefn} \end{eqnarray} where ($\Fempirical(u)$) is given by the time-rescaling theorem applied to $\hatcndintfn{t}$, and $\Fempirical(t)$ is the cumulative distribution function (CDF) for a uniform-$[0,1]$ random variable. We use the KS statistic to test absolute goodness of fit for all point process models computed in our examples. It is important to note that the KS plots were generated on test data so that the models were all cross-validated. \end{itemize} \section{The Dynamic Maximum Likelihood Estimation (DMLE) Problem for Point Processes} We now consider a nonparametric estimation procedure where make the following assumption: \begin{eqnarray} \log \cndintfn{t} = z(X_t) \label{eqn:loglambdaAssumption:HistoryFinite} \end{eqnarray} where $X_t \in \reals^m$ encodes all the relevant information about $t$, and $z: \reals^m \to \reals$ is an unknown function only assumed to be Lipschitz continuous with parameter $K$. Given this assumption, we discuss a convex optimization procedure to estimate the structure of $z$. Recall that a function $z: \reals^m \to \reals$ is {\it Lipschitz continuous} with parameter $K \geq 0$ if \begin{eqnarray} \abs{z(x_1) - z(x_2)} \leq K \norm{x_1-x_2} \label{eqn:LipschitzCtsDefn} \end{eqnarray} holds for all $x_1, x_2 \in \reals^m$. \footnote{We note that our procedure allows for {\bf any} norm, and all discussions of complexity and interpolation still hold. Unless mentioned otherwise, we will assume the $\norm{a}=\infnorm{a}=\sup_{i=1,...m} |a_i|$}. This approach is inspired by the methodology of~\cite{BertsimasGamarnikTsitsiklis99}. In this point process setting, we exploit the fact that the point process likelihood is a {\it convex} functional of the conditional intensity, which leads to a computationally efficient Dynamic Maximum Likelihood Estimation (DMLE) problem \cite[Sec. 3]{BertsimasGamarnikTsitsiklis99}. Specifically, we introduce decision variables $\ullambda= (\lambda_1,\ldots,\lambda_n)$, express the negative log-likelihood of the data by way of \eqref{eqn:dtppLikelihood}, and minimize it subject to the Lipschitz continuity constraint \eqref{eqn:LipschitzCtsDefn} at each pair of data points: \begin{subequations} \begin{align} \min_{\ullambda} &\ \sum_{i=1}^{n} -\log \lambda_i dy_i + \lambda_i \Delta \\ s.t. \;\;\;& \abs{\log \lambda_{j_1} - \log \lambda_{j_2}} \leq K \norm {X_{j_1}-X_{j_2}}, \ \ \ j_1, j_2 = 1,\ldots,n. \end{align} \label{eqn:DMLE} \end{subequations} By letting $z_i = \log \lambda_i$, and $\lipschitzconstraint=K \norm {X_{j_1}-X_{j_2}}$, it follows that \eqref{eqn:DMLE} becomes: \begin{subequations} \begin{align} \min_{\ulz} \;\;\;& \sum_{i=1}^{n} -z_i dy_i + \Delta \exp(z_i) \label{eqn:convexOpt:a}\\ s.t. \;\;\;& z_{j_1} - z_{j_2} \leq \lipschitzconstraint\ \ \ j_1, j_2 = 1,\ldots,n. \label{eqn:convexOpt:b} \end{align} \label{eqn:convexOpt} \end{subequations} Note that \eqref{eqn:convexOpt} corresponds to minimizing a convex function over a convex set and thus the problem is, in principle, efficiently solvable. %\paragraph{Interpolation} This procedure, for other likelihoods that are convex functionals of the dynamic function, was shown in \cite{BertsimasGamarnikTsitsiklis99} to have nice properties, including consistency. Another nice property of such a procedure, is that the optimal solution, $(\hat{z}_1, \ldots, \hat{z}_n)$, can be used to interpolate a Lipschitz-continuous function at {\it any} point $x \in \reals^m$: \begin{eqnarray} \hat{z}(x) &\triangleq& \max_{1 \leq i \leq n} \braces{\hat{z}_i-K \norm{x-X_i}}, \label{eqn:interpolationProcedure} \end{eqnarray} where the following properties hold: \begin{itemize} \item $\hat{z}(x_i) = \hat{z}_i$ \item $\hat{z}(x)$ is a Lipschitz continuous function with parameter $K$ \end{itemize} The proof details follow directly from \cite[Proposition 1]{BertsimasGamarnikTsitsiklis99}. \section{Computational Reduction of the DMLE} Although our DMLE~\eqref{eqn:DMLE} is a convex optimization problem, it corresponds to minimizing a cost function with $n$ unknowns and $O(n^2)$ constraints. Typically, neural data involves sample sizes on the order of $10^6$ rendering a need for a computationally efficient way to compute~\eqref{eqn:convexOpt}. In this section, we make a series of observations that lead a computationally efficient algorithm for solving ~\eqref{eqn:convexOpt}. The observations are outlined as follows: \begin{enumerate} \item [a)] We uncover the separable \cite[Sec 5.1.6]{Bertsekas99} structure of \eqref{eqn:convexOpt} and look at the corresponding dual problem, which corresponds to minimizing an unconstrained function of $O(n^2)$ variables. \item [b)] We reduce the complexity of the dual problem by deriving an equivalent non-differentiable convex optimization problem of $n$ variables and $O(n)$. This can be solved by employing well-known subgradient methods \cite[Sec 6.3.1]{Bertsekas99}, and the calculation of each subgradient corresponds to solving a ({\it particularly efficiently solvable}) network flow problem \cite[Chapter 7]{BertsimasTsitsiklis97}. \end{enumerate} The approaches (a) and (b) mentioned above significantly reduce complexity and always converge to the optimal solution. \subsection{Dual Problem Formulation} We now introduce penalties $\mu_{j_1,j_2}$ for each constraint in \eqref{eqn:convexOpt:b} and formulate the Lagrangian: \begin{eqnarray*} \LagrangianAll &=& \sum_{i=1}^{n} f_i(z_i) + \sum_{j_1=1}^{n} \sum_{j_2=1}^{n} \mu_{j_1,j_2} \parenth{z_{j_1}-z_{j_2}-C_{j_1,j_2}}, \\ \end{eqnarray*} where $f_i(z_i)=-z_i dy_i + \Delta \exp(z_i).$ With some rearranging, it follows that \begin{subequations} \begin{align} \LagrangianAll &= \sum_{i=1}^n \Lagrangiani \\ \Lagrangiani &\triangleq -z_i dy_i + \Delta\exp(z_i) + \brackets{\sum_{j_1=1}^{n}\mu_{j_1,i} \parenth{-z_i-\half C_{j_1,i}} + \sum_{j_2=1}^n \mu_{i,j_2} \parenth{z_i-\half C_{i,j_2}} } \end{align} \label{eqn:LagrangianDefn} \end{subequations} From this, we define the dual function $\qmu$ as \begin{eqnarray} \qmu = \min_{\ulz} \LagrangianAll \label{eqn:dualfunctionDefn} \end{eqnarray} For any $\ulmu \geq 0,$ it is straightforward to see that $\qmu$ serves as a {\it lower bound} to the optimal cost in \eqref{eqn:convexOpt}. To make this lower bound as tight as possible, maximizing over all $\ulmu \geq 0$ gives the {\it dual problem}: \begin{subequations} \begin{eqnarray} \max \;\;\; && \qmu \\ s.t.\;\;\; && \ulmu \geq \ulzero. \end{eqnarray} \label{eqn:dualproblem:qfn} \end{subequations} Since the original problem is a convex optimization problem, \begin{itemize} \item the dual problem \eqref{eqn:dualproblem:qfn} is also a convex optimization problem. \item the optimal cost of the dual problem \eqref{eqn:dualproblem:qfn} is equal to the optimal cost to the primal problem \eqref{eqn:convexOpt}. \end{itemize} This follows from standard convex optimization theory \cite{Bertsekas99}. We now note that by separability in \eqref{eqn:LagrangianDefn}, \begin{eqnarray} \qmu &=& \sum_{i=1}^n q_i(\ulmu) \\ q_i(\ulmu) &\triangleq& \min_{z_i} \Lagrangiani \end{eqnarray} Now by defining \begin{eqnarray} W_i(\ulmu) \triangleq dy_i + \sum_{j_1} \mu_{j_1,i} -\sum_{j_2} \mu_{i,j_2} \label{eqn:Wdefn}, \end{eqnarray} it follows that we can characterize $q_i(\ulmu)$: \[ 0 = \partialderiv{\Lagrangiani}{z_i} = -W_i(\ulmu) + \Delta\exp(z_i) \] and so we have $z_i^* = -\log \Delta + \log W_i(\ulmu).$ Note that $W_i(\ulmu) \geq 0$ for all $i,\ulmu$. We now simplify the expression for $\qmu = \sum_{i=1}^n q_i(\ulmu)$: \begin{itemize} \item By \eqref{eqn:Wdefn} it follows that \[ \sum_{i=1}^n W_i(\ulmu) = \sum_{i=1}^n dy_i \] \item The appearance of the term $-\half \lipschitzconstraint \mu_{j_1,j_2}$ in $\qmu$ appears twice, once for $q_{j_1}(\ulmu)$ and once for $q_{j_2}(\ulmu)$. \end{itemize} So it follows that \begin{eqnarray*} \qmu &=& \brackets{\sum_{i=1}^n (1+\log \Delta) dy_i -W_i(\ulmu) \log W_i(\ulmu)} - \brackets{\sum_{j_1=1}^n \sum_{j_2=1}^n \lipschitzconstraint \mu_{j_1,j_2}}, \end{eqnarray*} and thus performing our nonparametric regression is equivalent to the following dual problem:\\ \fbox{\parbox{\columnwidth}{ \textbf{DUAL PROBLEM:} \begin{eqnarray*} \max && \qmu = \brackets{\sum_{i=1}^n (1+\log \Delta) dy_i -W_i(\ulmu) \log W_i(\ulmu)} - \brackets{\sum_{j_1=1}^n \sum_{j_2=1}^n \lipschitzconstraint \mu_{j_1,j_2}} \\ s.t. && \ulmu \geq \ulzero, \\ W_i(\ulmu) &\triangleq& dy_i + \sum_{j_1=1}^n \mu_{j_1,i} - \sum_{j_2=1}^n \mu_{i,j_2} \\ \lipschitzconstraint &\triangleq& K \norm {X_{j_1}-X_{j_2}}. \end{eqnarray*} The optimal solution is given in closed form simply as \begin{eqnarray*} z_i^* &=& -\log \Delta + \log W_i(\ulmu^*) \\ \Rightarrow \hat{\lambda}_i^* &=& \frac{1}{\Delta} W_i(\ulmu^*) \end{eqnarray*} }} \vspace{0.15in} \subsection{Complexity Reduction of an Equivalent Dual Problem} Note that the dual problem above has $O(n^2)$ unknowns. As such, this is also computationally infeasible for moderate data sizes. We now show that the dual can be significantly simplified. Consider, for a fixed $W \geq 0$, the set \[ S(\ulW) = \braces{\ulmu: \ulmu \geq \ulzero, \;\; W_i -dy_i = \sum_{j_1=1}^n \mu_{j_1,i} - \sum_{j_2=1}^n \mu_{i,j_2},\;\;\; i=1, \ldots, n } \] where $\sum_i W_i = \sum_i dy_i$. This can be described succinctly by saying that \begin{eqnarray*} A\ulmu &=& b(\ulW,\uldy), \\ \ulmu &\geq& \ulzero \end{eqnarray*} where $A$ is a matrix consisting of $0$, $1$ and $-1$ and entries, with $n$ rows and $n(n-1)$ columns; and $\underline{b}=b(\ulW,\uldy)=\ulW-\uldy$ is the column constraint matrix. Note that \begin{eqnarray} \sum_{i=1}^n b_i = \sum_{i=1}^n (W_i-dy_i) = (\sum_{i=1}^n W_i)-(\sum_{i=1}^n dy_i) = 0. \label{eqn:sumconstraintsZero} \end{eqnarray} We are fixing the values of $W$ and would like to vary over the remaining free parameters $\ulmu$, consistent with this fixed $W$, to optimize $\qmu$. Note that maximizing $\qmu$ subject to this fixed value of $W$ is equivalent to performing the following linear program (LP): \begin{subequations} \begin{eqnarray} \min & & \sum_{j_1=1}^n \sum_{j_2=1}^n \lipschitzconstraint \mu_{j_1,j_2} \label{eqn:equivalentLP:a}\\ s.t. & & A \ulmu = b(\ulW,\uldy) \label{eqn:equivalentLP:b}\\ & & \ulmu \geq 0 \label{eqn:equivalentLP:c} \end{eqnarray} \label{eqn:equivalentLP} \end{subequations} This LP is in standard form and thus there must exist a basic feasible solution that is optimal \cite[Sec. 2.6]{BertsimasTsitsiklis97}. Also note that any column of $A$ has exactly two nonzero values, with one being $1$ and the other being $-1$. Combining this observation with \eqref{eqn:sumconstraintsZero}, it follows that this is a network flow problem \cite[Chapter 7]{BertsimasTsitsiklis97}. We have a graph $G=(V,E)$ with a set $V$ of vertices, ranging from $1$ to $n$ where each vertex corresponds to a millisecond observation. The edge set $E$ corresponds to directed edges $(j_1,j_2)$ in the graph, and each edge $(j_1,j_2)$ has an associated cost $\lipschitzconstraint$. This is exactly the $\lipschitzconstraint$ corresponding to the Lipschitz constraint. Note that the graph is fully connected, because we want the Lipschitz constraints to be satisfied for all possible pairs. To enforce absolute value constraints, what this means is that the directed edges $(j_1,j_2)$ and $(j_2,j_1)$ are both in the edge set $E$ and they both have the same cost $\lipschitzconstraint$. There is a flow $b_i=W_i-dy_i$ that is supplied to the network at node $i$. As an example, suppose we were looking at problem where $n=3$ and thus \begin{center} $\ulmu = \left [ \begin{array}{cccccc} \mu_{12} & \mu_{13} & \mu_{21} & \mu_{23} & \mu_{31} & \mu_{32} \end{array} \right ]^T$ \end{center} has $n(n-1)=6$ components. Then $A$ will take the form of { \setlength\arraycolsep{0.3em} \begin{eqnarray*} %A=\brackets{\begin{array}{llllllllllll} A=\brackets{\begin{array}{cccccc} -1 & -1 & 1 & 0 & 1 & 0 \\ 1 & 0 & -1 & -1 & 0 & 1 \\ 0 & 1 & 0 & 1 & -1 & -1 \\ \end{array}} \end{eqnarray*}} By virtue of the dual, this can also be expressed as \begin{subequations} \begin{eqnarray} \max & & \sum_{i}^n p_i (W_i-dy_i) \label{eqn:equivalentLPdual:a}\\ s.t. & & p_{j_2} - p_{j_1} \leq C_{j_1,j_2}, \;\;j_1=1,\ldots,n, \;\;\;j_2=1,\ldots,n,\ \ j_1 \neq j_2. \label{eqn:equivalentLPdual:b} \end{eqnarray} \label{eqn:equivalentLPdual} \end{subequations} In general, network flow problems are efficiently solvable and the network simplex method, dual ascent methods, and likewise allow for extremely efficient solutions~\cite{PapadimitriouSteiglitz}. They are often orders of magnitudes faster than implementations of the standard LP simplex method. Thus we now have the equivalent problem to our dual: \fbox{\parbox{\columnwidth}{ \textbf{Computationally Efficient Dual Problem:} \begin{eqnarray*} \min & & \tilde{q}(\ulW) = \sum_{i=1}^n W_i \log W_i + R(\ulW)\\ s.t. & & \ulW \geq 0, \\ & & \sum_{i=1}^n W_i = \sum_{i=1}^n dy_i \\ R(\ulW) &\triangleq& \max_{\ulp} \sum_{i}^n p_i (W_i-dy_i) \\ s.t. & & p_{j_2} - p_{j_1} \leq \lipschitzconstraint, \;\;j_1,j_2=1,\ldots,n, \end{eqnarray*} The optimal solution is given in closed form simply as \begin{eqnarray*} z_i^* &=& -\log \Delta + \log W_i^* \\ \Rightarrow \hat{\lambda}_i^* &=& \frac{1}{\Delta} W_i^* \end{eqnarray*} \vspace{0.1in} }} Note that by exploiting this structure, we have eliminated the $O(n^2)$ $\ulmu$ variables and thus the complexity of the problem has been diminished vastly. Namely, the original problem involved $O(n^2)$ constraints (which for $30$s=$30,000$ms of retinal data cannot even be filled into a Matlab matrix without memory overflow errors), and the dual problem involved involved on the order of $O(n^2)$ variables that suffer the same Matlab fate. By reducing our problem as described above, there are only $n$ variables. Every call to the dual function involves the computation of $R(\ulW)$, which can be done with any special-purpose network flow algorithm. In the Appendix, we describe how one efficiently searches to optimize across $W$. \section{An Information-Theoretic Procedure to Select the Lipschitz Constant from Data} For our nonparametric model specified by \eqref{eqn:loglambdaAssumption:HistoryFinite} it is unclear a priori what the dimension $m$ is and likewise for the Lipschitz constant $K$, and thus they must be learned in a principled way from the data. Selecting the optimal integer $m$ is a well-established problem and can be performed via the use of standard minimum-description length, by penalizing the negative log likelihood by an amount $\frac{m \log n}{2}$ for a consistent estimate \cite{BarronRissanenYu98}. As for optimally selecting a Lipschitz constant, we know that for a sequence of increasing constants $\braces{K_j}$, by virtue of \eqref{eqn:convexOpt}, the feasible sets are nested increasingly in $j$. Our approach is to embed the problem of selecting $\hat{K}=K_{\hat{j}}$ directly into the optimization problem in the spirit of Minimum Description Length \cite{BarronRissanenYu98}, by selecting the candidate conditional intensity function that minimizes the total length in bits required to describe the data \cite{barron1991minimum,BarronRissanenYu98}: \begin{itemize} \item Define the Lipschitz family of functions on $[0,1]$ of parameter $K$ as \[ \mL(K) \triangleq \braces{f: [0,1] \to \mathbb{R}: |f(a)-f(b)| \leq K |a-b|}.\] \item The $\epsilon$-entropy $H_\epsilon(\mathcal{S})$ of a set $\mathcal{S}$ of functions is the logarithm of the smallest set $\mathcal{S}'$ of functions s.t. for any $f \in S$, there exists an $f' \in \mathcal{S}'$ s.t. $\infnorm{f-f'} < \epsilon$. It is known \cite{kolmogorov1959e} that for sufficiently small $\epsilon$: \begin{eqnarray} H_\epsilon(\mL(K)) = \frac{K}{\epsilon} +\log \parenth{\frac{1}{\epsilon}}. \label{eqn:epsilonEntropyLipschitz} \end{eqnarray} For any $\epsilon > 0$, we consider a finite set $\Gamma_K(\epsilon) \subset \mL(K)$, where $\log \abs{\Gamma_K(\epsilon)}=H_\epsilon(\mL(K))$. \item We consider an increasing list $(K_j: j=1,2,\ldots)$ of Lipschitz constants, and consider an associated nested list $(\Gamma_{K_j}(\epsilon): j=1,2,\ldots)$ of Lipschitz functions: \begin{eqnarray} \Gamma(\epsilon) = \bigcup_{j=1}^\infty \Gamma_{K_j}(\epsilon). \label{eqn:gammaEpsilon} \end{eqnarray} %For each $K_j$, we consider $\hat{q}_j$ to be the optimal solution to \eqref{eqn:DMLE} with $K_j$. \item For any $p \in \cup_{K > 0} \mL(K)$, and neural spike train $y \in \calYT$ drawn according to \eqref{eqn:ppLikelihood} with $\log \lambda=p$, we define the length $R_n(p)$ as the minimum number of bits per input symbol required to represent $y$ with a prefix-free code \cite{CoverThomas06}. \item Define $R_n(p,\epsilon)$ as the minimum number of bits per symbol required to describe $y$ with a prefix-free source code using a description $q \in \Gamma(\epsilon)$. Note that this can be done without loss of generality in three stages: first specifying $j$ with a universal prefix-free code for the set of integers (which requires $O(\log j)$ bits \cite{rissanen1983universal}); second specifying the index of $q$ in $\Gamma_{K_j}$ (which requires $H_\epsilon(\mL(K_j))$ bits), and lastly encoding the data $y \in \calYT$ with an optimal prefix-free code - which requires the Shannon code length \cite{CoverThomas06} of $-\log f_{Y|X}(y|x;q)$ bits: \begin{eqnarray} R_n(p,\epsilon) &=& \min_{j \in \integers_+} \min_{q \in \Gamma_{K_j}(\epsilon)} \frac{\log j}{n} + \frac{1}{n} H_\epsilon(\mL(K_j)) -\frac{1}{n} \log f_{Y|X}(y|x;q) \label{eqn:Rnpepsilon:a} \\ &\simeq& \min_{j \in \integers_+} \min_{q \in \Gamma_{K_j}(\epsilon)} \frac{1}{n} H_\epsilon(\mL(K_j)) -\frac{1}{n} \log f_{Y|X}(y|x;p) + \kldist{p}{q}\label{eqn:Rnpepsilon:b} \end{eqnarray} where \eqref{eqn:Rnpepsilon:b} follows for large $n$ and from the Kullback-Leibler divergence interpretation \cite{CoverThomas06} of the excess code length for optimally source coding a source with distribution $p$ under the assumption it arose from $q$: $\kldist{p}{q} \triangleq E_p \braces{\log \frac{f_{Y|X}(y|x;p)}{f_{Y|X}(y|x;q)}}$. \item Note that for a fixed $j$ in \eqref{eqn:Rnpepsilon:b} and for small $\epsilon$, the inner minimization will result in the maximum likelihood solution $\hqKj$ from \eqref{eqn:DMLE} for a fixed $K_j$. Thus we have: \begin{eqnarray} R_n(p,\epsilon) &\simeq& \min_{j \in \integers_+} \frac{1}{n} H_\epsilon(\mL(K_j)) -\frac{1}{n} \log f_{Y|X}(y|x;\hqKj) + \kldist{p}{\hqKj}\label{eqn:Rnpepsilon:c} \end{eqnarray} \item It is well known \cite[eqn 6.23]{barron1991minimum} that for small $\epsilon$, $p \in \mL(K)$, and $q \in \Gamma_K(\epsilon)$: $\kldist{p}{q} \simeq 1/2 \epsilon^2$. So we can optimize how $\epsilon$ varies with $n$ and $K_j$ in $R_n(p,\epsilon)$ to tradeoff the increase of $H_\epsilon(\mL(K_j))$ with the decrease of $\kldist{p}{q}$ as $\epsilon \downarrow 0$. By standard calculus, for large $n$, it follows that $\epsilon^*(n,K_j) \simeq (K_nn)^{-\frac{1}{3}}$. As a consequence, we arrive at: \begin{eqnarray} % R_n(p) &\triangleq& \min_{j} -\frac{1}{n}\log f_{Y|X}\parenth{y|x;\hqKj} + {K_j}^{\frac{4}{3}} n^{-\frac{2}{3}} \label{eqn:MDLselection:a}\\ \hat{j} &=& \arg \min_{j \in \integers_+} -\frac{1}{n}\log f_{Y|X}\parenth{y|x;\hqKj} + {K_j}^{\frac{4}{3}} n^{-\frac{2}{3}} \label{eqn:MDLselection:b}\\ \hat{q} &=& \hqK{\hat{j}} \label{eqn:MDLselection:c} \end{eqnarray} Under statistical assumptions on the almost-sure convergence of $-\frac{1}{n}\log f_{Y|X}\parenth{Y|X;p}$ along with $p \in \Gamma$, it is known that this procedure is consistent: $\hat{q} \to p$ a.s. \cite{barron1991minimum}. Moreover, under stronger statistical assumptions on $-\frac{1}{n}\log f_{Y|X}\parenth{Y|X;p}$, the index of resolvability \cite[eqn 5.3]{barron1991minimum} provides a bound on the convergence rate of this estimation procedure: \[ \kldist{p}{\hat{q}} \leq O \parenth{n^{-\frac{2}{3}}}.\] \end{itemize} \section{Experimental Results} \subsection{Computing Uncertainty} \label{sec:computingUnvertainty} To compute confidence bounds for our estimate of the conditional intensity function, we applied an algorithm based on the time-rescaling theorem for point processes which is described in~\cite{SS08}. This bootstrap method is applicable to any point process model (parametric or nonparametric). Consider a given estimate of a conditional intensity function, $\hatcndintfn{t}$, for all $ t \in (0,T].$ To compute confidence bounds for our estimate $\hatcndintfn{t}$, for all $ t \in (0,T],$ we apply the following bootstrap algorithm, where the $j$th bootstrap sample of the statistic for $j=1,\ldots,B$ is computed as follows: \newcommand{\Ukj}{U_k^{(j)}} \newcommand{\taukj}{\tau_k^{(j)}} \newcommand{\hcndintfnjt}{\hat{\lambda}^{(j)}(t|\History_t)} %{\bf Sri: What is the function `$\theta$'? Is it the identity function? } \begin{enumerate} \item Draw an i.i.d. sequence $\braces{\Ukj}$, of uniformly distributed on $[0,1)$ random variables, for $k \in \integers_+$ and $j=1, \ldots,B$. \item Define the i.i.d. sequence $\braces{\tau_k^{(j)}}$ of unit-rate exponentially distributed random variables according to $\tau_k^{(j)} = -\log\parenth{\Ukj}$. \item Find $t_k^{(j)}$ as the solution to \[ \tau_k^{(j)} = \int_{t_{k-1}^{(j)}}^{t_{k}^{(j)}} \lambda(u|H_u) du \] for $k=1,2,3,…$ ($t_0^{(j)}=0$) and stop when $t_k^{(j)}>T$. Denote the final $k$ value as $n_j$ for bootstrap sample $j$ such that $t_{n_j} \leq T \leq t_{n_{j+1}}$. \item From the bootstrap data $t_k^{(j)}$ estimate $\cndintfn{t}$ (for example, using the DMLE procedure) to produce $\hcndintfnjt.$ %Note that $t_k^{(j)} = t_1^{(j)},\ldots,t_n^{(j)}$. \end{enumerate} Once all $B$ bootstrapped spike train samples are generated along with their corresponding CIF estimates, we compute the 95% confidence bounds by ordering the bootstrapped CIFs, $\hcndintfnjt,$ for each time $t$ and then taking the 5th quantile as the lower bound and the 95th quantile as the upper bound. An example of this is provided in Figure~\ref{fig:bootstrappedConfidenceIntervals}. \begin{figure}[hbtp] \centering \includegraphics[width=0.75\columnwidth]{bootstrappedSinPlot} \caption{Bootstrapped confidence intervals for the simulated sinusoidal dataset.} \label{fig:bootstrappedConfidenceIntervals} \end{figure} \subsection{Simulated Dataset} % possibly change to 15s here???? To test the effectiveness of our approach on a controlled dataset we, constructed $1.5s$ of data. The conditional intensity function of the spike train was constructed as follows: \begin{eqnarray} X_i &\in& [-0.8,-0.2]\\ \log \cndintfn{i\Delta} \simeq \log \lambda(X_i) &=& \theta_0 + \frac{K}{\omega}\sin \parenth{\omega X_i} \label{eqn:CIFModelSimulatedDataset} \end{eqnarray} where $\{X_i: 1 \leq i \leq 1500\}$ are uniformly spaced across $[-0.8,0.2]$; $K=10$; $\theta_0 = 4.7$; $\omega = 6\pi$; and $\Delta = 1ms$. Note that in \eqref{eqn:CIFModelSimulatedDataset}, $\log \lambda \triangleq p \in \mL(10)$. Using a list of increasing candidate $K$ values, starting $0$ with increments of $0.5$, we performed \eqref{eqn:MDLselection:c} with each individual $\hqKj$ solved using the DMLE approach \eqref{eqn:DMLE}. Figure~\ref{fig:simulatedDataSet:a}(top) illustrates the penalized normalized negative log likelihoods pertaining to \eqref{eqn:MDLselection:b} for various values $K_j$. Note that with this finite $n$, the optimal $K_j$ is $9.5$ when the true value is $10$. %In order to cross-validate for goodness-of-fit, we took a remaining different block of $15s$ of data and used the estimated conditional intensity $\hqKj$, and performed time-rescaling and KS plot generation, with $\Funiform(t)$ versus $\Fempirical(t)$. The subsequent KS plot, which lies within the 95\% confidence bounds, is shown in ~\ref{fig:simulatedDataSet:a}(R). We also compare our estimate $\hq{9.5}$ of the CIF to the actual $p=\log \lambda$ in Figure~\ref{fig:simulatedDataSet:a}(bottom) and note that our nonparametric estimate, which has no prior assumption on the shape of $p$, is able to uncover the sinusoid waveform. \begin{figure}[hbtp] \centering \includegraphics[width=0.75\columnwidth]{penalizedNNLLSinusoud070809}\\ \includegraphics[width=0.75\columnwidth]{KSplotSinusoidTestSet} \caption{Penalized normalized negative log likelihood across candidate $K_j$ values for the simulated sinusoid dataset. The true value of $K$ is $10$. $\hat{K}$ is $9.5$. (Top) %(R) KS plot on cross-validated data using the optimal estimated conditional intensity for the simulated sinusoid dataset. The estimated function $\hq{9.5}$ computed from the DMLE, interpolated at any value of $x$, compared with the true function $\lambda \in \mL(10)$. (Bottom). } \label{fig:simulatedDataSet:a} \end{figure} \subsection{Goldfish Retinal Ganglion Data}\label{sec:applications:GoldfishRetinalGanglionData} We study a spike train data series from a goldfish retinal ganglion cell neuron recorded in vitro. The retinae were removed from the goldfish and maintained in a flow of moist oxygen and recordings were made with an extracellular microelectrode under constant illumination (constant stimulus). The data are 975 spikes recorded over 30 seconds from neuron 78 in~\cite{IyengarLiao97}. Figure~\ref{fig:Retinal:a}(top) illustrates the histogram of ISIs in this data. \begin{figure}[htbp] \centering \includegraphics[width=0.75\columnwidth]{ISI_data}\\ \includegraphics[width=0.75\columnwidth]{penalizedNNLLRetinal070809} \caption{Histogram of ISIs for the retinal dataset (Top). Penalized normalized negative log likelihood across candidate $K_j$ values for the retinal ganglion dataset. $\hat{K}=4.5$ (Bottom).} %The history-dependent ISI density using the optimal nonparametric estimated conditional intensity function.} \label{fig:Retinal:a} \end{figure} The nonparametric estimator treats $x$ as follows: each $x_i \in \{0,1\}^{10}$ is modeled to capture history with the interpretation that \begin{eqnarray} x_{i,j} &=& dy_{i-j} \;\;\; j \in \{1, \ldots 10\} \label{eqn:historyDefnRetinalISIs} \end{eqnarray} with $\Delta=1$ms. Using a list of possible $K$ values beginning at $0$ with increments of $0.5$, \eqref{eqn:MDLselection:c} was performed, with each individual $\hqKj$ solved using the DMLE approach \eqref{eqn:DMLE}. Figure~\ref{fig:Retinal:a}(bottom) shows the penalized normalized negative log likelihood values across $K_j$ and the optimal $\hat{K}=4.5$. The subsequent history-dependent inter-spike interval density, from $\hqKj$, is given in Figure~\ref{fig:Retinal:b}(top). With this approach, we can account for bursting phenomena empirically with our approach via estimation of the conditional intensity and application of \eqref{eqn:densityISIfromCIF}. Note that the ISI density has its maximum delayed, to account for refractoriness. Also, a second local peak at $10$ms appears to be reminiscent of bursting. For the purpose of comparison, parametric models were developed with $X_i \in \reals$ is to the time (in ms) since the last spike. The parametric models built in comparison are enumerated below. \begin{enumerate} \item A renewal process model with $\densityISI{t}$ pertaining to an exponential distribution on the inter-spike intervals. \item A renewal process model with $\densityISI{t}$ pertaining to a Gamma distribution on the inter-spike intervals \item The conditional intensity pertains to a generalized linear model: \[ \cndintfn{i\Delta}= \exp\braces{ \sum_{j=1}^{10} \alpha_j \sum_{k = l(j,1)}^{l(j,2)} dy_{i-k}} \] with $l= [1, 3; 4, 6; 7, 8; 9, 10; 11, 20; 21, 30; 31, 40; 41, 50; 51, 60; 61, 70; 71, 80; 81, 90; 91, 100; 101, 110; 111, 120]$.\\ %{\bf to here}\\ The optimal order of the GLM model was estimated using the minimum description length penalized negative log likelihood procedure, where a penalty $\frac{1}{2}\log{d}{n}$ was added for a model of order $d$. \end{enumerate} In order to assess goodness-of-fit and cross-validation, each of the parametric models, along with $\hqKj$, were optimally estimated using maximum likelihood on $15s$ of data and then cross-validated on an adjacent non-overlapping $15s$ of data. For the GLM model, the minimum description length model selection approach was used to estimate the optimal order of $16$. Using the time-rescaling theorem and Kolmogorov-Smirnov plots with $95\%$ confidence intervals, subsequent KS plots on the time-rescaled inter-spike intervals on the cross-validated dataset are shown in Figure~\ref{fig:Retinal:b}(bottom). Note that the non-parametric model has the best KS plot across all cross-validated parametric models, including the GLM model. \begin{figure}[hbtp] \centering \includegraphics[width=0.75\columnwidth]{nonparam_isi_model_overlayed_isi_histogram}\\ \scalebox{0.3}{\includegraphics{cross_validated_ksplot_retinal_10000msec}} %\scalebox{1.5}{\includegraphics[width=0.5\columnwidth]{cross_validated_ksplot_retinal_10000msec}} \caption{History-dependent inter-spike interval density estimated from the optimal non-parametric model, with $\hat{K}=4.5$ (top). KS plot on cross-validated data using the optimal estimated conditional intensity for the nonparametric model as well as optimal estimates from parametric models (bottom). } \label{fig:Retinal:b} \end{figure} \subsection{Rat CA1 Hippocampal Data} We apply our nonparametric method to a spike train recorded from a CA1 hippocampal neuron in a rat foraging in an open circular environment. Both place cell microelectrode array recordings and position data were captured from a Long-Evans rat freely foraging in a circular environment 70 cm in diameter with walls 30 cm high and a fixed visual cue. The data we used are 137 spikes recorded over the first 3 seconds from one CA1 neuron. Details of the experiment can be found in~\cite{BarbieriFrankNguyenQuirkSoloWilsonBrown04}. Figure~\ref{fig:rat_pos_data} illustrates the position trajectory of the rat and the spiking histogram relative to the position of the rat over the first 20 seconds. \begin{figure}[htbp] \centering \includegraphics[width=0.75\columnwidth]{somat_rat_trajectory}\\ \includegraphics[width=0.75\columnwidth]{somat_spikinghistogram} \caption{Rat trajectory over 20 seconds (top), Spiking histogram of single place cell over 20 seconds (bottom) } \label{fig:rat_pos_data} \end{figure} Barbieri et al. built a semi-parametric point process model on the same data set using spatial Zernike polynomials via maximum likelihood estimation~\cite{BarbieriFrankNguyenQuirkSoloWilsonBrown04}. Specifically, the conditional intensity function took on the form \[ \log \lambda_z(x_t) =\sum_{l=0}^{L} \sum_{m=-l}^{l} \alpha_{l,m} Z_{l,m}(\rho(t),\phi(t)) ,\] \noindent where: \begin{itemize} \item $\rho(t) = r^{-\half} [(x(t) - \eta_1)^2 + (y(t) - \eta_2)^2]^{\half}$, $\phi(t) = \tan^{-1} [(x(t) - \eta_1)(y(t) - \eta_2)^{-1}]$, $(\eta_1,\eta_2)=(35\text{cm},35\text{cm})$ is the center of the circular environment, and $r=35$cm is the radius of the circular environment. \item $Z_{l,m}$ is the $m$th component of the $l$th order Zernike polynomial, \begin{eqnarray*} Z_{l,m}(\rho(t),\phi(t)) &=& \left \{ \begin{array}{cc} R_{l,m}(\rho(t)) \sin(m\phi(t)) & m >0 \\ R_{l,m}(\rho(t)) \cos(m\phi(t)) & m < 0 \end{array} \right . \\ R_{l,m}(\rho(t)) &=& \left \{ \begin{array}{cc} \sum_{j=0}^{(l-|m|)/2} \frac{(-1)^j(l-j)!}{j!(\frac{l+m}{2} -j)!(\frac{l-m}{2} -j)!} & (l-m) \ \ even \\ 0 & o.w. \end{array} \right . , \end{eqnarray*} where $0 \leq |m| \leq l$. \end{itemize} More details are found in~\cite{BarbieriFrankNguyenQuirkSoloWilsonBrown04}. We compare our nonparametric method to estimate the CIF as a function of the 2-dimensional position vector $x_t$ and compare to the semi-parametric approach using Zernike model described above. An optimal number of Zernike basis functions was selected using the minimum description length procedure by penalizing the negative log likelihood with $d \log(n)$ for Zernike model with $d$ parameters. The optimal Zernike model which we use for comparison had $4$ basis functions. We used the nonparametric procedure \eqref{eqn:MDLselection:c} with $x_i \in \reals^2$ corresponding to the position within a circle of radius 2 centered at $(0,0)$. Figure~\ref{fig:Zernike:a} plots the penalized normalized negative log likelihood as a function of the Lipschitz candidate constants $K_j$. Note that the optimal nonparametric model is for $K=5$. \begin{figure}[tp] \centering \includegraphics[width=0.75\columnwidth]{penalizedNNLLSomata070809}\\ \includegraphics[width=0.75\columnwidth]{ksplots_zernike} \caption{Penalized normalized negative log likelihood across candidate $K_j$ values for the hippocampal place cell dataset (top). KS plots of optimal nonparametric model and optimal Zernike model on a cross-validated set of 20s of position and spiking (bottom) } \label{fig:Zernike:a} \end{figure} Figure~\ref{fig:Zernike:b} (top) shows the contour plots of each CIF estimate and their corresponding KS plots. \begin{figure}[tp] \centering \includegraphics[width=0.75\columnwidth]{cif_surface_zernike4bases}\\ \includegraphics[width=0.75\columnwidth]{cif_surface_nonparam_model} \caption{CIF Contour for Optimal Zernike Model learned from $4$ bases functions (top). CIF Contour for Nonparametric Model (bottom).} \label{fig:Zernike:b} \end{figure} Note that the nonparametric method appears to provide a model of spiking more visually consistent with the spiking histogram than that of the Zernike approach. This is also manifested quantitatively in a better KS plot for the nonparametric method in Figure~\ref{fig:Zernike:a}. \section*{Acknowledgements} The authors would like to thank Angelia Nedich and Stephen Boyd for thoughtful recommendations on implementation of the subgradient method and cutting plane algorithms, Maxim Raginsky and Munther Dahleh for useful comments on nonparametric statistical learning, and Antonio Frangioni for help with the MCFZIB library for implementing the network flow solution solver in C++. \bibliography{ColemanSarmaNeCoJuly09References} \section{Appendix} \subsection{Searching over $\ulW$} \label{app:W} We now discuss some properties of this problem that allow us to only compute a sequence of network flow problems to attain a solution. %\subsection{Nondifferentiable Optimization Method} By virtue of the dual problem, we have that we would like to perform \begin{subequations} \label{eqn:dualNonDifferentiableProblem} \begin{eqnarray} \max && q_K(\ulW) = \sum_{i=1}^n (1+\log \Delta) dy_i -W_i \log W_i - R_K(\ulW) \label{eqn:dualNonDifferentiableProblem:a} \\ s.t. && \ulW \in \simplex \end{eqnarray} \end{subequations} where $\mathcal{S}$ is the simplex: \[ \simplex = \braces{\ulW: \ulW \geq \ulzero, \sum_{i=1}^n W_i = \sum_{j=1}^n dy_j}\] and $R_K(\ulW)$ is the optimal solution to the network flow problem (in dual form) \begin{subequations} \label{eqn:dualNetworkFlowProblem} \begin{eqnarray} \max && \ulp'(\ulW-\uldy) \\ s.t. && \ulp \in \dualNetFloConstraintSet{K} \end{eqnarray} \end{subequations} where \begin{eqnarray} \dualNetFloConstraintSet{K} = \braces{\ulp: p_i - p_j \leq K \abs{X_i-X_j}, \; \forall\; i,j}. \label{eqn:dualNetFloConstraintSet} \end{eqnarray} Note that: \begin{lemma} \label{lemma:RkLinearInK} For any $K \geq 0$ and any $\ulW \in \simplex$, by virtue of \eqref{eqn:dualNetFloConstraintSet}, $R_K(\ulW) = K R_1(\ulW)$ and if $\ulp^*_1$ is an optimal dual solution associated with $R_1(\ulW)$, then $K \ulp^*_1$ is an optimal dual solution associated with $R_K(\ulW)$. \end{lemma} Note that the dual problem, specified as \eqref{eqn:dualNonDifferentiableProblem}, is convex but is \textbf{nondifferentiable} because of the piecewise linearity of $R_K(\ulW)$ in $W$. We now consider using a subgradient method to solve this. \begin{definition}[Subgradient] given a convex function $f$ we say that $\uld$ is a subgradient of $f$ at $\ulx$ if \[ f(\ulz) \geq f(\ulx) + (\ulz-\ulx)'\uld\] for all z. If f is concave, we say that $\uld$ is a subgradient of $f$ at $\ulx$ if $-\uld$ is a subgradient of the convex function $-f$ at $\ulx$. \end{definition} Note that any dual optimal solution $p^K(\ulW)$ to $R_K(\ulW)$ is a subgradient to $R_K(\ulW)$ and thus, $\underline{g} = -[1 + \log \ulW - p^K(\ulW)]$ is a subgradient to $q_K(\ulW)$. The subgradient method generates a sequence of dual feasible points according to the iteration \[ \ulW^{j+1} = [\ulW^j + s^j \underline{g}^j]_+ \] where $g^j$ is the subgradient and $[\cdot]_+$ denotes projection onto the simplex $\simplex$. \paragraph{Projections onto convex sets} \begin{lemma}%[Projection onto $\simplex$] For any $\ulW_0 \in \reals^n$, the projection of $\ulW_0$ onto $\simplex$, given by \[ [\ulW_0]_+=\arg \min_{\ulW \in \simplex} \norm{\ulW-\ulW_0}^2, \] is given by \[ W_i^* = (W_{0,i}-\nu)^+\] where $\nu$ is chosen to satisfy \cite[Sec. 2.1]{Bertsekas99} $\sum_{i} W_i^* = \sum_j dy_j$. \end{lemma} {\bf Proof} The KKT optimality conditions for convex optimization over a simplex (see Bertsekas Example 2.1.2) are that \[ W_i^* >0 \Rightarrow \frac{\partial f(W^*)}{\partial W_i} \leq \frac{f(W^*)}{\partial W_j} \;\; \forall j\] which equivalently means that all coordinates that are positive at the optimum must have minimal (and equal) partial cost derivatives. In our case, \begin{eqnarray*} f(\ulW) = \sum_{i=1}^n (W_i-W_{0,i})^2 \Rightarrow \frac{\partial f(\ulW)}{\partial W_i} = 2(W_i-W_{0,i}). \end{eqnarray*} By defining $\sumdy = \sum_j dy_j$, it follows that $W_i^*$ must satisfy \begin{subequations} \begin{eqnarray} W_i^* &=& (W_{0,i}-\nu)^+ \label{eqn:optimalityCondsProjSimplex:a}\\ \frac{\partial f(W^*)}{\partial W_i} = 2(W_i^*-W_{i,0})&=& -2\nu, \; \forall\;\;i \;s.t.\; W^*_i > 0 \label{eqn:optimalityCondsProjSimplex:b}\\ \nu &\geq& \max_{i: W_i^*=0} W_{0,i} \label{eqn:optimalityCondsProjSimplex:c}\\ \nu & & \textrm{satisfies }\sum_{i=1}^n W_i^* = \sumdy \label{eqn:optimalityCondsProjSimplex:d} \end{eqnarray} \end{subequations} where the maximum of the empty set is $-\infty$. We can set up an algorithm to do this as follows: \begin{itemize} \item[1] If $\sum_i W_{0,i}^+ = \sumdy$ then set $\nu=0$ and go to 4. \item[2] Define $\ulW_{s}$ to be $\ulW_0$ sorted in descending order so that $W_{s,1} \geq W_{s,2} \geq \ldots \geq W_{s,n} \geq W_{s,n+1}\triangleq-\infty$. Set $k=n$. \item[3] Define $\nu_k$ as \begin{eqnarray} \nu_k = \frac{1}{k} \braces{ \parenth{\sum_{i=1}^{k} W_{s,i}}- \sumdy}. \label{eqn:projsimplexalg:a} \end{eqnarray} If \begin{eqnarray} W_{s,k+1} \leq \nu_k \leq W_{s,k} \label{eqn:projsimplexalg:b} \end{eqnarray} then $\nu=\nu_k$; else set $k=k-1$ and go to 3. \item[4] Set $W^*_i = (W_{0,i}-\nu)^+$. \end{itemize} \textbf{Proof of Optimality of the Algorithm} \begin{itemize} \item [a] \eqref{eqn:optimalityCondsProjSimplex:a} is satisfied because of step 4 in the algorithm. \item [b] \eqref{eqn:optimalityCondsProjSimplex:b} is satisfied because of the second inequality in \eqref{eqn:projsimplexalg:b} and the definition of $W^*_i$ in step 4. \item [c] \eqref{eqn:optimalityCondsProjSimplex:c} is satisfied because of the first inequality in \eqref{eqn:projsimplexalg:b} and the definition of $W^*_i$ in step 4. \item [d] \eqref{eqn:optimalityCondsProjSimplex:d} is satisfied because of \eqref{eqn:projsimplexalg:a}, \eqref{eqn:projsimplexalg:b}, and the definition of $W^*_i$ in step 4. \end{itemize} Lastly, this approach can be also sped up by using a standard cutting plane algorithm \cite[Sec 6.3]{BertsimasTsitsiklis97} to adaptively add constraints in the network flow problem, as required. A certificate of feasibility routine was implemented in C++ to interface with matlab to identify a small set of violated constraints. Adaptively adding more constraints, and stopping once a certificate of feasibility is generated, usually requires significantly fewer constraints and is guaranteed to ultimately return the optimal solution. \newpage \end{document}