mirror of https://github.com/2martens/uni.git
622 lines
31 KiB
TeX
622 lines
31 KiB
TeX
\documentclass[12pt,twoside]{scrartcl}
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Meta informations:
|
|
\newcommand{\trauthor}{Jim Martens}
|
|
\newcommand{\trtype}{Seminar Paper} %{Seminararbeit} %{Proseminararbeit}
|
|
\newcommand{\trcourse}{Neural Networks}
|
|
\newcommand{\trtitle}{Catastrophic Forgetting and Neuromodulation}
|
|
\newcommand{\trmatrikelnummer}{6420323}
|
|
\newcommand{\tremail}{2martens@informatik.uni-hamburg.de}
|
|
\newcommand{\trarbeitsbereich}{Knowledge Technology, WTM}
|
|
\newcommand{\trdate}{09.07.2018}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Languages:
|
|
|
|
% Falls die Ausarbeitung in Deutsch erfolgt:
|
|
% \usepackage[german]{babel}
|
|
% \usepackage[T1]{fontenc}
|
|
% \usepackage[latin1]{inputenc}
|
|
% \usepackage[latin9]{inputenc}
|
|
% \selectlanguage{german}
|
|
|
|
% If the thesis is written in English:
|
|
\usepackage[spanish,english]{babel}
|
|
\selectlanguage{english}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Bind packages:
|
|
\usepackage[utf8]{inputenc} % Unicode funktioniert unter Windows, Linux und Mac
|
|
\usepackage[T1]{fontenc}
|
|
\usepackage{acronym} % Acronyms
|
|
\usepackage{algorithmic} % Algorithms and Pseudocode
|
|
\usepackage{algorithm} % Algorithms and Pseudocode
|
|
\usepackage{amsfonts} % AMS Math Packet (Fonts)
|
|
\usepackage{amsmath} % AMS Math Packet
|
|
\usepackage{amssymb} % Additional mathematical symbols
|
|
\usepackage{amsthm}
|
|
\usepackage{booktabs} % Nicer tables
|
|
%\usepackage[font=small,labelfont=bf]{caption} % Numbered captions for figures
|
|
\usepackage{color} % Enables defining of colors via \definecolor
|
|
\definecolor{uhhRed}{RGB}{254,0,0} % Official Uni Hamburg Red
|
|
\definecolor{uhhGrey}{RGB}{122,122,120} % Official Uni Hamburg Grey
|
|
\usepackage{fancybox} % Gleichungen einrahmen
|
|
%\usepackage{fancyhdr} % Packet for nicer headers
|
|
\usepackage[automark]{scrlayer-scrpage}
|
|
\usepackage[hidelinks]{hyperref}\urlstyle{rm}
|
|
%\usepackage{fancyheadings} % Nicer numbering of headlines
|
|
|
|
%\usepackage[outer=3.35cm]{geometry} % Type area (size, margins...) !!!Release version
|
|
%\usepackage[outer=2.5cm]{geometry} % Type area (size, margins...) !!!Print version
|
|
%\usepackage{geometry} % Type area (size, margins...) !!!Proofread version
|
|
\usepackage[outer=3.15cm]{geometry} % Type area (size, margins...) !!!Draft version
|
|
\geometry{a4paper,body={5.8in,9in}}
|
|
|
|
\usepackage{graphicx} % Inclusion of graphics
|
|
%\usepackage{latexsym} % Special symbols
|
|
\usepackage{longtable} % Allow tables over several parges
|
|
\usepackage{listings} % Nicer source code listings
|
|
\usepackage{multicol} % Content of a table over several columns
|
|
\usepackage{multirow} % Content of a table over several rows
|
|
\usepackage{rotating} % Alows to rotate text and objects
|
|
\usepackage[hang]{subfigure} % Allows to use multiple (partial) figures in a fig
|
|
%\usepackage[font=footnotesize,labelfont=rm]{subfig} % Pictures in a floating environment
|
|
\usepackage{tabularx} % Tables with fixed width but variable rows
|
|
\usepackage{url,xspace,boxedminipage} % Accurate display of URLs
|
|
|
|
\usepackage{csquotes}
|
|
\usepackage[
|
|
backend=biber,
|
|
bibstyle=ieee,
|
|
citestyle=ieee,
|
|
minnames=1,
|
|
maxnames=2
|
|
]{biblatex}
|
|
|
|
\addbibresource{bib.bib}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Configurationen:
|
|
|
|
\hyphenation{whe-ther} % Manually use: "\-" in a word: Staats\-ver\-trag
|
|
|
|
%\lstloadlanguages{C} % Set the default language for listings
|
|
\DeclareGraphicsExtensions{.pdf,.svg,.jpg,.png,.eps} % first try pdf, then eps, png and jpg
|
|
\graphicspath{{./src/}} % Path to a folder where all pictures are located
|
|
%\pagestyle{fancy} % Use nicer header and footer
|
|
\pagestyle{scrheadings}
|
|
|
|
% Redefine the environments for floating objects:
|
|
\setcounter{topnumber}{3}
|
|
\setcounter{bottomnumber}{2}
|
|
\setcounter{totalnumber}{4}
|
|
\renewcommand{\topfraction}{0.9} %Standard: 0.7
|
|
\renewcommand{\bottomfraction}{0.5} %Standard: 0.3
|
|
\renewcommand{\textfraction}{0.1} %Standard: 0.2
|
|
\renewcommand{\floatpagefraction}{0.8} %Standard: 0.5
|
|
|
|
% Tables with a nicer padding:
|
|
\renewcommand{\arraystretch}{1.2}
|
|
\MakeOuterQuote{"}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Additional 'theorem' and 'definition' blocks:
|
|
\theoremstyle{plain}
|
|
\newtheorem{theorem}{Theorem}[section]
|
|
%\newtheorem{theorem}{Satz}[section] % Wenn in Deutsch geschrieben wird.
|
|
\newtheorem{axiom}{Axiom}[section]
|
|
%\newtheorem{axiom}{Fakt}[chapter] % Wenn in Deutsch geschrieben wird.
|
|
%Usage:%\begin{axiom}[optional description]%Main part%\end{fakt}
|
|
|
|
\theoremstyle{definition}
|
|
\newtheorem{definition}{Definition}[section]
|
|
|
|
%Additional types of axioms:
|
|
\newtheorem{lemma}[axiom]{Lemma}
|
|
\newtheorem{observation}[axiom]{Observation}
|
|
|
|
%Additional types of definitions:
|
|
\theoremstyle{remark}
|
|
%\newtheorem{remark}[definition]{Bemerkung} % Wenn in Deutsch geschrieben wird.
|
|
\newtheorem{remark}[definition]{Remark}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Provides TODOs within the margin:
|
|
\newcommand{\TODO}[1]{\marginpar{\emph{\small{{\bf TODO: } #1}}}}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Abbreviations and mathematical symbols
|
|
\newcommand{\modd}{\text{ mod }}
|
|
\newcommand{\RS}{\mathbb{R}}
|
|
\newcommand{\NS}{\mathbb{N}}
|
|
\newcommand{\ZS}{\mathbb{Z}}
|
|
\newcommand{\dnormal}{\mathit{N}}
|
|
\newcommand{\duniform}{\mathit{U}}
|
|
|
|
\newcommand{\erdos}{Erd\H{o}s}
|
|
\newcommand{\renyi}{-R\'{e}nyi}
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Document:
|
|
\begin{document}
|
|
\renewcommand{\headheight}{14.5pt}
|
|
|
|
%\fancyhead{}
|
|
%\fancyhead[LE]{ \slshape \trauthor}
|
|
%\fancyhead[LO]{}
|
|
%\fancyhead[RE]{}
|
|
%\fancyhead[RO]{ \slshape \trtitle}
|
|
\lehead{\slshape \trauthor}
|
|
\rohead{\slshape \trtitle}
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Cover Header:
|
|
\begin{titlepage}
|
|
\begin{flushleft}
|
|
Universit\"at Hamburg\\
|
|
Department Informatik\\
|
|
\trarbeitsbereich\\
|
|
\end{flushleft}
|
|
\vspace{3.5cm}
|
|
\begin{center}
|
|
\huge \trtitle\\
|
|
\end{center}
|
|
\vspace{3.5cm}
|
|
\begin{center}
|
|
\normalsize\trtype\\
|
|
[0.2cm]
|
|
\Large\trcourse\\
|
|
[1.5cm]
|
|
\Large \trauthor\\
|
|
[0.2cm]
|
|
\normalsize Matr.Nr. \trmatrikelnummer\\
|
|
[0.2cm]
|
|
\normalsize\tremail\\
|
|
[1.5cm]
|
|
\Large \trdate
|
|
\end{center}
|
|
\vfill
|
|
\end{titlepage}
|
|
|
|
%backsite of cover sheet is empty!
|
|
\thispagestyle{empty}
|
|
\hspace{1cm}
|
|
\newpage
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Abstract:
|
|
|
|
% Abstract gives a brief summary of the main points of a paper:
|
|
\section*{Abstract}
|
|
Catastrophic forgetting is a huge problem for neural networks, in particular
|
|
for autonomous systems. This paper will showcase three approaches using
|
|
diffusion-based neuromodulation and compare them with respect to catastrophic
|
|
forgetting. The results of the comparison being that modulated random search
|
|
is not useful to combat catastrophic forgetting, modulated gaussian walk is
|
|
significantly better and likely useful for single task or combined feedback
|
|
situations. Localized learning overcomes catastrophic forgetting in a very
|
|
bespoke setup and more generally could be useful for situations with combined
|
|
tasks and distinct feedback for each task.
|
|
|
|
% Lists:
|
|
\setcounter{tocdepth}{2} % depth of the table of contents (for Seminars 2 is recommented)
|
|
\tableofcontents
|
|
\pagenumbering{arabic}
|
|
\clearpage
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% Content:
|
|
|
|
% the actual content, usually separated over a number of sections
|
|
% each section is assigned a label, in order to be able to put a
|
|
% crossreference to it
|
|
|
|
\section{Introduction}
|
|
\label{sec:introduction}
|
|
|
|
Autonomous robots need to adapt to new situations. They have a need to learn
|
|
for an entire life. In order to do this they need a second environmental feedback
|
|
loop that tells them when to learn\cite{Toutounji2016}.
|
|
|
|
The learning itself is also described as plasticity. In the context of this paper
|
|
the definition of synaptic plasticity given by Citri\cite{Citri2008} will be used.
|
|
In short the process of learning itself, changing the weights, is already
|
|
considered plasticity. This can occur throughout the lifetime of a network or
|
|
during the training phase of networks using for example supervised learning
|
|
and backpropagation.
|
|
|
|
When a network has to adapt to new situations, it has to learn new tasks. Usually
|
|
the previously learned weights are largely forgotten. This phenomenon is called
|
|
catastrophic forgetting\cite{French1999,McCloskey1989}. It is highly problematic,
|
|
because the weights encode the learning of a network. If they are forgotten or
|
|
rather overwritten the previously learned tasks cannot be fulfilled anymore.
|
|
|
|
Since catastrophic forgetting is a key problem for autonomous learning, it is
|
|
crucial to overcome it. In this paper I will present some approaches for
|
|
learning in an autonomous setup to analyse which of them if any can overcome
|
|
catastrophic forgetting.
|
|
|
|
\section{Catastrophic Forgetting}
|
|
\label{sec:catastrophicforgetting}
|
|
|
|
French\cite{French1999} did a review of the existing research about catastrophic
|
|
forgetting. The following paragraphs will follow this review and highlight the
|
|
major developments in research related to catastrophic forgetting.
|
|
|
|
McCloskey and Cohen\cite{McCloskey1989} originally discovered the problem of
|
|
catastrophic forgetting, which was referred to as catastrophic interference. This
|
|
discovery of a fundamental limitation of the classic neural network was as
|
|
important as the work of Minsky and Papert\cite{Minsky1969} who described the
|
|
limitations of a perceptron twenty years prior. The key discovery of McCloskey
|
|
and Cohen was that previously learned patterns were completely forgotten after
|
|
a few training cycles of learning a new pattern. The reason behind this
|
|
behaviour was the real problem. They identified the single set of shared
|
|
weights as responsible for it.
|
|
|
|
If one thinks a few minutes about it this makes absolute sense. The classic
|
|
backpropagation algorithm works by modifying the weights that have contributed
|
|
the most to a bad outcome. When the set of targets is changing then the network
|
|
will perform badly for the new pattern. In order to rectify this the backpropagation
|
|
algorithm will change many of the weights so that the network is delivering a
|
|
good result for the new pattern. This on the other hand results in an increasingly
|
|
worse performance on previously learned patterns. If this worsening was gradual
|
|
it would still be unfortunate but understandable. It is called catastrophic
|
|
because this performance change is not gradual but rather abrupt. Even small
|
|
changes in the weights can have a huge impact on the output.
|
|
|
|
In fact catastrophic forgetting is only a very radical example of a more general
|
|
problem for all models of memory, the so called "stability-plasticity" problem\cite{Grossberg1982}.
|
|
This problem, sometimes called dilemma, basically puts up the question how to
|
|
design a system in such a way that it is both sensitive to new input and not
|
|
radically disrupted by it. In other words: That it can learn new things
|
|
without largely or completely forgetting already learned things.
|
|
|
|
Early attempts to alleviate or overcome catastrophic forgetting required a more
|
|
sparse representation. This means that not every weight is responsible for all
|
|
possible inputs. The downside is a worse ability to generalize to new input
|
|
and overall a worse ability to discriminate. In an extreme form this can
|
|
lead to catastrophic remembering\cite{Sharkey1995}.
|
|
The idea here is that a
|
|
network learns the function describing the inputs too well and therefore
|
|
loses its ability to differentiate between new and already learned input.
|
|
This can be understood well with the example given by French\cite{French1999},
|
|
where a network has the task to reproduce the input at the output. It can detect
|
|
a new input if the output is diverging by a large margin. It has learned too well
|
|
if it learned the identity function and is therefore able to reproduce any
|
|
input perfectly at the output and hence loses the ability to detect new input.
|
|
|
|
Significant improvements were made by rehearsing previously learned input.
|
|
Robins\cite{Robins1995} found a way to rehearse prior input if it is no longer
|
|
available and called it "pseudo-patterns". The idea being that the weights
|
|
of the trained network resemble a function. A random input and the predicted
|
|
output together somewhat describe this function and are such a pattern. Robins
|
|
used many of them interleaved with new input and the results were promising
|
|
as the forgetting became more gradual. This insight together with
|
|
the findings of McClelland\cite{McClelland1995} resulted in the development
|
|
of dual-network models.
|
|
In short one network would model the hippocampus and be able to quickly learn new
|
|
information without disrupting previously learned regularities. This network
|
|
would then serve as teacher for the second network which models the neocortex
|
|
and is responsible for generalizing.
|
|
|
|
In the time between 1999 and 2018 more work was done with regards to catastrophic
|
|
forgetting. Most recently the work of Kirkpatrick\cite{Kirkpatrick2017}, who
|
|
slows down the learning for weights of older tasks,
|
|
Velez\cite{Velez2017}, whose work will be showcased later, and
|
|
Shmelkov\cite{Shmelkov2017}, who introduces a loss function that intends to keep
|
|
catastrophic forgetting at bay, has to be named.
|
|
|
|
\section{Plasticity}
|
|
\label{sec:plasticity}
|
|
|
|
Catastrophic forgetting requires learned weights that can be forgotten.
|
|
Every neural network learns and therefore deals with plasticity, given our
|
|
definition of it. In this section three approaches for plasticity using diffusion-based
|
|
neuromodulation are presented in more detail. Modulated Random Search and
|
|
Modulated Gaussian Walk are using linearly modulated neural networks. They
|
|
are taken from Toutounji and Pasemann\cite{Toutounji2016}. The third approach
|
|
was introduced by Velez and Clune\cite{Velez2017} and uses diffusion-based neuromodulation
|
|
for localized learning hence the name of the subsection here.
|
|
|
|
\subsection{Modulated Random Search}
|
|
\label{subsec:mrs}
|
|
|
|
\subsubsection*{Modulated Neural Network}
|
|
|
|
Since both approaches from Toutounji and Pasemann are using linearly-modulated
|
|
neural networks the structure of these networks is described first. Linearly-modulated
|
|
neural networks (LMNN) are a specific variant of modulated neural
|
|
networks (MNN). Any artifical neural network (ANN) or simply neural network
|
|
in the context of Computer Science can become a modulated neural network by
|
|
adding a neuromodulator layer. This neuromodulator layer is the second
|
|
environmental feedback loop mentioned earlier.
|
|
|
|
Toutounji and Pasemann describe a variant of this layer that uses neuromodulator
|
|
cells (NMCs). Each NMC produces a specific type of neuromodulator (NM) and
|
|
saves its own concentration level of it. The network wide concentration level at
|
|
a certain point in space and time can be obtained by summing up all concentration
|
|
levels saved in NMCs at the point in space. Produced neuromodulators usually
|
|
impact nearby network parts. This type of spatial impact requires a spatial
|
|
representation in the network where all network elements have a location in
|
|
the space.
|
|
|
|
There are a production and a reduction mode for the NMCs. During the production
|
|
mode the concentration of neuromodulator can be increased and during reduction
|
|
mode it can be decreased. A cell can enter production mode if it was stimulated
|
|
for some time while it falls back to reduction mode when this stimulation
|
|
does not happen for some time.
|
|
|
|
\subsubsection*{Linearly-Modulated Neural Network}
|
|
|
|
A linearly-modulated neural network uses discrete time and stimulates NMCs
|
|
with a simple linear model. Each NMC is connected to a carrier cell or neuron
|
|
which itself is part of a modulatory subnetwork. The NMC is stimulated if the
|
|
output of the carrier neuron is within a specified range
|
|
(\(\text{S}^{\text{min}}\), \(\text{S}^{\text{max}}\)). In every time step
|
|
is checked if the output of the carrier cell is high enough to stimulate the
|
|
NMC. If it is the stimulation level of the NMC increases. Otherwise it decreases.
|
|
Once the stimulation level reaches the threshold \(\text{T}^{\text{prod}}\)
|
|
the cell goes into the production mode. If it falls below \(\text{T}^{\text{red}}\)
|
|
the cell goes back into reduction mode.
|
|
Over time the neuromodulator diffuses to the surrounding control subnetwork
|
|
where it initiates plasticity that is dependent on the concentration of it at
|
|
the respective synapse.
|
|
|
|
\subsubsection*{Modulated Random Search}
|
|
|
|
\begin{table}
|
|
\begin{tabular}{l|l}
|
|
\textbf{Parameter} & \textbf{Description} \\
|
|
\(Type\) & The neuromodulator type the synapse is sensitive to \\
|
|
\(W\) & Weight change probability \\
|
|
\(D\) & Disable / enable probability \\
|
|
\(W^{min}, W^{max}\) & Minimum and maximum weight of synapse \\
|
|
\(M\) & Maximum neuromodulator sensitivity limit of the synapse
|
|
\end{tabular}
|
|
\caption{Parameters stored for each synapse.
|
|
Replication of Table 1 in Toutounji and Pasemann\cite{Toutounji2016}.}
|
|
\label{tab:mrs-synapse}
|
|
\end{table}
|
|
|
|
Modulated random search means essentially random weight changes. Each synapse \(i\)
|
|
has some parameters that are used (see table \ref{tab:mrs-synapse}). The weight
|
|
change probability \(p_i^w\) at time \(t\) is the product of the intrinsic weight
|
|
change probability \(W_i\) and the concentration of the neuromodulator the synapse
|
|
is sensitive to \(c(t, x_i, y_i)\) at its location \((x_i, y_i)\). Additionally
|
|
the maximum neuromodulator sensitivity \(M_i\) is the ceiling for the second part
|
|
of that product \eqref{eq:weightchangeprob}. This means there is a maximum weight
|
|
change probability for each synapse. Weight changes can happen at any time step.
|
|
Therefore the intrinsic weight change probability has to be very small. Should a
|
|
weight change occur a new weight \(w_i\) is chosen randomly from the interval
|
|
\([W_i^{min}, W_i^{max}]\).
|
|
|
|
The weight change probability \(p_i^w\) tells the network when to learn and leaves
|
|
room for variation as it is a probability and not a binary learn/do not learn
|
|
situation. Within this example this probability is the so called second environmental
|
|
feedback loop.
|
|
|
|
\begin{equation}\label{eq:weightchangeprob}
|
|
p_i^w = min(M_i, c(t, x_i, y_i)) \cdot W_i,\; 0 < W_i \lll 1
|
|
\end{equation}
|
|
|
|
Moreover a synapse can disable or enable itself. The actual disable/enable
|
|
probability \(p_i^d\) is the product of the intrinsic value \(D_i\) saved as
|
|
parameter and the neuromodulator concentration \(c(t, x_i, y_i)\) \eqref{eq:enableprob}.
|
|
The concentration is again ceiled by the maximum sensitivity limit \(M_i\) given
|
|
as parameter. This means there is a maximum disable/enable probability as well.
|
|
The intrinsic enable/disable probability must be smaller than the intrinsic weight
|
|
change probability. A disabled synapse is treated as having weight 0 but the actual
|
|
value is stored so that it can be restored when the synapse is enabled again.
|
|
|
|
\begin{equation}\label{eq:enableprob}
|
|
p_i^d = min(M_i, c(t, x_i, y_i) \cdot D_i,\; 0 \leq D_i < W_i
|
|
\end{equation}
|
|
|
|
Given a so called neural network structure or substrate this makes it easier
|
|
to find different network topologies (structure and weights combined).
|
|
|
|
\subsection{Modulated Gaussian Walk}
|
|
\label{subsec:mgw}
|
|
|
|
The modulated gaussian walk is introduced by Toutounji and Pasemann. The key differences
|
|
start with the parameters. There is no maximum sensitivity for the neuromodulator
|
|
concentration. When a weight change occurs the new weight is not chosen randomly
|
|
but rather the difference to be added to the current weight is sampled from a
|
|
normal distribution with a mean of zero and \(\sigma^2\)-variance \eqref{eq:gausswalk}.
|
|
The sampled value could be infinitely large and hence the new weight outside of
|
|
the given bounds for it. Therefore the value is sampled until the sum of the
|
|
current weight and the sampled value are within the interval \([W_i^{min}, W_i^{max}]\).
|
|
|
|
\begin{equation}\label{eq:gausswalk}
|
|
w_i (t + 1) = w_i (t) + \Delta w_i \;\text{where}\; \Delta w_i \sim \mathcal{N}(0, \sigma^2)
|
|
\end{equation}
|
|
|
|
Toutounji and Pasemann implemented a mechanism for disabling synapses
|
|
in the modulated gaussian walk as well but did not make use of it later and
|
|
therefore they did not describe how it works.
|
|
|
|
\subsection{Localized learning}
|
|
\label{subsec:diffusion}
|
|
|
|
Velez and Clune are using a small network to solve the foraging task. The network
|
|
represents an agent that has a lifetime of three years. Each year consists of
|
|
the seasons summer and winter. During each season the agent is presented with
|
|
food and has to either eat the food or not. Half of the food is nutritious
|
|
and the other poisonous. The target is a fitness value which is best if the
|
|
agent eats all nutritious food and none of the poisonous. The associations of
|
|
nutritious and poisonous are different between summer and winter but within
|
|
a season remain the same during the lifetime. Therefore a nutritious food in
|
|
summer will always be nutritious.
|
|
This setup makes it easy to measure if the agent is able to remember the learned
|
|
associations from the previous seasons.
|
|
|
|
The initial weights of the network are derived from an evolutionary algorithm.
|
|
All later learning uses neuromodulation. The neurons of the network are spatially
|
|
located and there are two sources of neuromodulators in the network - one on either
|
|
side. The sources are only active in their respective season and encode whether
|
|
the previously eaten food was nutritious (1) or poisonous (-1). If they are
|
|
not active their value is zero. As soon as the sources are activated the neuromodulators
|
|
fill a space within a radius of 1.5 units of distance from the source and potentially
|
|
trigger weight changes of neurons inside the radius. The strength of the neuromodulators
|
|
is decreasing with further distance from the source. The sources are the second
|
|
environmental feedback loop in this example as they tell the network or a part of
|
|
it when to learn.
|
|
|
|
How does the actual learning happen? The weight change between two neurons
|
|
is dependent on the activation of both neurons, the learning rate and the concentration
|
|
of neuromodulators \eqref{eq:hebbian}. In short Hebbian learning is employed.
|
|
|
|
\begin{equation}\label{eq:hebbian}
|
|
\Delta w_{ij} = \eta \cdot m_i \cdot a_i \cdot a_j
|
|
\end{equation}
|
|
|
|
This explanation should suffice for the general understanding of their method.
|
|
The neurons within the vicinity of these sources only update their weights
|
|
in one of the seasons. Therefore they only learn for one season and are unaffected
|
|
by the other season. This results in a localized learning.
|
|
|
|
\section{Comparison regarding catastrophic forgetting}
|
|
\label{sec:comparison}
|
|
|
|
Given the presentations of the three approaches it is interesting to compare
|
|
them with regard to their ability to mitigate or overcome catastrophic forgetting.
|
|
For both the modulated random search and the modulated gaussian walk this aspect
|
|
was analyzed in the experiments conducted by Toutounji and Pasemann\cite{Toutounji2016}.
|
|
Therefore the results of their work will be utilized for this comparison.
|
|
Velez and Clune\cite{Velez2017} introduced the presented approach of localized
|
|
learning to analyze its capability with respect to overcoming catastrophic
|
|
forgetting. Hence their results will be used for the comparison in this section.
|
|
|
|
Over multiple experiments of increasing difficulty the performance of modulated
|
|
random search and modulated gaussian walk were tested. The difficulty ranged
|
|
from a positive light-tropism task in the first experiment (E1) over an
|
|
obstacle-avoidance task in the second experiment (E2), a combination of E1 and E2
|
|
in the third experiment (E3) to a more difficult variant of E3 in the fourth
|
|
experiment (E4). The fifth experiment (E5) was a pendulum experiment.
|
|
In each experiment a robot had to learn the task from scratch. A pre-designed
|
|
LMNN was given in each case and defined the boundaries in which the learning
|
|
took place. If a temporary solution was discarded the learning started again.
|
|
|
|
Modulated random search was able to find successful behaviours in almost all
|
|
cases in E1 despite a short training time of only two hours. The slightly
|
|
longer training time for E2 of four hours however was apparently far too short
|
|
to find consistently good solutions. Both in E3 and E4 the number of intermediate
|
|
temporary solutions is significantly higher than the final number of solutions.
|
|
The pendulum experiment was an easier task and therefore many successful
|
|
behaviours were found.
|
|
|
|
Toutounji and Pasemann note that even almost stable networks are destroyed
|
|
if they have the slightest weakness. Therefore modulated random search
|
|
does not help at all against catastrophic forgetting.
|
|
|
|
Modulated gaussian walk contrary to the random search tends to improve temporary
|
|
solutions when they have weaknesses. For E3 the random search resulted in 34
|
|
temporary solutions which lasted longer than five minutes, averaging at \(5.7\)
|
|
minutes per solution. The gaussian walk found roughly twice that many temporary
|
|
solutions and averaged at \(12.5\) minutes per solution. This indicates that
|
|
gaussian walk mitigates catastrophic forgetting although it does not completely
|
|
remove it.
|
|
|
|
After the experiments related to modulated random search and modulated gaussian
|
|
walk the experiment related to localized learning is described.
|
|
The experiment setup for the localized learning approach was already mentioned.
|
|
After performing some tests Velez and Clune discovered that two functional
|
|
modules formed. One set of connections is learning during sommer and the other
|
|
during winter. The connections learning in summer do not change in winter and
|
|
vice versa. This completely removes catastrophic forgetting.
|
|
|
|
If catastrophic forgetting is the only measurement then localized learning
|
|
seems to be the supreme solution to the problem. But Velez and Clune only
|
|
showed that it works in a very bespoke setup which a priori information about
|
|
the linear separability of the learning areas and correct solution. It has yet
|
|
to be shown that localized learning can be generalized to larger problems.
|
|
Modulated random search can be completely discarded as a potential solution.
|
|
Modulated gaussian walk is a clear improvement compared to the random search
|
|
in the analyzed experiments.
|
|
|
|
While all three approaches used diffusion-based neuromodulation the first two and
|
|
the third are quite different in their setup. First the general neuromodulation
|
|
architecture was different (each neuron can diffuse neuromodulators vs. two stationary
|
|
sources) and second the actual weight change was also different. Both modulated
|
|
gaussian walk and the shown localized learning are improving previous weights
|
|
instead of completely changing them. The gaussian walk is using a normal distribution
|
|
to get the weight change while the localized learning uses Hebbian learning and
|
|
therefore is dependent on the activations of two neurons and is directly incorporating
|
|
the neuromodulators in the weight change formula itself.
|
|
|
|
It is important to note that the advantage of gaussian walk had nothing to do
|
|
with the architecture of the neuromodulation as that was identical for both
|
|
random search and gaussian walk. The improvement originated in the learning
|
|
rule. In the experiments of Toutounji and Pasemann they used a homogenous
|
|
diffusion but it would have been possible to use different diffusion strengths,
|
|
decays and so on for every neuron.
|
|
|
|
For the future it would be interesting to
|
|
compare the LMNN architecture with the "sources" architecture of localized
|
|
learning to understand the impact of the neuromodulation architecture.
|
|
In addition the gaussian walk learning rule should be compared with the
|
|
Hebbian learning rule used by localized learning.
|
|
|
|
For the E3 experiment used by Toutounji and Pasemann it is safe to assume that
|
|
the kind of a priori placement of neuromodulator sources won't work. The experiment
|
|
requires the robot to solve two tasks at the same time: It has
|
|
to approach the lights and avoid obstacles. Since both tasks need to be solved
|
|
at the same time and always it is not possible to devise two "seasons" or some
|
|
similar separation of learning time. Therefore the LMNN architecture is likely
|
|
better suited.
|
|
|
|
Nevertheless it would make sense to separate the learning for
|
|
these two tasks as a robot might already be very good at approaching lights but
|
|
only mediocre at avoiding obstacles. In that case the improvements for the second
|
|
task should not impact the first task. The Hebbian learning rule is likely
|
|
better suited to achieve this effect as it correlates the weight change with
|
|
the correlation of the connected neurons. Simply using a value sampled from a
|
|
normal distribution as the gaussian walk does it, probably does not result in
|
|
localized learning. On the other hand localized learning will likely only work
|
|
if it is possible to give the robot distinct feedback about its performance in
|
|
each task. If it only receives a combined feedback it is more difficult to
|
|
utilize localized learning as it is then not easy to find out which part (and
|
|
therefore which weights) performed bad.
|
|
|
|
In situations where there is only one task to solve (E2) or the feedback is only
|
|
given as a total without distinct information about each sub task it is very
|
|
likely that localized learning won't work and therefore gaussian walk is better
|
|
suited than Hebbian learning.
|
|
|
|
\section{Conclusion}
|
|
\label{sec:concl}
|
|
|
|
The second environmental feedback loop is used to tell autonomous systems
|
|
when to learn. However the mere existence of such a loop is not enough. It matters
|
|
how this feedback loop is working and how it is connected with the rest of the
|
|
network. The weight change probability of both modulated random search and
|
|
modulated gaussian walk is the second environmental feedback loop but it was shown
|
|
that these two approaches are vastly different in their performance.
|
|
Therefore it is equally important how the learning actually works. The comparison
|
|
has shown that localized learning utilizing neuromodulator sources can overcome
|
|
catastrophic forgetting for small networks in a very restricted setup.
|
|
Furthermore the comparison revealed that modulated random search is not part of
|
|
a solution to catastrophic forgetting. In a more general case it is likely that
|
|
the LMNN architecture is better than the sources architecture and that Hebbian
|
|
learning is better suited for combined tasks and localized learning than modulated
|
|
gaussian walk. For single task environments or those where localized learning is
|
|
not an option modulated gaussian walk is likely better suited than Hebbian learning.
|
|
|
|
Future work should look into the assumptions that were taken here and analyze which
|
|
network architecture is better and which learning rule is better for the kind of
|
|
autonomous robot experiments that were conducted by Toutounji and Pasemann. In
|
|
general the applicability of localized learning to bigger problems for example
|
|
in the area of deep neural networks should be researched.
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% hier werden - zum Ende des Textes - die bibliographischen Referenzen
|
|
% eingebunden
|
|
%
|
|
% Insbesondere stehen die eigentlichen Informationen in der Datei
|
|
% ``bib.bib''
|
|
%
|
|
\newpage
|
|
\printbibliography
|
|
\addcontentsline{toc}{section}{Bibliography}% Add to the TOC
|
|
|
|
\end{document}
|