\begin{figure} \DisableQuotes \centering \begin{tikzpicture}[ every node/.style={ outer sep=0, inner sep=0, node distance=0.2cm }, layerconv/.style={ % The shape: rectangle, % The size: minimum height=3mm, minimum width={width("ReLU + Pool") - 2em}, % The border: % The filling: fill=conv, font=\sffamily, rotate=90, }, layerrelu/.style={ % The shape: rectangle, % The size: minimum height=3mm, minimum width={width("ReLU + Pool") - 2em}, % The filling: fill=relu, font=\sffamily, rotate=90, }, layerfc/.style={ % The shape: rectangle, % The size: minimum height=3mm, minimum width={width("ReLU + Pool") - 2em}, % The filling: fill=fc, font=\sffamily, rotate=90, }, layerfc2/.style={ % The shape: rectangle, % The size: minimum height=3mm, minimum width={width("FC 3D Box") - 1.7em}, % The filling: fill=fc, font=\sffamily, rotate=90, }, layerfclarge/.style={ % The shape: rectangle, % The size: minimum height=3mm, minimum width={width("Concatenation") + 1.5em}, % The filling: fill=fc, font=\sffamily, rotate=90, }, layersoftmax/.style={ % The shape: rectangle, % The size: minimum height=3mm, minimum width={width("ReLU + Pool") - 2em}, % The filling: fill=softmax, font=\sffamily, rotate=90, }, layersmooth/.style={ % The shape: rectangle, % The size: minimum height=3mm, minimum width={width("FC 3D Box") - 1.7em}, % The filling: fill=softmax, font=\sffamily, rotate=90, }, layergray/.style={ % The shape: rectangle, % The size: minimum width={width("2D VGG on ImageNet") - 3em}, minimum height={width("FC 3D Box") - 1.7em}, % The filling: fill=gray2, font=\sffamily, anchor=south west, }] \node (bedB) at (0,0) {\includegraphics[scale=0.12]{upperbed}}; \node (bedT) [above=of bedB] {\includegraphics[scale=0.15]{lowerbed}}; \node (conv1) [right=0.35 of bedT] {\begin{tikzpicture} \node [layerconv] {\tiny Conv 1}; \end{tikzpicture}}; \node (relu1) [right= of conv1] {\begin{tikzpicture} \node [layerrelu] {\tiny ReLU + Pool}; \end{tikzpicture}}; \node (conv2) [right= of relu1] {\begin{tikzpicture} \node [layerconv] {\tiny Conv 2}; \end{tikzpicture}}; \node (relu2) [right= of conv2] {\begin{tikzpicture} \node [layerrelu] {\tiny ReLU + Pool}; \end{tikzpicture}}; \node (conv3) [right= of relu2] {\begin{tikzpicture} \node [layerconv] {\tiny Conv 3}; \end{tikzpicture}}; \node (relu3) [right= of conv3] {\begin{tikzpicture} \node [layerrelu] {\tiny ReLU}; \end{tikzpicture}}; \node (vgg) [below right=-0.01 and 0.4 of bedB,layergray] {\scriptsize 2D VGG on ImageNet}; \node (fc1) [right= of vgg] {\begin{tikzpicture} \node [layerfc2] {\tiny FC 1}; \end{tikzpicture}}; \node (fc2) [right= of relu3] {\begin{tikzpicture} \node [layerfc] {\tiny FC 2}; \end{tikzpicture}}; \node (concat) [above right=0.25 and 0.3 of fc1.north east,anchor=west] {\begin{tikzpicture} \node [layerfclarge] {\tiny Concatenation}; \end{tikzpicture}}; \node (fc3) [right=0.3 of concat] {\begin{tikzpicture} \node [layerfclarge] {\tiny FC 3}; \end{tikzpicture}}; \node (fcclass) [right=0.3 of fc3.north east,anchor=north west] {\begin{tikzpicture} \node [layerfc] {\tiny FC Class}; \end{tikzpicture}}; \node (fc3dbox) [right=0.3 of fc3.south east,anchor=south west] {\begin{tikzpicture} \node [layerfc2] {\tiny FC 3D Box}; \end{tikzpicture}}; \node (softmax) [right=of fcclass] {\begin{tikzpicture} \node [layersoftmax] {\tiny Softmax}; \end{tikzpicture}}; \node (smooth) [right=of fc3dbox] {\begin{tikzpicture} \node [layersmooth] {\tiny L1 Smooth}; \end{tikzpicture}}; \draw (vgg.east) edge (fc1.west) (conv1) edge (relu1) (relu1) edge (conv2) (conv2) edge (relu2) (relu2) edge (conv3) (conv3) edge (relu3) (relu3) edge (fc2) ; \draw[->] (bedB) edge (bedB -| vgg.west) (bedT) edge (conv1) (fc1) edge (fc1 -| concat.west) (fc2) edge (fc2 -| concat.west) (concat) edge (fc3) (fcclass) edge (softmax) (fc3dbox) edge (smooth) (fc3.east |- fcclass) edge (fcclass) (fc3.east |- fc3dbox) edge (fc3dbox) ; \end{tikzpicture} \EnableQuotes \caption{\textbf{Joint Object Recognition Network:} For each 3D region proposal, the 3D volume from depth is fed to a 3D ConvNet and the 2D projection of the 3D proposal is fed to a 2D ConvNet. Jointly they learn the object category and 3D box regression.} \label{fig:system} \end{figure}