% This is the master BibTeX file for iLab.usc.edu
% Copyright (C) 2000-2007 by iLab at the University of Southern California.
% All rights reserved.

% Special entry types:
% --------------------
% <at>invited for invited/plenary talks
% <at>press for press coverage, interviews, etc
% <at>misc for misc stuff
%
% all in all:
%   'article' => 'Journal Articles',
%   'incollection' => 'Book Chapters',
%   'inproceedings' => 'Proceedings from International Conferences',
%   'book' => 'Books',
%   'phdthesis' => 'Ph.D. Theses',
%   'mastersthesis' => 'Master Theses',
%   'techreport' => 'Technical Reports',
%   'booklet' => 'Booklets',
%   'manual' => 'Technical Documentations',
%   'proceedings' => 'Edited Conference Proceedings',
%   'unpublished' => 'Unpublished Documents',
%   'patent' => 'Patents and Copyrights',
%   'press' => 'Press Coverage',
%   'misc' => 'Miscellaneous',
%   'invited' => 'Selected Plenary and Invited Talks',

% Special fields:
% ---------------
% - type: semicolumn-separated list of classifications.
%         See http://ilab.usc.edu/publications/src/bibTOhtml for definitions
%         As of 10/22/2007:
%         bu   = 'Model of Bottom-Up Saliency-Based Visual Attention';
%         td   = 'Model of Top-Down Attentional Modulation';
%         psy  = 'Human Psychophysics';
%         mod  = 'Computational Modeling';
%         mip  = 'Medical Image Processing';
%         fmri = 'Functional Neuroimaging';
%         med  = 'Medical Research';
%         cv   = 'Computer Vision';
%         rev  = 'Review Articles and Chapters';
%         bb   = 'Beobots';
%         sc   = 'Scene Understanding';
%         eye  = 'Human Eye-Tracking Research';
%         su   = 'Bayesian Theory of Surprise';
%         phy  = 'Monkey Electrophysiology';
%
% - review: For ALL inproceedings entries, choose from:
%         review = {full/conf}  international conference with 
%                  full-paper review (e.g., CVPR, NIPS)
%         review = {abs/conf}  international conference with 
%                  abstract review (e.g., SPIE, VSS, SfN)
%         review = {full/wkshp}  international workshop with 
%                  full-paper review (e.g., WAPCV)
%         review = {abs/wkshp}  international workshop with 
%                  abstract review (e.g., some obscure workshop)
% - if: impact factor. For journal papers,
%         if = {200X impact factor: x.xxx}
%         for conference papers,
%         if = {Acceptance rate: xx.x\%}
%
% - file: Put a URL to a PDF. We don't use the standard URL field as
%         often we don't want the URL to show up in the reference lists
%         when we cite our papers. Give a copy of your PDF to Laurent so that
%         downloads of it will count towards our "top downloads" list.

% Miscellaneous:
% --------------
%
% - please STRICTLY adhere to the key naming rules. See
%   http://ilab.usc.edu/bibTOhtml/ for the general logic, plus, for
%   ilab.bib, always add some abbreviated initials of the
%   journal/conference/etc. For example, Peters_Itti08nips
%
% - make sure you enter a month!
%
% - give your PDF to Laurent for upload to the server if you put
%   something in the 'file' field. STRICTLY same name as the BibTeX key!
%
% - ONLY standard ASCII please, no windows characters. Plus you have
%   to be careful that it should work for HTML (some chars will be
%   escaped for you, like <) and BibTeX (need to escape % as \% or percent).
%   Have a look at http://ilab.usc.edu/publications/src/bibTOhtml but
%   please don't edit this one unless you know exactly what you are doing.
%
% - Keep entries roughly ordered by date of publication. Keep in mind
%   that your entry will be indexed by Google Scholar immediately. So if
%   you put an 'in press' paper, try to make the right guess for the year,
%   put no page numbers, etc to avoid ending up with several entries later
%   on in Google Scholar.
%
% - Chances are that an entry in the same journal or conference
%   already exists in this file for a previous year. Then look for it and
%   copy it to get started. This will get you the same standardized
%   conference name as the other entries, etc.

@article{Marat_Itti12vc,
author={S. Marat and L. Itti},
title={Influence of the amount of context learned for improving object classification when simultaneously learning
                  object and contextual cues},
journal={Visual Cognition},
year={2012},
type={mod;cv;sc},
if={2010 Impact Factor: 1.828}
}

@inproceedings{Borji_etal12icra,
author={A. Borji and D. N. Sihite and L. Itti},
title={Modeling the Influence of Action on Spatial Attention in Visual Interactive Environments},
booktitle={Proc. IEEE International Conference on Robotics and Automation (ICRA)},
year={2012},
month={May},
pages={1-6},
review={full/conf},
type={mod;td;cv},
if={2012 acceptance rate: 40\%}
}

@inproceedings{Windaw_Itti11iros,
author={J. Windaw and L. Itti},
title={Multilayer real-time video image stabilization},
abstract={In many camera-based robotics applications, stabilizing video images in real-time is often critical for
                  successful performance. In particular vision-based navigation, localization and tracking tasks cannot
                  be performed reliably when landmarks are blurry, poorly focused or disappear from the camera view due
                  to strong vibrations. Thus a reliable video image stabilization system would be invaluable for these
                  applications. This paper presents a real-time video image stabilization system (VISS) primarily
                  developed for aerial robots. Its unique architecture combines four independent stabilization
                  layers. Layer 1 detects vibrations via an inertial measurement unit (IMU) and performs external
                  counter-movements with a motorized gimbal. Layer 2 damps vibrations by using mechanical devices. The
                  internal optical image stabilization of the camera represents Layer 3, while Layer 4 filters remaining
                  vibrations using software. VISS is low-cost and robust. It has been implemented on a 'Photoship One'
                  gimbal, using GUMBOT hardware for processing Sparkfun- IMU data (Layer 1). Lord Mount vibration
                  isolators damp vibrations (Layer 2). Video images of Panasonic's Lumix DMC- TZ5 camera are optically
                  stabilized with Panasonic's 'Mega O.I.S.' technique (Layer 3) and digitally stabilized with 'Deshaker'
                  software (Layer 4). VISS significantly improved the stability of shaky video images in a series of
                  experiments.},
booktitle={Proc. IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
year={2011},
pages={2397-2402},
month={Sep},
review={full/conf},
type={bb},
file={http://ilab.usc.edu/publications/doc/Windaw_Itti11iros.pdf},
if = {2011 acceptance rate: 32\%}
}

@invited{Itti11iros,
author={L. Itti},
title={Biologically-inspired attention and scene understanding algorithms for mobile robots},
booktitle={Sixth international cognitive vision workshop of IROS 2011},
month={Sep},
year={2011},
type={bu;td;mod}
}


@article{Boehnke_etal11ejn,
title={Visual adaptation and novelty responses in the superior colliculus},
author={S. E. Boehnke and D. J. Berg and R. A. Marino and P. F. Baldi and L. Itti and D. P. Munoz},
abstract={The brain's ability to ignore repeating, often redundant, information while enhancing novel information
                  processing is paramount to survival. When stimuli are repeatedly presented, the response of visually
                  sensitive neurons decreases in magnitude, that is, neurons adapt or habituate, although the mechanism
                  is not yet known. We monitored the activity of visual neurons in the superior colliculus (SC) of
                  rhesus monkeys who actively fixated while repeated visual events were presented. We dissociated
                  adaptation from habituation as mechanisms of the response decrement by using a Bayesian model of
                  adaptation, and by employing a paradigm including rare trials that included an oddball stimulus that
                  was either brighter or dimmer. If the mechanism is adaptation, response recovery should be seen only
                  for the brighter stimulus; if the mechanism is habituation, response recovery ('dishabituation')
                  should be seen for both the brighter and dimmer stimuli. We observed a reduction in the magnitude of
                  the initial transient response and an increase in response onset latency with stimulus repetition for
                  all visually responsive neurons in the SC. Response decrement was successfully captured by the
                  adaptation model, which also predicted the effects of presentation rate and rare luminance
                  changes. However, in a subset of neurons with sustained activity in response to visual stimuli, a
                  novelty signal akin to dishabituation was observed late in the visual response profile for both
                  brighter and dimmer stimuli, and was not captured by the model. This suggests that SC neurons
                  integrate both rapidly discounted information about repeating stimuli and novelty information about
                  oddball events, to support efficient selection in a cluttered dynamic world.},
journal={European Journal of Neuroscience},
volume={34},
number={5},
pages={766-779},
year={2011},
month={Sep},
type={su;phy},
file={http://ilab.usc.edu/publications/doc/Boehnke_etal11ejn.pdf},
if = {2009 impact factor: 3.658}
}

@inproceedings{Borji_etal11bmvc,
author={A. Borji and D. N. Sihite and L. Itti},
title={Computational Modeling of Top-down Visual Attention in Interactive Environments},
booktitle={Proc. British Machine Vision Conference (BMVC 2011)},
abstract={Modeling how visual saliency guides the deployment of attention over visual scenes has attracted much interest
                  recently - among both computer vision and experimental/computational researchers - since visual
                  attention is a key function of both machine and biological vision systems. Research efforts in
                  computer vision have mostly been focused on modeling bottom-up saliency. Strong influences on
                  attention and eye movements, however, come from instantaneous task demands. Here, we propose models of
                  top-down visual guidance considering task influences.  The new models estimate the state of a human
                  subject performing a task (here, playing video games), and map that state to an eye position. Factors
                  influencing state come from scene gist, physical actions, events, and bottom-up saliency. Proposed
                  models fall into two categories. In the first category, we use classical discriminative classifiers,
                  including Regres- sion, kNN and SVM. In the second category, we use Bayesian Networks to combine all
                  the multi-modal factors in a unified framework. Our approaches significantly outperform 15 competing
                  bottom-up and top-down attention models in predicting future eye fixations.},
year={2011},
month={Sep},
pages={85.1-85.12},
type={bu;td;mod;psy},
review={full/conf},
file={http://ilab.usc.edu/publications/doc/Borji_etal11bmvc.pdf},
if={2011 acceptance rate: 31.8\%}
}

@invited{Itti11ss,
author={L. Itti},
title={Visual attention: theories and models},
booktitle={Series of lectures for the CoSMo 2011 summer school in computational sensory-motor neuroscience, Kingston,
                  ON, Canada},
month={Aug},
year={2011},
type={bu;td;mod}
}

@invited{Itti11de,
author={L. Itti},
title={Modeling of bottom-up and top-down visual attention in natural environments},
booktitle={USC Discover Engineering guest lecture},
month={Aug},
year={2011},
type={bu;td;mod}
}

@inproceedings{Noori_Itti11ms,
author={N. Noori and L. Itti},
title={Symbolic Simulation: a grounded mechanistic account for processing symbolic information},
booktitle={Proc. 44th Annual Meeting of the Society for Mathematical Psychology (MathPsych 2011)},
abstract={Cognition by means of abstract symbolic concepts in an algorithmic manner is one of the tenets of mathematical
                  cognition.  Identifying the relationship between this evo- lutionarily newly emerged symbolic
                  machinery and rudimentary older modal systems has motivated numerous studies mostly focused on
                  grounding representation of symbolic concepts (Barsalou 2008). However recent evidences emerging from
                  neuroimaging and patient studies suggest that modal systems for visually guiding actions in space play
                  a role in mental operations on symbolic information that is beyond representation of symbolic concepts
                  (Koenigs M. et al 2009, Knops A.  et al. 2009). Motivated by these findings we posit a grounded
                  mechanistic model for algorithmic controlled information processing in human brain. We propose a
                  critical role for a spatially organized short-term memory which is used for anchoring task relevant
                  items into the space. These anchors are used for selective processing of the maintained infor-
                  mation. Selective processing of information (such as deletion of item from memory) in turn is made
                  possible through shifts in spatial attention towards registry location of the item of interest in the
                  space. This registry system along with an articulatory system for hashing items into phonological
                  codes, and a system for performing and monitoring sequential actions provide necessary mechanisms for
                  employing overly-trained networks for processing limited set of activated items in arbitrary
                  algorithms.  We have evaluated our hypothesis by detecting process related traces of mental symbolic
                  operations in both eye movements of human subjects and visuospatial short-term memory of objects in
                  the environment.},
type={eye;psy},
year={2011},
month={Jul},
review={abs/conf},
if={}
}

@inproceedings{Noori_Itti11ms2,
author={N. Noori and L. Itti},
title={Symbolic Simulation: a neural account for algorithmic and controlled information processing in human brain},
booktitle={Proc. 44th Annual Meeting of the Society for Mathematical Psychology (MathPsych 2011)},
abstract={Recent findings of neuroimaging and patient studies suggest that brain regions with visual-spatial
                  characteristics are involved in a wide range of memory tasks including those with no immediate
                  visual-spatial features. Yet, exactly how these regions contribute to such tasks remains an open
                  question. To address this question here we propose a framework for manipulation of items in memory
                  which relies on registering memory items in a spatially-organized short-term memory store. Switching
                  executive attention to memory items that need processing may then be embodied through shifting spatial
                  attention towards those registry locations. This assumption suggests that a secondary executive mem-
                  ory task may interfere with visuospatial short- term memory selectively and independent of the load on
                  executive attention. Experiments with human subjects verified these predictions.  Our findings suggest
                  that visuospatial short- term memory serves domain independent processes for memory manipulation in
                  addition to domain specific functions that require temporary maintaining of spatial information.},
type={eye;psy},
year={2011},
month={Jul},
review={abs/conf},
if={}
}

@inproceedings{Noori_Itti11cs,
author={N. Noori and L. Itti},
title={Modeling forward and backward serial recall using a spatial registry assumption},
booktitle={Proc. Conference on Cognitive Science (CogSci 2011)},
abstract={Inspired by recent observations of involvement of brain regions with visuospatial characteristics in mental
                  tasks featuring memory manipulation, we offer a model based on a spatial registry of working memory
                  items to explain the error patterns in both forward and backward recall. We hypothesize this spatial
                  registry is made possible by recruiting visuospatial resources. This spatial registry facilitates
                  selective processing of memory items determined by the algorithmic features of the mental task. We
                  assume that retrieving working memory items from spatial registry locations is subject to errors which
                  are determined by biological characteristics of a population encoding of spatial registry
                  locations. In this model, spatial-temporal factors of the population encoding of space, captures the
                  nature of visuospatial short-term memory. Our simulation results match both the positional and the
                  displacement error distributions for forward and reverse recall tasks in addition to fill-in errors in
                  forward recall.},
type={eye;psy},
year={2011},
month={Jul},
review={abs/conf},
file={http://ilab.usc.edu/publications/doc/Noori_Itti11cs.pdf},
if={}
}

@inproceedings{Noori_Itti11cs2,
author={N. Noori and L. Itti},
title={Spatial Registry Model: Towards a Grounded Account for Executive Attention},
booktitle={Proc. Conference on Cognitive Science (CogSci 2011)},
abstract={Mental tasks that feature algorithmic processing with symbolic items are shown to rely on brain regions known
                  for visual-spatial functions. Yet, exactly how these functions may help execution of amodal tasks
                  remains an open question. Here we propose a hypothesis for manipulation of items in working memory,
                  which relies on registering items in a spatially-organized short-term memory store. Switching
                  executive attention to items that need processing may then be embodied through shifting spatial
                  attention towards those registry locations. We studied gaze shifts of human subjects during memory
                  tasks as a proxy for shifts in spatial attention. Analysis of gaze shifts during sorting random
                  sequences of five decimal digits indicates that sorting in memory elicits gaze shifts that correlate
                  with sorting procedure. Our proposal establishes a functional relationship between general-purpose
                  production mechanisms that support algorithmic memory tasks with amodal items, and modal systems for
                  perception and action.},
type={eye;mod;psy},
pages={1-6},
year={2011},
month={Jul},
review={full/conf},
file={http://ilab.usc.edu/publications/doc/Noori_Itti11cs2.pdf},
if={}
}

@invited{Itti11bor,
author={L. Itti},
title={Computational modelling of attention and visual behaviour},
booktitle={25 years of research on saliency - state of the art and new directions},
month={Jul},
year={2011},
type={bu;td;mod}
}

@inproceedings{Noori_Itti11ecs,
author={N. Noori and L. Itti},
title={Eye-Movement Signatures of Abstract Mental Tasks},
booktitle={Proc. European Conference on Cognitive Science (EuroCogSci 2011)},
editor={B. Kokinov and A. Karmiloff-Smith and N. J. Nersessian},
abstract={Brain regions with visual-spatial characteristics are known to be recruited in mental tasks featuring
                  algorithmic information processing with symbolic concepts. Yet, exactly how they con- tribute to such
                  processing remains an open question. Here we propose a framework for manipulation of items in mem-
                  ory, which relies on registering memory items in a spatially- organized short-term memory
                  store. Switching executive at- tention to memory items that need processing may then be embodied
                  through shifting spatial attention towards those reg- istry locations. We studied gaze shifts during
                  memory tasks as a proxy for shifts in spatial attention. Analysis of gaze shifts during sorting
                  semi-random sequences of five decimal digits indicates that sorting in memory elicits gaze shifts that
                  correlate with sorting procedure. Our proposal establishes a functional relationship between those
                  general-purpose produc- tion mechanisms that support algorithmic memory tasks with amodal symbolic
                  information and modal systems for percep- tion and action.},
type={eye;psy},
month={May},
year={2011},
pages={110:1-110:6},
review={full/conf},
file={http://ilab.usc.edu/publications/doc/Noori_Itti11ecs.pdf},
if={}
}

@inproceedings{Borji_etal11vss,
author={A. Borji and D. N. Sihite and L. Itti},
title={Quantifying the relative influence of photographer bias and viewing strategy on scene viewing},
abstract={Saccade distributions while observers freely watch natural scenes and videos are often found to be highly
                  biased toward the image center (center-bias effect) (Tatler, 2007). Our quantitative comparison of 30
                  saliency models over three standard datasets of still images (Bruce & Tsotsos 2006, Kootstra et al.,
                  2008 and Judd et al., 2009), shows that model rankings do not agree. Interestingly, a trivial central
                  Gaussian blob saliency model outperforms many models in regard to predicting where humans look. Two
                  main sources of center-bias are: photographer bias (natural tendency of photographers to place objects
                  of interest near the center) and viewing strategy (tendency of subjects to look at the center to
                  extract more information) (Tseng et al., 2009). In this study, we measure the relative influence of
                  these causes and introduce a less center-biased dataset as a benchmark for fair evaluation of
                  models. From four datasets (three aforementioned and Le Meur et al., 2006), we chose those images with
                  the lowest center-bias index (a defined measure of tendency of human saccade density maps to be
                  concentrated toward center) and selected just 187 out of overall 1250 stimuli. The average center-bias
                  index of accepted images, all original stimuli and Gaussian blob were 0.61, 0.76 (0.88 for Judd) and
                  1, respectively. Next, to remove the variability in eye recording parameters in datasets, we recorded
                  eye movements of 30 subjects watching these images. The center-bias index for recorded eye movements
                  over selected images and the Judd dataset were 0.76 and 0.861, respectively. After removing the first
                  saccade, these values dropped to 0.68 and to 0.845. Although selected images had less objects at the
                  center, there was still a great amount of saccade density at the center. Our results suggest that, 1)
                  Widely used datasets are center-biased, 2) Photographer bias could be reduced, and 3) Viewing strategy
                  has a higher influence than photographer bias on overall center-bias since removing photographer bias
                  does not reduce overall center-bias significantly.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS11)},
year={2011},
month={May},
type={psy;mod;eye},
review={abs/conf}
}

@inproceedings{Tseng_etal11vss,
author={P. Tseng and I. G. M. Cameron and D. P. Munoz and L. Itti},
title={Effects of development on low-level feature processing during natural viewing of dynamic scenes},
abstract={Eye movements have been widely used to examine many aspects of brain functions, such as reflexive response,
                  inhibitory controls, and working memory, in normal development.  However, it is unclear how normal
                  development affects eye movements of natural viewing behavior.  This study specifically examined the
                  developmental trajectory of low-level features processing while participants freely viewed videos of
                  natural scenes. These videos are composed of short (2–4 seconds), unrelated clips. This design was to
                  reduce top-down expectation and to magnify the difference in gaze allocation at every scene change.
                  Gazes of 3 groups of participants (18 children, 10.7 +/- 1.8 yr; 18 young adults, 23.2 +/- 2.6 yr; 24
                  elderly, 70.3±7.5 yr) were tracked while they watched the videos for 20 minutes.  First, we used a
                  computational saliency model (Itti & Koch, 2001) to compute bottom-up saliency maps for each video
                  frame.  These saliency maps can be computed from a single feature (e.g. color contrast, motion
                  contrast) or a combination of them.  Next, we computed the correlation between salience and gaze of
                  each population.  To reveal the developmental trajectory of low-level features processing, classifiers
                  were built to differentiate (1) children vs. young adults, and (2) young adults vs. elderly. In the
                  mean time, a feature selection method was performed to identify the most discriminative features for
                  differentiating the populations.  Using this method, we found that during normal maturation (children
                  to young adults), there was a reduction in saccade interval and an increase in correlation between
                  gaze and texture contrast, orientated edges, and color contrast.  On the other hand, during normal
                  aging (young adults to elderly), we found an increase in saccade interval and a decrease in
                  correlation between gaze and oriented edges.  In conclusion, this study revealed for the first time
                  the differences between age groups in low-level feature processing during natural viewing of dynamic
                  scenes. },
booktitle={Proc. Vision Science Society Annual Meeting (VSS11)},
year={2011},
month={May},
type={psy;mod;eye},
file={http://ilab.usc.edu/publications/doc/Tseng_etal11vss.pdf},
review={abs/conf}
}

@inproceedings{Shen_Itti11vss,
author={J. Shen and L. Itti},
title={Top-down Visual Attention and Gender in a Focused Listening Task},
abstract={Visual attention and patterns of eye movements may be influenced by individual characteristics, such as gender
                  or culture, in specific stimulus and task environments. From psychology studies of social interaction,
                  we know that gaze behavior varies between men and women when in conversation. Using eye-tracking in
                  naturalistic settings, we found that men and women orient attention differently during conversational
                  listening. Thirty-four subjects (15 men and 19 women) had their eyes tracked while watching and
                  listening to twelve videotaped speakers in 84 different clips. While listening, we found that men gaze
                  more often at the mouth (p = 0.009) and women at the eyes (p = 0.028) of the speaker. In addition, we
                  measured the static and dynamic feature saliency according to a previously verified model of attention
                  (Itti, 2004). When we measured the correlation (in ROC score) of each subject's eye movements to
                  feature, we found that the fixations of men correlated more strongly with dynamic saliency (p <
                  0.0001), even at regions inside the face, i.e. the eyes (p = 0.023). We attribute overall gaze gender
                  differences in social interactions to a male preference for motion and a female preference for
                  features that are socially defined. We also propose that these gender differences arise from different
                  integration strategies of visual cues in selecting the final target of attention. Our findings
                  illuminate how the character of social interactions may vary by gender, and may also suggest more
                  predictive models of visual attention that take into account individual characteristics.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS11)},
year={2011},
month={May},
type={psy;mod;eye},
review={abs/conf}
}

@inproceedings{Chang_etal11vss,
author={C. K. Chang and C. Siagian and L. Itti},
title={Mobile robot vision navigation and obstacle avoidance based on gist and saliency algorithms},
abstract={Two of the important capabilities needed for scene understanding are extracting the gist of the scene and
                  identifying salient regions in the image. Here we present a robotic vision system that utilizes these
                  two modules to understand its surrounding from an image. That is, we would like the robot to be able
                  to localize and navigate in its environment. We present a vision-based navigation and localization
                  system using the two biologically-inspired scene understanding models. For localization, gist, which
                  captures the holistic characteristics and layout of an image, coarsely localize the robot to within
                  the general vicinity. Then, saliency, which emulates the visual attention of primates, refine the
                  location information by recognizing the detected conspicuous regions in the image.  For the vision
                  navigation sub-system, we use the gist features to identify the road region. Here, the image is
                  segmented into multiple regions, which are then classified using the gist features to find most likely
                  road region. By incorporating knowledge about the road geometry, the system is able to locate the
                  center of the road as well as avoid obstacles. At the same time, we also use the recognized salient
                  regions to prime the location of the road in the image. Furthermore these regions provides high level
                  navigation parameters such as distance to the junction and overall heading of the road (Chang et al.,
                  2010). The navigation system then uses the estimated road parameters to perform visual feedback
                  control to direct the robot's heading and to go to a user-provided goal location.  We test the vision
                  localization and navigation system at four sites (one indoor and three outdoor environments) using our
                  mobile robot, Beobot 2.0. The system is able to keep robot in the center of the lane with a route
                  length over 138.27 m. },
booktitle={Proc. Vision Science Society Annual Meeting (VSS11)},
year={2011},
month={May},
type={cv;bb;bu},
review={abs/conf}
}

@inproceedings{Chiang_etal11vss,
author={A-Y. D. Chiang and D. J. Berg and L. Itti},
title={Saliency, Memory, and Attention Capture in Marketing},
abstract={Visual attention is considered to have great value in marketing. The AIDA (Attention – Interest – Desire –
                  Action) advertising model suggests that attention capture is the first and most important step before
                  the desired consumer consumption behavior takes action. Pre-attentive visual processing accounts
                  largely for building up brand preferences in consumer schema: People tend to choose one brand over the
                  other because they feel familiar with it (mere exposure effect), even though they don't consciously
                  remember having seen the brand or its advertisements before. Marketers have been spending a lot of
                  money and time designing, and choosing effective publicity materials for consumer attention
                  capture. An efficient evaluation tool is thus considered necessary. We propose that saliency map (the
                  computational model of vision) can serve as the useful tool to predict people's eye fixation locations
                  in an advertisement, and help marketers to make strategic decisions choosing the most effective ad for
                  publicity through an objective manner. To test saliency map's efficiency, eye movements from fourteen
                  naive subjects were recorded while eighteen images from scenes of shopping environments were showed to
                  them for two seconds followed by a random mask. Subjects were then asked to recall whether
                  subsequently presented image contained items that were presented in the scene. We found no significant
                  correlation between subjects' recall rates and computed saliency of objects from the scenes; however,
                  the computed saliency has predicted eye locations three standard deviations above chance. The result
                  has supported other marketing studies on pre-attentive visual processing, and further demonstrated the
                  potential of saliency map in marketing.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS11)},
year={2011},
month={May},
type={cv;bb;bu},
review={abs/conf}
}

@inproceedings{Noori_Itti11vss,
author={N. Noori and L. Itti},
title={Visuospatial attention shifts during non-visual mental tasks},
abstract={We experimentally validate the hypothesis that seemingly irregular eye movements during mental tasks reflect
                  shifts in spatial attention, in turn employed by executive attention to manage short-term memory
                  through a spatial registry system. We build upon recent findings of neuroimaging (Knops, 2009) and
                  patient studies (Koenigs, 2009) that suggest a crucial role for brain regions with visual-spatial
                  characteristics in abstract mnemonic tasks that require memory manipulation. To validate our
                  hypothesis, we recorded human gaze shifts during two mental tasks: either passively maintaining or
                  sorting sequences of five random digits. We manipulated spatial binding of items to visual locations
                  through different visual presentations of the stimuli; however, subjects were instructed and monitored
                  for performing the tasks in memory and in front of a blank screen. In our first experiment we primed
                  spatial binding of items along horizontal and vertical directions and saw when the task is sorting,
                  the difference in normalized distribution of gaze-shift directions(GSD) along the horizontal
                  direction, for horizontal relative to vertical priming, on average is +4.41\% +/- 1.51\% (mean +/- SE)
                  which is significant (t-test, n = 9, p < 0.0192). However, when the task is passive maintaining the
                  difference in GSDs along the on average is 1.3\% +/- 1.76\% (mean +/- SE), n.s. (t-test, n = 9, p >=
                  0.9288). In our second experiment we showed that reversing the order of stimuli for mental sorting,
                  leads to a horizontal symmetry in spatial distribution of gaze shifts amplitudes. We chose two
                  categories of 5 digits, identified by these canonical strings:41230 and 03214. Exemplars for each
                  category were generated by using different digit values while preserving relative
                  ordering. Subtracting amplitude distributions of gaze-shifts leads to an antisymmetric distribution
                  measured by linear correlation of data point of two sides of distributions (t(−0.78) = −4.318, df =
                  12, p < 0.0005).Our findings establish a functional relationship between presumably amodal mnemonic
                  tasks and visual-spatial systems in human brain and might help explain the notable impact of executive
                  memory tasks on visual processing.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS11)},
year={2011},
month={May},
type={psy;eye},
review={abs/conf}
}


@Article{Li_Itti11tip,
author ="Z. Li and L. Itti",
title={Saliency and Gist Features for Target Detection in Satellite Images},
journal={IEEE Transactions on Image Processing},
abstract={Reliably detecting objects in broad-area overhead or satellite images has become an increasingly pressing
                  need, as the capabilities for image acquisition are growing rapidly.  The problem is particularly
                  difficult in the presence of large intraclass variability, e.g., finding “boats” or “buildings,”
                  where model-based approaches tend to fail because no good model or template can be defined for the
                  highly variable targets. This paper explores an automatic approach to detect and classify targets in
                  high-resolution broad-area satellite images, which relies on detecting statistical signatures of
                  targets, in terms of a set of biologically-inspired low-level visual features. Broad-area images are
                  cut into small image chips, analyzed in two complementary ways: 'attention/saliency' analysis
                  exploits local features and their interactions across space, while 'gist analysis focuses on global
                  nonspatial features and their statistics. Both feature sets are used to classify each chip as
                  containing target(s) or not, using a support vector machine. Four experiments were performed to find
                  'boats' (Experiments 1 and 2), 'buildings' (Experiment 3) and 'airplanes' (Experiment 4). In
                  experiment 1, 14 416 image chips were randomly divided into training (300 boat, 300 non-boat) and
                  test sets (13 816), and classification was performed on the test set (ROC area: 0.977 +/- 0.003). In
                  experiment 2, classification was performed on another test set of 11 385 chips from another
                  broad-area image, keeping the same training set as in experiment 1 (ROC area: 0.952 +/- 0.006). In
                  experiment 3, 600 training chips (300 for each type) were randomly selected from 108 885 chips, and
                  classification was conducted (ROC area: 0.922 +/- 0.005). In experiment 4, 20 training chips (10 for
                  each type) were randomly selected to classify the remaining 2581 chips (ROC area: 0.976 +/-
                  0.003). The proposed algorithm outperformed the state-of-the-art SIFT, HMAX, and hidden-scale salient
                  structure methods, and previous gist-only features in all four experiments. This study shows that
                  the proposed target search method can reliably and effectively detect highly variable target objects
                  in large image datasets.},
year={2011},
volume={20},
number={7},
pages={2017-2029},
type= { bu ; mod ; cv ; },
file={http://ilab.usc.edu/publications/doc/Li_Itti11tip.pdf},
if = {2009 impact factor: 2.848}
}

@article{Siagian_etal11jfr,
title={Beobot 2.0: Cluster Architecture for Mobile Robotics},
author={C. Siagian and C.-K. Chang and R. Voorhies and L. Itti},
journal={Journal of Field Robotics},
month={March/April},
year={2011},
volume={28},
number={2},
pages={278-302},
abstract={ With the recent proliferation of robust but computationally demanding robotic algorithms, there is now a need
                  for a mobile robot platform equipped with powerful computing facilities. In this paper, we present the
                  design and implementation of Beobot 2.0: an affordable research-level mobile robot equipped with a
                  cluster of sixteen 2.2GHz processing cores. Beobot 2.0 uses compact Computer on Module (COM)
                  processors with modest power requirements, thus accommodating various robot design constraints while
                  still satisfying the requirement for computationally intensive algorithms.  In the paper, we discuss
                  issues involved in utilizing multiple COM Express modules on a mobile platform such as inter-processor
                  communication, power consumption, cooling, and protection from shocks, vibrations, and other
                  environmental hazards such as dust and moisture. We have applied Beobot 2.0 to the following
                  computationally demanding tasks: laser-based robot navigation, SIFT object recognition, finding
                  objects in a cluttered scene using visual saliency, and vision-based localization, wherein the robot
                  has to identify landmarks from a large database of images in a timely manner.  For the last task, we
                  tested the localization system in three large-scale outdoor environments, which provide 3583, 6006,
                  and 8823 test frames, respectively. The localization errors for the three environments were 4.12ft,
                  7.81ft, and 13.40ft respectively. The per-frame processing times were 421.45ms, 794.31ms, and 884.74ms
                  respectively, representing speedup factors of 2.80, 3.00, and 3.58 when compared to a single dual-core
                  computer performing localization.  },
file={http://ilab.usc.edu/publications/doc/Siagian_etal11jfr.pdf},
type={mod;bu;sc;bb},
if = {2010 2-year impact factor: 3.580 }
}

@article{Baluch_Itti11tins,
author={F. Baluch and L. Itti},
title={Mechanisms of Top-Down Attention},
abstract={Attention exhibits characteristic neural signatures in brain regions that process sensory signals. An
                  important area of future research is to understand the nature of top-down signals that facilitate
                  attentional guidance towards behaviorally relevant locations and features. In this review, we discuss
                  recent studies that have made progress towards understanding: (i) the brain structures and circuits
                  involved in attentional allocation; (ii) top-down attention pathways, particularly as elucidated by
                  microstimulation and lesion studies; (iii) top-down modulatory influences involving subcortical
                  structures and reward systems; (iv) plausible substrates and embodiments of top-down signals; and (v)
                  information processing and theoretical constraints that might be helpful in guiding future
                  experiments. Understanding top-down attention is crucial for elucidating the mechanisms by which we
                  can filter sensory information to pay attention to the most behaviorally relevant events.},
journal={Trends in Neurosciences},
volume={34},
issue={4},
month={March},
year={2011},
pages={210-224},
file={http://ilab.usc.edu/publications/doc/Baluch_Itti11tins.pdf},
type={td;bu;psy;mod;fmri;eye;phy},
note={Front cover, April 2011},
if={2010 impact factor: 13.320}
}

@invited{Itti11alamos,
author={L. Itti},
title={Bayesian modeling of bottom-up and top-down visual attention in natural environments},
booktitle={Grand Challenges in Neural Computation, Los Alamos},
month={Feb},
year={2011},
type={bu;td;mod}
}

@inproceedings{Borji_Itti11icra,
author={A. Borji and L. Itti},
title={Scene Classification with a Sparse Set of Salient Regions},
abstract={This work proposes an approach for scene classification by extracting and matching visual features only at the
                  focuses of visual attention instead of the entire scene. Analysis over a database of natural scenes
                  demonstrates that regions proposed by the saliency-based model of visual attention are robust to image
                  transformations. Using a nearest neighbor classifier and a distance measure defined over the salient
                  regions, we obtained 97.35\% and 78.28\% classification rates with SIFT and C2 features from the HMAX
                  model at 5 salient regions covering at most 31\% of the image. Classification with features extracted
                  from the entire image results in 99.3\% and 82.32\% using SIFT and C2 features,
                  respectively. Comparing attentional and adhoc approaches shows that classification rate of the first
                  approach is 0.95 of the second. Overall, our results prove that efficient scene classification, in
                  terms of reducing the complexity of feature extraction is possible without a significant drop in
                  performance.},
booktitle={Proc. IEEE International Conference on Robotics and Automation (ICRA)},
pages={1902-1908},
year={2011},
month={Feb},
review={full/conf},
type={bb;bu;td},
%file={http://ilab.usc.edu/publications/doc/Borji_Itti11icra.pdf},
if = {2010 acceptance rate: 49\%}
}

@invited{Itti11uci,
author={L. Itti},
title={Bayesian modeling of bottom-up and top-down visual attention in natural environments},
booktitle={University of California, Irvine, Distinguished Lecture Series},
month={Jan},
year={2011},
type={bu;td;mod}
}

@invited{Itti11hvei,
author={L. Itti},
title={Statistical modeling of surprise with applications to predicting attention andgaze},
booktitle={SPIE Human Vision and Electronic Imaging, Special session: attention and gaze in constructing the visual
                  world},
month={Jan},
year={2011},
type={bu;td;mod}
}

@article{Li_etal11ivc,
author = {Z. Li and S. Qin and L. Itti},
title={Visual attention guided bit allocation in video compression},
journal={Image and Vision Computing},
volume={29},
number={1},
month={Jan},
year={2011},
pages={1-14},
if={2009 Impact Factor: 1.474},
file={http://ilab.usc.edu/publications/doc/Li_etal11ivc.pdf},
type={bu;cv;eye}
}

@invited{Itti10nest,
author={L. Itti},
title={Neuromorphic visual system for intelligent unmanned sensors},
booktitle={DARPA Neural Engineering, Science and Technology Forum, San Diego, CA},
month={Nov},
year={2010},
type={bu;td;mod}
}

@inproceedings{Tseng_etal10SfN,
author={P. Tseng and I. G. M. Cameron and D. P. Munoz and L. Itti},
title={Differentiating patients (ADHD, FASD, Parkinson's Disease) from
                  controls by gazing patterns},
abstract={Dysfunction in inhibitory control of attention was shown in
                  children with Attention Deficit Hyperactivity
                  Disorder (ADHD), Fetal Alcohol Spectrum Disorder
                  (FASD), and elderly with Parkinson's Disease
                  (PD). Previous studies explored the deficits in
                  top-down (goal oriented) and bottom-up (stimulus
                  driven) attention with a series of visual
                  tasks. This study investigates the difference in
                  attentional selection mechanism while patients
                  freely viewed natural scene videos without
                  performing specific tasks, and the difference is
                  utilized to develop classifiers to differentiate
                  patients from controls. These specially designed
                  videos are composed of short (2-4 seconds),
                  unrelated clips to reduce top-down expectation and
                  emphasize the difference in gaze allocation at every
                  scene change. Gaze of six groups of observers
                  (control children, ADHD children, FASD children,
                  control young adults, control elderly, and PD
                  elderly) were tracked while they watched the
                  videos. A computational saliency model computed
                  bottom-up saliency maps for each video
                  frame. Correlation between salience and gaze of each
                  population was computed and served as features for
                  classifiers. Leave-one-out was used to train and
                  test the classifiers. The classifier differentiates
                  ADHD, FASD, and control children with 72\% accuracy;
                  another classifier differentiates PD and control
                  elderly with 89\% accuracy. A feature selection
                  method was also used to identify the features that
                  differentiate the populations the most. This study
                  demonstrates that attentional selection mechanisms
                  are influenced by PD, ADHD, and FASD, and the
                  behavioral difference is captured by the correlation
                  between salience and gaze. Furthermore, this
                  task-free method shows promise toward future
                  screening tools. },
booktitle={Society for Neuroscience Annual Meeting (SfN10)},
year={2010},
month={Nov},
type={psy;mod;med;eye},
file={http://ilab.usc.edu/publications/doc/Tseng_etal10SfN.pdf},
review={abs/conf}
}

@invited{Itti10duke,
author={L. Itti},
title={Bayesian modeling of bottom-up and top-down visual attention in natural environments},
booktitle={Distinguished Lecture Series, Duke University, Durham, NC},
month={Oct},
year={2010},
type={bu;td;mod}
}

@inproceedings{Borji_etal10iros,
author={A. Borji and M. N. Ahmadabadi and B. N. Araabi},
title={Simultaneous Learning of Spatial Visual Attention and Physical Actions},
abstract={This paper introduces a new method for learning top-down and 
		task-driven visual attention control along with physical 
		actions in interactive environments. Our method is based 
		on the Reinforcement Learning of Visual Classes(RLVC) 
		algorithm and adapts it for learning spatial visual selection 
		in order to reduce computational complexity. Proposed algorithm 
		also addresses aliasings due to not knowing previous actions and 
		perceptions. Continuing learning shows our method is robust to 
		perturbations in perceptual information. Our method also allows 
		object recognition when class labels are used instead of physical 
		actions. We have tried to gain maximum generalization while 
		performing local processing. Experiments over visual navigation 
		and object recognition tasks show that our method is more efficient 
		in terms of computational complexity and is biologically more plausible.},
booktitle={Proc. IEEE/RSJ International Conference on Intelligent Robots and
Systems (IROS)},
year={2010},
month={Oct},
review={full/conf},
type={bb},
if = {2010 acceptance rate: 58.2\%}
}
%file={http://ilab.usc.edu/publications/doc/Borji_etal10iros.pdf},

@inproceedings{Chang_etal10iros,
author={C.-K. Chang and C. Siagian and L. Itti},
title={Mobile Robot Vision Navigation \& Localization Using Gist and Saliency},
abstract={ We present a vision-based navigation and localization
                  system using two biologically-inspired scene
                  understanding models which are studied from human
                  visual capabilities: (1) Gist model which captures
                  the holistic characteristics and layout of an image
                  and (2) Saliency model which emulates the visual
                  attention of primates to identify conspicuous
                  regions in the image. Here the localization system
                  utilizes the gist features and salient regions to
                  accurately localize the robot, while the navigation
                  system uses the salient regions to perform visual
                  feedback control to direct its heading and go to a
                  user-provided goal location. We tested the system on
                  our robot, Beobot2.0, in an indoor and outdoor
                  environment with a route length of 36.67m (10,890
                  video frames) and 138.27m (28,971 frames),
                  respectively. On average, the robot is able to drive
                  within 3.68cm and 8.78cm (respectively) of the
                  center of the lane.  },
booktitle={Proc. IEEE/RSJ International Conference on Intelligent Robots and
Systems (IROS)},
year={2010},
month={Oct},
review={full/conf},
type={bb},
note={Both first authors contributed equally},
file={http://ilab.usc.edu/publications/doc/Chang_etal10iros.pdf},
if = {2010 acceptance rate: 58.2\%}
}

@inproceedings{Yoshida_etal10jnss,
author={M. Yoshida and L. Itti and D. J. Berg and T. Ikeda and R. Kato
                  and K. Takaura and T. Isa},
title={Visually-guided eye movements based on color saliency in monkeys with unilateral lesion of primary visual cortex},
booktitle={Proc. Japanese Neuroscience Society annual meeting, Kobe, Japan},
month={Sep},
year={2010},
type={bu;td;mod},
review={abs/conf}
}

@misc{Siagian_etal10ar,
title={Cover Photo of journal Autonomous Robots},
author={C. Siagian and C.-K. Chang and R. Voorhies and L. Itti},
month={Aug},
year={2010},
volume={29},
number={2},
pages={1},
abstract={The front cover shows Beobot 2.0 developed at iLab,
                  University of SOuthern California, USA. This
                  wheelchair-based robot is a mobile high performance
                  parallel computing platform equipped with a Beowulf
                  cluster of sixteen 2.2 GHz processing cores. The
                  robot features sensors including Laser Range Finder,
                  sonars, low-latency camera, Inertial Measurement
                  Unit (IMU), Global Positioning System (GPS), and a
                  compass. The robot is used to implement
                  computationally hungry, neuroscience-inspired
                  algorithms for visual attention, object recognition,
                  indoor/outdoor localization and navigation, and
                  cognitive visual scene analysis. Image used with
                  permission from iLab, University of Southern
                  California, USA.},
file={http://ilab.usc.edu/publications/doc/Siagian_etal10ar.pdf},
type={mod;bu;sc;bb},
if = {2008 impact factor: 1.235 }
}

@patent{Tseng_etal10patent,
author              = { P. Tseng and I. G. M. Cameron and D. P. Munoz and L. Itti},
title               = { Eye-tracking method and system for screening human diseases},
month = { Aug },
year                = 2010,
note                = { Patent pending. Filed Aug 19, 2010, US 2010/0208205 A1},
organization        = { University of Southern California and Queen's University},
type                = { bu;mod;cv }
}

@incollection{Moser_etal10dcb,
title={Coordination in Brain Systems},
author={E. Moser and M. Corbetta and R. Desimone and Y. Fregnac and
                  P. Fries and A. Graybiel and J. D. Haynes and
                  L. Itti and L. Melloni and H. Monyer and W. Singer
                  and C. von der Maslburg and M. Wilson},
booktitle={Dynamic Coordination in the Brain: From Neurons to Mind -
                  Struengmann Forum Report, vol. 5.},
editor={C. von der Malsburg and W. A. Phillips and W. Singer},
year={2010},
month={Jul},
pages={193-214},
publisher={MIT Press},
address={Cambridge, MA},
type={mod;rev;td}
}

@invited{Itti10suvlpr,
author={L. Itti},
title={Full-Day Tutorial: Visual attention - Bottom-up, top-down and applications},
booktitle={Sino-USA Summer School on Vision, Learning and Pattern-Recognotion, Xi'an, China},
month={Jul},
year={2010},
type={bu;td;mod}
}

@article{Elazary_Itti10vr,
title={A Bayesian model for efficient visual search and recognition},
author={L. Elazary and L. Itti},
abstract={Humans employ interacting bottom-up and top-down processes
                  to significantly speed up search and recognition of
                  particular targets. We describe a new model of
                  attention guidance for efficient and scalable
                  first-stage search and recognition with many objects
                  (117,174 images of 1147 objects were tested, and 40
                  satellite images). Performance for recognition is on
                  par or better than SIFT and HMAX, while being,
                  respectively, 1500 and 279 times faster. The model
                  is also used for top-down guided search, finding a
                  desired object in a 5x5 search array within four
                  attempts, and improving performance for finding
                  houses in satellite images.},
journal={Vision Research},
volume={50},
number={14},
pages={1338-1352},
year={2010},
month={Jun},
type={bu;td;eye},
file={http://ilab.usc.edu/publications/doc/Elazary_Itti10vr.pdf},
if = {2008 impact factor: 2.051}
}

@article{Baldi_Itti10nn,
title={Of bits and wows: A Bayesian theory of surprise with applications to attention},
author={P. F. Baldi and L. Itti},
abstract={The amount of information contained in a piece of data can
                  be measured by the effect this data has on its
                  observer. Fundamentally, this effect is to transform
                  the observer's prior beliefs into posterior beliefs,
                  according to Bayes theorem. Thus the amount of
                  information can be measured in a natural way by the
                  distance (relative entropy) between the prior and
                  posterior distributions of the observer over the
                  available space of hypotheses. This facet of
                  information, termed ''surprise'', is important in
                  dynamic situations where beliefs change, in
                  particular during learning and adaptation. Surprise
                  can often be computed analytically, for instance in
                  the case of distributions from the exponential
                  family, or it can be numerically
                  approximated. During sequential Bayesian learning,
                  surprise decreases as the inverse of the number of
                  training examples. Theoretical properties of
                  surprise are discussed, in particular how it differs
                  and complements Shannon's definition of
                  information. A computer vision neural network
                  architecture is then presented capable of computing
                  surprise over images and video
                  stimuli. Hypothesizing that surprising data ought to
                  attract natural or artificial attention systems, the
                  output of this architecture is used in a
                  psychophysical experiment to analyze human eye
                  movements in the presence of natural video
                  stimuli. Surprise is found to yield robust
                  performance at predicting human gaze (ROC-like
                  ordinal dominance score of 0.7 compared to 0.8 for
                  human inter-observer repeatability, 0.6 for simpler
                  intensity contrast- based predictor, and 0.5 for
                  chance). The resulting theory of surprise is
                  applicable across different spatio- temporal scales,
                  modalities, and levels of abstraction.},
journal={Neural Networks},
volume={23},
number={5},
pages={649-666},
year={2010},
month={Jun},
type={su;eye},
file={http://ilab.usc.edu/publications/doc/Baldi_Itti10nn.pdf},
if = {2008 impact factor: 2.656}
}

@invited{Itti10ndn,
author={L. Itti},
title={Statistical modeling of surprise with applications to images and videos},
booktitle={Computational modelling of attention and visual behaviour},
month={Jun},
year={2010},
type={bu;td;mod}
}

@inproceedings{Yoshida_etal10assc,
author={M. Yoshida and L. Itti and D. J. Berg and T. Ikeda and R. Kato and K. Tkaura and T. Isa},
title={Guidance of gaze based on color saliency in monkeys with blindsight},
booktitle={Proc. 14th annual meeting of the Association for the
Scientific Study of Consciousness (ASSC14), Toronto, Canada},
month={Jun},
year={2010},
type={bu;mod;sc;eye},
review={abs/conf}
}

@invited{Itti10cirm,
author={L. Itti},
title={Statistical modeling of surprise with applications to images and videos},
booktitle={CIRM Workshop on Statistical Models for Images, Luminy, France},
month={May},
year={2010},
type={bu;td;mod}
}

@inproceedings{Parks_etal10vss,
author={D. F. Parks and A. Jain and J. McInerney and L. Itti},
title={GPGPU-based real-time object detection and recognition system},
abstract={Many neuroscience inspired vision algorithms have been
                  proposed over the past few decades.  However, it is
                  difficult to easily compare the various algorithms
                  that have been proposed by investigators.  Many are
                  very computationally intensive and are thus hard to
                  run at or near real time.  This makes it difficult
                  to rapidly compare different algorithms.  Further,
                  it makes it difficult to tweak existing algorithms
                  and to design new algorithms due to the training and
                  testing framework that must be constructed around
                  it.  With the advent of GPGPU computing significant
                  speedups on the order of 10-50 times are achievable
                  if the computations are intensive, local, and
                  massively parallel.  Many object recognition systems
                  fit this description, so the GPGPU provides an
                  attractive platform.  We describe an implemented
                  GPGPU-based system that uses saliency (Itti, Koch,
                  1998) to detect interesting regions of a scene, and
                  a generic backend that can run various object
                  recognition systems such as HMAX (Riesenhuber,
                  Poggio 1999) or SIFT (Lowe, 2004).  The less
                  intensive front end system only achieved a speed up
                  of 2x, but HMAX was sped up by 10x (Chikkerur,
                  2008).  We believe that this framework will allow
                  rapid testing and improvement of novel recognition
                  algorithms.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS10)},
year={2010},
month={May},
type={mod;cv},
file={http://ilab.usc.edu/publications/doc/Parks_etal10vss.pdf},
review={abs/conf}
}

@inproceedings{Tseng_etal10vss,
author={P. Tseng and I. G. M. Cameron and D. P. Munoz and L. Itti},
title={Differentiating Patients from Controls by Gazing Patterns},
abstract={Dysfunction in inhibitory control of attention was shown in
                  children with Attention Deficit Hyperactivity
                  Disorder (ADHD), Fetal Alcohol Spectrum Disorder
                  (FASD), and elderly with Parkinson's Disease
                  (PD). Previous studies explored the deficits in
                  top-down (goal oriented) and bottom-up (stimulus
                  driven) attention with a series of visual
                  tasks. This study investigates the difference in
                  attentional selection mechanism while patients
                  freely viewed natural scene videos without
                  performing specific tasks, and the difference is
                  utilized to develop classifiers to differentiate
                  patients from controls. These specially designed
                  videos are composed of short (2Ã¢â‚¬â€œ4 seconds),
                  unrelated clips to reduce top-down expectation and
                  emphasize the difference in gaze allocation at every
                  scene change. Gaze of six groups of observers
                  (control children, ADHD children, FASD children,
                  control young adults, control elderly, and PD
                  elderly) were tracked while they watched the
                  videos. A computational saliency model computed
                  bottom-up saliency maps for each video
                  frame. Correlation between salience and gaze of each
                  population was computed and served as features for
                  classifiers. Leave-one-out was used to train and
                  test the classifiers. With eye traces of less than 4
                  minutes of videos, the classifier differentiates
                  ADHD, FASD, and control children with 84\% accuracy;
                  another classifier differentiates PD and control
                  elderly with 97\% accuracy. A feature selection
                  method was also used to identify the features that
                  differentiate the populations the most. Moreover,
                  videos with higher inter-observer variability in
                  gaze were more useful in differentiating
                  populations. This study demonstrates attentional
                  selection mechanisms are influenced by PD, ADHD, and
                  FASD, and the behavioral difference is captured by
                  the correlation between salience and
                  gaze. Furthermore, this task-free method shows
                  promise toward future screening tools. },
booktitle={Proc. Vision Science Society Annual Meeting (VSS10)},
year={2010},
month={May},
type={psy;mod;med;eye},
file={http://ilab.usc.edu/publications/doc/Tseng_etal10vss.pdf},
review={abs/conf}
}

@inproceedings{Voorhies_etal10vss,
author={R. C. Voorhies and L. Elazary and L. Itti},
title={Application of a Bottom-Up Visual Surprise Model for Event Detection in Dynamic Natural Scenes},
abstract={We present an application of a neuromorphic visual attention
                  model to the field of large-scale video surveillance
                  and show that it outperforms a state-of-the-art
                  method at the task of event detection. Our work
                  extends Itti and Baldi's Surprise framework as
                  described by 'A Principled Approach to Detecting
                  Surprising Events in Video' in CVPR 2005. The
                  Surprise framework is a biologically plausible and
                  validated model of primate visual attention which
                  uses a new Bayesian model of information to detect
                  unexpected changes in feature detectors modeled
                  after those in the mammalian primary visual cortex.
                  We extend this model to cover extremely large fields
                  of view, and present methods for processing and
                  aggregating such large amounts of visual data.  Our
                  system is tested on real-world data in which events
                  containing both pedestrians and vehicles are staged
                  in an outdoor environment and are shot on a 16
                  mega-pixel camera at 3 frames per second. In these
                  tests, we show that our system is able to provide a
                  greater than 12.5\% gain in an ROC AUC analysis over
                  a reference (OpenCV) algorithm ('Foreground Object
                  Detection from Videos Containing Complex
                  Background,' Li, et al, 2003). Furthermore, our
                  system is rigorously tested and compared against the
                  same algorithm on artificially generated target
                  events in which image noise and target size is
                  independently controlled. In these tests, we show an
                  approximately 27\% improvement in noise invariance,
                  and an approximately 10\% improvement in scale
                  invariance over the comparison algorithm. The
                  results from these tests suggest the importance of
                  strong collaboration between the neuroscience and
                  computer science communities in developing the next
                  generation of vision algorithms.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS10)},
year={2010},
month={May},
type={bu;mod;cv;su},
review={abs/conf}
}


@inproceedings{Baluch_Itti10vss,
author={F. Baluch and L. Itti},
title={The effects of 2nd-order feature interactions in predicting human gaze},
abstract={How do features interact to guide human gaze? Models of
                  human attention have assumed a linear combination of
                  features to construct a final saliency map that
                  drives human overt/covert attention. We asked what
                  role, if any, do second order feature interactions
                  play in attracting attention.  We examined the eye
                  movements of 8 subjects while they watched videos
                  containing natural and synthetic scenes. A set of
                  five low-level feature channels including Color (C),
                  Intensity (I), Orientation (O), Flicker (F) and
                  Motion (M) using center-surround differences were
                  computed for each of 46,489 video frames shown to
                  the subjects. A total of 11,430 saccades were
                  analyzed. We compared 4 models including i) simple
                  unweighted sum of 1st-order terms, i.e., C, I, O, F,
                  M; ii) a weighted linear sum of 1st-order terms;
                  iii) unweighted sum of 1st-order and all 2nd-order
                  multiplicative feature interaction terms (e.g., CC,
                  CI, CO, CF, CM, etc.); and iv) a weighted linear
                  combination of 1st and 2nd order terms. For the
                  weighted combinations, the weights were learned
                  using a genetic algorithm (GA) that optimizes a cost
                  function defined as the difference between the
                  distribution of human saccade end points and salient
                  locations as computed by the respective models. The
                  optimal solution was found from a large search space
                  of size 2^20 for the model incorporating 1st order
                  terms and 2^80 for the model incorporating both 1st
                  and 2nd order terms respectively.  We found that the
                  optimized 1st order model performed significantly
                  better than all other models (p<0.05). Further we
                  found that models using 2nd-order interactions did
                  not improve the predicitive power of a model in
                  explaining eye movements of human subjects.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS10)},
year={2010},
month={May},
type={bu;td;psy;eye},
review={abs/conf},
file={http://ilab.usc.edu/publications/doc/Baluch_Itti10vss.pdf}
}

@article{Fallani_etal10ijbc,
  title={Functional networks From EEG signals during motor learning tasks},
  author={F. D. E. V. Fallani and F. Baluch and L. Astofli and
                  D. Subramanian and G. Zouridakis and F. Babiloni},
  abstract={The evaluation of the topological properties of brain
                  networks is an emerging research topic, since the
                  estimated cerebral connectivity patterns often have
                  relatively large size and complex structure. Since a
                  graph is a mathematical representation of a network,
                  the use of a theoretical graph approach would
                  describe concisely the topological features of the
                  functional network estimated from neuroimaging
                  techniques. In particular, by applying the process
                  of coherence analysis to high-density EEG
                  recordings, rich visualizations can be developed
                  that provide a means for spatiotemporal analysis of
                  changes in synchronous brain activity. In the
                  present work, we studied the changes in brain
                  synchronization networks during performance of a
                  complex visuomotor task with strategic components in
                  normal subjects. In particular, we evaluated the
                  differences in the functional network topology
                  associated with human learning by calculating global
                  and local efficiency indexes. Our results suggest
                  that during an initial period of learning, which is
                  probably related to the most significant cognitive
                  processes, the particular organization of functional
                  links in the alpha frequency band (8–12 Hz) tends to
                  increase the efficiency of communication within the
                  cerebral network. Such evidence could be interpreted
                  as due to the need for a new strategy
                  formulation. Overall, this approach enabled us to
                  capture a shift in topology made during the process
                  of learning and thus helped us to shed more light on
                  the neural correlates of strategy formulation. Our
                  findings provide strong support for the efficacy of
                  theoretical graph analysis to study complex brain
                  networks. },
  journal={International Journal of Bifurcation and Chaos},
  volume={20},
  number={3},
  pages={905-912},
  month={Mar},
  year={2010},
  type={psy},
  if={}
}

@inproceedings{Elazary_Itti10vss,
author={L. Elazary and L. Itti},
title={Framework and implementation for perception},
abstract={A biologically-inspired framework for perception is proposed
                  and implemented, which helps guide the systematic
                  development of machine vision algorithms and
                  methods. The core is a hierarchical Bayesian
                  inference system. Hypotheses about objects in a
                  visual scene are generated 'bottom-up' from sensor
                  data. These hypotheses are refined and validated
                  'top-down' when complex objects, hypothesized at
                  higher levels, impose new feature and location
                  priors on the component parts of these objects at
                  lower levels. To efficiently implement the
                  framework, an important new contribution is to
                  systematically utilize the concept of bottom-up
                  saliency maps to narrow down the space of
                  hypotheses. In addition, we let the system
                  hallucinate top-down (manufacture its own data) at
                  low levels given high-level hypotheses, to overcome
                  missing data, ambiguities and noise. The implemented
                  system is tested against images of real scenes
                  containing simple 2D objects against various
                  backgrounds. The system correctly recognizes the
                  objects in 98.71\% of 621 video frames, as compared
                  to SIFT which achieves 38.00\%.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS10)},
year={2010},
month={May},
type={bu;td;mod;cv;sc},
file={http://ilab.usc.edu/publications/doc/Elazary_Itti10vss.pdf},
review={abs/conf}
}

@inproceedings{Shen_Itti10vss,
author={J. Shen and L. Itti},
title={Gender Differences in Visual Attention During Listening as Measured By Neuromorphic Saliency: What Women (and
                  Men) Watch},
abstract={Predictive models of eye movements often do not address population differences.  Different tasks may play an
                  important role in differentiating eye movements among discrete groups.  For example, eye movement
                  behavior is known to vary by gender for an emotion-perception task (Vassallo, 2009).  We explore
                  observed differences in eye movements between genders by eye-tracking subjects during a audio-visual
                  listening task, as compared to a free-viewing task.  Thirty-four subjects, balanced by gender, are
                  eye-tracked while watching eighty-five videos of different people who give answers to conversational
                  questions.  Videos are filmed outdoors with a natural background of distractors, such as pedestrians
                  and vehicles. After viewing each clip, subjects answer questions about the video to measure any
                  attentional differences. To control for task effects, a separate group of ten control subjects are
                  asked to free view the clips. Interestingly, the main sequence of collected saccades significantly
                  differs across gender (n=33806, peak velocity: p<1e-15, amplitude: p=0.0076). Saccade sequences are
                  scored by examining the values of the saliency model output of the corresponding video (Itti, 2004) at
                  saccade endpoints.  Correlation to saliency is measured by comparing saccade scores to randomly
                  sampled saliency scores with an AUC (area under the curve) metric.  Saccades are also scored for their
                  correlation to the component features of saliency (color, orientation, intensity, flicker, and motion)
                  in a similar manner. We also find that correlations to saliency are significantly greater for male
                  viewers over female (p<1e-143) and are also significantly greater for female speakers (p<1e-143).
                  Furthermore, there is a two-way interaction on saliency correlations between the gender of the viewer
                  and speaker (2-way ANOVA, df=1, F=15123.48).  Gender differences persist across all features,
                  suggesting a broad gender difference in attentional allocation during listening.  We also investigate
                  the interplay of gender and saliency with fixations to the viewer's eyes, face, and background
                  objects.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS10)},
year={2010},
month={May},
type={bu;td;mod;eye},
file={http://ilab.usc.edu/publications/doc/Shen_Itti10vss.pdf},
review={abs/conf}
}

@inproceedings{Chang_etal10vss,
author={C.-K. Chang and C. Siagian and L. Itti},
title={Hardware and software computing architecture for robotics applications of neuroscience-inspired vision and navigation algorithms },
abstract={ Biologically-inspired vision algorithms have thus far not
                  been widely applied to real-time robotics because of
                  their intensive computation requirements. We present
                  a biologically-inspired visual navigation and
                  localization system which is implemented in
                  real-time using a cloud computing framework. We
                  create a visual computation architecture on a
                  compact wheelchair-based mobile platform. Our work
                  involves both a new design of cluster computer
                  hardware and software for real-time vision.  The
                  vision hardware consists of two custom-built carrier
                  boards that host eight computer modules (16
                  processor cores total) connected to a camera. For
                  all the nodes to communicate with each other, we use
                  ICE (Internet Communication Engine) protocol which
                  allow us to share images and other intermediate
                  information such as saliency maps (Itti and Koch
                  2001), and scene 'gist' features (Siagian and Itti
                  2007). The gist features, which coarsely encode the
                  layout of the scene, are used to quickly identify
                  the general whereabouts of the robot in a map, while
                  the more accurate but time consuming salient
                  landmark recognition is used to pin-point its
                  location to the coordinate level.  Here we extend
                  the system to also be able to navigate in its
                  environment (indoors and outdoors) using these same
                  features. That is, the robot has to identify the
                  direction of the road, use it to compute movement
                  commands, perform visual feedback control to ensure
                  safe driving over time.  We utilize four out of
                  eight computers for localization (salient landmark
                  recognition system) while the remainder are used to
                  compute navigation strategy. As a result, the
                  overall system performs all these computing tasks
                  simultaneously in real-time at 10 frames per
                  second. In short, with the new design and
                  implementation of the highly-capable vision
                  platform, we are able to apply computationally
                  complex biologically-inspired vision algorithms on
                  the mobile robot.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS10)},
year={2010},
month={May},
type={bu;td;mod;eye},
file={http://ilab.usc.edu/publications/doc/Chang_etal10vss.pdf},
review={abs/conf}
}

@invited{Itti10nvis,
author={L. Itti},
title={Biologically-inspired vision and attention for cognitive robots},
booktitle={Workshop on hybrid neuro-computer vision systems, Columbia University, NY},
month={Apr},
year={2010},
type={bu;td;mod}
}

@invited{Itti10usc,
author={L. Itti},
title={Computational models of visual attention and gaze in primates},
booktitle={University of Southern California Annual Vision Symposium, Los Angeles, CA},
month={Apr},
year={2010},
type={bu;td;mod}
}

@invited{Itti10dag,
author={L. Itti},
title={Biologically-inspired vision and attention for cognitive robots},
booktitle={Schloss Dagstuhl Seminar Series, Seminar 10081 on Cognitive Robotics, Germany},
month={Feb},
year={2010},
type={bu;td;mod}
}

@invited{Itti10aach,
author={L. Itti},
title={Modelinb Bottom-up and top-down visual attention in human and monkeys},
booktitle={Invited seminar, Aachen University, Germany},
month={Feb},
year={2010},
type={bu;td;mod}
}

@article{Baluch_Itti10pone,
author={F. Baluch and L. Itti},
title={Training top-down attention improves performance on a triple conjunction search task},
journal={PLoS One},
month={Feb},
year={2010},
volume={5},
issue={2},
pages={e9127},
abstract={Training has been shown to improve perceptual performance on
                  limited sets of stimuli. However, whether training
                  can generally improve top-down biasing of visual
                  search in a target-nonspecific manner remains
                  unknown. We trained subjects over 10 days on a
                  visual search task, challenging them with a novel
                  target (top-down goal) on every trial, while
                  bottom-up uncertainty (distribution of distractors)
                  remained constant. Subjects became experts at this
                  task, with performance increased two-fold, decreased
                  fixation duration, and stronger tendency to guide
                  gaze towards items with color and spatial frequency
                  (but not necessarily orientation) that resembled the
                  target, suggesting improved general top-down biasing
                  of search.},
type={mod;bu;psy;eye},
file={http://ilab.usc.edu/publications/doc/Baluch_Itti10pone.pdf},
if = {2009 impact factor: 4.351}
}

@invited{Itti10ucla,
author={L. Itti},
title={Exploiting bottom-up and top-down information in a Bayesian framework for vision},
booktitle={Mathematics Dept. Seminar, UCLA, Westwood, CA},
month={Feb},
year={2010},
type={bu;td;mod}
}

@press{Ricketts10ss,
author={S. Ricketts},
title={Today's Tech, Tomorrow's Data},
journal={Star Trek Nemesis (Blu-Ray) Bonus Feature},
month={Feb},
year={2010},
type={bu;eye;mod;bb}
}

@article{Parikh_etal10jne,
author={N. Parikh and L. Itti and J. Weiland},
title={Saliency-based image processing for retinal prostheses},
abstract={We present a computationally efficient model for detecting
                  salient regions in an image frame.  The model when
                  implemented on a portable, wearable system can be
                  used in conjunction with a retinal prosthesis, to
                  identify important objects that a retinal prosthesis
                  patient may not be able to see due to implant
                  limitations. The model is based on an earlier
                  saliency detection model but has a reduced number of
                  parallel streams. Results of a comparison between
                  the areas detected as salient by the algorithm and
                  areas gazed at by human subjects in a set of images
                  show a correspondence which is greater than what
                  would be expected by chance.  Initial results for a
                  comparison of the execution speed of the two
                  algorithm models for each frame on the TMS320 DM642
                  Texas Instruments Digital Signal Processor suggest
                  that the proposed model is approximately ten times
                  faster than the original saliency model. },
journal={Journal of Neural Engineering},
month={Jan},
year={2010},
pages={1-10},
volume={7},
if={2008 Impact Factor: 2.737},
file={http://ilab.usc.edu/publications/doc/Parikh_etal10jne.pdf},
type={mod;bu;med}
}

@invited{Itti09ucsd,
author={L. Itti},
title={Computational models of attention and eye movements in natural scenes},
booktitle={Cognitive Science 200: Visual salience: neurophysiology and
                  models, invited lecture, UCSD, La Jolla, CA},
month={Nov},
year={2009},
type={bu;td;mod}
}

@inproceedings{Boehnke_etal09sfn,
author    = {S. E. Boehnke and D. J. Berg and K. Kaneda and Y. Yanagawa
                  and L. Itti and T. Isa and D. P. Munoz},
title     = {Temporal characteristics and mechanisms of visual
                  adaptation in the superior colliculus},
abstract  = {Repeated visual stimulation leads to reduced sensory
                  responsiveness (adaptation) in neurons across the
                  visual system. Adaptation of visual responses also
                  occurs in superior colliculus neurons following
                  repeated stimulation, and it has the behavioral
                  consequence of delayed visual orienting. It is
                  currently unknown which temporal characteristics of
                  repeating stimuli affect the degree of adaptation in
                  SC visual neurons (stimulus duration, onset-onset
                  time or offset-onset time) and it remains unclear if
                  adaptation in the SC is the result of an intrinsic
                  mechanism or simply inherited through adapted
                  inputs. We addressed these issues by comparing
                  adaptation of SC superficial (SCS) neurons in
                  behaving monkeys and in the mouse slice preparation,
                  following sequences of four repetitive visual
                  stimuli (flashes and electrical microstimulation of
                  visual afferents to the SCS, respectively) that
                  varied in several temporal parameters. The degree of
                  adaptation across temporal configurations were
                  remarkable similar between awake monkey neurons and
                  mouse slice neurons. Neurons recorded from the SCS
                  layers in both monkey and slice responded
                  transiently or in a sustained fashion, with
                  transient responders showing greater adaptation than
                  sustained responders. Adaptation was generally
                  characterized by a decrease in onset firing rate and
                  an increase in visual response onset latency. The
                  amount of adaptation was dependent to varying
                  degrees on both the duration of the stimulus and the
                  time between stimuli (longer recovery time reducing
                  adaptation). Although these factors interacted in a
                  complex way, a simple Bayesian model (Surprise)
                  which takes stimulus history at several time scales
                  into account could well explain the patterns of
                  adaptation observed across temporal
                  configurations. In non-GABAergic neurons, identified
                  by lack of GFP fluorescence in GAD67-GFP knock-in
                  mice, adaptation was examined before and after bath
                  application of GABAB blocker (CGP52432). GABAB
                  blockage significantly reduced the amount of
                  adaptation observed (p<0.05), suggesting that at
                  least part of the adaptation is caused by an
                  intrinsic mechanism in the SCS circuitry. These data
                  are important for understanding visual temporal
                  processing in general, and guiding stimulus design
                  and interpretation of results from studies in
                  visual/cognitive neuroscience in which sequences of
                  stimuli are presented.},
month     = { Oct },
year      = {2009},
booktitle = { Proc. Society for Neuroscience Annual Meeting (SFN'09)},
type      = { mod;phy;su},
review    = {abs/conf}
}

@inproceedings{Itti_etal09jnss,
author={L. Itti and M. Yoshida and D. J. Berg and T. Ikeda and R. Kato
                  and K. Takaura and T. Isa},
title={Saliency-based guidance of eye movements in monkeys with
                  unilateral lesion of primary visual cortex},
booktitle={Proc. Japanese Neuroscience Society annual meeting, Nagoya, Japan},
month={Sep},
year={2009},
type={bu;td;mod},
review={abs/conf}
}


@article{White_etal09jn,
author={B. J. White and S. E. Boehnke and R. A. Marino and L. Itti and D. P. Munoz},
title={Color-related Signals in the Primate Superior Colliculus},
abstract={Color is important for segmenting objects from backgrounds,
                  which can in turn facilitate visual search in
                  complex scenes. However, brain areas involved in
                  orienting the eyes toward colored stimuli in our
                  environment are not believed to have access to color
                  information. Here, we show that neurons in the
                  intermediate layers of the monkey superior
                  colliculus (SC), a critical structure for the
                  production of saccadic eye movements, can respond to
                  isoluminant color stimuli with the same magnitude as
                  a maximum contrast luminance stimulus. In contrast,
                  neurons from the superficial SC layers showed little
                  color-related activity. Crucially, visual onset
                  latencies were 30-35ms longer for color, implying
                  that luminance and chrominance information reach the
                  SC through distinct pathways and that the observed
                  colorrelated activity is not the result of residual
                  luminance signals. Furthermore, these differences in
                  visual onset latency translated directly into
                  differences in saccadic reaction time. The results
                  demonstrate that the saccadic system can signal the
                  presence of chromatic stimuli only one stage from
                  the brainstem premotor circuitry that drives the
                  eyes.},
journal={Journal of Neuroscience},
year={2009},
month={Sep},
volume={29},
number={39},
pages={12159-12166},
if={2008 Impact Factor: 7.452},
file={http://ilab.usc.edu/publications/doc/White_etal09jn.pdf},
type={phy;psy}
}

@invited{Berg_Itti09ecem,
author={D. J. Berg and L. Itti},
title={Modeling bottom-up and top-down guidance of eye movements in humans and monkeys},
abstract={Active visual processing of complex natural environments
                  requires animals to combine, in a highly dynamic and
                  adaptive manner, sensory signals that originate from
                  the environment (bottom-up) with behavioral goals
                  and priorities dictated by the task at hand
                  (top-down). Together, bottom-up and top-down
                  influences combine to serve the many tasks which
                  require that we direct attention to the most
                  ''relevant'' entities in our visual
                  environment. While much progress has been made in
                  investigating experimentally how humans and other
                  primates may operate such goal-based attentional
                  selection, very little is understood of the general
                  mathematical principles and neuro-computational
                  architectures that subserve the observed behavior. I
                  will describe recent computational work which
                  attacks the problem of developing models of visual
                  attentional selection and eye movement programming
                  that are more flexible and can be strongly modulated
                  by the task at hand. I will back the proposed
                  architectures up by comparing their predictions to
                  behavioral recordings from humans and monkeys. I
                  will show examples of applications of these models
                  to real-world vision challenges, using complex
                  stimuli from television programs or modern immersive
                  video games.},
booktitle={European Conference on Eye Movements (ECEM), Southampton, England},
month={Aug},
year={2009},
type={bu;td;mod}
}

@inproceedings{Voorhies_etal09iros,
author={R. C. Voorhies and C. Siagian and L. Elazary and L. Itti},
title={Centralized Server Environment for Educational Robotics},
abstract={One of the main challenges when creating an undergraduate
                  introduction to robotics course is connecting the
                  theory taught in the lectures with the current
                  practices of research. The primary cause of this
                  difficulty is an inability to find a hardware
                  solution that is powerful enough to run complex
                  cutting-edge algorithms yet inexpensive enough to be
                  purchased by an undergraduate class budget. An ideal
                  system needs to have a gentle learning curve to
                  allow students with minimal background in the field
                  to get a robot up and running. Lastly, a fleet of
                  classroom robots needs to be easy to administrate
                  and maintain given the limited time of a Teaching
                  Assistant. Our approach is to implement a
                  centralized server system. In this system individual
                  robots are inexpensive yet capable of establishing a
                  WiFi link to a main server so that all the
                  compilation and system administration, as well as
                  much of the computationally intensive processing,
                  are done on that server.  We find that this solution
                  saves both time and money and provides an effective
                  teaching tool. This paper describes the hardware and
                  software architecture of the system, and example
                  applications implemented by undergraduate
                  students. },
booktitle={Proc. IEEE/RSJ International Conference on Intelligent Robots and
Systems (IROS)},
year={2009},
month={Oct},
review={full/conf},
type={bb},
file={http://ilab.usc.edu/publications/doc/Voorhies_etal09iros.pdf},
if = {2009 acceptance rate: 54.5\%}
}

@invited{Itti09stru,
author={L. Itti},
title={Untitled},
booktitle={Ernst Struengmann forum on Dynamic Coordination in the Brain: From
Neurons to Mind, Frankfurt, Germany},
month={Aug},
year={2009},
type={bu;td;mod}
}

@article{Siagian_Itti09tro,
title={Biologically Inspired Mobile Robot Vision Localization},
author={C. Siagian and L. Itti},
journal={IEEE Transactions on Robotics},
month={July},
year={2009},
volume={25},
number={4},
pages={861-873},
abstract={We present a robot localization system using biologically
                  inspired vision. Our system models two extensively
                  studied human visual capabilities: 1) extracting the
                  'gist' of a scene to produce a coarse localization
                  hypothesis and 2) refining it by locating salient
                  landmark points in the scene. Gist is computed here
                  as a holistic statistical signature of the image,
                  thereby yielding abstract scene classification and
                  layout. Saliency is computed as a measure of
                  interest at every image location, which efficiently
                  directs the time-consuming landmark-identification
                  process toward the most likely candidate locations
                  in the image. The gist features and salient regions
                  are then further processed using aMonte Carlo
                  localization algorithm to allow the robot to
                  generate its position. We test the system in three
                  different outdoor environments - building complex
                  (38.4m x 54.86m area, 13,966 testing images),
                  vegetation-filled park (82.3m x 109.73 m area,
                  26,397 testing images), and open field park (137.16m
                  x 178.31m area, 34,711 testing images) - each with
                  its own challenges. The system is able to localize,
                  on aver age, within 0.98, 2.63, and 3.46 m,
                  respectively, even with multiple kidnapped-robot
                  instances.  },
file={http://ilab.usc.edu/publications/doc/Siagian_Itti09tro.pdf},
type={mod;bu;sc;bb},
if = {2008 impact factor: 2.656 }
}

@article{Itti_etal09nmc,
title={Dopamine transporter imaging under high-dose transdermal
                  nicotine therapy in Parkinson's disease: an
                  observational study},
author={E. Itti and G. Villafane and Z. Malek and P. Brugieres and
                  D. Capacchione and L. Itti and P. Maison and
                  P. Cesaro and M. Meignan},
abstract={OBJECTIVES: Nicotine therapy might improve the course of
                  Parkinson's disease. This observational study
                  evaluated the performance of dopamine transporter
                  imaging in follow-up patients under nicotine
                  therapy. METHODS: Six Hoehn and Yahr stage III
                  patients underwent 123I-FP-CIT imaging prior to, 3
                  months, and 1 year after the onset of nicotine
                  therapy. Nicotine was administered transdermally
                  with increasing daily doses during 3 months (up to
                  105 mg/day) and decreased progressively. On
                  co-registered magnetic resonance imaging, striatal
                  regions of interest were drawn and binding
                  potentials of 123I-FP-CIT were calculated.Changes in
                  Unified Parkinson's Disease Rating Scale-III over
                  time were compared with binding potentials using
                  regression analysis. RESULTS: All patients improved
                  motor scores at 3 months (-65 +/- 22\% 'off', -89
                  +/- 12\% 'on') and most received fewer dopaminergic
                  drugs (-30\% dosage in average). Motor improvement
                  persisted to a lesser extent at 1 year(-39 +/- 31\%
                  'off', -13 +/- 43\% 'on'), partly because one
                  patient stopped the treatment. Interestingly, the
                  decrease in binding potentials (-4.0 +/- 10.5\%) was
                  slower than that expected in Parkinsonian patients
                  (usually -10\% per year) and was inversely
                  correlated with Unified Parkinson's Disease Rating
                  Scale-III improvement, r= 0.83 'off' and 0.91
                  'on'. CONCLUSION: This observational study
                  emphasizes a potential effect of nicotine therapy on
                  striatal dopamine transporter density, which may be
                  interpreted as direct pharmacological effect or
                  deceleration of neuronal loss.},
journal={Nuclear Medicine Communications},
volume={30},
number={7},
pages={513-518},
month={Jul},
year={2009},
type={med;mip},
if={2008 impact factor: 1.706 }
}
%file={http://ilab.usc.edu/publications/doc/Itti_etal09nmc.pdf},

@article{Tseng_etal09jov,
title={Quantifying center bias of observers in free viewing of dynamic
                  natural scenes},
author={P. Tseng and R. Carmi and I. G. M. Cameron and D.P. Munoz and
                  L. Itti},
abstract={Human eye-tracking studies have shown that gaze fixations
                  are biased toward the center of natural scene
                  stimuli ('center bias'). This bias contaminates the
                  evaluation of computational models of attention and
                  oculomotor behavior. Here we recorded eye movements
                  from 17 participants watching 40 MTV-style video
                  clips (with abrupt scene changes every 2–4 s), to
                  quantify the relative contributions of five causes
                  of center bias: photographer bias, motor bias,
                  viewing strategy, orbital reserve, and screen
                  center. Photographer bias was evaluated by five
                  naive human raters and correlated with eye
                  movements. The frequently changing scenes in
                  MTV-style videos allowed us to assess how motor bias
                  and viewing strategy affected center bias across
                  time. In an additional experiment with 5
                  participants, videos were displayed at different
                  locations within a large screen to investigate the
                  influences of orbital reserve and screen center. Our
                  results demonstrate quantitatively for the first
                  time that center bias is correlated strongly with
                  photographer bias and is influenced by viewing
                  strategy at scene onset, while orbital reserve,
                  screen center, and motor bias contribute
                  minimally. We discuss methods to account for these
                  influences to better assess computational models of
                  visual attention and gaze using natural scene
                  stimuli.},
journal={Journal of Vision},
volume={9},
number={7:4},
pages={1-16},
year={2009},
month={July},
type={bu;td;sc;eye;mod;psy},
file={http://ilab.usc.edu/publications/doc/Tseng_etal09jov.pdf},
if = {2007 impact factor: 3.791}
}

@article{Mundhenk_etal09vr,
title={Automatic Computation of an Image's Statistical Surprise
                  Predicts Performance of Human Observers on a Natural
                  Image Detection Task},
author={T. N. Mundhenk and W. Einhaeuser and L. Itti},
abstract={To understand the neural mechanisms underlying humans' exquisite
ability at processing briefly flashed visual scenes, we present a computer 
model that predicts human performance in a Rapid Serial Visual Presentation 
(RSVP) task. The model processes streams of natural scene images presented 
at a rate of 20Hz to human observers, and attempts to predict when subjects 
will correctly detect if one of the presented images contains an animal 
(target). We find that metrics of Bayesian surprise, which models both spatial 
and temporal aspects of human attention, differ significantly between RSVP 
sequences on which subjects will detect the target (easy) and those on which 
subjects miss the target (hard). Extending beyond previous studies, we here 
assess the contribution of individual image features including color 
opponencies and Gabor edges. We also investigate the effects of the spatial 
location of surprise in the visual field, rather than only using a single 
aggregate measure. A physiologically plausible feed-forward system, which 
optimally combines spatial and temporal surprise metrics for all features, 
predicts performance in 79.5\% of human trials correctly. This is significantly 
better than a baseline maximum likelihood Bayesian model (71.7\%). We can see 
that attention as measured by surprise, accounts for a large proportion of 
observer performance in RSVP. The time course of surprise in different feature 
types (channels) provides additional quantitative insight in rapid bottom-up 
processes of human visual attention and recognition, and illuminates the 
phenomenon of attentional blink and lag-1 sparing. Surprise also reveals 
classical Type-B like masking effects intrinsic in natural image RSVP 
sequences. We summarize these with the discussion of a multistage model of 
visual attention.},
journal={Vision Research},
volume={49},
number={13},
pages={1620-1637},
year={2009},
month={Jun},
type={bu;su;mod},
file={http://ilab.usc.edu/publications/doc/Mundhenk_etal09vr.pdf},
if = {2007 impact factor: 2.055}
}

@inproceedings{Berg_etal09crcns,
author={D. J. Berg and S. E. Boehnke and R. A. Marino and D. P. Munoz and L. Itti },
title={Free viewing of dynamic stimuli by humans and monkeys},
booktitle={Collaborative Research in Computational Neuroscience Annual
Meeting, Pittsburgh, PA},
month={Jun},
year={2009},
type={td;mod;psy},
review={abs/wkshp}
}

@article{Berg_etal09jov,
title={Free viewing of dynamic stimuli by humans and monkeys},
author={D. J. Berg and S. E. Boehnke and R. A. Marino and D. P. Munoz and L. Itti},
abstract={Due to extensive homologies, monkeys provide a sophisticated
                  animal model of human visual attention. However, for
                  electrophysiological recording in behaving animals
                  simplified stimuli and controlled eye position are
                  traditionally used. To validate monkeys as a model
                  for human attention during realistic free viewing,
                  we contrasted human (n = 5) and monkey (n = 5) gaze
                  behavior using 115 natural and artificial video
                  clips. Monkeys exhibited broader ranges of saccadic
                  endpoints and amplitudes and showed differences in
                  fixation and intersaccadic intervals. We compared
                  tendencies of both species to gaze toward scene
                  elements with similar low-level visual attributes
                  using two computational models: luminance contrast
                  and saliency. Saliency was more predictive of both
                  human and monkey gaze, predicting human saccades
                  better than monkey saccades overall. Quantifying
                  interobserver gaze consistency revealed that while
                  humans were highly consistent, monkeys were more
                  heterogeneous and were best predicted by the
                  saliency model. To address these discrepancies, we
                  further analyzed high-interest gaze targets—those
                  locations simultaneously chosen by at least two
                  monkeys. These were on average very similar to human
                  gaze targets, both in terms of specific locations
                  and saliency values. Although substantial
                  quantitative differences were revealed, strong
                  similarities existed between both species,
                  especially when focusing analysis onto high-interest
                  targets.},
journal={Journal of Vision},
volume={9},
number={5:19},
pages={1-15},
year={2009},
month={May},
type={bu;sc;eye;su},
file={http://ilab.usc.edu/publications/doc/Berg_etal09jov.pdf},
if = {2007 impact factor: 3.791}
}

@invited{Itti09nd,
author={L. Itti},
title={Computational models of vision and scene understanding in humans and monkeys},
booktitle={USC Neurodinner, Los Angeles, CA},
month={May},
year={2009},
type={bu;td;mod}
}

@inproceedings{Tseng_etal09vss,
author={P. Tseng and I. G. M. Cameron and D. P. Munoz and L. Itti},
title={Screening Attentional-related Diseases based on Correlation
                  between Salience and Gaze},
abstract={Several studies have shown that eye movements and certain
                  complex visual functions are influenced by diseases
                  such as Parkinson's Disease (PD) and Attention
                  Deficit Hyperactivity Disorder (ADHD). Here we
                  examine how bottom-up (stimulus-driven) attentional
                  selection mechanisms may differ between patient and
                  control populations, and we take advantage of the
                  difference to develop classifiers to differentiate
                  patients from controls. We tracked gaze of four
                  groups of observers (15 control children, aged 7-14;
                  6 ADHD children, aged 9-15; 12 control elderly, aged
                  66-79; and 9 PD elderly, aged 53-68) while they
                  freely viewed MTV-style videos. These stimuli are
                  composed of short (2-4 seconds), unrelated clips of
                  natural scenes to reduce top-down (contextual)
                  expectations and emphasize bottom-up influences on
                  gaze allocations at the scene change. We used a
                  saliency model to compute bottom-up saliency maps
                  for every video frame. Saliency maps can be computed
                  from a full set of features (color, intensity,
                  orientation, flicker, motion) or from individual
                  features. Support-vector-machine classifiers (with
                  Radial-Basis Function Kernel) were built for each
                  feature contributing the saliency map and for the
                  combination of them. Leave-one-out was used to train
                  and test the classifiers. Two classification
                  experiments were performed: (1) between ADHD and
                  control children; (2) between PD and control
                  elderly. Saliency maps computed with all features
                  can well differentiate patients and control
                  populations (correctness: experiment 1 - 100\%;
                  experiment 2 - 95.24\%). Additionally, saliency maps
                  computed from any one feature performed nearly as
                  well (both experiments' results are 0-5\%
                  worse). Moreover, 0-250 ms after scene change is the
                  most discriminative period for the
                  classification. This study demonstrates that the
                  bottom-up mechanism is greatly influenced by PD and
                  ADHD, and the difference can serve as a probable
                  diagnosis tool for clinical applications. },
booktitle={Proc. Vision Science Society Annual Meeting (VSS09)},
year={2009},
month={May},
type={bu;td;mod;psy;med},
review={abs/conf}
}
%file={http://ilab.usc.edu/publications/doc/Tseng_etal09vss.pdf},

@inproceedings{Li_Itti09vss,
author={Z. Li and L. Itti},
title={Gist Based Top-Down Templates for Gaze Prediction},
abstract={People use focal visual attention and rapid eye movements to
                  analyze complex visual inputs in a manner that
                  highly depends on current scene's properties. Here
                  we propose a top-down attention model which exploits
                  visual templates associated with different types of
                  scenes. During training, an image set has been
                  manually classified into several scene categories and
                  for each category we define a corresponding top-down
                  map which highlights locations likely to be of
                  interest empirically. Then 'gist' feature vectors of
                  each category's images are computed to generate a
                  Gaussian gist feature distribution, or signature of
                  that category. During testing, the input image's
                  gist feature vector is computed first, based on this
                  feature vector and the already generated scene
                  categories' gist feature distributions, a group of
                  corresponding weights are computed using the
                  probability density functions. The top-down map is
                  then the weighted summation of those pre-defined
                  templates. Finally, the top-down map is combined
                  with a bottom-up saliency map (Itti & Koch 2001) to
                  generate a final attention guidance map. In
                  eye-tracking validation experiments, two video types
                  are adopted as testing data, one is an original set
                  of captured video clips and the other one is built
                  by cutting the original clips into 1-3s small clips
                  and re-assembling. Results show that in the original
                  clips, the area under curve (AUC) score and the KL
                  distance of the standard bottom-up saliency map is
                  0.665 and 0.185 (higher is better) while the
                  attention guidance map result is 0.688 and 0.242,
                  respectively; with the re-assembled clips, the
                  standard bottom-up model result is 0.648 and 0.145
                  while the combined model result is 0.718 and
                  0.327. Our results suggest that the attention
                  selection can be more accurate with the proposed
                  top-down component. [1] Itti, L. and Koch, C. 2001
                  Computational Modeling of Visual Attention, Nature
                  Reviews Neuroscience, 2(3),
                  194-203. Acknowledgement: The authors gratefully
                  acknowledge the contribution of NSF, HFSP, NGA,
                  DARPA and China Scholarship Council.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS09)},
year={2009},
month={May},
type={bu;td;mod;cv},
review={abs/conf}
}

@inproceedings{Mundhenk_etal09vss,
author={T. N. Mundhenk and W. Einhaeuser and L. Itti},
title={What the Searchlight saw: revealing the extent of natural image
                  information that passes through bottom-up visual
                  attention mechanisms to higher visual processing},
abstract={In order to optimize information utilization and prevent
                  bottlenecking during visual processing, bottom-up
                  information is triaged by selectively gating image
                  features as they are observed. Here we demonstrate
                  for the first time a biologically-plausible,
                  information-theoretic model of the visual gating
                  mechanism which works efficiently with natural
                  images. From this, we give a neurophysiological
                  preview of what image information is passing to
                  higher levels of processing. We do this by
                  processing information given in a natural image
                  Rapid Serial Visual Presentation (RSVP) task by its
                  spatio-temporal statistical surprise (Einhaeuser,
                  Mundhenk, Baldi, Koch & Itti, 2007).  From this, we
                  obtain an attention-gate mask over each of the RSVP
                  image frames derived from the map of attentional
                  capture provided by the surprise system. The mask
                  itself accounts for the degree to which distracter
                  images that proceed or follow a target image are
                  able to take attention away from it and vice
                  versa. Attention is also accounted for within an
                  image so that targets need to be salient both across
                  frames and within the target image in order to be
                  detected. Additionally, stronger target capture
                  leads to better masking of rival information
                  decreasing later visual competition. The
                  surprise-based attention-gate is validated against
                  the performance of eight observers. We find that 29
                  unique RSVP targets from 29 different sequences
                  which are easy to detect precisely overlap to a far
                  greater extent with open regions in the attention
                  gate compared with 29 unique targets which are
                  difficult to detect (P<.001). This means that when a
                  target is easy to detect, more target regions are
                  passing through the attention-gate increasing the
                  availability of relevant features to visual
                  recognition facilities. Additionally, this allows
                  us to surmise what parts of any given image in an
                  RSVP task can plausibly be detected since regions
                  which are gated at this stage cannot be processed
                  any further.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS09)},
year={2009},
month={May},
type={bu;td;mod;su},
review={abs/conf}
}

@inproceedings{Baluch_Itti09vss,
author={F. Baluch and L. Itti},
title={Learning in an attentionally-demanding triple-conjunction task},
abstract={Several studies have shown improvement in perceptual
                  discrimination over the course of training sessions
                  with spatial or feature range specificity.  This
                  type of specific learning does not address the
                  question of whether and how general visual task
                  expertise can be gained. We designed a conjunction
                  search task in which a different target was
                  previewed on each trial and was then randomly
                  positioned in a search array, thus eliminating both
                  feature and spatial specificity of learning. 5
                  subjects performed 10 sessions of 100 trials each
                  over the course of 10 days. In each trial subjects
                  had to find a color gabor patch target uniquely
                  identified by its hue, orientation and spatial
                  frequency among an array of 32 patches.  All
                  subjects showed marked improvement over the course
                  of training. A highly significant (p<0.005) change
                  in performance was observed from session 1 (44\%
                  correct) to session 5 (73\% correct), beyond which
                  performance plateaued. In order to isolate changes
                  in visual behavior resulting from learning we
                  construct feature similarity maps that define the
                  similarity between items in a search array and a
                  target, in individual feature dimensions. High
                  values in the similarity maps indicate large
                  differences between the target and an item, and
                  vice-versa. We find small but negative correlations
                  between saccade endpoint distributions (SED) and
                  feature similarity maps, indicating that subjects
                  were preferentially looking at distractor items more
                  similar to the target. Moreover, the negative
                  correlations grow stronger over the course of the
                  sessions, for the hue and frequency features but not
                  for orientation. We then correlate SED with linear
                  combinations of individual feature maps as well as a
                  map derived from a simple minimum distance rule. The
                  results indicate that subjects had an increased
                  tendency to look toward items similar to the target
                  in the hue and frequency dimensions.
                  Acknowledgement: NGA. },
booktitle={Proc. Vision Science Society Annual Meeting (VSS09)},
year={2009},
month={May},
type={bu;td;psy;eye},
review={abs/conf}
}

@inproceedings{White_etal09vss,
author={B. J. White and S. E. Boehnke and R. A. Marino and L. Itti and D. P. Munoz},
title={Color Signals in the Primate Superior Colliculus},
abstract={Color is important for segmenting objects from backgrounds,
                  which can in turn facilitate visual search in
                  complex scenes. However, brain areas that control
                  overt visual orienting (i.e., saccadic eye
                  movements) are not believed to have access to color
                  (Schiller et al., 1979), despite massive visual
                  corticotectal projections (Lock et al. 2003), which
                  include areas traditionally associated with color
                  processing (e.g., V4). Here, we show the first
                  evidence that neurons from the intermediate layers
                  of the monkey superior colliculus (SC), a critical
                  structure for both overt and covert visual orienting
                  (Fecteau and Munoz, 2006; Ignashchenkova et al.,
                  2004), can respond to pure chromatic stimuli with
                  the same magnitude as a maximum contrast luminance
                  stimulus. In contrast, neurons from the superficial
                  SC layers showed little color response. Crucially,
                  visual onset latencies were approximately 30ms
                  longer for color, implying that luminance and
                  chrominance information reach the SC through
                  distinct pathways, and that the color response can-
                  not be due to residual luminance
                  signals. Furthermore, these differences in visual
                  latency translated directly into differences in
                  saccadic reaction time (SRT) between color and
                  luminance, which closely match SRT differences
                  reported in humans (White et al., 2006). These
                  results demonstrate that the saccadic eye movement
                  system can signal the presence of pure chromatic
                  stimuli only one stage from the brainstem premotor
                  circuitry that drives the eyes.  Acknowledgement:
                  The authors thank Ann Lablans, Becky Cranham, Donald
                  Brien, Sean Hickman and Mike Lewis and for technical
                  assistance. This project was funded by the Human
                  Frontiers Science Program, Grant RGP0039-2005-C, and
                  the Canadian Institutes of Health Research. DPM was
                  supported by Canada Research Chair Program.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS09)},
year={2009},
month={May},
type={phy},
review={abs/conf}
}

@article{Itti_Baldi09vr,
title={Bayesian Surprise Attracts Human Attention},
author={L. Itti and P. F. Baldi},
abstract={We propose a formal Bayesian definition of
  surprise to capture subjective aspects of sensory
  information. Surprise measures how data affects an observer, in
  terms of differences between posterior and prior beliefs about the
  world.  Only data observations which substantially affect the
  observer's beliefs yield surprise, irrespectively of how rare or
  informative in Shannon's sense these observations are.  We test the
  framework by quantifying the extent to which humans may orient
  attention and gaze towards surprising events or items while watching
  television. To this end, we implement a simple computational model
  where a low-level, sensory form of surprise is computed by simple
  simulated early visual neurons. Bayesian surprise is a strong
  attractor of human attention, with 72\% of all gaze shifts directed
  towards locations more surprising than the average, a figure rising
  to 84\% when focusing the analysis onto regions simultaneously
  selected by all observers. The proposed theory of surprise is
  applicable across different spatio-temporal scales, modalities, and
  levels of abstraction.},
journal={Vision Research},
volume={49},
number={10},
pages={1295-1306},
year={2009},
month={May},
type={su;eye},
file={http://ilab.usc.edu/publications/doc/Itti_Baldi09vr.pdf},
if = {2007 impact factor: 2.055}
}

@invited{Itti09ncm,
author={L. Itti},
title={Bottom-up and top-down control of gaze in natural viewing},
booktitle={Society for the Neural Control of Movement Annual Meeting, Waikoloa, HI},
month={Apr},
year={2009},
type={bu;td;mod}
}


@inproceedings{Parikh_etal09gro,
author={N. Parikh and L. Itti and M. Humayun and J. Weiland},
title={Comparison of regions
detected salient by an image processing algorithm with regions of
human eye movements},
booktitle={Proceedings of the 13th Annual Grodins Graduate
Research Symposium, Department of Biomedical Engineering, University of
Southern California},
pages={119-120},
year={2009},
month={Apr},
review={abs/wkshp},
type={bu;med}
}

@invited{Itti09cos,
author={L. Itti},
title={Integrating context to computational models of saliency-based attention},
booktitle={Cosyne Workshop: The role of spatial context in biological and computational vision, Salt Lake City, UT},
month={Mar},
year={2009},
type={bu;td;mod}
}


@press{Mitchell09pp,
author={K. Mitchell},
title={Do Men and Women Take Different Photos?},
journal={Popular Photography},
month={Jan},
year={2009},
type={bu;cv;eye},
file = { http://iLab.usc.edu/publications/doc/Mitchell09pp.pdf}
}

@invited{Itti09mit,
author={L. Itti},
title={Bottom up and top-down guidance of visual attention in natural environments},
booktitle={Symposium on Natural Scenes, MIT, Cambridge, MA},
month={Jan},
year={2009},
type={bu;td;mod}
}

@invited{Itti09har,
author={L. Itti},
title={Modeling bottom-up and top-down visual attention in humans and monkeys},
abstract={Visual processing of complex natural environments requires
                  animals to combine, in a highly dynamic and adaptive
                  manner, sensory signals that originate from the
                  environment (bottom-up) with behavioral goals and
                  priorities dictated by the task at hand
                  (top-down). Together, bottom-up and top-down
                  influences combine to serve the many tasks which
                  require that we direct attention to the most
                  ''relevant'' entities in our visual
                  environment. While much progress has been made in
                  investigating experimentally how humans and other
                  primates may operate such goal-based attentional
                  selection, very little is understood of the general
                  mathematical principles and neuro-computational
                  architectures that subserve the observed behavior. I
                  will describe recent computational work which
                  attacks the problem of developing models of visual
                  attentional selection that are more flexible and can
                  be strongly modulated by the task at hand. I will
                  back the proposed architectures up by comparing
                  their predictions to behavioral recordings from
                  humans and monkeys. I will show examples of
                  applications of these models to real-world vision
                  challenges, using complex stimuli from television
                  programs or modern immersive video games.},
booktitle={Harvard medical school weekly seminar, Cambridge, MA},
month={Jan},
year={2009},
type={bu;td;mod}
}

@press{Grasso09yi,
author={E. Grasso},
title={Come rimanere 'a bocca aperta' da adulti},
journal={Yahoo Italia Notizie},
month={Jan},
year={2009},
type={bu;cv;su},
file = { http://iLab.usc.edu/publications/doc/Grasso09yi.pdf}
}

@press{Anonymous09ihu,
author={A. Anonymous},
title={Huha-faktorral vizsgaljak a meglepetest},
journal={Index.hu},
month={Jan},
year={2009},
type={bu;cv;su},
file = { http://iLab.usc.edu/publications/doc/index-hu-surprise.pdf}
}

@invited{Itti09jhu,
author={L. Itti},
title={Modeling bottom-up and top-down visual attention in humans and monkeys},
booktitle={Center for language and speech processing, Johns Hopkins University, Baltimore, MD},
month={Jan},
year={2009},
type={bu;td;mod}
}

@press{Geddes09ns,
author={L. Geddes},
title={Model of surprise has 'wow' factor built in},
journal={New Scientist},
month={Jan},
year={2009},
type={bu;cv;su},
file = { http://iLab.usc.edu/publications/doc/Geddes09ns.pdf}
}

@press{Byrne08v1,
author={E. Byrne},
title={Surprise moves eyes},
journal={Primary Visual Cortex},
month={Oct},
year={2008},
type={bu;cv;su},
file = { http://iLab.usc.edu/publications/doc/Byrne08v1.pdf}
}

@invited{Itti08rov,
author={L. Itti},
title={Computational models of attention and eye movements in natural scenes},
booktitle={Workshop on attention and motor control, Rovereto, Italy},
month={Oct},
year={2008},
type={bu;td;mod}
}

@invited{Itti08humc,
author={L. Itti},
title={Bottom-up and top-down guidance of visual attention in health
and disease},
booktitle={Harbor-UCLA Neurology Department Grand Rounds, Torrance,
California},
month={Sep},
year={2008},
type={bu;td;mod}
}

@invited{Berg08ngp,
author={D. J. Berg},
title={Modeling bottom-up visual attention in humans and monkeys},
booktitle={USC Neuroscience Retreat, Laguna Beach, CA},
year={2008},
month={Sep}
}

@press{Matthews08pp,
author={N. Matthews},
title={The Photographer's Guide to the Eye},
journal={Popular Photography},
month={Aug},
year={2008},
type={bu;cv;eye},
file = { http://iLab.usc.edu/publications/doc/Matthews08pp.pdf}
}

@press{Anonymous08oka,
author={A. Anonymous},
title={Professors Win Okawa Research Awards},
abstract={Two USC Viterbi School of Engineering faculty Laurent Itti
of the computer science department and Shrikanth Narayanan of the Ming
Hsieh Department of Electrical Engineering have won 2008 Okawa
Foundation grants to support their research in human-machine speech
processing and neuroscience-inspired robotic vision systems,
respectively.},
journal={USC Press release},
month={Aug},
year={2008},
type={bu;cv;bb},
file = { http://iLab.usc.edu/publications/doc/Anonymous08oka.pdf}
}

@invited{Itti08vssa,
author={L. Itti},
title={Modeling bottom-up and top-down visual attention and search in
humans and monkeys},
booktitle={International Symposium on Visual Search and Selective
Attention, Muntelier-Loewenberg, Switzerland},
month={Jul},
year={2008},
type={bu;td;mod}
}

@invited{Itti08icp,
author={L. Itti},
title={Bottom up and top-down guidance of visual attention in natural
environments},
abstract={The natural world affords a complexity which makes its
comprehension highly complex. To cope with this complexity, biological
systems have evolved attentional strategies which rapidly focus
processing resources on the most important and relevant aspects of the
incoming sensory data. Here I describe several exciting new research
directions that study the joint stimulus-driven (or bottom-up) and
goal-driven (or top-down) influences on attentional allocation. I
describe a new computational model which processes video inputs and
predicts where observers look under different task conditions. I
discuss results of testing this model against human eye movement
recordings over several hours of video stimuli.},
year={2008},
month={July},
booktitle={International Conference of Psychology (ICP),
 Symposium on Current Views in Scene Perception},
type={bu;td;eye}
}

@inproceedings{Siagian_Itti08gcer,
author={C. Siagian and L. Itti},
title={Search and Rescue: An Educational Test-bed for Robotics Systems
                  Integration},
abstract={ We present a search and rescue problem as a final project
for an undergraduate-level Introduction to Robotics course. To
successfully complete the project, a participating robot has to solve
four sub-problems that are currently among the most actively
researched areas in robotics: computer vision, manipulative robotics,
localization, and multi-robot cooperation. By immersing the students
in the details of these sub-problems, we aim to have them develop a
deeper appreciation of the difficulties. In many cases, this
experience also motivates the students to pursue research in those
areas. We describe several example systems that use easily obtained
hardware components such as LEGO pieces, the Handyboard, sonar,
compass, CMUCam, and servomotors. However, each of the sub-problems is
easily extendable to increase the complexity/creativity of the
solution as well as accommodate for new and more powerful devices.},
year={2008},
month={July},
booktitle={Global Conference on Educational Robotics (GCER),
Norman, Oklahoma},
type={bb},
file={http://ilab.usc.edu/publications/doc/Siagian_Itti08gcer.pdf},
review={full/conf}
}

@inproceedings{Tseng_etal08hfsp,
author={P. Tseng and D. J. Berg and M. Yoshida and I. G. M. Cameron
                  and S. E. Boehnke and R. A. Marino and T. Ikeda and
                  R. Kato and K.  Takaura and L. Itti and T. Isa and
                  D. P. Munoz},
title={Deployment of visual attention and of the eyes differentiate
                  observer populations},
abstract={In our complex visual world, it is critical to pay attention
                  to the right places to detect life-threatening
                  events, to accomplish tasks, and to explore new
                  environments. Our attention is attracted by salient
                  objects and events (bottom-up) and is controlled by
                  the task at hand (top-down). A computational model
                  of bottom-up attentional selection mechanisms (Itti
                  & Koch, 2001) was developed based on the
                  functionality of primary visual cortex (V1), which
                  computes low-level features (color contrast,
                  intensity contrast, orientation, flicker, and
                  motion) from visual scenes, and detects locations
                  where these features significantly differ from their
                  surround. The model computes a saliency map which
                  predicts hot spots that draw peopleÕs attention. To
                  investigate the relative contributions of bottom-up
                  and top-down in humans and monkeys, we tracked gaze
                  of different groups of observers, in different but
                  related projects supported by our grant: 15 control
                  human children, 12 control human elderly, 6
                  Attention Deficit Hyperactivity Disorder (ADHD)
                  patients, 4 Fetal Alcohol Spectrum Disorder (FASD)
                  patients, 9 Parkinson's Disease (PD) patients, 4
                  normal rhesus monkeys, 3 macaque monkeys with
                  unilateral lesion of V1, and 3 normal macaque
                  monkeys) while they freely viewed videos. Using the
                  saliency model, we quantified how bottom-up
                  attentional selection mechanisms may differ between
                  populations, based on the correlation between their
                  eye-movement traces and the saliency maps, and using
                  support-vector-machine classifiers to compare this
                  correlation between populations.  Leave-one-out was
                  used to train and test the classifiers.  The first
                  experiment differentiated PD patients from control
                  elderly with 95.24\% correctness. The second
                  differentiated ADHD, FASD from control children with
                  97\% correctness. Further investigations comparing
                  humans (young adults) and monkeys showed that the
                  salience of locations which all monkeys looked at
                  simultaneously was significantly higher than for
                  humans (t-test, p<10-10). Moreover, the saliency
                  model was used to quantify specific eye movement
                  deficits in the monkeys with unilateral V1 lesion,
                  demonstrating significant (t-test, p<10-5) albeit
                  surprisingly small differences. Our results
                  demonstrate that the balance between bottom-up and
                  top-down mechanisms is greatly influenced by
                  diseases or dysfunction, and can be quantified by
                  the saliency model. This can serve not only to
                  further our basic understanding of vision and
                  attention, but also as a screening or diagnosis tool
                  for clinical applications.},
booktitle={Annual Meeting of the Human Frontier Science Program (HFSP),
Berlin, Germany},
month={Jul},
year={2008},
type={td;mod;psy;bu;eye;med},
review={abs/wkshp}
}

@inproceedings{Boehnke_etal08crcns,
author={S. E. Boehnke and D. J. Berg and R. A. Marino and P. F. Baldi
                  and L. Itti and D. P. Munoz},
title={Adaptation, habituation and dishabituation of visual responses
                  in the superior colliculus},
abstract={When stimuli are repeatedly flashed into the receptive field
   of visual neurons in the superior colliculus (SC), the response
   magnitude decreases (e.g., in a cue-target task). This effect could
   be due to 'adaptation' - a lower level mechanism like pupil
   constriction, or 'habituation' - the non-associative learning
   mechanism by which an organism stops responding to an irrelevant
   stimulus but recovers the response (dishabituates) after a change
   in stimulus properties. We sought to characterize the changes in
   responses that could be attributed to adaptation and habituation in
   superficial (SCs, n=18) and intermediate (SCi, n=36) layer neurons
   using a paradigm adopted from the ideas of Sokolov (1963). Two
   monkeys (Macaca mulatta) were rewarded for fixating a central
   fixation point while a series of 7 successive stimuli were flashed
   briefly (55 ms duration; 75-800 ms interstimulus interval (ISI)) in
   the receptive field of the neuron. On 70\% of trials all flashed
   stimuli were identical, while on others, the 4th stimulus was
   either brighter, dimmer or absent (10\% each). If reduced neural
   responsiveness is due to habituation, some recovery of the response
   (dishabituation) should occur to any oddball stimulus. However, if
   the reduced response is due to adaptation, the response should be
   further reduced after the brighter stimulus, but recover after the
   dimmer or absent stimulus. For the typical ISI of 200ms, the
   largest decrease in response magnitude (> 60\% in SCs and ~50\% in
   SCi) was to the second stimulus, and subsequent stimuli resulted in
   only small further reductions. The shorter the ISI the greater
   these reductions, with responses to stimuli 2-7 often being
   obliterated at very short ISIs (<100ms). The onset latency of the
   visual response increased with each stimulus so that on average the
   7th stimulus response initiated nearly 20ms later than the
   1st. These patterns were globally similar in SCs and SCi, but there
   were greater changes to the 3-7th stimuli in many SCi
   neurons. Responses to oddball stimuli in SCs neurons were
   suggestive only of sensory adaptation, while responses in SCi
   neurons showed features of both adaptation and habituation because
   a dishabituation signature in response to brighter or dimmer
   stimuli was present in the late visual response, and in the
   subsequent interstimulus interval. The adaptation responses of
   neurons in the SCs were more homogenous compared to those in the
   SCi, which showed considerable diversity in adaptation and
   habituation response properties.},
booktitle={Collaborative Research in Computational Neuroscience Annual
Meeting, Los Angeles, California},
month={Jun},
year={2008},
type={td;mod;psy},
review={abs/wkshp}
}

@inproceedings{Itti08crcns,
author={L. Itti},
title={CRCNS Data Sharing: Eye movements during free-viewing of
natural videos},
booktitle={Collaborative Research in Computational Neuroscience Annual
Meeting, Los Angeles, California},
month={Jun},
year={2008},
type={td;mod;psy},
review={abs/wkshp}
}

@inproceedings{Berg_etal08crcns,
author={D. J. Berg and S. E. Boehnke and R. A. Marino and P. F. Baldi
                  and D. P. Munoz and L. Itti},
title={Monkeys as a Model for Human Bottom-Up Overt Attention},
abstract={Monkeys are widely used as animal models for human visual
                  attention, however, they are usually studied using
                  simplified stimulus conditions in order to have
                  experimental control for neural recording. Here we
                  compared the visual behavior of 5 human and 5 monkey
                  observers to determine the degree that the two
                  species show bottom-up differences in visual
                  behavior. To this end we recording eye movements of
                  naïve observers in a free-viewing task using 115
                  video clips (47,903 frames, approximately 27
                  minutes) ranging in semantic content (natural
                  scenes, monkey relevant, noise etc). Saccade and
                  fixation statistics such as the main sequence,
                  saccadic endpoint distribution and frequency of
                  saccades indicate a strong difference between human
                  and monkey eye movements. To test whether these
                  difference affect overall visual behavior and to
                  establish to what extent human and rhesus monkey eye
                  movements can be predicted by bottom-up stimulus
                  properties we quantified primate gaze shifts using a
                  model of visual attention, Surprise Model (Itti and
                  Baldi '06). We also compared a given subjects gaze
                  to the gaze locations of all other subjects on the
                  same stimulus(Interobserver model), which includes
                  bottom-up and top-down factors. A computational
                  model of visual attention predicted human gaze
                  shifts marginally, but significantly better than
                  monkey gaze shifts(p < 0.001). However, when looking
                  at individual video clips the correlation between
                  measured Surprise at human and monkey saccadic
                  endpoints is high (r2 = 0.62, p <
                  10-9). Furthermore, locations that 3 monkeys looked
                  at simultaneously were higher in Surprise than those
                  locations that multiple humans agreed to look (p <
                  0.07). The computational model predicted monkey eye
                  movements better than the monkey interobserver model
                  (p < 0.002), yet the interobserver model predicted
                  human gaze shifts far better than the computational
                  model (p < .001). In whole the results demonstrate
                  that although there are some differences in visual
                  behavior between species, monkeys can be effectively
                  used as a model for stimulus driven attention in
                  humans during free viewing tasks.},
booktitle={Collaborative Research in Computational Neuroscience Annual
Meeting, Los Angeles, California},
month={Jun},
year={2008},
type={td;mod;psy},
review={abs/wkshp}
}

@inproceedings{Tseng_etal08crcns,
author={P. Tseng and I. G. M. Cameron and D. P. Munoz and L. Itti},
title={Differentiating Patients from Controls Based on Correlation
between Salience and Gaze},
abstract={Several studies have shown that eye movements and certain
                  complex visual functions are influenced by diseases
                  such as Attention Deficit Hyperactivity Disorder
                  (ADHD), Fetal Alcohol Spectrum Disorders (FASD) and
                  Parkinson's Disease (PD). Here we examine how
                  bottom-up (stimulus-driven) attentional selection
                  mechanisms may differ between patient and control
                  populations, and we take advantage of the difference
                  to develop classifiers to differentiate patients
                  from controls. We tracked gaze of five groups of
                  observers (15 control children, aged 7-14; 6 ADHD
                  children, aged 9-15; 4 FASD children, aged 9-15; 12
                  control elderly, aged 66-79; and 9 PD elderly, aged
                  53-68) while they freely viewed MTV-style
                  videos. These stimuli are composed of short (2-4
                  seconds), unrelated clips of natural scenes to
                  reduce top-down (contextual) expectations and
                  emphasize bottom-up influences on gaze allocations
                  at the scene change. We used a saliency model to
                  compute bottom-up saliency maps for every video
                  frame. Saliency maps can be computed from a full set
                  of features (color, intensity, orientation, flicker,
                  motion) or from individual
                  features. Support-vector-machine classifiers were
                  built for each feature contributing the saliency map
                  and for the combination of them. Leave-one-out was
                  used to train and test the classifiers. Two
                  classification experiments were performed: (1)
                  between ADHD, FASD and control children; (2) between
                  PD and control elderly. Saliency maps computed with
                  all features can well differentiate patients and
                  control populations (correctness: experiment 1 -
                  80\%; experiment 2 - 95.24\%). Additionally,
                  saliency maps computed from any one feature
                  performed nearly as well (correctness: experiment 1
                  - 92\% for flicker; experiment 2 - 100\% for color
                  and flicker). This study demonstrates that the
                  bottom-up mechanism is greatly influenced by ADHD,
                  FASD and PD, and the difference can serve as a
                  probable diagnosis/screening tool for clinical
                  applications.},
booktitle={Collaborative Research in Computational Neuroscience Annual
Meeting, Los Angeles, California},
month={Jun},
year={2008},
type={bu;td;mod;psy;med},
review={abs/wkshp}
}

@inproceedings{Elazary_Itti08vss,
author={L. Elazary and L. Itti},
title={A bayesian model of visual search and recognition},
abstract={Visual search and recognition in humans employ a combination
                  of bottom-up (data-driven) and top-down
                  (goal-driven) processes. Although many bottom-up
                  search and recognition models have been developed,
                  the computational and neural basis of top-down
                  biasing in such models has remained elusive. This
                  paper develops a new model of attention guidance
                  with dual emphasis: a single common Bayesian
                  representational framework is used (1) for learning
                  how to bias and guide search towards desired
                  targets, and, (2) for recognizing targets when they
                  are found. At its core, the model learns probability
                  distributions of an object's visual appearance
                  having a range of values along a number of low-level
                  visual feature dimensions, then uses this learned
                  knowledge both to locate and to recognize desired
                  objects. The model is tested on three publicly
                  available datasets, ALOI, COIL and SOIL47,
                  containing photographs of 1,000, 100 and 47 objects
                  taken under many viewpoints and illuminations
                  (117,174 images in total). Model performance for
                  recognition is compared to that of two
                  state-of-the-art object recognition models (SIFT and
                  HMAX). The proposed model performs significantly
                  better and faster, reaching 89\% classification rate
                  (SIFT: 25\%, HMAX: 76\%) when utilizing 1/4 of the
                  images for training and 3/4 for testing, while at
                  the same time being 89 and 279 times faster than
                  SIFT and HMAX, respectively. The proposed model can
                  also be used for top-down guided search, finding a
                  desired object in a 5x5 search array on average
                  within 4 attempts (chance would be 12.5
                  attempts). Our results suggest that the simple
                  Bayesian formalism developed here is capable of
                  delivering robust machine vision performance.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS08)},
year={2008},
month={May},
type={bu;sc;td},
review={abs/conf}
}
%file={http://ilab.usc.edu/publications/doc/Elazary_Itti08vss.pdf},

@inproceedings{Berg_Itti08vss,
author={D. J. Berg and L. Itti},
title={Memory, eye position and computed saliency},
abstract={Saliency based image computations have been useful in
                  understanding the mechanisms of guiding overt and
                  covert attentional shifts and predicting eye
                  position in natural and artificial scenes. With this
                  foundation we investigated the relationship between
                  memory, eye position and saliency. Particular we
                  tested the hypothesis that in natural scenes an
                  object's computed saliency will positively correlate
                  with subjects' memory of the object independent of
                  eye location. To this end we recorded eye movements
                  from fourteen naive subjects while they were shown
                  18 images (subtending 55degx33deg of visual angle)
                  from scenes of shopping environments for two second
                  followed by a random mask. Subjects were then asked
                  to recall whether subsequently presented image
                  patches contained items that were present in the
                  scene. We found that the amount of computed saliency
                  of object patches in natural scenes has no
                  significant correlation with subjects' recall rates;
                  however, eye position and fixation time on an object
                  are strong factors in facilitating recall(p
                  [[lt]].05). We also find that saliency can predict
                  eye location three standard deviations above
                  chance. These results indicate that saliency's
                  contribution to memory is mostly through overt(eye
                  movements) and not covert attention.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS08)},
year={2008},
month={May},
type={bu;sc},
review={abs/conf}
}
%file={http://ilab.usc.edu/publications/doc/Berg_Itti08vss.pdf},

@inproceedings{Tseng_etal08vss,
author={P. Tseng and I. G. M. Cameron and D. P. Munoz and L. Itti},
title={Differentiating patients from controls based on correlation
                  between salience and gaze},
abstract={Several studies have shown that eye movements and certain
                  complex visual functions are influenced by diseases
                  such as Parkinson's Disease (PD) and Attention
                  Deficit Hyperactivity Disorder (ADHD). Here we
                  examine how bottom-up (stimulus-driven) attentional
                  selection mechanisms may differ between patient and
                  control populations, and we take advantage of the
                  difference to develop classifiers to differentiate
                  patients from controls. We tracked gaze of four
                  groups of observers (15 control children, aged 7-14;
                  6 ADHD children, aged 9-15; 12 control elderly, aged
                  66-79; and 9 PD elderly, aged 53-68) while they
                  freely viewed MTV-style videos. These stimuli are
                  composed of short (2-4 seconds), unrelated clips of
                  natural scenes to reduce top-down (contextual)
                  expectations and emphasize bottom-up influences on
                  gaze allocations at the scene change. We used a
                  saliency model to compute bottom-up saliency maps
                  for every video frame. Saliency maps can be computed
                  from a full set of features (color, intensity,
                  orientation, flicker, motion) or from individual
                  features. Support-vector-machine classifiers (with
                  Radial-Basis Function Kernel) were built for each
                  feature contributing the saliency map and for the
                  combination of them. Leave-one-out was used to train
                  and test the classifiers. Two classification
                  experiments were performed: (1) between ADHD and
                  control children; (2) between PD and control
                  elderly. Saliency maps computed with all features
                  can well differentiate patients and control
                  populations (correctness: experiment 1 - 100\%;
                  experiment 2 - 95.24\%). Additionally, saliency maps
                  computed from any one feature performed nearly as
                  well (both experiments' results are 0-5\%
                  worse). Moreover, 0-250 ms after scene change is the
                  most discriminative period for the
                  classification. This study demonstrates that the
                  bottom-up mechanism is greatly influenced by PD and
                  ADHD, and the difference can serve as a probable
                  diagnosis tool for clinical applications.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS08)},
year={2008},
month={May},
type={bu;td;mod;psy;med},
review={abs/conf}
}
%file={http://ilab.usc.edu/publications/doc/Tseng_etal08vss.pdf},

@inproceedings{Itti_etal08vss,
author={L. Itti and M. Yoshida and D. J. Berg and T. Ikeda and R. Kato
                  and K. Takaura and T. Isa},
title={Saliency-based guidance of gaze in monkeys with unilateral
                  lesion of primary visual cortex},
abstract={ We investigate residual visually-guided behavior in monkeys
                  after unilateral ablation of primary visual cortex
                  (area V1), to unravel the specific contributions of
                  V1 to salience computation in the primate brain. We
                  analyze eye movements (108,458 saccades) of six
                  macaque monkeys (three normals, three with
                  unilateral V1 ablation) watching ~54 minutes of
                  television and other natural video stimuli (97,051
                  video frames). A computational model of
                  saliency-based, bottom-up attention quantifies the
                  monkeys' propensity to attend to salient targets in
                  their normal vs. lesioned hemifields. To eliminate
                  stimulus biases, we randomly presented all video
                  clips twice, original and horizontally flipped. All
                  monkeys are attracted towards salient stimuli,
                  significantly above chance (assessed by simulating
                  random saccades), for saccades directed both into
                  normal and lesioned hemifields (t-tests,
                  p[[lt]]0.0001 or better). We compute a saliency
                  deficit score (SDS) comparing the extent to which
                  saccades directed into one hemifield may be more or
                  less saliency-guided than saccades directed into the
                  other hemifield (SDS=0\% would indicate no
                  asymmetry, SDS=100\% would indicate complete
                  blindness in the lesioned hemifield). For the
                  lesioned monkeys, SDS is significantly different
                  from 0\% (t-tests, p[[lt]]0.00001 or better) but
                  surprisingly low (range 9\% - 28\%; average for
                  normal monkeys: 4\%), indicating a significant
                  residual attraction towards salient targets even in
                  the lesioned hemifields. Further, the recent history
                  (up to 500ms before saccade onset) of saliency
                  values around saccade targets reveals significantly
                  increased salience just before saccades into the
                  normal, but not the lesioned, hemifield
                  (Bonferroni-corrected t-tests, p[[lt]]0.05 or
                  better). Taken together, our results suggest that
                  unilateral ablation of V1 does not abolish the
                  natural tendency of monkeys to gaze towards salient
                  targets during natural vision, although it
                  significantly decreases the monkeys' ability to
                  rapidly select targets in the lesioned hemifield
                  that have recently become salient.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS08)},
year={2008},
month={May},
type={bu;sc},
file={http://ilab.usc.edu/publications/doc/Itti_etal08vss.pdf},
review={abs/conf}
}

@inproceedings{Peters_Itti08vss,
author={R. J. Peters and L. Itti},
title={The role of Fourier phase information in predicting saliency},
abstract={ Global amplitude spectra from the discrete Fourier
                  transform (DFT) have proven useful in studying
                  behavioral and computational aspects of visual
                  object and scene recognition. Here, we investigated
                  whether Fourier phase (rather than amplitude)
                  spectra may be useful for another purpose, namely
                  guiding attentional selection. We developed a simple
                  model which produces salience maps from phase
                  information alone, by (1) downsampling images by one
                  or more factors, (2) computing the DFT of each
                  downsampled image's luminance, (3) normalizing each
                  complex DFT value to unit amplitude while retaining
                  its phase, (4) computing the inverse DFT, (5)
                  squaring the result, and (6) combining the maps
                  resulting from each downsampling factor. Salience
                  maps from this model significantly predicted the
                  free-viewing gaze patterns of four observers for 337
                  images of natural outdoor scenes, fractals, and
                  aerial imagery. For fractals and aerial imagery,
                  this phase-based model was significantly better
                  (paired t-test, ppf power spectra, so forcing a flat
                  Fourier amplitude spectrum is similar to scaling the
                  amplitude everywhere by f, equivalent to a spatial
                  derivative. However, this derivative-like aspect
                  cannot completely explain our results, because the
                  image category with the most 1/f-like spectrum
                  (outdoor scenes) was the one for which the
                  phase-only model fared worst. Just as Fourier
                  amplitude can form a computational basis for scene
                  categorization (Torralba 2003), our results
                  establish Fourier phase information as one possible
                  computational basis for spatial attentional
                  selection.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS08)},
year={2008},
month={May},
type={bu;sc},
review={abs/conf}
}
%file={http://ilab.usc.edu/publications/doc/Peters_Itti08vss.pdf},

@inproceedings{Li_Itti08vss,
author={Z. Li and L. Itti},
title={Visual attention guided video compression},
abstract={ Human visual characteristics show promising future for
                  applications to video coding. Here, we propose,
                  implement, and test a universal visual attention
                  based video coding platform (VAVC). This platform
                  includes two main parts: the visual attention module
                  and the video coding module. The visual attention
                  module is used to generate saliency maps (or other
                  maps which can represent human visual
                  characteristics) according to the human visual
                  system (HVS) while the video coding module is used
                  to compress the raw video sequence according to the
                  results of the first module. Using this platform, a
                  saliency-based video coding algorithm is
                  implemented. The bottom-up methods proposed in Itti
                  et al. (1998) are adopted to get the saliency
                  map. Then we transform the saliency map into the
                  quantization map used in the latest video coding
                  standard H.264 to guide the residual
                  quantization. For the salient regions, we decrease
                  the quantization step to reduce the artifacts, and
                  for the non-salient regions, we increase the
                  quantization step to increase the compression
                  ratio. In our experiment, 18 natural video sequences
                  are adopted for encoding with different methods
                  while 6 subjects to evaluate these encoded
                  results. Subjects were asked to subjectively rate on
                  a 1-5 scale the perceptual quality of 3 variants of
                  the clips: standard H.264, our VAVC (yielding on
                  average 17.37\% smaller file sizes), and rate
                  controlled H.264 to match the smaller size of the
                  VAVC encoded clips. The experiment results show
                  that, for 64.9\% samples, the subjective quality of
                  VAVC-encoded clips is equal or better than
                  traditional H.264-encoded clips. For 87.04\%
                  samples, the subjective quality of the proposed VAVC
                  method is equal or better than the rate-controlled
                  H.264 method for equal file size. Our results
                  suggest that exploiting human visual characteristics
                  can lead to better video compression without
                  degrading perceptual quality.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS08)},
year={2008},
month={May},
type={bu;sc},
review={abs/conf}
}
%file={http://ilab.usc.edu/publications/doc/Li_Itti08vss.pdf},

@inproceedings{Mundhenk_etal08vss,
author={T. N. Mundhenk and W. Einhaeuser and L. Itti},
title={Natural Image RSVP task performance is predicted by
                  measurements of bottom-up Bayesian Surprise
                  exhibited by image sequences},
abstract={The performance of observers on a Rapid Serial Vision
                  Protocol (RSVP) task is causally linked with the
                  amount of bottom-up Bayesian Surprise (buBS)
                  exhibited by both target and distracter images in
                  RSVP sequences. In this paradigm, observers watched
                  a sequence of 20 images at 20Hz. One of the images
                  in the sequence might contain a picture of an animal
                  target at chance. Subjects had to respond as to
                  whether or not they spotted the target. Observers'
                  performance was compared with the amount of buBS
                  images in the sequence exhibited. The buBS
                  information metric defined by (Itti and Baldi 2005;
                  Itti and Baldi 2006) gives a measure of the amount
                  of information gain both within an image (between
                  image locations) and between images. Using the
                  coarse statistics of buBS we were able to alter the
                  performance of observers on an RSVP task by changing
                  the order of images within a sequence. Placing
                  images of high surprise both before and after the
                  target image impairs the ability of observers to
                  recall the target(Einhaeuser, Mundhenk et
                  al. 2007). Here we show coarse statistics for buBS
                  in both color and Gabor orientations is
                  significantly different between RSVP sequences
                  observers find easy (subjects tend to spot the
                  target correctly) compared with ones that observers
                  find difficult. In particular, course statistics for
                  mean buBS are elevated in the flanking images before
                  and after the target in difficult RSVP
                  sequences. Further, buBS is significantly different
                  in some features such as vertical lines as much as
                  250ms before the target image with a relaxed period
                  100ms before the target. This lends support to the
                  two stage model of visual processing (Chun and
                  Potter 1995). Additionally, we can use the buBS
                  statistics to inform us of the amount of bottom-up
                  attention capture intrinsic in images in RSVP
                  sequences.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS08)},
year={2008},
month={May},
type={bu;sc},
review={abs/conf}
}
%file={http://ilab.usc.edu/publications/doc/Mundhenk_etal08vss.pdf},

@inproceedings{Baluch_Itti08vss,
author={F. Baluch and L. Itti},
title={Effects of training on perceptual salience},
abstract={ Learning on a visual search task involves plasticity at one
                  or more levels of the visual cortex. Does this
                  plasticity boost the target features and suppress
                  distractors in a manner that would make the target
                  more perceptually salient? We address this question
                  by designing a challenging, attentionally-demanding
                  conjunction search task, where each colored Gabor
                  patch item is defined by a conjunction of 3 features
                  (hue, orientation and spatial frequency). Three
                  subjects' eye movements were recorded while they
                  searched for a target embedded among distractors in
                  1/f noise. Once the target is spotted subjects
                  report its location and are given feedback based on
                  whether they made the right choice or not. Subjects
                  perform three 100-trial search sessions. Each trial
                  had unique targets and distractors, so subjects
                  gained general task expertise rather than expertise
                  with specific stimuli.  Accuracy improved
                  significantly (one-way ANOVA p[[lt]]0.005) from
                  session to session and subjects achieved on average
                  a 15\% boost in accuracy in locating the
                  target. Further, the trajectories of subjects' eye
                  movements through the three dimensional feature
                  space were analyzed and the average Euclidean
                  distance to the target, within the feature space,
                  decreases from session to session. We also found
                  that subjects make first saccades towards items
                  closer to the target in feature space from one
                  session to the next. Moreover, the average Euclidean
                  distance of first saccade target items from the
                  search target (in feature space) was reduced by 20%
                  from the first session to the last. These results
                  provide evidence for subjects making saccades
                  towards items that are more similar to the target
                  during the course of the sessions. These saccades
                  towards target-like items suggest that these items
                  are more perceptually salient and become even more
                  so with training.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS08)},
year={2008},
month={May},
type={bu;sc},
file={http://ilab.usc.edu/publications/doc/Baluch_Itti08vss.pdf},
review={abs/conf}
}

@inproceedings{Siagian_Itti08vss,
author={C. Siagian and L. Itti},
title={Comparison of gist models in rapid scene categorization tasks},
abstract={The capacity of humans to perform a number of complex visual
                  tasks such as scene categorization and object
                  detection in as little as 100ms has been attributed
                  their ability to rapidly extract the gist of a
                  scene. Existing models of gist utilize various types
                  of low-level features, color (Ulrich & Nourbakhsh
                  2001), Fourier component profiles (Oliva & Torralba
                  2001), textures (Renniger & Malik 2004), steerable
                  wavelets (Torralba, et. al. 2003), and a combination
                  of these (Siagian & Itti 2007). Some of the methods
                  compute feature histograms from the whole image,
                  while others encode rough spatial information by
                  using a predefined grid system. Here, we
                  systematically compare gist models with
                  categorization tasks of increasing difficulty. We
                  investigate how far these low level features can
                  describe complicated real-world scenes. With three
                  outdoor test sites - a building complex (26368
                  training images, 13965 testing images), a park full
                  of trees (66291/26397 images), and a spacious
                  open-field area (82747/34711 images) - which provide
                  visually distinct challenges, we first ask the
                  question of which scene belongs to which site. As a
                  baseline for comparison fo other models we used the
                  classification rate of our combination model (95\%
                  success). Then we divide each site into nine
                  distinct segments, to test finer classification
                  ability (baseline of 85\% success). We finally
                  divide the segments into smaller geographical
                  regions, making it an even harder to do scene
                  classification as the regions become more similar
                  visually. This, in turn, forces the competing
                  systems to look for detailed attributes to
                  exploit. The hypothesis is that each particular
                  system (or more importantly, the features they use)
                  will be able to distinguish some segments but not
                  others.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS08)},
year={2008},
month={May},
type={bu;sc},
file={http://ilab.usc.edu/publications/doc/Siagian_Itti08vss.pdf},
review={abs/conf}
}

@invited{Itti08csh,
author={L. Itti},
title={Modeling bottom-up and top-down visual attention in humans and
monkeys},
booktitle={Cold Spring Harbor Swartz conference on theoretical and
experimental adavnces in visual and auditory attention, Cold Spring
Harbor Labs Banburry Center, New York},
month={Apr},
year={2008},
type={bu;td;mod}
}

@article{Elazary_Itti08jov,
title={Interesting objects are visually salient},
author={L. Elazary and L. Itti},
abstract={How do we decide which objects in a visual scene are more
                  interesting? While intuition may point toward
                  high-level object recognition and cognitive
                  processes, here we investigate the contributions of
                  a much simpler process, low-level visual
                  saliency. We used the LabelMe database (24,863
                  photographs with 74,454 manually outlined objects)
                  to evaluate how often interesting objects were among
                  the few most salient locations predicted by a
                  computational model of bottom-up attention. In 43\%
                  of all images the model's predicted most salient
                  location falls within a labeled region (chance
                  21\%). Furthermore, in 76\% of the images (chance
                  43\%), one or more of the top three salient
                  locations fell on an outlined object, with
                  performance leveling off after six predicted
                  locations. The bottom-up attention model has neither
                  notion of object nor notion of semantic
                  relevance. Hence, our results indicate that
                  selecting interesting objects in a scene is largely
                  constrained by low-level visual properties rather
                  than solely determined by higher cognitive
                  processes.},
journal={Journal of Vision},
volume={8},
number={3:3},
pages={1-15},
year={2008},
month={Mar},
type={td;mod;cv;sc},
file={http://ilab.usc.edu/publications/doc/Elazary_Itti08jov.pdf},
if = {2006 impact factor: 3.753}
}

@invited{Itti08neti,
author={L. Itti},
title={Quantitative analysis of perceptual salience at the point of
gaze in humans and monkeys},
abstract={Visual processing of complex natural environments requires
                  animals to combine, in a highly dynamic and adaptive
                  manner, sensory signals that originate from the
                  environment (bottom-up) with behavioral goals and
                  priorities dictated by the task at hand
                  (top-down). In the visual domain, bottom-up and
                  top-down guidance of attention towards salient or
                  behaviorally relevant targets have both been studied
                  and modeled extensively. More recently, the
                  interaction between bottom-up and top-down control
                  of attention has also become of topic of interest. A
                  number of neurally-inspired computational models
                  have emerged which integrate components for the
                  computation of bottom- up salience maps, top-down
                  attention biasing, rapid computation of the 'gist'
                  or rough context of a scene, objet recognition, and
                  some higher-level cognitive reasoning functions. I
                  will review a number of such efforts, which aim at
                  building models that can both process real-world
                  inputs in robust and flexible ways, and perform
                  cognitive reasoning on the symbols extracted from
                  these inputs. I will draw from examples in the
                  biological/computer vision fields, including
                  algorithms for complex scene understanding, robot
                  navigation, and animation of virtual humans.},
booktitle={Workshop on Natural Environments, Taska, and Intelligence
(NETI), Austin, Texas},
month={Mar},
year={2008},
type={bu;td;mod}
}

@inproceedings{Itti08cos,
author={L. Itti},
title={Eye movements during free viewing of natural videos},
booktitle={Computational and Systems Neuroscience (COSYNE), workshop
on data sharing, Salk Lake City, Utah},
abstract={There is growing interest in the neuroscience community to
                  understand how humans and other animals interact
                  with the complexities of the real world. Departing
                  from conventional laboratory experiments based on
                  simple, well-controlled and contrived stimulus
                  response paradigms, natural stimuli require more
                  sophisticated data analysis techniques and
                  computational models to interpret the data. There is
                  a pressing need to develop such techniques and
                  models, as many of the most interesting questions
                  about human and animal vision, cognition, and
                  goal-directed behavior can only be studied with
                  complex natural stimuli. We believe that a key
                  enabler/catalyst to the development of further
                  analysis tools and computational models is to make
                  existing data widely available.  I will describe our
                  NSF-facilitated free, unrestricted sharing of a body
                  of human eye-tracking data traces obtained while
                  normal volunteers inspected complex video stimuli
                  (TV programs, outdoors videos, video games).},
month={Mar},
year={2008},
type={td;mod;psy},
review={abs/wkshp}
}

@inproceedings{Peters_Itti08nips,
author = {R. J. Peters and L. Itti},
title={Congruence between model and human attention
reveals unique signatures of critical visual events},
abstract={Current computational models of bottom-up and top-down
                  components of attention are predictive of eye
                  movements across a range of stimuli and of simple,
                  fixed visual tasks (such as visual search for a
                  target among distractors). However, to date there
                  exists no computational framework which can reliably
                  mimic human gaze behavior in more complex
                  environments and tasks, such as driving a vehicle
                  through traffic.  Here, we develop a hybrid
                  computational/behavioral framework, combining simple
                  models for bottom-up salience and top-down
                  relevance, and looking for changes in the predictive
                  power of these components at different critical
                  event times during 4.7 hours (500,000 video frames)
                  of observers playing car racing and flight combat
                  video games. This approach is motivated by our
                  observation that the predictive strengths of the
                  salience and relevance models exhibit reliable
                  temporal signatures during critical event windows in
                  the task sequence --- for example, when the game
                  player directly engages an enemy plane in a flight
                  combat game, the predictive strength of the salience
                  model increases significantly, while that of the
                  relevance model decreases significantly. Our new
                  framework combines these temporal signatures to
                  implement several event detectors. Critically, we
                  find that an event detector based on fused
                  behavioral and stimulus information (in the form of
                  the model's predictive strength) is much stronger
                  than detectors based on behavioral information alone
                  (eye position) or image information alone (model
                  prediction maps). This approach to event detection,
                  based on eye tracking combined with computational
                  models applied to the visual input, may have useful
                  applications as a less-invasive alternative to other
                  event detection approaches based on neural
                  signatures derived from EEG or fMRI recordings.},
year={2008},
month={Jun},
publisher           = { MIT Press },
address             = { Cambridge, MA },
booktitle           = { Advances in Neural Information Processing Systems, Vol.
                        20 (NIPS*2007) },
pages={1145-1152},
type={bu ; cv ; td ; eye ; mod},
file={http://ilab.usc.edu/publications/doc/Peters_Itti08nips.pdf},
review={full/conf},
if={2007 oral acceptance rate: 10.4\%}
}

@inproceedings{Siagian_Itti08icra,
author={C. Siagian and L. Itti},
title={Storing and Recalling Information for Vision Localization},
abstract={In implementing a vision localization system, a crucial
                  issue to consider is how to efficiently store and
                  recall the necessary information, so that the robot
                  is not only able to accurately localize itself, but
                  does so in a timely manner. In the presented system,
                  we discuss a strategy to minimize the amount of
                  stored data by analyzing the strengths and
                  weaknesses of several cooperating recognition
                  modules and by using them through a prioritization
                  scheme which orders the data entries from the most
                  likely to match to the least likely. We validate the
                  system through a series of experiments in three
                  large scale outdoor environments: a building complex
                  (126x180ft. area, 3583 testing images), a
                  vegetation-filled park (270x360ft.  area, 6006
                  testing images), and an open-field area
                  (450x585ft. area, 8823 testing images) - each with
                  its own set of challenges.  Not only is the system
                  able to localize in these environments (on average
                  3.46ft., 6.55ft., 12.96ft. of error, respectively),
                  it does so while searching through only 7.35\%,
                  3.50\%, and 6.12\% of all the stored information,
                  respectively.},
year={2008},
month={May},
booktitle={IEEE International Conference on Robotics and Automation (ICRA),
Pasadena, California},
type={bu; sc},
file={http://ilab.usc.edu/publications/doc/Siagian_Itti08icra.pdf},
review={full/conf},
if={2008 acceptance rate: 43\%}
}

@article{Peters_Itti08tap,
author = {R. J. Peters and L. Itti},
title={Applying computational tools to predict gaze direction in
interactive visual environments},
abstract={Future interactive virtual environments will be
                  ``attention-aware,'' capable of predicting, reacting
                  to, and ultimately influencing the visual attention
                  of their human operators. Before such environments
                  can be realized, it is necessary to operationalize
                  our understanding of the relevant aspects of visual
                  perception, in the form of fully-automated
                  computational heuristics that can efficiently
                  identify locations that would attract human gaze in
                  complex dynamic environments. One promising approach
                  to designing such heuristics draws on ideas from
                  computational neuroscience. We compared several
                  neurobiologically-inspired heuristics with eye
                  movement recordings from five observers playing
                  video games, and found that heuristics which detect
                  outliers from the global distribution of visual
                  features were better predictors of human gaze than
                  were purely local heuristics. Heuristics sensitive
                  to dynamic events performed best overall. Further,
                  heuristic prediction power differed more between
                  games than between different human observers. While
                  other factors clearly also influence eye position,
                  our findings suggest that simple neurally-inspired
                  algorithmic methods can account for a significant
                  portion of human gaze behavior in a naturalistic,
                  interactive setting. These algorithms may be useful
                  in the implementation of interactive virtual
                  environments, both to predict the cognitive state of
                  human operators, as well as to effectively endow
                  virtual agents in the system with human-like visual
                  behavior.},
journal = {ACM Transactions on Applied Perception},
year={2008},
volume={5},
number={2},
pages={Article 8},
type={bu ; cv ; td ; eye ; mod},
file={http://ilab.usc.edu/publications/doc/Peters_Itti08tap.pdf}
}

@invited{Itti08oki,
author={L. Itti},
title={Quantitative analysis of perceptual salience at the point of
gaze in humans and monkeys},
booktitle={Okinawa Institute of Science and Technology, Okinawa, Japan},
month={Jan},
year={2008},
type={bu;td;mod}
}

@article{Siagian_Itti07nme,
author={C. Siagian and L. Itti},
title={Biologically inspired mobile-robot self localization},
journal={The Neuromorphic Engineer},
abstract={Using both global and local visual features, 'Beobot' can
find its own position in the environment.},
pages={1-2},
month={Dec},
year={2007},
type={bu;sc},
file = { http://iLab.usc.edu/publications/doc/Siagian_Itti07nme.pdf }
}
%volume={1},
%number={1},

@inproceedings{Berg_etal07sfn,
author = {D. J. Berg and S. E. Boehnke and P. F. Baldi and D. P. Munoz
                  and L. Itti},
title = {Modeling adaptation responses in the superior colliculus
                  using a Bayesian theory of surprise},
abstract={A fundamental question in visual neuroscience is the role of
                  adaptation and habituation in sensory
                  processing. Here we demonstrate that a simple
                  mathematical model of Bayesian surprise can explain
                  the adaptation responses of visual neurons in the
                  Superior Colliculus. We previously proposed a
                  Bayesian surprise model (Itti and Baldi, '06) to
                  quantify information contained in a piece of data by
                  measuring the effect this data has on an observer
                  whether the observer be a single neuron or an
                  organism. We call this new kind of information
                  'surprise'. This is fundamentally different from
                  Shannon information (Shannon, '48) which only takes
                  into account the probability of the data. Surprise
                  transforms the observer's prior beliefs into
                  posterior beliefs, according to Bayes
                  theorem. Information can now be measured in a
                  natural way by the distance (relative entropy)
                  between the prior and posterior distributions of the
                  observer over the available space of hypotheses
                  (i.e., beliefs about the data). Surprise is
                  important in situations where new data changes
                  beliefs about sensory information. This mechanism is
                  particularly important for phenomena such as
                  adaptation and habituation. To test this
                  computational theory we modeled recordings from
                  neurons in superficial (SCs, n=18) and intermediate
                  (SCi, n=36) layers of the Superior Colliculus of two
                  awake behaving monkeys (Macaca mulatta). A simple
                  paradigm was used where 7 flashes of light (55 ms in
                  duration) were repeatedly presented in a cells
                  receptive field. We introduced surprising changes by
                  altering the intensity of the 4th flash on rare
                  trials (30\%). This oddball stimulus was of brighter
                  intensity (10\%), dimmer intensity (10\%), or absent
                  (10\%). Additionally, inter-stimulus interval was
                  varied from 75-800 ms to assess the time course of
                  habituation or adaptation. Firing rates to the
                  oddball stimuli in superficial neurons represented
                  sensory adaptation, while responses in SCi neurons
                  showed both adaptation and habituation (see Boehnke
                  et al.). In the control condition, neural firing
                  rate decreased significantly after the first
                  presentation of the stimulus and then
                  stabilized. This can be modeled with a single time
                  constant representing the decay of beliefs about the
                  information content of the stimulus. We find that
                  surprise tracks adaptation effects seen on the peak
                  magnitude of the response for SCs in all
                  conditions. Thus we can quantify neuronal responses
                  in the SCs as surprise units.},
month = {Nov},
year = {2007},
booktitle = {Proc. Society for Neuroscience Annual Meeting (SFN'07)},
type = {mod;su;phy},
review={abs/conf}
}


@inproceedings{Boehnke_etal07sfn,
author = {S. E. Boehnke and D. J. Berg and R. A. Marino and
P. F. Baldi and L. Itti and D. P. Munoz},
title = {Adaptation, habituation and dishabituation of visual
responses in the superior colliculus},
abstract={When stimuli are repeatedly flashed into the receptive field
  of visual neurons in the superior colliculus (SC), the response
  magnitude decreases (e.g., in a cue-target task). This effect could
  be due to 'adaptation' - a lower level mechanism like pupil
  constriction, or 'habituation' - the non-associative learning
  mechanism by which an organism stops responding to an irrelevant
  stimulus but recovers the response (dishabituates) after a change in
  stimulus properties. We sought to characterize the changes in
  responses that could be attributed to adaptation and habituation in
  superficial (SCs, n=18) and intermediate (SCi, n=36) layer neurons
  using a paradigm adopted from the ideas of Sokolov (1963). Two
  monkeys (Macaca mulatta) were rewarded for fixating a central
  fixation point while a series of 7 successive stimuli were flashed
  briefly (55 ms duration; 75-800 ms interstimulus interval (ISI)) in
  the receptive field of the neuron. On 70\% of trials all flashed
  stimuli were identical, while on others, the 4th stimulus was either
  brighter, dimmer or absent (10\% each). If reduced neural
  responsiveness is due to habituation, some recovery of the response
  (dishabituation) should occur to any oddball stimulus. However, if
  the reduced response is due to adaptation, the response should be
  further reduced after the brighter stimulus, but recover after the
  dimmer or absent stimulus. For the typical ISI of 200ms, the largest
  decrease in response magnitude (> 60\% in SCs and ~50\% in SCi) was
  to the second stimulus, and subsequent stimuli resulted in only
  small further reductions. The shorter the ISI the greater these
  reductions, with responses to stimuli 2-7 often being obliterated at
  very short ISIs (<100ms). The onset latency of the visual response
  increased with each stimulus so that on average the 7th stimulus
  response initiated nearly 20ms later than the 1st. These patterns
  were globally similar in SCs and SCi, but there were greater changes
  to the 3-7th stimuli in many SCi neurons. Responses to oddball
  stimuli in SCs neurons were suggestive only of sensory adaptation,
  while responses in SCi neurons showed features of both adaptation
  and habituation because a dishabituation signature in response to
  brighter or dimmer stimuli was present in the late visual response,
  and in the subsequent interstimulus interval. The adaptation
  responses of neurons in the SCs were more homogenous compared to
  those in the SCi, which showed considerable diversity in adaptation
  and habituation response properties. The adaptation features of
  responses to all stimuli, including oddballs, were successfully
  modeled using a Bayesian Surprise model (see Berg et al.).},
month = {Nov},
year = {2007},
booktitle = {Proc. Society for Neuroscience Annual Meeting (SFN'07)},
type = {mod;phy;su},
review={abs/conf}
}


@inproceedings{White_etal07sfn,
author = {B. J. White and R. A. Marino and S. E. Boehnke and L. Itti
                  and J. Theeuwes and D. P. Munoz},
title = {Interactions between endogenous and exogenous neural activity
                  in the superior colliculus},
abstract={The oculomotor capture paradigm (Theeuwes et al. 1999) is an
                  ideal tool to explore goal driven target selection
                  without the contamination of visual transients on
                  the goal related activity. Using this task, we
                  explored neural correlates of the interaction
                  between stimulus-driven and goal-driven oculomotor
                  behavior in the superior colliculus (SC). Two
                  monkeys were trained on the task in which one of six
                  stimuli, equidistant from fixation, becomes a target
                  singleton through an isoluminant color change in the
                  remaining five items. Simultaneously, on half the
                  trials an additional abrupt-onset distractor
                  appeared either near or far from the saccade
                  goal. The monkeys were required to make a saccade to
                  the odd colored target. Behaviorally, the results
                  were not unlike that found in humans. Correctly
                  directed saccades showed longer latencies on
                  distractor-present trials and shorter latencies when
                  the eyes were overtly captured by the distractor. In
                  addition, the proportion of errors to the
                  abrupt-onset distractor and other non-target items
                  was always greatest when the distractor appeared
                  near the goal of the saccade. We recorded single
                  unit activity in the intermediate SC and found that
                  pretarget activity could predict whether the monkey
                  would subsequently make an error. In addition, on
                  correctly directed saccades where the abrupt-onset
                  distractor appeared near the goal of the saccade,
                  there was a suppression of the endogenous-related
                  activity that was not evident in the remote
                  distractor condition. This suppression was also
                  correlated with the increase in saccade latency. The
                  results are consistent with the hypothesis that
                  endogenous and exogenous activity is combined in the
                  intermediate SC, and compete through lateral
                  interactions to guide oculomotor behavior.},
month = {Nov},
year = {2007},
booktitle = {Proc. Society for Neuroscience Annual Meeting (SFN'07)},
type = {phy;eye},
review={abs/conf}
}

@inproceedings{Itti_etal07sfn,
author={L. Itti and M. Yoshida and D. J. Berg and T. Ikeda and R. Kato
and K. Takaura and T. Isa},
title = {Saliency-based guidance of spontaneous saccades in monkeys
                  with unilateral lesion of primary visual cortex},
abstract={Primary visual cortex (area V1) is the entry point of visual
                  processing into the primate cortex. Yet, human and
                  animal studies of V1 lesions have demonstrated a
                  'blindsight' phenomenon, whereby residual
                  visually-guided behavior remains even when large
                  portions of V1 are absent. However, little is known
                  quantitatively of how this residual vision (possibly
                  subcortically-mediated) differs from normal
                  (cortically-mediated) vision. We analyzed eye
                  movements (89,285 saccades) of five macaque monkeys
                  (two normals, three with complete unilateral V1
                  ablation) watching ~54 minutes of television (97,051
                  video frames). A computational model of bottom-up
                  attention quantified how salient visual features may
                  guide gaze into the normal vs. the lesioned
                  hemifield. To eliminate stimulus biases, we randomly
                  presented all video clips twice, original and
                  horizontally flipped. We quantified the extent to
                  which salient stimuli attracted gaze of each monkey
                  by computing, for saccades tallied along the eight
                  principal directions, a model-based bottom-up
                  guidance score (chance level 0.5; ideal upper bound
                  1.0; practical inter-observer score previously
                  measured as the extent to which three control
                  monkeys predict gaze of a fourth monkey ~0.6). For
                  the normals as well as lesioned monkeys, we found
                  that saccades in all eight principal directions were
                  guided towards salient locations, significantly
                  above chance (scores 0.585+/-0.003 to 0.649+/-0.004,
                  t-tests p<0.00001, 2,021 to 6,039 saccades per
                  monkey in each direction). However, although
                  lesioned monkeys overall scored lower, there was
                  little difference in bottom-up guidance with saccade
                  direction (scores 2\% - 4\% lower for saccades
                  directed into vs. away from the lesioned
                  hemifield). Our results suggest that the extent to
                  which monkey saccades are attracted towards salient
                  locations during natural vision may be less affected
                  by the absence of primary visual cortex than
                  previously considered.},
month = {Nov},
year = {2007},
booktitle = {Proc. Society for Neuroscience Annual Meeting (SFN'07)},
type = {mod;psy;bu;eye},
review={abs/conf}
}

@press{Anonymous07ti,
author={A. Anonymous},
title={Soon, robots with human gaze},
journal={The Times of India},
month={Oct},
year={2007},
type={bu;cv},
file = { http://iLab.usc.edu/publications/doc/Anonymous07ti.pdf}
}

@press{Dume07ns,
author={B. Dume},
title={Virtual human has a roving eye},
journal={The New Scientist},
month={Oct},
year={2007},
type={bu;cv},
file = {http://iLab.usc.edu/publications/doc/Dume07ns.pdf}
}

@inproceedings{Siagian_Itti07iros,
author={C. Siagian and L. Itti},
title={Biologically-Inspired Robotics Vision Monte-Carlo
  Localization in the Outdoor Environment},
abstract={We present a robot localization system using
                  biologically-inspired vision.  Our system models two
                  extensively studied human visual capabilities: (1)
                  extracting the ``gist'' of a scene to produce a
                  coarse localization hypothesis, and (2) refining it
                  by locating salient landmark regions in the scene.
                  Gist is computed here as a holistic statistical
                  signature of the image, yielding abstract scene
                  classification and layout.  Saliency is computed as
                  a measure of interest at every image location,
                  efficiently directing the time-consuming landmark
                  identification process towards the most likely
                  candidate locations in the image.  The gist and
                  salient landmark features are then further processed
                  using a Monte-Carlo localization algorithm to allow
                  the robot to generate its position.  We test the
                  system in three different outdoor environments -
                  building complex (126x180ft. area, 3794 testing
                  images), vegetation-filled park (270x360ft. area,
                  7196 testing images), and open-field park
                  (450x585ft. area, 8287 testing images) - each with
                  its own challenges. The system is able to localize,
                  on average, within 6.0, 10.73, and 32.24 ft.,
                  respectively, even with multiple kidnapped-robot
                  instances.},
booktitle={Proc. IEEE/RSJ International Conference on Intelligent Robots and
Systems (IROS)},
year={2007},
month={Oct},
review={full/conf},
type={td;bu;sc;mod},
file={http://ilab.usc.edu/publications/doc/Siagian_Itti07iros.pdf},
if = {2007 acceptance rate: 52.4\%}
}

@invited{Itti07iros,
author={L. Itti},
title={Neuromorphic vision and attention for mobile robots},
abstract={In recent years, a number of neurally-inspired computational
                  models have emerged which demonstrate unparalleled
                  performance, flexibility, and adaptability in coping
                  with real-world inputs. In the visual domain, in
                  particular, such models are achieving great strides
                  in tasks including focusing attention onto the most
                  important locations in a scene, recognizing attended
                  objects, computing contextual information in the
                  form of the ``gist'' of the scene, and
                  planning/executing visually-guided motor actions,
                  among many other functions. However, these models
                  have not yet been able to demonstrate much
                  higher-level or cognitive computation ability. On
                  the other hand, symbolic models from artificial
                  intelligence have reached significant maturity in
                  their cognitive reasoning abilities, but the worlds
                  in which they can operate have been necessarily
                  simplified (e.g., a chess board, a virtual maze). In
                  this talk I will present the latest developments in
                  our and other laboratories which attempt to bridge
                  the gap between these two disciplines, neural
                  modeling and artificial intelligence, in developing
                  the next generation of robots. I will briefly review
                  a number of efforts which aim at building models
                  that can both process real-world inputs in robust
                  and flexible ways, and perform cognitive reasoning
                  on the symbols extracted from these inputs. I will
                  draw from examples in the biological/computer vision
                  fields, including algorithms for complex scene
                  understanding, and for robot navigation.},
booktitle={IEEE/RSJ IROS 2007 Workshop: From sensors to human spatial 
concepts, San Diego, CA},
month={Oct},
year={2007},
type={bu;td;mod}
}

@press{Anonymous07usc,
author={A. Anonymous},
title={The next generation of innovators},
journal={www.usc.edu main site},
month={Oct},
year={2007},
type={cv},
file={http://ilab.usc.edu/publications/doc/Anonymous07usc.pdf}
}

@invited{Itti07ngp,
author={L. Itti},
title={Bottom-up and top-down visual attention in humans and monkeys},
booktitle={USC Neuroscience Retreat, Laguna Beach, CA},
month={Sep},
year={2007},
type={bu;td;mod}
}

@invited{Itti07nae,
author={L. Itti},
title={Computational Cognitive Neuroscience and Its Applications},
booktitle={National Academy of Engineering, 2007 U.S. Frontiers of
Engineering Symposium, Redmond, WA},
month={Sep},
year={2007},
type={bu;td;mod}
}

@invited{Itti07ecem,
author={L. Itti},
title={Bottom-up and top-down visual attention in humans and monkeys},
abstract={Many tasks require that we direct attention to the most
                  ''relevant'' entities in our visual
                  environment. While much progress has been made in
                  investigating experimentally how humans may operate
                  such goal-based attentional selection, very little
                  is understood of the general mathematical principles
                  and neuro-computational architectures that subserve
                  the observed behavior. I will describe recent
                  computational work which attacks the problem of
                  developing models of visual attentional selection
                  that are more flexible and can be strongly modulated
                  by the task at hand. I will back the proposed
                  architectures up by comparing their predictions to
                  behavioral recordings from humans and monkeys. I
                  will show examples of applications of these models
                  to real-world vision challenges, using complex
                  stimuli from television programs or modern immersive
                  video games.},
booktitle={European Conference on Eye Movements (ECEM), Potsdam, Germany},
month={Aug},
year={2007},
type={bu;td;mod}
}

@incollection{Itti07sp,
author={L. Itti},
title={Visual Salience},
abstract={Visual salience (or visual saliency) is the distinct
subjective perceptual quality which makes some items in the world
stand out from their neighbors and immediately grab our attention.},
booktitle={Scholarpedia - the free peer-reviewed encyclopedia},
year={2007},
month={Jul},
volume={2},
number={9},
pages={3327},
type={rev ; bu ; td},
file={http://www.scholarpedia.org/article/Visual_Salience}
}

@article{Einhaeuser_etal07jov,
title={A bottom-up model of spatial attention predicts human error
patterns in rapid scene recognition},
author={W. Einhaeuser and T. N. Mundhenk and P. F. Baldi and C. Koch
and L. Itti},
abstract={Humans demonstrate a peculiar ability to detect complex
                  targets in rapidly presented natural scenes. Recent
                  studies suggest that (nearly) no focal attention is
                  required for overall performance in such
                  tasks. Little is known, however, of how detection
                  performance varies from trial to trial and which
                  stages in the processing hierarchy limit
                  performance: bottom-up visual processing
                  (attentional selection and/or recognition) or
                  top-down factors (e.g., decision-making, memory, or
                  alertness fluctuations)? To investigate the relative
                  contribution of these factors, eight human observers
                  performed an animal detection task in natural scenes
                  presented at 20 Hz. Trial-by-trial performance was
                  highly consistent across observers, far exceeding
                  the prediction of independent errors. This
                  consistency demonstrates that performance is not
                  primarily limited by idiosyncratic factors but by
                  visual processing. Two statistical stimulus
                  properties, contrast variation in the target image
                  and the information-theoretical measure of
                  ``surprise'' in adjacent images, predict performance
                  on a trial-by-trial basis. These measures are
                  tightly related to spatial attention, demonstrating
                  that spatial attention and rapid target detection
                  share common mechanisms. To isolate the causal
                  contribution of the surprise measure, eight
                  additional observers performed the animal detection
                  task in sequences that were reordered versions of
                  those all subjects had correctly recognized in the
                  first experiment. Reordering increased surprise
                  before and/or after the target while keeping the
                  target and distractors themselves
                  unchanged. Surprise enhancement impaired target
                  detection in all observers. Consequently, and
                  contrary to several previously published findings,
                  our results demonstrate that attentional
                  limitations, rather than target recognition alone,
                  affect the detection of targets in rapidly presented
                  visual sequences.},
journal={Journal of Vision},
volume={7},
number={10},
pages={1-13},
month={Jul},
year={2007},
type={td;bu;su;mod;psy},
file={http://ilab.usc.edu/publications/doc/Einhaeuser_etal07jov.pdf},
if = {2005 impact factor: 3.469}
}

@press{Anonymous07e,
author={A. Anonymous},
title={SeaBee II Swims for USC in This Week's International Underwater 
Robot Contest},
journal={www.exduco.net - Graduate schools and programs guide},
month={Jul},
year={2007},
type={cv},
file = {http://iLab.usc.edu/publications/doc/Anonymous07e.pdf}
}

@invited{Itti07urg,
author={L. Itti},
title={The role of attention as an enabler},
booktitle={DARPA URGENT meeting, San Diego, California},
month={Jul},
year={2007},
type={bu;td;mod}
}

@invited{Itti07nsf,
author={L. Itti},
title={Human and monkey eye movements under natural free viewing},
booktitle={First NSF data sharing workshop, University of Maryland, MD},
month={Jun},
year={2007},
type={bu;td;eye;mod}
}

@invited{Itti07crcns,
author={L. Itti},
title={Characterizing bayesian surprise in humans and monkeys},
booktitle={NSF CRCNS annual meeting, University of Maryland, MD},
month={Jun},
year={2007},
type={bu;td;su;mod}
}

@invited{Itti07mun,
author={L. Itti},
title={Integrating low-level visual and high-level cognitive processing},
booktitle={International Symposium on Dynamics of Attentional
Control, Munich, Germany},
month={Jun},
year={2007},
type={bu;td;sc;mod}
}

@inproceedings{Peters_Itti07cvpr,
author={R. J. Peters and L. Itti},
title={Beyond bottom-up: Incorporating task-dependent influences into a
computational model of spatial attention},
abstract={A critical function in both machine vision and biological
                  vision systems is attentional selection of scene
                  regions worthy of further analysis by higher-level
                  processes such as object recognition. Here we
                  present the first model of spatial attention that
                  (1) can be applied to arbitrary static and dynamic
                  image sequences with interactive tasks and (2)
                  combines a general computational implementation of
                  both bottom-up (BU) saliency and dynamic top-down
                  (TD) task relevance; the claimed novelty lies in the
                  combination of these elements and in the fully
                  computational nature of the model. The BU component
                  computes a saliency map from 12 low-level
                  multi-scale visual features. The TD component
                  computes a low-level signature of the entire image,
                  and learns to associate different classes of
                  signatures with the different gaze patterns recorded
                  from human subjects performing a task of
                  interest. We measured the ability of this model to
                  predict the eye movements of people playing
                  contemporary video games. We found that the TD model
                  alone predicts where humans look about twice as well
                  as does the BU model alone; in addition, a combined
                  BU*TD model performs significantly better than
                  either individual component. Qualitatively, the
                  combined model predicts some easy-to-describe but
                  hard-to-compute aspects of attentional selection,
                  such as shifting attention leftward when approaching
                  a left turn along a racing track. Thus, our study
                  demonstrates the advantages of integrating BU
                  factors derived from a saliency map and TD factors
                  learned from image and task contexts in predicting
                  where humans look while performing complex
                  visually-guided behavior.},
booktitle={Proc. IEEE Conference on Computer Vision and Pattern
Recognition (CVPR)},
address={Minneapolis, MN},
month={Jun},
year={2007},
type={bu ; cv ; td ; eye ; mod},
file={http://ilab.usc.edu/publications/doc/Peters_Itti07cvpr.pdf},
if = {2007 acceptance rate: 28\%},
review={full/conf}
}

@invited{Itti07bir,
author={L. Itti},
title={Building models that integrate low-level visual and high-level
cognitive processing},
booktitle={International conference: Closing the gap between
neurophysiology and behaviour: A computational modelling approach,
Birmingham, U.K.},
month={Jun},
year={2007},
type={bu;td;mod}
}

@invited{Itti07jsnc,
author={L. Itti},
title={Bottom-up and top-down guidance of attention towards visually
surprising stimuli},
booktitle={Joint Symposium on Neural Computation, Pasadena, CA},
month={May},
year={2007},
type={bu;td;mod}
}

@inproceedings{Elazary_Itti07vss,
author={L. Elazary and L. Itti},
title={Interesting objects in natural scenes are more salient},
abstract={How do we decide which objects in a visual scene are more
                  interesting? Intuition suggests a complex process of
                  recognizing different candidate scene elements in
                  turn, evaluating their identity and other attributes
                  against behavioral preferences and goals, and
                  finally deciding which among the candidates are more
                  relevant and interesting. Here we investigate the
                  contributions of a much simpler process,
                  saliency-based visual attention. We used the
                  publicly available LabelMe database of 24,863
                  digital photographs in which 74,454 presumably
                  interesting objects have been manually outlined. We
                  evaluated how often these objects were among the few
                  most salient locations by a computational model of
                  bottom-up attention. We find that in 43 percent of
                  all images the model's first fixation falls within a
                  labeled region, twice above chance (21
                  percent). Furthermore, within three fixations, the
                  saliency map is able to pick a labeled region over
                  85 percent of the time, with performance leveling
                  off after six fixations. The bottom-up attention
                  model has no notion of object nor of semantic
                  relevance. Hence, our results indicate that
                  selecting interesting objects in a scene is largely
                  constrained by low-level visual properties of scene
                  elements, rather than solely determined by
                  recognition and higher cognitive processes. The
                  saliency map is a strong predictor of what humans
                  find interesting in complex natural scenes.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS07)},
year={2007},
month={May},
type={mod;bu;psy},
file={http://ilab.usc.edu/publications/doc/Elazary_Itti07vss.pdf},
review={abs/conf}
}

@inproceedings{Berg_etal07vss,
author={D. J. Berg and S. E. Boehnke and R. A. Marino and P. F. Baldi and D. P. Munoz
and L. Itti},
title={The Role of Bottom-Up and Top-Down Influences in Directing
Primate Gaze Shifts},
abstract={We investigated differences in the propensity for human and
                  monkey eye movements to be guided by bottom-up
                  stimulus properties or cognitive factors. We
                  recorded eye movements from 4 humans and 4 monkeys
                  freely viewing 50 natural video clips (approximately
                  20,000 frames). To quantify species differences in
                  attentional allocation we used a computational model
                  of visual surprise (Itti, Baldi, 2005). We measured
                  the amount of surprise at the endpoints of saccadic
                  eye movements and compared it to the amount of
                  surprise at randomly shuffled eye positions. We
                  found that humans and monkeys are attracted to
                  surprising locations significantly above chance
                  (p<10-10) but with no significant species difference
                  (p=0.5); however, interobserver agreement is
                  significantly higher in humans than monkeys. When
                  multiple monkeys simultaneously agreed on a common
                  gaze target the location had significantly increased
                  surprise (p=.0001), while humans showed slightly
                  decreased surprise (p=0.6) for agreed gaze
                  locations.  This indicates that monkeys agree on
                  gaze targets for bottom-up and humans for cognitive
                  factors. This is further supported by the
                  observation that those strong attractors of gaze
                  corresponded to different locations for humans and
                  monkeys. We found that over the course of a clip
                  monkeys remain consistently surprise driven, while
                  humans showed significant differences over time
                  (p<10-10). We observed a similar trend in
                  interobserver agreement. We conclude that humans and
                  monkeys are in general equally driven by surprise,
                  but humans tend to agree on locations containing
                  scene specific semantic information while monkeys
                  agree on locations containing visually surprising
                  information. The time course of surprise and
                  interobserver predictability for each species
                  implies that top-down influences play a greater role
                  in modulating the influence of bottom-up attention
                  in humans than monkeys. This study indicates that
                  monkeys serve as a good model for bottom-up
                  attention in humans due to the limited effect of
                  top-down influences.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS07)},
year={2007},
month={May},
type={mod;su;eye},
review={abs/conf}
}

@inproceedings{Navalpakkam_Itti07vss,
author={V. Navalpakkam and L. Itti},
title={Attentional modulation of tuning width, preferred features and
gains during visual search},
abstract={Although attention is known to modulate neural activity,
                  there has been much debate on whether it modulates
                  tuning width, or preferred features, or gains. While
                  some studies show evidence for changes in tuning
                  width and preferred features without change in
                  gains, other studies show evidence for gain
                  modulation only. Here, we adopt a computational
                  approach and ask what is the best way in which
                  attention can modulate neural activity so as to
                  maximize visual search performance. Our simulation
                  results predict that all forms of modulation occur,
                  but their utility varies with task difficulty due to
                  target-distractor discriminability.  While
                  modulation of tuning width contributes little in
                  easy tasks where the target and distractor are very
                  different, its contribution increases in difficult
                  tasks where the target and distractor are
                  similar. The opposite trend is shown for gain
                  modulation, whose contribution decreases with
                  increasing task difficulty. This suggests that the
                  conflicting experimental observations in the field
                  may be due to differences in tasks and in their
                  difficulty. This calls for new experiments that
                  systematically investigate neural modulation as a
                  function of task difficulty.  },
booktitle={Proc. Vision Science Society Annual Meeting (VSS07)},
year={2007},
month={May},
type={mod;td;bu},
review={abs/conf}
}

@inproceedings{White_etal07vss,
author={B. J. White and S. E. Boehnke and R. A. Marino and D. Talsma and
L. Itti and J. Theeuwes and D. P. Munoz},
title={Competition between exogenous and endogenous signals revealed
                  by saccade latency and saccade curvature in the
                  monkey},
abstract={Natural visual environments contain multiple stimuli
                  competing for our attention, only one of which is
                  selected at a given moment. The processes underlying
                  this competition may be reflected in the time
                  required to generate a saccade to the goal stimulus
                  (Walker et al., 1997), and the nature of the saccade
                  trajectory (McPeek et al. 2003). We trained two
                  monkeys on an oculomotor capture paradigm (Theeuwes
                  et al. 1999) in which one of six homogenous stimuli
                  becomes a target singleton through a color change in
                  the other five items. The monkeys had to make a
                  saccade to the odd colored target. Simultaneously on
                  half the trials an additional distractor suddenly
                  appeared either near or far from the saccade
                  goal. We found that correctly directed saccades
                  showed longer latencies when the distractor was
                  present versus absent. In contrast, latencies were
                  shorter when the eyes were overtly captured by the
                  distractor. In addition, the proportion of capture
                  was always greatest when the distractor appeared
                  near the goal of the saccade. On correct trials,
                  saccades were often curved in the direction of the
                  near distractor. Furthermore, the eyes were also
                  captured by other items more often when the
                  distractor was near the saccade goal suggesting that
                  efforts to avoid the sudden onset occurred at a cost
                  of capture by other non-targets. The results are
                  consistent with the idea that the activity
                  associated with exogenous and endogenous signals
                  combine locally (Trappenberg et al. 2001).},
booktitle={Proc. Vision Science Society Annual Meeting (VSS07)},
year={2007},
month={May},
type={psy;bu;td},
review={abs/conf}
}

@inproceedings{Peters_Itti07vss,
author={R. J. Peters and L. Itti},
title={Integrating low-level and high-level visual influences on eye
                  movement behavior},
abstract={We propose a comprehensive computational framework unifying
                  previous qualitative studies of high-level cognitive
                  influences on eye movements with quantitative
                  studies demonstrating the influence of low-level
                  factors such as saliency. In this framework, a
                  top-level ''governor'' uses high-level task
                  information to determine how best to combine
                  low-level saliency and gist-based task-relevance
                  maps into a single eye-movement priority map.  We
                  recorded the eye movements of six trained subjects
                  playing 18 different sessions of first-person
                  perspective video games (car racing, flight combat,
                  and ''first-person shooter'') and simultaneously
                  recorded the game's video frames, giving about 18
                  hours of recording for 15,000,000 eye movement
                  samples (240Hz) and 1.1TB of video data (640x480
                  pixels at 30Hz). We then computed measures of how
                  well the individual saliency and task-relevance maps
                  predicted observers' eye positions in each frame,
                  and probed for the role of the governor in
                  relationships between high-level task information --
                  such as altimeter and damage meter settings, or the
                  presence/absence of a target -- and the predictive
                  strength of the maps.  One such relationship
                  occurred in the flight combat game. In this game,
                  observers are actively task-driven while tracking
                  enemy planes, ignoring bottom-up saliency in favor
                  of task-relevant items like the radar screen; then,
                  after firing a missile, observers become passively
                  stimulus-driven while awaiting visual confirmation
                  of the missile hit. We confirmed this quantitatively
                  by analyzing the correspondence between saliency and
                  eye position across a window of +/-10s relative to
                  the time of 328 such missile hits. Around -200ms
                  (before the hit), the saliency correspondence begins
                  to rise, reaching a peak at +100ms (after the hit)
                  of 10-fold above the previous baseline, then is
                  suppressed below baseline at +800ms, and rebounds
                  back to baseline at +2000ms. Thus, one mechanism by
                  which high-level cognitive information can influence
                  eye movements is through dynamically weighting
                  competing saliency and task-relevance maps.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS07)},
year={2007},
month={May},
type={mod;bu;td;eye},
review={abs/conf}
}

@inproceedings{Boehnke_etal07vss,
author={S. E. Boehnke and D. J. Berg and P. F. Baldi and L. Itti and D. P. Munoz},
title={Adaptation and habituation of visual responses in the
                  superficial and intermediate layers of the superior
                  colliculus (SC)},
abstract={One neural correlate of visual attention is a decrease in
                  the neural response to a target after prior
                  presentation of an orienting 'cue' (e.g. inhibition
                  of return). This decreased responding could be the
                  result of repeated stimulation of a neuron's
                  receptive field resulting in either 'adaptation' --
                  a lower level mechanism related to neural fatigue,
                  or 'habituation' -- where an organism stops
                  responding to an irrelevant stimulus but recovers
                  the response after a change in stimulus
                  properties. We dissociated adaptation from
                  habituation in superficial (SCs) and intermediate
                  (SCi) layer neurons of the SC, a hub of oculomotor
                  and attentional processing. SCs receives visual
                  input from the retina directly or via V1, while the
                  SCi receives convergent input from visual and motor
                  areas. Monkeys were rewarded for fixating a central
                  point while a series of 7 successive stimuli were
                  briefly flashed (100 ms duration; 100-400 ms
                  interval) in the receptive field of the neuron. On
                  70 percent of trials all flashed stimuli were
                  identical, while on others, the 4th was either
                  brighter, dimmer or absent (10 percent each). If
                  reduced neural response is due to habituation, some
                  recovery of the response (dishabituation) should
                  occur to any oddball stimulus. However, if the
                  reduced response is due to adaptation, the response
                  should be further reduced after the brighter, but
                  recover after the dimmer or absent stimulus. The
                  largest decrease in response (often > than 50
                  percent) was to the second stimulus, and subsequent
                  stimuli resulted in only small further
                  reductions. The shorter the inter-flash interval,
                  the greater these reductions. The pattern was
                  globally similar in SCs and SCi, but there was a
                  greater reduction to the 3-7th stimuli in
                  SCi. Responses to oddball stimuli in SCs neurons
                  were suggestive of adaptation, while responses in
                  SCi neurons showed features of both adaptation and
                  habituation.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS07)},
year={2007},
month={May},
type={mod;su},
review={abs/conf}
}

@inproceedings{Carmi_etal07vss,
author={R. Carmi and P. Tseng and I. G. M. Cameron and D. P. Munoz and
                  L. Itti},
title={The impact of maturation and aging on mechanisms of attentional
                  selection},
abstract={How do mechanisms of attentional selection change as people
                  mature and age?  To investigate this question, we
                  tracked the eyes of 3 groups of human observers
                  (children: 10-13, adults: 20-28, and elderly: 69-73)
                  as they watched MTV-style video clips (30 s each)
                  constructed from unrelated shots of natural scenes
                  (2-4 s each).  It was previously shown that jump
                  cuts - abrupt transitions between shots - lead to
                  stereotypical changes in the balance between
                  bottom-up and top-down influences on attentional
                  selection (http://journalofvision.org/6/9/4).
                  Specifically, the impact of bottom-up influences
                  peaks shortly after jump cuts, followed by monotonic
                  decreases for up to 2.5 s.  Here we investigated the
                  effects of maturation and aging on the balance
                  between bottom-up and top-down influences.  We
                  analyzed the input video clips with a bottom-up
                  computer model of attentional selection, and probed
                  the impact of bottom-up influences by quantifying
                  the accuracy of the model in predicting saccade
                  targets (>40,000 in total).  We found that the
                  overall impact of bottom-up influences increased
                  monotonically as a function of age (>10 percent
                  magnitude difference between adjacent age groups,
                  p<<0.01).  Temporal changes in the impact of
                  bottom-up influences were highly conserved between
                  the children and the adults, but differed
                  substantially in the elderly.  A straight-forward
                  yet counter-intuitive interpretation of the results
                  is that people become more bottom-up driven as they
                  mature and age.  Alternatively, jump cuts may affect
                  attentional mechanisms differently in different
                  ages, leading to more random selections by children
                  and slower utilization of top-down information by
                  the elderly.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS07)},
year={2007},
month={May},
type={mod;bu;td;eye;sc},
review={abs/conf}
}

@inproceedings{Tseng_etal07vss,
author={P. Tseng and R. Carmi and I. G. M. Cameron and D. P. Munoz and
                  L. Itti},
title={The impact of content-independent mechanisms on guiding
                  attention},
abstract={Several eye-tracking experiments have shown that human
                  observers tend to look at the center of photographs
                  and movies more than expected by chance. This
                  so-called ``center bias'' in gaze distributions may
                  be caused by centrally-biased content, such as
                  objects of interest (top-down bias) and salient
                  features (bottom-up bias), or by other
                  content-independent factors, such as experimental
                  setup. Here we quantify the relative contribution of
                  these potential causes.  We shot videos of natural
                  scenes that were either biased (camera following a
                  main actor) or non-biased (camera panning at a fixed
                  speed) in terms of top-down influences. These videos
                  were further classified into bottom-up biased versus
                  non-biased based on a computational model of
                  bottom-up influences, resulting in four bias
                  conditions. We then tracked the eyes of twelve young
                  adults as they freely viewed the videos, and
                  extracted their saccades (12132). The extent of
                  center bias was quantified using a metric that
                  measures the average distance of saccade targets
                  from the center (0: saccade targets uniformly
                  distributed over display; 100: all saccade targets
                  at center).  We found highly significant center-bias
                  remained in the eye-movement patterns in all the
                  four conditions. (1) For both top-down and bottom-up
                  center-bias condition, it scored 45.26+/-0.52 (mean
                  +/- SE); (2) top-down and not bottom-up center-bias:
                  41.61 +/- 0.61; (3) bottom-up and not top-down
                  center-bias: 38.26+/-0.64; (4) neither top-down nor
                  bottom-up center-bias: 38.07+/-0.56.  Our data
                  demonstrates for the first time that a bias exists
                  which is due to neither cognitively interesting
                  items nor bottom-up salient items being concentrated
                  around the center. It also indicate that 84 percent
                  of the maximal observed center-bias can be explained
                  by factors that are content-independent, such as the
                  methodological setup (chin rest, centralized gaze
                  position of subjects when they look straight ahead),
                  and a viewing strategy of looking at the center so
                  as to minimize the amplitude of saccades.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS07)},
year={2007},
month={May},
type={mod;bu;td;eye;sc},
review={abs/conf}
}

@inproceedings{Itti_etal07vss,
author={L. Itti and M. Yoshida and D. J. Berg and T. Ikeda and R. Kato
                  and K. Takaura and T. Isa},
title={Investigation of spontaneous saccades based on the saliency
                  model in monkeys with unilateral lesion of primary
                  visual cortex},
abstract={Primary visual cortex (area V1) is the entry point of visual
                  processing into the primate cortex. Yet, human and
                  animal studies of V1 lesions have demonstrated a
                  ``blindsight'' phenomenon, whereby residual
                  visually-guided behavior remains even when large
                  portions of V1 are absent.  However, little is known
                  quantitatively of how this residual vision (possibly
                  subcortically-mediated) differs from normal
                  (cortically-mediated) vision. We analyzed eye
                  movements of three macaque monkeys (one normal, two
                  with complete unilateral V1 ablation) watching ~54
                  minutes of television (97,051 video frames). A
                  computational model of bottom-up attention
                  quantified how salient visual features may guide
                  gaze into the normal vs. the lesioned hemifield. To
                  eliminate stimulus biases, we randomly presented all
                  video clips twice, original and horizontally
                  flipped. We quantified the extent to which salient
                  stimuli attracted gaze of each monkey by computing,
                  for saccades tallied along the eight principal
                  directions, a model-based bottom-up guidance score
                  (chance level 0.5; ideal upper bound 1.0; practical
                  inter-observer score previously measured as the
                  extent to which three control monkeys predict gaze
                  of a fourth monkey ~0.6). For the normal as well as
                  both lesioned monkeys, we found that saccades in all
                  eight principal directions were guided towards
                  salient locations, significantly above chance
                  (scores 0.585+/-0.006 to 0.662+/-0.004, t-tests
                  p<0.00003, 909 to 3,537 saccades per monkey in each
                  direction). However, although lesioned monkeys
                  overall scored lower, there was little difference in
                  bottom-up guidance with saccade direction (scores
                  2.2 percent to 3.6 percent lower for saccades
                  directed into vs. away from the lesioned hemifield).
                  Our preliminary results suggest that the extent to
                  which monkey saccades are attracted towards salient
                  locations during natural vision may be less affected
                  by the absence of primary visual cortex than
                  previously considered.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS07)},
year={2007},
month={May},
type={mod;su;eye;bu},
review={abs/conf}
}

@invited{Itti07wcaa,
author={L. Itti},
title={Combined top-down and bottom-up attentional guidance},
booktitle={Plenary lecture, ICVS workshop on computational attention
and applications (WCAA), Bielefeld, Germany},
month={Mar},
year={2007},
type={bu;td;mod}
}

@invited{Itti07up,
author={L. Itti},
title={What makes something visually interesting?},
booktitle={University of Paderborn, Germany},
month={Mar},
year={2007},
type={bu;td;mod}
}

@press{Scott07ieb,
author={C. Scott},
title={Technical Article: Don't let Colours Hide the Alarms!},
abstract={Failure to identify alarms and their states quickly and
                  accurately has recently been cited as the cause of
                  many, otherwise avoidable, major capital asset
                  losses. Dealing with alarms properly and quickly is
                  a topic of keen interest to the process
                  manufacturing audience. This article explores the
                  way in which use of colour in process displays
                  affects operator response to alarms and events.},
journal={The Industrial Ethernet Book},
type={bu;cv},
month={Mar},
file={http://ilab.usc.edu/publications/doc/Scott07ieb.pdf},
year={2007}
}

@invited{Itti07vaw,
author={L. Itti},
title={Combined bottom-up and top-down control of visual attention
in humans and monkeys},
booktitle={International workshop on visual attention,
Buenos Aires, Argentina},
month={Mar},
year={2007},
type={bu;td;mod}
}

@inproceedings{Navalpakkam_Itti07cos,
author={V. Navalpakkam and L. Itti},
title={Role of task difficulty in modulation of neural activity},
booktitle={Proc. Computational and Systems Neuroscience (Cosyne)},
abstract={How does attentional modulation of a neuron's preferred
                  feature, or changing its tuning width, or its
                  response gain benefit behavioral performance? We
                  investigate this in the context of a visual search
                  task, e.g., search for a vertical bar among
                  horizontal bars (which becomes more difficult as the
                  target orientation approaches horizontal). [...]},
month={Feb},
year={2007},
pages={I-1},
type={td;mod;psy},
file={http://ilab.usc.edu/publications/doc/Navalpakkam_Itti07cos.pdf},
review={abs/conf}
}

@article{Navalpakkam_Itti07n,
title={Search goal tunes visual features optimally},
author={V. Navalpakkam and L. Itti},
abstract={How does a visual search goal modulate the activity of
neurons encoding different visual features (e.g., color, direction of
motion)? Previous research suggests that goal-driven attention
enhances the gain of neurons representing the target's visual
features. Here, we present mathematical and behavioral evidence that
this strategy is suboptimal and that humans do not deploy it. We
formally derive the optimal feature gain modulation theory, which
combines information from both the target and distracting clutter to
maximize the relative salience of the target. We qualitatively
validate the theory against existing electrophysiological and
psychophysical literature. A surprising prediction is that it is
sometimes optimal to enhance nontarget features. We provide
experimental evidence toward this through psychophysics experiments on
human subjects, thus suggesting that humans deploy the optimal gain
modulation strategy.},
journal={Neuron},
month={Feb},
year={2007},
volume={53},
number={4},
pages={605-617},
note={Also see commentary / preview entitled ``Paying Attention to
Neurons with Discriminating Taste'' by A. Pouget and D. Bavelier,
Neuron 2007;53(4):473-475.},
file={http://ilab.usc.edu/publications/doc/Navalpakkam_Itti07n.pdf},
type={bu;td;mod;psy},
if = {2005 impact factor: 14.304}
}

@article{Siagian_Itti07pami,
title={Rapid Biologically-Inspired Scene Classification
  Using Features Shared with Visual Attention},
author={C. Siagian and L. Itti},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
month={Feb},
year={2007},
volume={29},
number={2},
pages={300-312},
abstract={We describe and validate a simple context-based scene
recognition algorithm for mobile robotics applications. The system can
differentiate outdoor scenes from various sites on a college campus
using a multiscale set of early-visual features, which capture the
``gist'' of the scene into a low-dimensional signature vector.
Distinct from previous approaches, the algorithm presents the
advantage of being biologically plausible and of having low
computational complexity, sharing its low-level features with a model
for visual attention that may operate concurrently on a robot. We
compare classification accuracy using scenes filmed at three outdoor
sites on campus (13,965 to 34,711 frames per site). Dividing each site
into nine segments, we obtain segment classification rates between
84.21 percent and 88.62 percent.  Combining scenes from all sites
(75,073 frames in total) yields 86.45 percent correct classification,
demonstrating generalization and scalability of the approach.},
file={http://ilab.usc.edu/publications/doc/Siagian_Itti07pami.pdf},
type={mod;bu;sc},
if = {2005 impact factor: 3.810}
}

@inproceedings{Boehnke_atal07cps,
author={S. E. Boehnke and L. Itti and D. P. Munoz},
title={Adaptation and habituation of visual responses in the superficial and
intermediate layers of the superior colliculus},
abstract={ One neural correlate of visual attention is a decrease in
the neural response to a target after prior presentation of an
orienting 'cue' (e.g. inhibition of return). This decreased responding
could be the result of repeated stimulation of a neuron's receptive
field resulting in either 'adaptation' -- a lower level mechanism
related to neural fatigue, or 'habituation' -- where an organism stops
responding to an irrelevant stimulus but recovers the response after a
change in stimulus properties. We dissociated adaptation from
habituation in superficial (SCs) and intermediate (SCi) layer neurons
of the SC, a hub of oculomotor and attentional processing. SCs
receives visual input from the retina directly or via V1, while the
SCi receives convergent input from visual and motor areas. Monkeys
were rewarded for fixating a central point while a series of 7
successive stimuli were briefly flashed (100 ms duration; 100-400 ms
interval) in the receptive field of the neuron. On 70 percent of
trials all flashed stimuli were identical, while on others, the 4th
was either brighter, dimmer or absent (10 percent each). If reduced
neural response is due to habituation, some recovery of the response
(dishabituation) should occur to any oddball stimulus. However, if the
reduced response is due to adaptation, the response should be further
reduced after the brighter, but recover after the dimmer or absent
stimulus. The largest decrease in response (often > than 50 percent)
was to the second stimulus, and subsequent stimuli resulted in smaller
further reductions. The shorter the inter-flash interval, the greater
these reductions. The pattern was globally similar in SCs and SCi, but
there was a greater reduction to the 3-7th stimuli in SCi. Responses
to oddball stimuli in SCs neurons were suggestive of adaptation, while
responses in SCi neurons showed features of both adaptation and
habituation.},
booktitle={Proc. Canadian Physiological Society Annual Meeting,
Mount Saint Anne, Canada},
month={Jan},
year={2007},
review={abs/conf},
type={mod;bu;phy}
}

@article{Carmi_Itti06vr,
author={R. Carmi and L. Itti},
title={Visual Causes versus Correlates of Attentional Selection in
Dynamic Scenes},
journal={Vision Research},
year={2006},
volume={46},
number={26},
month={Dec},
pages={4333-4345},
abstract={What are the visual causes, rather than mere correlates, of
attentional selection and how do they compare to each other during
natural vision? To address these questions, we first strung together
semantically unrelated dynamic scenes into MTV-style video clips, and
performed eye tracking experiments with human observers. We then
quantified predictions of saccade target selection based on 7
bottom-up models, including intensity variance, orientation contrast,
intensity contrast, color contrast, flicker contrast, motion contrast,
and integrated saliency. On average, all tested models predicted
saccade target selection well above chance. Dynamic models were
particularly predictive of a subset of saccades that were initiated
immediately after scene onsets, and led to minimal interobserver
variability. In comparison, static models showed mixed results in
these circumstances, with intensity variance and orientation contrast
achieving particularly weak prediction accuracy (lower than their own
average, and approximately 4 times lower than dynamic models). These
results indicate that dynamic visual cues play a dominant causal role
in attracting attention. In comparison, some static visual correlates
of attentional selection play a weaker causal role, while other static
correlates are not causal at all, and may instead reflect top-down
causes.},
file={http://ilab.usc.edu/publications/doc/Carmi_Itti06vr.pdf},
type={bu;sc;eye},
if = {2005 impact factor: 2.027}
}

@article{Navalpakkam_Itti06jov,
title={Top-down attention selection is fine-grained},
author={V. Navalpakkam and L. Itti},
abstract={Although much is known about the sources and modulatory
effects of top-down attentional signals, the information capacity of
these signals is less known. Here, we investigate the granularity of
top-down attentional signals. Previous theories in psychophysics have
provided conflicting evidence on whether top-down guidance is coarse
grained (i.e., one gain control term per feature dimension) or fine
grained (i.e., multiple gain control terms per dimension). We resolve
the conflict by designing new experiments that disentangle top-down
from bottom-up contributions, thereby avoiding confounds existing in
previous studies. The results of our eye-tracking experiments show
that subjects can selectively saccade to items belonging to the
relevant feature interval compared with irrelevant intervals within a
dimension. This suggests that top-down signals can specify not only
the relevant feature dimension but also the relevant feature interval
within a dimension. We conclude that top-down signals are fine grained
and can specify multiple gain control terms per dimension.},
journal={Journal of Vision},
volume={6},
number={11},
pages={1180-1193},
year={2006},
month={Oct},
type={td;bu;mod;psy},
file={http://ilab.usc.edu/publications/doc/Navalpakkam_Itti06jov.pdf},
if = {2005 impact factor: 3.469}
}

@incollection{Itti_Baldi06cvnms,
author={L. Itti and P. F. Baldi},
title={Modeling what attracts human gaze over dynamic natural scenes},
booktitle={Computational Vision in Neural and Machine Systems},
abstract={Attention in biological and artificial systems rapidly
selects important information within massive sensory inputs, a process
key to survival.  When there is little time for detailed sensory
analysis, finding important information must rely on heuristic
computations. To characterize these computations, we propose a general
Bayesian definition of important information we call surprise.
Surprise quantifies how data affects a natural or artificial observer,
by measuring the difference between prior and posterior beliefs of the
observer.  We find that surprise outperforms five other metrics
previously proposed in the literature in predicting recorded gaze
shifts of four humans watching 25 minutes of video stimuli (over
45,000 distinct video frames), including television broadcast and
video games. The Bayesian theory of surprise presented in this chapter
is general and applicable to domains beyond visual attention, across
different modalities, datatypes, tasks, and abstraction levels.},
editor={L. Harris and M. Jenkin},
publisher={Cambridge University Press},
address={Cambridge, MA},
year={2006},
type={mod;bu;cv;su;eye}
}

@invited{Itti06ness,
author={L. Itti},
title={Modeling bottom-up and top-down guidance of visual attention
over natural scenes},
booktitle={Caltech NESS Annual Retreat, Oxnard, CA},
month={Oct},
year={2006},
type={bu;td;mod}
}

@press{Anonymous06cs,
author={A. Anonymous},
title={Beobots},
journal={USC Computer Science main site rolling featured research},
month={Sep},
year={2006},
type={bu;cv;bb},
file={http://ilab.usc.edu/publications/doc/Anonymous06cs.pdf}
}

@incollection{Itti_Arbib06arl,
author={L. Itti and M. A. Arbib},
title={Attention and the Minimal Subscene},
abstract={We describe a computational framework that explores the
interaction between focal visual attention, the recognition of objects
and actions, and related use of language. The framework is motivated
by recent brain imaging studies showing activation of Broca's area not
only in language tasks but also in tasks related to imitation and
action recognition. We introduce the notion of ``minimal subscene'' as a
middle ground representation, in which an agent is linked to objects
or others via some action, and we examine how this notion links to
low-level visual perception, on the one end, and to sentences which
describe such a subscene or raise questions about the scene, on the
other.},
booktitle={Action to Language via the Mirror Neuron System},
editor={M. A. Arbib},
publisher={Cambridge University Press},
address={Cambridge, U.K.},
year={2006},
pages={289-346},
file={http://ilab.usc.edu/publications/doc/Itti_Arbib06arl.pdf},
type={bu;td;mod;sc;eye}
}

@article{Carmi_Itti06jov,
title={The Role of Memory in Guiding Attention during Natural Vision},
author={R. Carmi and L. Itti},
journal={Journal of Vision},
year={2006},
month={Aug},
volume={6},
number={9},
pages={898-914},
abstract={What is the time frame in which perceptual memory guides
attention? Current estimates range from a few hundred milliseconds to
several seconds, minutes, or even days. Here we answer this question
by establishing the time course of attentional selection in realistic
viewing conditions. First, we transformed continuous video clips into
MTV-style video clips by stringing together continuous clip segments
using abrupt transitions (jump cuts). We then asked participants to
visually explore either continuous or MTV-style clips, and recorded
their saccades as objective behavioral indicators of attentional
selections. The utilization of perceptual memory was estimated across
viewing conditions and over time by quantifying the agreement between
human attentional selections and predictions made by a
neurally-grounded computational model. In the critical condition, jump
cuts led to sharp declines in the impact of perceptual memory on
attentional selection, followed by monotonic increases in memory
utilization across 7 consecutive saccades and 2.5 seconds. These
results demonstrate that perceptual memory traces play an important
role in guiding attention across several saccades during natural
vision. We propose novel hypotheses and experiments using hybrid
natural-artificial stimuli to further elucidate neurocomputational
mechanisms of attentional selection.},
file={http://ilab.usc.edu/publications/doc/Carmi_Itti06jov.pdf},
type={mod;bu;td;eye},
if = {2005 impact factor: 3.469}
}

@invited{Itti06grc,
author={L. Itti},
title={Bottom-up and top-down influences on visual attention during
understanding of dynamic visual scenes},
booktitle={Gordon Research Conference on Sensory Coding and the
Natural Environment, Big Sky, Montana},
month={Aug},
year={2006},
type={bu;td;mod}
}

@invited{Itti06humc,
author={L. Itti},
title={A Neurocomputational Model of ``Surprise'' in the Human Brain},
booktitle={Neurology Grand Rounds, Harbor-UCLA Medical Center,
Torrance, California},
month={Aug},
year={2006},
type={bu;td;mod}
}

@article{Itti06vc,
author="L. Itti",
title={Quantitative Modeling of Perceptual Salience at Human Eye Position},
journal={Visual Cognition},
year={2006},
month={Aug-Dec},
abstract={We investigate the extent to which a simple model of
bottom-up attention and salience may be embedded within a broader
computational framework, and compared with human eye movement data. In
this study, we focus on quantifying whether increased realism of the
simulation framework significantly affects the outcome of quantitative
measures of how well the model may predict where in video clips humans
may direct their gaze. To this end, we compare three variants of the
model, tested with 15 video clips of natural scenes, also shown to
three human observers. We measure model-predicted salience at the
locations gazed to by the human observers, compared to random
locations. The first variant simply processes the raw video clips, the
second adds a gaze-contingent foveation filter, and the third further
attempts to realistically simulate dynamic human vision by embedding
the video frames within a larger background, and shifting them to eye
position. Our main finding is that increasing simulation realism
highly significantly improves the predictive ability of the model.
This study hence suggests that attempting to better emulate the
details of how a visual stimulus may actually be captured by a
constantly rotating retina during active vision has a significant
impact onto quantitative outcomes of comparisons between model and
human behavior.},
volume={14},
number={4-8},
pages={959-984},
keywords={Visual attention ; eye movements ; saliency ; bottom-up},
type={mod ; bu ; eye},
file={http://ilab.usc.edu/publications/doc/Itti06vc.pdf},
if = {2004 impact factor: 1.588}
}

@inproceedings{Berg_etal06hfsp,
author={D. J. Berg and S. E. Boehnke and R. A. Marino and P. F. Baldi and D. P. Munoz
and L. Itti},
title={Characterizing Surprise in Humans and Monkeys},
abstract={We investigate the role of visual surprise in guiding eye
movements in humans and rhesus monkeys under free viewing conditions,
for a variety of natural stimuli. Surprise differs from other models
of bottom-up visual attention in that it quantifies how data affects
an observer, by measuring the difference between posterior and prior
beliefs of the observer. We recorded eye movements from naive
observers, 4 humans and 3 monkeys, while they watched 115 video clips
(47,903 frames, 27 minutes) resulting in 6,775 saccades for humans and
10,406 for monkeys. Clips ranged in semantic content, including video
of natural, non-natural, building-city, indoor, and sporting-outdoor
scenes both with and without main actors. A surprise model of
bottom-up visual attention then predicted in real-time how surprising
every location was in the display. The distribution of surprise at the
endpoint (target) locations of human or monkey saccadic eye movements
was then compared to the distribution of surprise at random locations
using a standard information theoretic technique, Kullback-Leibler
distance. Considering all clips together 59 percent and 56 percent of
gaze shifts were directed towards locations more surprising than
average for humans and monkeys, however, agreement with the model
varied greatly across clip type (ranging from 35-77 percent). Humans
and monkeys showed a similar pattern of agreement with the model
across image type, with a significant difference only in
sporting-outdoor clips. This data suggests that under free viewing
humans and monkeys are employing similar bottom-up attentional
mechanisms.},
booktitle={HFSP 6th Annual Meeting, Paris, France},
year={2006},
month={Jul},
type={mod;su;eye},
review={abs/conf}
}

@inproceedings{Itti_etal06icme,
author={L. Itti and N. Dhavale and F. Pighin},
title={Photorealistic Attention-Based Gaze Animation},
abstract={We apply a neurobiological model of visual attention and
gaze control to the automatic animation of a photorealistic virtual
human head. The attention model simulates biological visual processing
along the occipito-parietal pathway of the primate brain. The gaze
control model is derived from motion capture of human subjects, using
high-speed video-based eye and head tracking apparatus. Given an
arbitrary video clip, the model predicts visual locations most likely
to attract an observer's attention, and simulates the dynamics of eye
and head movements towards these locations.  Tested on 85 video clips
including synthetic stimuli, video games, TV news, sports, and outdoor
scenes, the model demonstrates a strong ability at saccading towards
and tracking salient targets.  The resulting autonomous virtual human
animation is of photorealistic quality.},
booktitle={Proc. IEEE International Conference on Multimedia and Expo},
year={2006},
pages={1-4},
month={Jul},
type={mod;bu},
file={http://ilab.usc.edu/publications/doc/Itti_etal06icme.pdf},
review={full/conf}
}

@inproceedings{Navalpakkam_Itti06cvpr,
author={V. Navalpakkam and L. Itti},
title={An Integrated Model of Top-down and Bottom-up Attention for
Optimal Object Detection},
abstract={Integration of goal-driven, top-down attention and
image-driven, bottom-up attention is crucial for visual search. For
instance, in robot navigation, it is important to detect goal-relevant
targets like road signs and landmarks, and to simultaneously notice
unexpected visual events like sudden obstacles and accidents. Yet,
previous research has mostly focused on models that are purely
top-down or bottom-up. Here, we propose a new model that combines
both. The bottom-up component computes the visual salience of scene
locations in different feature maps extracted at multiple spatial
scales. The top-down component uses accumulated statistical knowledge
of the visual features of the desired search target and background
clutter, to optimally tune the bottom-up maps so as to maximize target
detection speed. The results of testing on 600 artificial search
arrays and 300 natural scenes show that the model's predictions are
consistent with a large body of available literature on human
psychophysics of visual search. The promising results suggest that our
model may provide good approximation to how humans combine bottom-up
and top-down cues such as to optimize visual search behavior.},
booktitle={Proc. IEEE Conference on Computer Vision and Pattern
Recognition (CVPR)},
address={New York, NY},
month={Jun},
year={2006},
pages={2049-2056},
type={bu ; cv ; td},
file={http://ilab.usc.edu/publications/doc/Navalpakkam_Itti06cvpr.pdf},
if = {2006 acceptance rate: 28\%},
review={full/conf}
}

@invited{Itti06mto,
author={L. Itti},
title={Modeling Human Processes of Complex Scene Understanding},
booktitle={DARPA/MTO Complex Systems Architecture Workshop, Arlington, VA},
month={Jun},
year={2006},
type={bu;td;mod}
}

@invited{Itti06crcns,
author={L. Itti},
title={Bayesian Surprise Attracts Human and Monkey Attention},
booktitle={NSF CRCNS Annual Meeting, Arlington, VA},
month={Jun},
year={2006},
type={bu;td;mod;su;eye}
}

@article{Bonaiuto_Itti06ivc,
author={J. Bonaiuto and L. Itti},
title={The Use of Attention and Spatial Information for Rapid Facial Recognition in Video},
abstract={Bottom-up visual attention is the process by which primates
quickly select regions of an image likely to contain behaviorally
relevant objects. In artificial systems, restricting the task of
object recognition to these regions allows faster recognition and
unsupervised learning of multiple objects in cluttered scenes. A
problem with this approach is that often objects that are
superficially dissimilar to the target are given the same
consideration in recognition as similar objects. Additionally, in
video, objects recognized in previous frames at locations distant to
the current fixation point often are given the same consideration in
recognition as objects previously recognized at proximal locations.
Here we investigate the value of rapidly pruning the facial
recognition search space, first using similarity in the
already-computed low-level features that guide attention to prioritize
matching against an object database, and, second, using spatial
proximity information derived from previous video frames.  By
comparing the performance of Lowe's recognition algorithm with Itti \&
Koch's bottom-up attention model with and without search space
pruning, we demonstrate that this approach significantly accelerates
facial recognition in video footage.},
journal={Image and Vision Computing},
volume={24},
number={6},
pages={557-563},
month={Jun},
year={2006},
file={http://ilab.usc.edu/publications/doc/Bonaiuto_Itti06ivc.pdf},
type={bu ; cv},
if = {2004 impact factor: 1.159}
}

@inproceedings{Carmi_Itti06emcoc,
author={R. Carmi and L. Itti},
title={The Role of Spatial Memory in Guiding Attention During Natural Vision},
abstract={Paying attention to the right thing at the right time
underlies the ability of humans and other animals to learn, perceive,
and interact with their environment. A central unresolved question is
the time frame in which spatial memory guides attention, with current
estimates ranging from a single fixation to seconds, minutes, or even
days. Here we answer this question by revealing the time course of
attentional selection during natural vision. We asked human
participants to visually explore either continuous or scene-shuffled
video clips, and quantified the impact of memory-free influences on
overt attentional selections (saccades) based on a computational
saliency model. Overall, scene shuffling resulted in no significant
differences in the impact of memory-free influences compared to
continuous viewing. However, abrupt scene transitions (jump cuts) led
to sharp peaks in the impact of memory-free influences, which then
declined progressively across 7 fixations for up to 2.5 seconds. These
results indicate that visual exploration of dynamic scenes critically
depends on spatial memory traces that persist across several fixations
for up to a couple of seconds.},
booktitle={Proc. Eye Tracking, Cognition and Communication Workshop of the 
Second Biennial Conference on Cognitive Science},
address={St. Petersburg, Russia},
month={Jun},
year={2006},
type={bu;td;mod;eye},
review={abs/wkshp},
file={http://ilab.usc.edu/publications/doc/Carmi_Itti06emcoc.pdf}
}

@invited{Itti_Baldi06ucla2,
author={L. Itti and P. F. Baldi},
title={Bayesian Surprise Attracts Human and Monkey Attention},
booktitle={Brain Mapping Seminar, University of California, Los Angeles, CA},
month={May},
year={2006},
type={bu;td;mod;su;eye}
}

@invited{Itti06jsnc,
author={L. Itti},
title={Optimal Feature Biasing During Visual Search},
booktitle={Joint Symposium on Neural Computation, La Jolla, CA},
month={May},
year={2006},
type={bu;td;mod}
}

@inproceedings{Berg_etal06vss,
author={D. J. Berg and S. E. Boehnke and R. A. Marino and P. F. Baldi and D. P. Munoz
and L. Itti},
title={Characterizing Surprise in Humans and Monkeys},
abstract={We investigate the role of visual surprise in guiding eye
movements in humans and rhesus monkeys under free viewing conditions,
for a variety of natural stimuli. Surprise differs from other models
of bottom-up visual attention in that it quantifies how data affects
an observer, by measuring the difference between posterior and prior
beliefs of the observer. We recorded eye movements from naive
observers, 4 humans and 3 monkeys, while they watched 115 video clips
(47,903 frames, 27 minutes) resulting in 6,775 saccades for humans and
10,406 for monkeys. Clips ranged in semantic content, including video
of natural, non-natural, building-city, indoor, and sporting-outdoor
scenes both with and without main actors. A surprise model of
bottom-up visual attention then predicted in real-time how surprising
every location was in the display. The distribution of surprise at the
endpoint (target) locations of human or monkey saccadic eye movements
was then compared to the distribution of surprise at random locations
using a standard information theoretic technique, Kullback-Leibler
distance. Considering all clips together 59 percent and 56 percent of
gaze shifts were directed towards locations more surprising than
average for humans and monkeys, however, agreement with the model
varied greatly across clip type (ranging from 35-77 percent). Humans
and monkeys showed a similar pattern of agreement with the model
across image type, with a significant difference only in
sporting-outdoor clips. This data suggests that under free viewing
humans and monkeys are employing similar bottom-up attentional
mechanisms.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS06)},
year={2006},
month={May},
type={mod;su;eye},
review={abs/conf}
}

@inproceedings{Navalpakkam_Itti06vss,
author={V. Navalpakkam and L. Itti},
title={Optimal feature gain modulation during visual search},
abstract={Despite substantial neurobiological and behavioral evidence
that knowledge modulates feature processing and facilitates visual
search, currently, there is no mathematical theory to capture such
top-down influences. We propose an optimal theory of how prior
statistical knowledge of target and distractor features modulates the
response gains of neurons encoding low-level visual features, such
that search speed is maximized. Through numerical simulations, we show
that this theory successfully explains many reported behavioral and
electrophysiological observations including top-down effects such as
the role of priming, role of uncertainty, target enhancement and
distractor suppression, as well as bottom-up effects such as pop-out,
role of target-distractor discriminability, distractor heterogeneity,
linear separabilty and others. Further, the theory makes surprising
predictions whereby finding a target may sometimes require suppression
of target features, or enhancement of non-target features. We validate
these counter-intuitive predictions through new psychophysics
experiments. Four naive subjects performed a difficult search for 55
degree oriented target among 50 degree distractors. The gains thus set
up were tested by randomly inserting probe trials, in which we briefly
flashed (200ms) four items representing the distractor (50 degree),
the target (55 degree), relevant as predicted by the theory (60
degree), and steep (80 degree) cues. Although subjects searched for a
55 degree target, as predicted by the theory, there were significantly
higher number of reports on the 60 degree item (paired t-test with
p<0.05). These results provide direct experimental evidence that
humans may deploy optimal feature gain modulation strategies.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS06)},
year={2006},
month={May},
type={mod;td;psy},
review={abs/conf},
file={http://ilab.usc.edu/publications/doc/Navalpakkam_Itti06vss.pdf},
note={Recipient of VSS student travel fellowship}
}

@inproceedings{Carmi_Itti06vss,
author={R. Carmi and L. Itti},
title={From Eye-tracking Data to Information: Lessons from Dynamic Scenes},
abstract={A common simplifying assumption for dealing with vast
amounts of raw eye-tracking data is to focus on spatial rather than
temporal analyses. This assumption is supported by studies with still
images, which showed that spatial rather than temporal correlations
provide the only source of information in eye-tracking data. Here we
establish the extent to which this assumption is violated during
inspection of dynamic scenes.  We collected 50 video clips depicting a
heterogeneous collection of natural scenes. These clips were cut into
clip segments, which were re-assembled into 50 scene-shuffled clips
(MTV-style). Human observers inspected either continuous or
scene-shuffled clips, and inter-observer agreement in gaze position
was quantified across conditions and over time.  On average, the
instantaneous eye-positions of 4 human observers were clustered within
a rectangle covering 8.51 percent and 6.04 percent of the display area
in the continuous and scene-shuffled conditions, respectively. These
values increased to 11.48 percent (p<0.01) and 9.36 percent (p<0.01)
when eye-positions were sampled from the same eye traces in random
order. The average cluster area increased further to 35.88 percent
(p<0.01) when 4 eye-positions were chosen at random from a uniform
distribution of spatial locations. Moreover, preserving time
information led to previously unreported patterns of inter-observer
agreement.  These results demonstrate that increasing stimulus
dynamics triggers eye-movement patterns that diverge increasingly from
previous accounts based on still images. The limited scalability of
conclusions based on still images is likely to be further accentuated
by future enhancements in the realism of laboratory stimuli, such as
larger field of view and reduced central bias.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS06)},
year={2006},
month={May},
type={mod;bu;td;eye},
review={abs/conf}
}

@inproceedings{Peters_Itti06vss,
author={R. J. Peters and L. Itti},
title={A computational model of task-dependent influences on eye position},
abstract={Computational models of bottom-up attention can perform
significantly above chance at predicting eye positions of observers
passively viewing static or dynamic images. Nevertheless, much of eye
movement behavior (50 percent or more) is unexplained by purely
bottom-up models, and is typically attributed to top-down,
inter-observer, task-dependent, or random effects. Other studies have
qualitatively described such high-level effects in naturalistic
interactive visual tasks (e.g., while driving, how often do people
fixate other cars, or the road, or road signs); yet the underlying
neurocomputational mechanisms remain unknown. Here, we introduce a
simple computational model of task-related eye position influences in
interactive tasks with dynamic stimuli. This model extracts from each
frame a low-dimensional feature signature (``gist''), compares that
with a database of eye position training frames, and produces an eye
position prediction map. Finally, we combine the task-related and
bottom-up maps, and compare the combined maps with observers' actual
eye positions across 216,000 frames from 24 five-minute
videogame-playing sessions. For analysis, each map was rescaled to
have zero mean and unit standard deviation; the average predicted
value at human eye position locations was 0.61 +/- 0.1 in the purely
bottom-up maps, and 2.42 +/- 0.07 in the combined maps (a random model
gives an average value of 0). Thus, this straightforward model of
task-dependent effects offers some of the strongest purely
computational general-purpose eye movement predictions to date, going
significantly beyond what is explained by purely bottom-up effects;
yet it relies only on simple visual features, without requiring any
high-level semantic scene description.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS06)},
year={2006},
month={May},
type={mod;bu;td;eye},
review={abs/conf}
}

@inproceedings{Lu_Itti06vss,
author={J. Lu and L. Itti},
title={Feature-based attention is not object-based},
abstract={Feature-based attention was revealed as global enhancement
of attended visual features throughout the visual cortex. Object-based
attention was shown as better performance when concurrently
discriminating two features of same object compared to features of
different objects. We used fMRI to investigate whether feature-based
attention is object-based, i.e, is cortical enhancement of attended
features influenced by the objectness the stimulus features appear?
The stimuli were two fields of random dots presented bilaterally to
central fixation cross. Subjects performed luminance discrimination
using two-interval forced-choice paradigm on one side and ignored the
stimulus on the other side. The ignored stimulus was always red dots
and the attended stimulus was overlapped red and green dots. Subjects
performed luminance discrimination on either red dots or on green
dots. We compared visual cortical enhancement of the ignored stimulus
when subjects attended on the other side to either identical (red) or
different (green) stimulus in two conditions: either the dots stimuli
on both sides appeared to belong to same object (both fields displayed
in the same grey box appearing on top of a textured background, with
cast shadows effects around the box), or as two separate objects (each
field displayed in a separate box with the same background and
shadows). Results showed both in single-object condition and in
two-object condition the two subjects consistently had significant
enhancement of the ignored stimulus in early visual areas (V1 to
V4). Hence it indicated feature-based attentional enhancement exists
even between two stimuli which belong to two different objects,
suggesting a very early mechanism.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS06)},
year={2006},
month={May},
type={psy;td;fmri},
review={abs/conf},
note={Oral presentation}
}

@invited{Itti06vss,
author={L. Itti},
title={Bayesian Surprise Attracts Human and Monkey Attention},
booktitle={Vision Science Society Symposium on Integrating Bottom-Up and Top-Down Attention, Sarasota, FL},
month={May},
year={2006},
type={bu;td;mod;su;eye}
}

@invited{Itti06avs,
author={L. Itti},
title={Identifying Regions of Interest and Surprise Theory Realization},
booktitle={DARPA Workshop on Artificial Vision Systems, Boston, MA},
month={Apr},
year={2006},
type={bu;td;mod;su;eye}
}

@invited{Itti_Baldi06usc,
author={L. Itti and P. F. Baldi},
title={Bayesian Surprise Attracts Human and Monkey Attention},
booktitle={USC Annual Vision Symposium, Los Angeles, CA},
month={Apr},
year={2006},
type={bu;td;mod;su;eye}
}

@invited{Itti_Baldi06ucla,
author={L. Itti and P. F. Baldi},
title={Bayesian Surprise Attracts Human and Monkey Attention},
booktitle={Keck Vision Seminar, University of California, Los Angeles, CA},
month={Apr},
year={2006},
type={bu;td;mod;su;eye}
}

@invited{Itti_Baldi06isa,
author={L. Itti and P. F. Baldi},
title={Bayesian Surprise Attracts Human and Monkey Attention},
booktitle={National Institute for Physiological Science, Okazaki, Japan},
month={Apr},
year={2006},
type={bu;td;mod;su;eye}
}

@article{Itti_etal06jcem,
author  ={E. Itti and I. T. Gaw Gonzalo and A. Pawlikowska-Haddal and K. B.
Boone and A. Mlikotic and L. Itti and F. S. Mishkin and R. S. Swerdloff},
title   ={The Structural Brain Correlates of Cognitive Deficits in Adults
With Klinefelter's Syndrome},
journal ={Journal of Clinical Endocrinology and Metabolism},
volume  ={91},
number  ={4},
pages   ={1423-1427},
month   ={Apr},
year    ={2006},
abstract={Context: Adults with Klinefelter's syndrome (KS) are known to
present disturbances of language skills and delayed learning abilities.
Objectives: To assess brain morphometry in KS and to correlate eventual
volumetric changes with performance on neuropsychological tests. Patients:
18 KS adults and 20 age-matched controls. Methods: All participants
underwent prospectively double spin echo brain magnetic resonance imaging
(MRI) and neuropsychological testing of verbal and nonverbal domains. On
the axial stack of MRI slices, regional brain volumes were measured either
by automated segmentation (full brain, total cerebrospinal fluid,
ventricular volume) or manual drawing with help of a neuroanatomy atlas
(frontal, temporal and parietal lobes, gray matter component of the lobes,
cerebellar hemispheres, hippocampal complexes). Results: KS patients
performed significantly lower than controls on language-related tasks
exploring verbal processing speed and verbal executive function. They were
diagnosed with significant enlargement of ventricular volume and bilateral
reduction of cerebellar hemispheres. Furthermore, after separation of
participants according to handedness and after correction of regional brain
volumes for atrophy, a significant reduction of left temporal lobe volume
was found in KS, compared with controls. Ventricular volume was inversely
correlated with cognitive function, while left temporal lobe volume was
positively correlated with language-related tasks. Conclusion: This study
hypothesizes that supernumerary X-chromosome and/or congenital hypogonadism
provoke structural alterations in the subcortical pathways involved in
language processing, thus providing a neurobiological substrate for
cognitive deficits in KS.},
file={http://ilab.usc.edu/publications/doc/Itti_etal06jcem.pdf},
type={med;mip},
if = {2004 impact factor: 5.778}
}

@inproceedings{Peters_Itti06etra,
title={Computational mechanisms for gaze direction in interactive
visual environments},
author={R. J. Peters and L. Itti},
abstract={Next-generation immersive virtual environments and video
games will require virtual agents with human-like visual attention and
gaze behaviors. A critical step is to devise efficient visual
processing heuristics to select locations that would attract human
gaze in complex dynamic environments. One promising approach to
designing such heuristics draws on ideas from computational
neuroscience. We compared several such heuristics with eye movement
recordings from five observers playing video games, and found that
heuristics which detect outliers from the global distribution of
visual features were better predictors of human gaze than were purely
local heuristics. Heuristics sensitive to dynamic events performed
best overall. Further, heuristic prediction power differed more
between games than between different human observers. Our findings
suggest simple neurally-inspired algorithmic methods to predict where
humans look while playing video games.},
year={2006},
month={Mar},
pages={27-32},
booktitle={Proc. ACM Eye Tracking Research and Applications},
file={http://ilab.usc.edu/publications/doc/Peters_Itti06etra.pdf},
type={mod;bu;td;eye},
review={full/conf}
}

@inproceedings{Carmi_Itti06etra,
title={Causal Saliency Effects During Natural Vision},
author={R. Carmi and L. Itti},
abstract={Salient stimuli, such as color or motion contrasts, attract
human attention, thus providing a fast heuristic for focusing limited
neural resources on behaviorally relevant sensory inputs. Here we
address the following questions: What types of saliency attract
attention and how do they compare to each other during natural vision?
We asked human participants to inspect scene-shuffled video clips,
tracked their instantaneous eye-position, and quantified how well a
battery of computational saliency models predicted overt attentional
selections (saccades). Saliency effects were measured as a function of
total viewing time, proximity to abrupt scene transitions (jump cuts),
and inter-participant consistency. All saliency models predicted
overall attentional selection well above chance, with dynamic models
being equally predictive to each other, and up to 3.6 times more
predictive than static models. Among static models, color contrast was
up to 2.1 more predictive than intensity variance. These results
establish the superiority of dynamic over static saliency in
attracting attention during natural vision, while also indicating a
special role for color. We propose that purely bottom-up or purely
top-down saccades are rare in real world environments. Instead,
attentional selections are typically determined by dynamic
interactions between bottom-up and top-down influences, which are
sometimes cooperative and sometimes competitive.},
year={2006},
month={Mar},
pages={11-18},
booktitle={Proc. ACM Eye Tracking Research and Applications},
type={mod;bu;td;eye;psy},
review={full/conf},
file={http://ilab.usc.edu/publications/doc/Carmi_Itti06etra.pdf},
note={Recipient of Best Paper Award}
}

@inproceedings{Carmi_Itti06cos,
author={R. Carmi and L. Itti},
title={The Role of Memory in Guiding Attention During Natural Vision},
abstract={What is the time frame in which perceptual memory guides
attention? Current estimates range from a few hundred milliseconds to
several seconds, minutes, or even days. Here we answer this question
during natural vision by revealing the time course of attentional
selection. First, we generated MTV-style video clips from continuous
clips by using jump cuts to connect semantically unrelated clip
segments. We then asked participants to visually explore either
continuous or MTV-style clips, tracked their eyes, and extracted rapid
gaze shifts as objective behavioral indicators of attentional
selections. The utilization of perceptual memory was estimated across
viewing conditions and over time by quantifying the agreement between
human attentional selections and predictions made by a
neurally-grounded computational model. In the critical condition, jump
cuts led to sharp declines in the impact of perceptual memory on
attentional selection, which then increased monotonically for up to
2.5 seconds. Our study demonstrates that previous accounts of memory
utilization in simplified laboratory conditions have repeatedly led to
misleading conclusions. We propose novel hypotheses and experiments
with hybrid natural-artificial stimuli to further elucidate
neurocomputational mechanisms of attentional selection.},
booktitle={Proc. Computational and Systems Neuroscience (Cosyne)},
month={Mar},
year={2006},
pages={105},
type={td;mod;psy},
review={abs/conf}
}

@inproceedings{Navalpakkam_Itti06cos,
author={V. Navalpakkam and L. Itti},
title={A theory of optimal feature selection during visual search},
booktitle={Proc. Computational and Systems Neuroscience (Cosyne)},
abstract={How does the human visual system select relevant locations
and visual features (e.g., color, orientation) in order to quickly
detect desired targets in distracting backgrounds? Although recent
evidence suggests that humans can select relevant locations optimally,
it is not yet known whether they can select visual features optimally.
Several heuristics for feature selection have been proposed in the
past, such as promoting the target's features in early visual areas
like V1 and V2. But the correct choice of features depends on both the
desired target as well as distractors in the background. Here, we
propose the first formal theory of how prior statistical knowledge of
target and distractor features modulates the response gains of neurons
encoding features, such that search speed is maximized. Through
numerical simulations, we show that this theory successfully explains
many reported behavioral and electrophysiological observations
including top-down effects such as the role of priming, the role of
uncertainty, target enhancement and distractor suppression, as well as
bottom-up effects such as pop-out, the role of target-distractor
discriminability, distractor heterogeneity, linear separabilty and
others. Contrary to most common heuristics which suggest promotion of
target features in order to detect the target, the optimal theory
makes surprising predictions that target features may sometimes be
suppressed, or non-target features may be enhanced. We validate these
counter-intuitive predictions through new psychophysics
experiments. Four naive subjects performed a difficult search for a
target bar tilted 55 degrees off vertical among distractor bars tilted
50 degrees. The gains thus set up were tested by randomly inserting
probe trials, in which we briefly flashed (200ms) four items
representing the distractor (50 degree), the target (55 degree),
relevant as predicted by the theory (60 degree), and steep (80 degree)
cues. As always, the task was to search for the target and report
it. Although subjects searched for a 55 degree target, as predicted by
the theory, there were significantly higher number of reports on the
60 degree item (paired t-test with p < 0.05). These results provide
direct experimental evidence that humans may select visual features
optimally. This study bears implications for further research in
understanding top-down attention during visual search. For instance,
previous research in physiology focused on feature gain modulation
during attention to a target feature, and largely ignored the role of
the distracting background features. In contrast, our research
suggests that the distractors play a crucial role in determining
feature gains, and may even lead to suppression of target features or
enhancement of non-target features. Investigating the modulatory
effects during visual search call for new experiments in physiology,
brain imaging and behavior.},
month={Mar},
year={2006},
pages={9},
type={td;mod;psy},
review={abs/conf},
note={Oral presentation}
}

@invited{Itti_Baldi06imsc,
author={L. Itti and P. F. Baldi},
title={Bayesian Surprise Attracts Human and Monkey Attention},
booktitle={USC IMSC Student Council Event, Los Angeles, CA},
month={Feb},
year={2006},
type={bu;td;mod;su;eye}
}

@invited{Itti_Baldi06ini,
author={L. Itti and P. F. Baldi},
title={Bayesian Surprise Attracts Human and Monkey Attention},
booktitle={Important Scientific Questions for Computational
Neuroscience and Neuroinformatics Symposium, ETH-Zurich, Switzerland},
month={Feb},
year={2006},
type={bu;td;mod;su;eye}
}

@inproceedings{Navalpakkam_Itti06hvei,
  author = {V. Navalpakkam and L. Itti},
  title = {Bottom-up and top-down influences on visual scanpaths},
  year = {2006},
  month = {Jan},
  booktitle = {Proc. SPIE Human Vision and Electronic Imaging XI 
(HVEI06), San Jose, CA},
  editor = {B. Rogowitz and T. N. Pappas and S. Daly},
publisher={SPIE Press},
volume={6057},
address={Bellingham, WA},
  type = {mod;bu;td},
abstract={Visual attention to salient and relevant scene regions is
crucial for an animal's survival in the natural world. It is guided by
a complex interplay of at least two factors - image-driven, bottom-up
salience and knowledge-driven, top-down guidance. For instance, a ripe
red fruit among green leaves captures visual attention due to its
bottom-up salience, while a non-salient camouflaged predator is
detected through top-down guidance to known predator locations and
features. Although both bottom-up and top-down factors are important
for guiding visual attention, most existing models and theories are
either purely top-down or bottom-up. Here, we present a combined model
of bottom-up and top-down visual attention.},
file={http://ilab.usc.edu/publications/doc/Navalpakkam_Itti06hvei.pdf},
review={abs/conf}
}

@inproceedings{Navalpakkam_Itti06nips,
title={Optimal cue selection strategy},
author={V. Navalpakkam and L. Itti},
abstract={Survival in the natural world demands the selection of
relevant visual cues to rapidly and reliably guide attention towards
prey and predators in cluttered environments. We investigate whether
our visual system selects cues that guide search in an optimal
manner. We formally obtain the optimal cue selection strategy by
maximizing the signal to noise ratio (SNR) between a search target and
surrounding distractors. This optimal strategy successfully accounts
for several phenomena in visual search behavior, including the effect
of target-distractor discriminability, uncertainty in target's
features, distractor heterogeneity, and linear
separability. Furthermore, the theory generates a new prediction,
which we verify through psychophysical experiments with human
subjects. Our results provide direct experimental evidence that humans
select visual cues so as to maximize SNR between the targets and
surrounding clutter.},
year={2006},
publisher           = { MIT Press },
address             = { Cambridge, MA },
booktitle           = { Advances in Neural Information Processing Systems, Vol.
                        19 (NIPS*2005) },
pages={987-994},
type                = { mod;bu;td;psy },
file={http://ilab.usc.edu/publications/doc/Navalpakkam_Itti06nips.pdf},
review={full/conf},
if = {2005 acceptance rate: 24\%}
}

@inproceedings{Itti_Baldi06nips,
title={Bayesian Surprise Attracts Human Attention},
author={L. Itti and P. F. Baldi},
abstract={The concept of surprise is central to sensory processing,
adaptation, learning, and attention.  Yet, no widely-accepted
mathematical theory currently exists to quantitatively characterize
surprise elicited by a stimulus or event, for observers that range
from single neurons to complex natural or engineered systems.  We
describe a formal Bayesian definition of surprise that is the only
consistent formulation under minimal axiomatic assumptions. Surprise
quantifies how data affects a natural or artificial observer, by
measuring the difference between posterior and prior beliefs of the
observer.  Using this framework we measure the extent to which humans
direct their gaze towards surprising items while watching television
and video games.  We find that subjects are strongly attracted towards
surprising locations, with 72 percent of all human gaze shifts
directed towards locations more surprising than the average, a figure
which rises to 84 percent when considering only gaze targets
simultaneously selected by all subjects. The resulting theory of
surprise is applicable across different spatio-temporal scales,
modalities, and levels of abstraction.},
year={2006},
publisher           = { MIT Press },
address             = { Cambridge, MA },
booktitle           = { Advances in Neural Information Processing Systems, Vol.
                        19 (NIPS*2005) },
pages={547-554},
type                = { su;mod;bu;td;eye },
file={http://ilab.usc.edu/publications/doc/Itti_Baldi06nips.pdf},
review={full/conf},
if = {2005 acceptance rate: 24\%}
}

@invited{Itti_Baldi05nantes,
author={L. Itti and P. F. Baldi},
title={A Surprise Theory of Attention},
booktitle={Ecole Polytechnique de l'Universite de Nantes, France},
month={Oct},
year={2005},
type={bu;td;mod;su;eye}
}

@inproceedings{Mundhenk_etal05spie,
author={T. N. Mundhenk and J. Everist and C. Landauer and L.  Itti and
K. Bellman},
title={Distributed biologically-based real-time tracking in the
absence of prior target information},
abstract={We are developing a distributed system for the tracking of
people and objects in complex scenes and environments using
biologically based algorithms. An important component of such a system
is its ability to track targets from multiple cameras at multiple
viewpoints. As such, our system must be able to extract and analyze
the features of targets in a manner that is sufficiently invariant of
viewpoints, so that they can share information about targets, for
purposes such as tracking. Since biological organisms are able to
describe targets to one another from very different visual
perspectives, by discovering the mechanisms by which they understand
objects, it is hoped such abilities can be imparted on a system of
distributed agents with many camera viewpoints. Our current
methodology draws from work on saliency and center surround
competition among visual components that allows for real time location
of targets without the need for prior information about the targets
visual features. For instance, gestalt principles of color
opponencies, continuity and motion form a basis to locate targets in a
logical manner. From this, targets can be located and tracked
relatively reliably for short periods. Features can then be extracted
from salient targets allowing for a signature to be stored which
describes the basic visual features of a target. This signature can
then be used to share target information with other cameras, at other
viewpoints, or may be used to create the prior information needed for
other types of trackers. Here we discuss such a system, which, without
the need for prior target feature information, extracts salient
features from a scene, binds them and uses the bound features as a set
for understanding trackable objects.},
booktitle={Proc. SPIE International Conference on Intelligent Robots
and Computer Vision XXIII: Algorithms, Techniques, and Active Vision},
volume={6006},
editor={D. P. Casasent and E. L. Hall and J. Roning},
publisher={SPIE Press},
address={Bellingham, WA},
pages={142-153},
type={mod;cv},
month={Oct},
year={2005},
review={abs/conf}
}

@inproceedings{Baldi_Itti05icnnb,
author={P. F. Baldi and L. Itti},
title={Attention: Bits versus Wows},
booktitle={Proc. IEEE International Conference on Neural Networks and Brain,
Beijing, China},
month={Oct},
year={2005},
editor={M. Zhao and Z. Shi},
volume={1},
pages={PL56-PL61},
abstract={The concept of surprise is central to sensory processing,
adaptation and learning, attention, and decision making. Yet, no
widely-accepted mathematical theory currently exists to quantitatively
characterize surprise elicited by a stimulus or event, for observers
that range from single neurons to complex natural or engineered
systems. We describe a formal Bayesian definition of surprise that is
the only consistent formulation under minimal axiomatic
assumptions. Surprise quantifies how data affects a natural or
artificial observer, by measuring the difference between posterior and
prior beliefs of the observer. Using this framework we measure the
extent to which humans direct their gaze towards surprising items
while watching television and video games. Humans are strongly
attracted to locations of high Bayesian surprise, with 72 percent of
all human gaze shifts directed towards locations more surprising than
the average, a figure which rises to 84 percent when considering only
gaze targets simultaneously selected by all subjects. The resulting
theory of surprise is applicable across different spatio-temporal
scales, modalities, and levels of abstraction.},
type={bu;td;mod;su;eye},
review={full/conf},
file={ http://iLab.usc.edu/publications/doc/Baldi_Itti05icnnb.pdf }
}

@invited{Itti_Baldi05bip,
author={L. Itti and P. F. Baldi},
title={Bayesian surprise attracts human attention},
year={2005},
booktitle={International Workshop on Bioinspired Information
Processing: Cognitive modeling and gaze-based communication, Luebeck,
Germany},
month={Sep},
type={mod;bu;cv;su;eye}
}

@press{Grebb05,
author={M. Grebb},
title={Bot Builders Scramble for Cash},
abstract={With the exception of military and space applications, the
United States is falling behind Europe and Asia in robotics research,
according to an international study by the World Technology Evaluation
Center.  WTEC members gathered Friday at the National Science
Foundation to announce the study results, which indicate that, unlike
many other developed countries, the United States lacks a coordinated
strategy to cultivate robotics development.},
journal={Wired (online)},
month={Sep},
year={2005},
file={http://www.wired.com/news/technology/0,1282,68910,00.html},
type={bu;bb}
}

@article{Mundhenk_Itti05bc,
author={T. N. Mundhenk and L. Itti},
title={Computational modeling and exploration of contour integration
for visual saliency},
journal={Biological Cybernetics},
abstract={We propose a computational model of contour integration for
visual saliency. The model uses biologically plausible devices to
simulate how the representations of elements aligned collinearly along
a contour in an image are enhanced. Our model adds such devices as a
dopamine-like fast plasticity, local GABAergic inhibition and
multi-scale processing of images. The fast plasticity addresses the
problem of how neurons in visual cortex seem to be able to influence
neurons they are not directly connected to, for instance as observed
in contour closure effect. Local GABAergic inhibition is used to
control gain in the system without using global mechanisms, which may
be non-plausible given the limited reach of axonal arbors in visual
cortex.  The model is then used to explore not only its validity in
real and artificial images, but to discover some of the mechanisms
involved in processing of complex visual features such as junctions
and end-stops as well as contours. We present evidence for the
validity of our model in several phases, starting with local
enhancement of only a few collinear elements. We then test our model
on more complex contour integration images with a large number of
Gabor elements. Sections of the model are also extracted and used to
discover how the model might relate contour integration neurons to
neurons that process end-stops and junctions. Finally, we present
results from real world images. Results from the model suggest that it
is a good current approximation of contour integration in human
vision. As well, it suggests that contour integration mechanisms may
be strongly related to mechanisms for detecting end-stops and junction
points. Additionally, a contour integration mechanism may be involved
in finding features for objects such as faces. This suggests that
visual cortex may be more information efficient and that neural
regions may have multiple roles.},
pages={188-212},
volume={93},
number={3},
month={Sep},
year={2005},
type={bu;mod},
file={http://ilab.usc.edu/publications/doc/Mundhenk_Itti05bc.pdf},
if = {2003 impact factor: 1.933}
}

@article{Lu_Itti05jov,
author={J. Lu and L. Itti},
title={Perceptual Consequences of Feature-Based Attention},
abstract={Attention modulates visual processing along at least two
dimensions: A spatial dimension which enhances the representation of
stimuli within the focus of attention, and a feature dimension thought
to enhance attended visual features, for example upward motion,
throughout the visual field.  We investigate the consequences of
feature-based attention onto visual perception, using dual-task human
psychophysics and two distant drifting Gabor stimuli to systematically
explore 64 combinations of visual features (orientations and drift
speeds) and tasks (discriminating orientation or drift speed).  The
resulting single, consistent dataset suggests a functional model,
which predicts a maximum-rule by which only the dominant product of
feature enhancement and feature benefit by feature relevance may
benefit perception.},
journal={Journal of Vision},
year={2005},
month={Aug},
volume={5},
number={7},
pages={622-631},
type={td;psy},
file={ http://iLab.usc.edu/publications/doc/Lu_Itti05jov.pdf },
if = {2005 impact factor: 3.469}
}

@article{Peters_etal05vr,
author={R. J. Peters and A. Iyer and L. Itti and C. Koch},
title={Components of bottom-up gaze allocation in natural images},
journal={Vision Research},
abstract={Recent research (Parkhurst et al., Vision Research 2002)
showed that a model of bottom-up visual attention can account in part
for the spatial locations fixated by humans while free-viewing complex
natural and artificial scenes. That study used a definition of
salience based on local detectors with coarse global surround
inhibition. Here, we use a similar framework to investigate the roles
of several types of nonlinear interactions known to exist in visual
cortex, and of eccentricity-dependent processing. For each of these,
we added a component to the salience model, including richer
interactions among orientation-tuned units, both at spatial short
range (for clutter reduction) and long range (for contour
facilitation), and a detailed model of eccentricity-dependent changes
in visual processing. Subjects free-viewed naturalistic and artificial
images while their eye movements were recorded, and the resulting
fixation locations were compared with the models' predicted salience
maps. We found that the proposed interactions indeed play a
significant role in the spatiotemporal deployment of attention in
natural scenes; about half of the observed inter-subject variance can
be explained by these different models. This suggests that attentional
guidance does not depend solely on local visual features, but must
also include the effects of interactions among features. As models of
these interactions become more accurate in predicting
behaviorally-relevant salient locations, they become useful to a range
of applications in computer vision and human-machine interface
design.},
year={2005},
volume={45},
number={8},
pages={2397-2416},
month={Aug},
type={bu;td;eye;mod},
file={ http://iLab.usc.edu/publications/doc/Peters_etal05vr.pdf },
if = {2003 impact factor: 1.958}
}

@article{Itti05vc,
author="L. Itti",
title={Quantifying the Contribution of Low-Level Saliency to 
Human Eye Movements in Dynamic Scenes},
journal={Visual Cognition},
year={2005},
volume={12},
number={6},
pages={1093-1123},
month={Aug},
abstract={We investigated the contribution of low-level saliency to
human eye movements in complex dynamic scenes. Eye movements were
recorded while naive observers viewed a heterogeneous collection of 50
video clips (46,489 frames; 4-6 subjects per clip), yielding 11,916
saccades of amplitude 2deg or more. A model of bottom-up visual
attention computed instantaneous saliency at the instant each saccade
started and at its future endpoint location.  Median model-predicted
saliency was 45 percent the maximum saliency, a significant factor 2.03
greater than expected by chance.  Motion and temporal change were
stronger predictors of human saccades than color, intensity or
orientation features, with the best predictor being the sum of all
features.  There was no significant correlation between
model-predicted saliency and duration of fixation.  A majority of
saccades were directed to a minority of locations reliably marked as
salient by the model, suggesting that bottom-up saliency may provide a
set of candidate saccade target locations, with the final choice of
which location to fixate more strongly determined top-down.},
keywords={Visual attention ; eye movements ; saliency ;
bottom-up ; top-down},
file = { http://iLab.usc.edu/publications/doc/Itti05vc.pdf },
type={bu ; mod ; td ; eye},
if = {2003 impact factor: 1.588}
}

@press{Acevski05,
Author={N. Acevski},
title={USC Robotics Club Strives to Expand},
journal={The daily Trojan},
page={1},
volume={CXLVII},
number={7},
month={Aug},
year={2005},
type={bu;bb},
note={Front cover, August 30, 2005}
}

@press{Gaona05sdut,
author={E. Gaona},
journal={San Diego Union Tribune},
title={Below and beyond -- Students show off underwater crafts at
S.D. competition},
abstract={Underwater robots sputtered and stalled in murky water, but
the 21 teams in yesterday's Autonomous Underwater Vehicle Competition
advanced by leagues.  ``It is truly the cream of the crop competing
here, and even if a small number of these people go into the field
then we're increasing the quality of the equipment,'' said Daryl
Davidson, executive director of the Virginia-based Association for
Unmanned Vehicle Systems International.  ``The competition gets
smarter every year.''},
month={Aug},
year={2005},
file={http://www.signonsandiego.com/uniontrib/20050808/news_1m8vehicle.html},
type={cv;bb},
note={Full-page article, one of San Diego's major newspapers}
}

@inproceedings{Carmi_Itti05assc,
author={R. Carmi and L. Itti},
title={Attention deployment in intermittently predictable environments
- from amnesia to memory and back},
abstract={Paying attention to the right thing at the right time
underlies the ability of humans and other animals to learn, perceive,
and interact with their environment. What is the role of memory in
guiding attention? According to the world as an outside memory theory,
humans exploit the stability of the world to access external
information on demand, leading to conscious perceptions that are
seemingly rich and continuous without requiring detailed and
persistent internal representations. An alternative theory postulates
that attention deployment relies on detailed memory traces of relevant
inputs, which are functional for approximately one second. Here we
resolve this apparent discrepancy by showing that the impact of memory
on attention deployment depends on the availability of semantically
persistent context. We asked human observers to visually explore
MTV-style video clips, in which unpredictable scene changes occur
every 1-3 seconds, and quantified the ongoing ability of a memory-free
model of attention deployment to predict rapid gaze shifts
(saccades). Scene changes triggered memory-free influences on
attention deployment that overwhelmed previous influences within less
than 250 ms. These initial sharp increases in the impact of
memory-free influences were followed by gradual decreases, reflecting
slower increases in competing memory-dependent influences, and final
increases to an average level, demonstrating that the overall impact
of scene changes on attention deployment subsides within 2.5
seconds. Our study shows that the human attention system adapts
rapidly to changing environments, but is strongly modulated by
memory-dependent influences when semantically persistent context is
available.  },
booktitle={Proc. ninth annual meeting of the Association for the
Scientific Study of Consciousness (ASSC9), Pasadena, CA},
month={Jun},
year={2005},
file = { http://iLab.usc.edu/publications/doc/Carmi_Itti05assc.pdf },
type={bu;mod;sc;eye},
review={abs/conf}
}

@inproceedings{Itti_Arbib05assc,
author={L. Itti and M. A. Arbib},
title={Visual Salience Facilitates Entry into Counscious Scene
Representation},
abstract={Are we more likely to consciously register, remember, and report
elements in a visual scene which are more conspicuous or salient?
Focal visual attention is known to gate low-level visual information
into higher-level processing, short-term memory, and consciousness.
However, being merely attracted to something salient and attending to
it does not guarantee that it will be retained in the conscious mental
representation of a scene. Here we provide preliminary experimental
evidence that, in dynamic natural scenes, out of all objects, actors,
and actions which are attended to, those which are verbally reported
are also more bottom-up salient than those which are not.  Using an
eye-tracker, we recorded gaze of one human participant watching twelve
30-seconds television clips, together with his online verbal
descriptions of the scenes depicted in the clips. Eye movement traces
were segmented into periods of fixation and saccadic gaze shifts. We
manually isolated saccades towards each entity that had been reported
verbally.  Using a computational model of bottom-up visual salience,
we computed dynamic salience maps for all clips.  We compare the
distribution of instantaneous salience at human saccade targets to
that at random targets using the Kullback-Leibler (KL) distance; KL
scores above zero indicate that visual salience attracted gaze more
than expected by chance.  Our findings are three-fold: first, visual
salience significantly attracted gaze overall, as scene locations
saccaded to by the observer were reliably more salient than expected
by chance (KL=0.194+/-0.019, n=992 saccades, t-test, p<10^-27).
Second, restricting the analysis only to human saccades directed
towards objects, actors, or actions mentioned in the verbal report
yielded an even higher score (KL=0.372+/-0.055, n=319, p<10^-13),
indicating that more salient scene elements were more likely to be
reported. Third, restricting the analysis to only the first saccade
onto each of the 88 different reported scene elements yielded an even
higher score (KL=0.546+/-0.120, n=88, p<0.0008), suggesting that
instantaneous salience of a scene element when first gazed at may
significantly influence whether it will be reported. In sum, our study
suggests that, out of all the targets of human gaze over complex
dynamic scenes, those which emerge as the central elements in the
conscious representation of the scene are more bottom-up salient,
supporting a role for bottom-up salience in facilitating entry into
conscious mental scene representations.},
booktitle={Proc. ninth annual meeting of the Association for the
Scientific Study of Consciousness (ASSC9), Pasadena, CA},
month={Jun},
year={2005},
type={bu;mod;sc;eye},
review={abs/conf}
}

@inproceedings{Bonaiuto_Itti05wapcv,
author={J. Bonaiuto and L. Itti},
title={Combining attention and recognition for rapid scene analysis},
abstract={Bottom-up visual attention allows primates to quickly select
regions of an image that contain salient objects. In artificial
systems, restricting the task of object recognition to these regions
allows faster recognition and unsupervised learning of multiple
objects in cluttered scenes. A problem is that objects superficially
dissimilar to the target are given the same consideration in
recognition as similar objects. Here we investigate rapid pruning of
the recognition search space using the already-computed low-level
features that guide attention. Itti and Koch's bottom-up visual
attention algorithm selects salient locations based on low-level
features such as contrast, orientation, color, and intensity. Lowe's
SIFT recognition algorithm then extracts a signature of the attended
object, for comparison with the object database. The database search
is prioritized for objects which better match the low-level features
used to guide attention to the current candidate for recognition. The
SIFT signatures of prioritized database objects are then checked for
match against the attended candidate. By comparing performance of
Lowe's recognition algorithm and Itti and Koch's bottom-up attention
model with or without search space pruning, we demonstrate that our
pruning approach improves the speed of object recognition in complex
natural scenes.},
booktitle={Proc. IEEE-CVPR Workshop on Attention and Performance
in Computer Vision (WAPCV'05), San Diego, California},
pages={1-6},
year={2005},
month={Jun},
type={bu;mod;cv},
file = { http://iLab.usc.edu/publications/doc/Bonaiuto_Itti05wapcv.pdf},
review={full/wkshp}
}

@inproceedings{Siagian_Itti05wapcv,
author={C. Siagian and L. Itti},
title={Gist: A Mobile Robotics Application of Context-Based Vision in
Outdoor Environment},
abstract={We present context-based scene recognition for mobile
robotics applications. Our classifier is able to differentiate outdoor
scenes without temporal and spatial filtering relatively well from a
variety of locations at a college campus using a set of features that
together capture the ``gist'' of the scene. We discuss and perform
experiments on the accuracy and scalability of the current
features. We compare the classification accuracy of a set of scenes
from 1551 frames filmed outdoors along a path and dividing them to
four and twelve different legs.  We obtained a classification rate of
67.96 percent and 48.61 percent, respectively. We also tested the
scalability of the features by comparing the classification results
from the previous scenes with four legs with a longer path with eleven
legs. We obtained a classification rate of 55.08 percent. In the end
we also put forth some ideas to improve upon the theoretical strength
of the gist features.},
booktitle={Proc. IEEE-CVPR Workshop on Attention and Performance
in Computer Vision (WAPCV'05), San Diego, California},
pages={1-7},
year={2005},
month={Jun},
type={bu;mod;cv},
file = { http://iLab.usc.edu/publications/doc/Siagian_Itti05wapcv.pdf },
review={full/wkshp}
}

@invited{Itti_Baldi05york,
author={L. Itti and P. F. Baldi},
title={A Surprise Theory of Attention},
year={2005},
booktitle={Computational Vision in Neural and Machine Systems,
CVR Vision Conference, York University, Canada},
month={Jun},
type={mod;bu;cv;su;eye}
}

@inproceedings{Itti_Baldi05cvpr,
author={L. Itti and P. F. Baldi},
title={A Principled Approach to Detecting Surprising Events in Video},
abstract={Primates demonstrate unparalleled ability at rapidly orienting
towards important events in complex dynamic environments. During rapid
guidance of attention and gaze towards potential objects of interest
or potential threats, however, often there is no time for detailed
visual analysis. Thus, heuristic computations are necessary to locate
the most interesting events in quasi real-time. We present a new
theory of sensory surprise, which provides a principled and computable
shortcut to important information. We develop a model that computes
instantaneous low-level surprise at every location in video
streams. The algorithm significantly correlates with eye movements of
two human observers watching complex video clips, including television
programs (17,936 frames, 2,152 saccadic gaze shifts). The resulting
system allows more sophisticated and time-consuming image analysis to
be efficiently focused onto the most surprising subsets of the
incoming data.},
booktitle={Proc. IEEE Conference on Computer Vision and Pattern
Recognition (CVPR)},
address={San Siego, CA},
month={Jun},
year={2005},
pages={631-637},
type={bu ; cv ; eye ; su},
file = { http://iLab.usc.edu/publications/doc/Itti_Baldi05cvpr.pdf },
review={full/conf},
if = {2005 acceptance rate: 28\%}
}

@invited{Itti_Baldi05uci,
author={L. Itti and P. F. Baldi},
title={Bayesian Surprise Attracts Human Attention},
abstract={The concept of surprise is central to sensory processing,
adaptation and learning, attention, and decision making. Yet, no
widely-accepted mathematical theory currently exists to quantitatively
characterize surprise elicited by a stimulus or event, for observers
that range from single neurons to complex natural or engineered
systems.  We describe a formal Bayesian definition of surprise that is
the only consistent formulation under minimal axiomatic
assumptions. Surprise quantifies how data affects a natural or
artificial observer, by measuring the difference between posterior and
prior beliefs of the observer.  Using this framework we measure the
extent to which humans look towards surprising things while watching
television and video games.  We find that surprise is the strongest
known attractor of human attention, with 72 percent of all human gaze
shifts directed towards locations more surprising than on average, a
figure which rises to 84 percent when considering only gaze targets
simultaneously selected by four humans. The resulting theory of
surprise is applicable across different modalities, datatypes, tasks,
and abstraction levels.},
year={2005},
booktitle={University of California, Irvine, IMBS Seminar},
month={May},
type={mod;bu;cv;su;eye}
}

@inproceedings{Carmi_Itti05jsnc,
author={R. Carmi and L. Itti},
title={Attention deployment in novel environments - from amnesia to
memory and back},
abstract={Paying attention to the right thing at the right time
underlies the ability of humans and other animals to learn, perceive,
and interact with their environment. What is the role of memory in
guiding attention? According to ''the world as an outside memory''
theory, humans exploit the stability of the world to access external
information on demand, leading to conscious perceptions that are
seemingly rich and continuous without requiring detailed and
persistent internal representations.  An alternative theory postulates
that attention deployment relies on detailed memory traces (lasting
about 1s in real world conditions) of recent attention targets. Here
we resolve this apparent discrepancy by showing that the impact of
memory on attention deployment depends on the availability of
semantically persistent context. We asked human observers to visually
explore MTV-style video clips, in which unpredictable scene changes
occur every 1-3 seconds, and quantified the ongoing ability of a
memory-free model of attention deployment to predict rapid gaze shifts
(saccades).  Scene changes triggered memory-free influences on
attention deployment that overwhelmed previous influences within less
than 250 ms. These initial sharp increases in the impact of
memory-free influences were followed by gradual decreases, reflecting
slower increases in competing memory-dependent influences, and final
increases to an average level, demonstrating that the overall impact
of scene changes on attention deployment subsides within 2.5
seconds. Our results indicate that when semantically persistent
context is available, recently triggered memories play an important
role in guiding attention to specific locations in the visual field,
well beyond the time span of visual persistence.},
booktitle={Proc. 12th Joint Symposium on Neural Computation (JSNC'05),
Los Angeles, California},
month={May},
year={2005},
type={td;psy;eye},
review={abs/conf}
}

@inproceedings{Itti_Baldi05vss,
author={L. Itti and P. F. Baldi},
title={A surprise theory of attention},
abstract={[ORAL] Attention in biological and artificial systems
rapidly selects important information from within massive sensory
inputs, a process key to survival. When time lacks for detailed
sensory analysis, finding important information must rely on heuristic
or approximate computations. To quantitatively characterize these
computations, we propose a Bayesian definition of important
information we call surprise. Surprise quantifies how data affects a
natural or artificial observer, by measuring the difference between
prior and posterior beliefs of the observer. We argue that surprise
subsumes and extends previous often ad-hoc notions of stimulus
saliency and novelty, casting them into a single theoretical framework
derived from first principles. To test this, we measure the extent to
which ten image-based metrics that highlight different facets of
important information may predict gaze recordings of four human
observers watching 50 complex videoclip stimuli, including television
broadcast and video games (about 30 minutes in total). At the target
location of each of the 10,192 saccadic gaze shifts recorded, compared
to at random locations, we evaluate intrinsic visual properties of the
video clips using the ten computational metrics, including a surprise
metric. Extending previous findings, but for dynamic scenes, we find
that humans preferentially gaze towards locations where local entropy,
contrast, information, color, intensity and orientation responses are
higher than expected by chance (sign tests, p<1.0E-100 or
better). Furthermore, metrics computing dynamic image features like
flicker, motion, saliency and surprise correlate even better with
human eye movements. Out of all metrics, surprise significantly stands
out as best-scoring (t-tests, p<1.0E-100 or better). Our data shows
that guiding attention towards intrinsically surprising stimuli is an
efficient shortcut to important information.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS05)},
year={2005},
month={May},
type={mod;bu;cv;su;eye},
review={abs/conf},
note={Oral presentation}
}

@inproceedings{Carmi_Itti05vss,
author={R. Carmi and L. Itti},
title={Why do we fail to perceive jump-cuts in motion pictures?},
abstract={Motivation: For more than a century, motion pictures have
been extremely successful in attracting people's attention, yet their
psychology is poorly understood or even addressed by the scientific
community. One of the more puzzling practices in motion pictures is
the frequent use of jump-cuts (abrupt transitions between adjacent
scene shots), which are the staple of music television
(MTV). Attention research suggests that our seemingly continuous and
detailed perception of the real world is the product of highly
incomplete internal representations that depend on selective on-demand
sampling of continuous environmental inputs. If input continuity is so
important, why do jump-cuts often go unnoticed?  Methods: In order to
examine the effects of jump-cuts on attentional allocation, we first
constructed MTV-style clips, which featured persistent context for 1-3
seconds, from a diverse collection of continuous clips that depict
photography-based and computer-generated dynamic scenes. We then
performed two eye-tracking experiments with separate groups of
subjects, each inspecting either continuous or MTV-style clips. In
order to measure the persistence of attention-guiding representations
(AGRs), we quantified changes in their inputs and outputs using either
local intensity contrast or saliency as probes for the ongoing impact
of bottom-up influences on saccade target selection.  Results:
Jump-cuts update AGRs within less than 250 ms. AGRs persist for less
than 2 seconds even during inspection of continuous clips.
Conclusions: We propose that perceptual continuity is often
unperturbed across jump-cuts, despite physical discontinuities, thanks
to the briefness and sparseness of mental representations, combined
with the ingenuity of moviemakers in manipulating these
representations. Our results indicate that integrating computational
attention research with the art and technology of moviemaking is
technically feasible, and can advance the understanding and practice
of both fields.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS05)},
year={2005},
month={May},
type={mod;bu;eye},
review={abs/conf}
}

@inproceedings{Peters_etal05vss,
author={R. J. Peters and A. Iyer and C. Koch and L. Itti},
title={Components of Bottom-Up Gaze Allocation in Natural Scenes},
abstract={A model of bottom-up visual attention (``baseline salience
model'', based on local detectors with coarse global surround
inhibition) has been shown (Parkhurst et al., 2002) to account in part
for the spatial locations fixated by people while free-viewing complex
natural and artificial scenes. Here, we tested the additional roles in
bottom-up gaze allocation played by several visual cortical
mechanisms. In each case, we added a component to the salience model:
non-linear interactions among orientation-tuned units both at short
spatial ranges (for clutter reduction) and long ranges (for contour
facilitation), and a detailed model of eccentricity-dependent changes
in visual processing. Subjects free-viewed naturalistic and artificial
images while their eye movements were recorded, and we used a metric
called the Normalized Scanpath Salience (NSS) to compare the resulting
fixation locations with the different models' predicted salience
maps. NSS values indicate, on average, how many standard deviations
above or below the mean salience was the model-predicted salience at
human-fixated locations. Thus the minimum NSS value (when the model
and human behavior are unrelated) is 0; the theoretical maximum NSS
value is given by the ability of one observer's fixations to be
predicted by the remaining observers' fixations, which in practice
fell in the range 1.1--1.3 for different image categories. The
baseline salience model predicted fixations at 39--57 percent of the
maximum NSS level. Adding short-range orientation interactions
increased this range to 50--65 percent, contour facilitation further
increased it to 53--74 percent, and eccentricity-dependent processing
increased it to 84--95 percent. Thus the proposed cortical
interactions indeed appear to play a significant role in the
spatiotemporal deployment of attention in natural scenes. This
suggests that bottom-up attentional guidance does not depend solely on
local visual features, but must also include the effects of non-local
interactions.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS05)},
year={2005},
month={May},
type={mod;bu;eye},
review={abs/conf}
}

@inproceedings{Lu_etal05vss,
author={J. Lu and R. Yakupov and C. Lozar and L. Chang and T. Ernst
and L. Itti},
title={Feature-based Attention Is Also Object-based},
abstract={ Feature-based attention has been revealed as a global
enhancement of attended visual features throughout the visual
cortex. Object-based attention was shown as better performance when
concurrently discriminating two features of the same object compared
to two features of different objects. We used high field strength (4T)
functional MRI to investigate whether feature-based attention is also
object-based, i.e., does cortical enhancement of attended features
result from subjects treating two features as a single same object?
The stimuli were two drifting Gabor patches presented bilaterally to
the central fixation cross. Subjects performed orientation
discrimination using a two-interval forced-choice paradigm on one side
and ignored the stimulus on the other side. The ignored stimulus was
always vertical (V) and drifting slowly (S) and the attended stimulus
either was identical(VS) or horizontal (H) and drifting faster (F). We
compared visual cortical enhancement of the ignored stimulus when
subjects attended on the other side to either the identical (VS) or a
different (HF) stimulus in two conditions: either the Gabor stimuli on
both sides appeared to belong to a same object (with both Gabors
simply displayed on a normal gray background), or as two separate
objects (each Gabor displayed in a grey box appearing on top of a
textured background, with cast shadows effects around the
boxes). Results showed that in the single-object condition all three
subjects consistently had significant enhancement of the ignored
stimulus (SPM T-values T1=4.43; T2=4.24; T3=3.93 with
P(uncorrect)<0.001, TR=3s, Voxel size=3x3x3mm^3) ) in area MT+ , which
confirmed previous observations of feature-based attentional
modulation. But this enhancement disappeared (no significant
enhancement) when stimuli appeared as two different objects. These
results indicate that feature-based attentional modulation is also
object-based, i.e., only occurs when between features that belong to a
same object.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS05)},
year={2005},
month={May},
type={mod;td;psy},
review={abs/conf}
}

@inproceedings{Navalpakkam_Itti05vss,
author={V. Navalpakkam and L. Itti},
title={Attention can be guided to the relevant feature category},
abstract={When the visual system is challenged by distractor
heterogeneity and target-distractor similarity, can search be sped up
by guiding attention to the relevant feature category, i.e., one that
selectively promotes the target, and inhibits the distractors?
Previous studies measuring reaction time suggest that for small
feature differences between the target and distractors, search is
inefficient when the target is flanked by distractors in feature space
(D'Zmura, Vis Res 1991;31(6):951-966). This has been widely
demonstrated in size (Treisman and Gelade, Cog Psy 1980;12:97-136),
orientation (Wolfe et. al, J Exp Psy 1992;18(1):34-49) and color
(Bauer et. al, Vis Res 1996;36(10):1439-1465). A widely accepted
inference from these studies is that attention cannot be guided to a
category, otherwise search for the medium target would be
efficient. But this inference need not be true. We verified through
eye tracking methods that despite inefficient search for the medium
target (in size and color), a significantly higher number of fixations
landed on items in the medium category than less or high
categories. Our results suggest that indeed attention can be guided to
a category. To reconcile previous results with our data, we propose a
new computational mechanism which suggests that feature dimensions are
encoded in cortex by broadly tuned ``categorical'' channels and that
top-down influence can selectively boost the relevant category. Hence,
inefficient search for the medium target occurs due to increased
overlap between target and distractor categories, so that boosting the
relevant medium category may falsely activate some distractors that
belong to overlapping categories.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS05)},
year={2005},
month={May},
type={bu;td;eye},
review={abs/conf}
}

@invited{Itti_Baldi05ucla,
author={L. Itti and P. F. Baldi},
title={A Surprise Theory of Early Attention},
year={2005},
booktitle={UCLA Computer Vision Group, Westwood, California},
month={Apr},
type={mod;bu;cv;su;eye}
}

@invited{Itti_Baldi05imsc,
author={L. Itti and P. F. Baldi},
title={A Surprise Theory of Attention},
abstract={The concept of surprise is central to sensory processing,
adaptation and learning, attention, and decision making. Yet, no
widely-accepted mathematical theory currently exists to quantitatively
characterize surprise elicited by a stimulus or event, for observers
that range from single neurons to complex natural or engineered
systems.  We describe a formal Bayesian definition of surprise that is
the only consistent formulation under minimal axiomatic
assumptions. Surprise quantifies how data affects a natural or
artificial observer, by measuring the difference between posterior and
prior beliefs of the observer.  Using this framework we measure the
extent to which humans look towards surprising things while watching
television and video games.  We find that surprise is the strongest
known attractor of human attention, with 72 percent of all human gaze
shifts directed towards locations more surprising than on average, a
figure which rises to 84 percent when considering only gaze targets
simultaneously selected by four humans. The resulting theory of
surprise is applicable across different modalities, datatypes, tasks,
and abstraction levels.},
year={2005},
month={Apr},
booktitle={USC-IMSC Student Council Speaker Series},
type={mod;bu;cv;su;eye}
}

@invited{Itti_Baldi05cns,
author={L. Itti and P. F. Baldi},
title={The salience map: a local surprise detector?},
year={2005},
booktitle={Cognitive Neuroscience Annual Meeting,
Workshop on the Saliency Map, New York, N.Y.},
month={Apr},
abstract={To investigate the extent to which salience maps may
highlight regions in the visual field with high information content,
we propose a subjective definition of information we call surprise, to
quantify how data affects a natural or artificial observer, by
measuring the difference between prior and posterior beliefs of that
observer.  We argue that surprise is better suited to studying
subjective aspects of brain function and behavior than Shannon
information, particularly sensory processing and novelty detection.
Thus, we build a computational model of early vision and attention,
which topographically computes visual surprise at every location in a
salience map. It outperforms Shannon information and other models in
predicting gaze of four humans watching 50 complex videoclips. This
suggests that visual locations which appear as salient to an observer
may do so because they are surprising more than because they are
informative.},
type={mod;bu;cv;su;eye}
}

@invited{Itti_Baldi05negi,
author={L. Itti and P. F. Baldi},
title={A Surprise Theory of Attention},
year={2005},
booktitle={Neuroscience-Enabled Geospatial Intelligence Workshop,
Washington, D.C.},
month={Mar},
type={mod;bu;cv;su;eye}
}

@article{Ackerman_Itti05tro,
author={C. Ackerman and L. Itti},
title={Robot Steering With Spectral Image Information},
journal={IEEE Transactions on Robotics},
year={2005},
month={Apr},
volume={21},
number={2},
pages={247-251},
abstract={We introduce a method for rapidly classifying visual scenes,
globally along a small number of navigationally relevant dimensions:
depth of scene, presence of obstacles, path vs. non-path, and
orientation of path. We show that the algorithm reliably classifies
scenes in terms of these high-level features, based on global or
coarsely localized spectral analysis analogous to early-stage
biological vision. We use this analysis to implement a real-time
visual navigational system on a mobile robot, trained online by a
human operator. We demonstrate successful training and subsequent
autonomous path following for two different out-door environments, a
running track and a concrete trail. Our success with this technique
suggests a general applicability to autonomous robot navigation in a
variety of environments.},
keywords={autonomous robot ; Fourier transform ; vision ; path following ;
navigation ; gist of a scene},
type={ bb ; cv ; sc},
file={http://ilab.usc.edu/publications/doc/Ackerman_Itti05tro.pdf},
if = {2003 impact factor: 2.103},
}

@invited{Itti_Baldi05google,
author={L. Itti and P. F. Baldi},
title={A Surprise Theory of Attention},
year={2005},
booktitle={Google Inc., Mountain View, California},
month={Mar},
type={mod;bu;cv;su;eye}
}

@invited{Itti_Baldi05ss,
author={L. Itti and P. F. Baldi},
title={A Surprise Theory of Attention},
year={2005},
booktitle={Second Sight Inc., Sylmar, California},
month={Mar},
type={mod;bu;cv;su;eye}
}

@Article{Arbib_Mundhenk05np,
author  ="M. A. Arbib and T. N. Mundhenk",
title   ={Schizophrenia and the mirror system: an essay},
journal ={Neuropsychologia},
volume  ={43},
number  ={2},
pages   ={268-280},
month   ={Feb},
year    ={2005},
abstract={We analyze how data on the mirror system for grasping in macaque and
human ground the mirror system hypothesis for the evolution of the
language-ready human brain, and then focus on this putative relation
between hand movements and speech to contribute to the understanding of how
it may be that a schizophrenic patient generates an action (whether manual
or verbal) but does not attribute the generation of that action to himself.
We make a crucial discussion between self-monitoring and attribution of
agency. We suggest that vebal hallucinations occur when an utterance
progresses through verbal creation pathways and returns as a vocalization
observed, only to be dismissed as external since no record of its being
created has been kept. Schizophrenic patients on this theory then
confabulate the agent.},
address ={Computer Science, Neuroscience, and the USC Brain Project,
University of Southern California, Los Angeles, CA 90089-2520, USA.},
type={ mod ; sc},
file = { http://iLab.usc.edu/publications/doc/Arbib_Mundhenk05np.pdf },
if = {2003 impact factor: 2.695}
}

@book{Itti_etal05noa,
title={Neurobiology of Attention},
editor={L. Itti and G. Rees and J. K. Tsotsos},
publisher={Elsevier},
address={San Diego, CA},
year={2005},
pages={1-744},
month={Jan},
abstract={A key property of neural processing in many animals is the
capability to focus resources, by selectively directing attention
towards the most important sensory inputs of the moment.  Attention
research has shown rapid growth over the past two decades, as new
techniques have become available to study higher brain function in
humans, non-human primates, and other mammals.  Neurobiology of
Attention is the first encyclopedic volume to summarize the latest
developments in attention research. An authoritative collection of 111
concise articles organized into thematic sections provides both broad
coverage and access to focused, up-to-date research findings.  The
volume presents a state-of-the-art multidisciplinary perspective on
psychological, physiological and computational approaches to
understanding the neurobiology of attention. Ideal for students, as a
reference handbook, or for rapid browsing, the book has a wide appeal
to anybody interested in attention research.},
type={bu;td;psy;mod;fmri;cv;sc;eye},
note={The first encyclopedic volume on attention research, with 111 chapters
from over 160 experts in the field.},
file={http://ilab.usc.edu/publications/doc/NeurobiologyOfAttention/}
}

@incollection{Navalpakkam_etal05noa,
author={V. Navalpakkam and M. A. Arbib and L. Itti},
title={Attention and Scene Understanding},
abstract={This paper presents a simplified, introductory view of how
visual attention may contribute to and integrate within the broader
framework of visual scene understanding.  Several key components are
identified which cooperate with attention during the analysis of
complex dynamic visual inputs, namely rapid computation of scene gist
and layout, localized object recognition and tracking at attended
locations, working memory that holds a representation of currently
relevant targets, and long-term memory of known world entities and
their inter-relationships. Evidence from neurobiology and
psychophysics is provided to support the proposed architecture.},
booktitle={Neurobiology of Attention},
editor={L. Itti and G. Rees and J. K. Tsotsos},
publisher={Elsevier},
address={San Diego, CA},
year={2005},
month={Jan},
pages={197-203},
type={bu;td;psy;mod;sc},
file = { http://iLab.usc.edu/publications/doc/Navalpakkam_etal05noa.pdf }
}

@incollection{Itti05noa,
author={L. Itti},
title={Models of Bottom-Up Attention and Saliency},
abstract={Visually conspicuous, or so-called salient, stimuli often
have the capability of attracting focal visual attention towards their
locations. Several computational architectures subserving this
bottom-up, stimulus-driven, spatiotemporal deployment of attention are
reviewed in this article. The resulting computational models have
applications not only to the prediction of visual search
psychophysics, but also, in the domain of machine vision, to the rapid
selection of regions of interest in complex, cluttered visual
environments. We describe an unusal such application, to the objective
evaluation of advertising designs.},
booktitle={Neurobiology of Attention},
editor={L. Itti and G. Rees and J. K. Tsotsos},
publisher={Elsevier},
address={San Diego, CA},
year={2005},
month={Jan},
pages={576-582},
type={bu;psy;mod},
file = { http://iLab.usc.edu/publications/doc/Itti05noa.pdf }
}

@incollection{Tsotsos_etal05noa,
author={J. K. Tsotsos and L. Itti and G. Rees},
title={A Brief and Selective History of Attention},
booktitle={Neurobiology of Attention},
editor={L. Itti and G. Rees and J. K. Tsotsos},
publisher={Elsevier},
address={San Diego, CA},
year={2005},
pages={xxiii-xxxii},
month={Jan},
type={bu;td;psy;mod;sc}
}

@article{Navalpakkam_Itti05vr,
author={V. Navalpakkam and L. Itti},
title={Modeling the influence of task on attention},
journal={Vision Research},
volume={45},
number={2},
pages={205-231},
year={2005},
month={Jan},
abstract={We propose a computational model for the task-specific
guidance of visual attention in real-world scenes.  Our model
emphasizes four aspects that are important in biological vision:
determining task-relevance of an entity, biasing attention for the
low-level visual features of desired targets, recognizing these
targets using the same low-level features, and incrementally building
a visual map of task-relevance at every scene location.  Given a task
definition in the form of keywords, the model first determines and
stores the task-relevant entities in working memory, using prior
knowledge stored in long-term memory.  It attempts to detect the most
relevant entity by biasing its visual attention system with the
entity's learned low-level features.  It attends to the most salient
location in the scene, and attempts to recognize the attended object
through hierarchical matching against object representations stored in
long-term memory.  It updates its working memory with the
task-relevance of the recognized entity and updates a topographic
task-relevance map with the location and relevance of the recognized
entity.  The model is tested on three types of tasks: single-target
detection in 343 natural and synthetic images, where biasing for the
target accelerates target detection over two-fold on average;
sequential multiple-target detection in 28 natural images, where
biasing, recognition, working memory and long term memory contribute
to rapidly finding all targets; and learning a map of likely locations
of cars from a video clip filmed while driving on a highway. The
model's performance on search for single features and feature
conjunctions is consistent with existing pyschophysical data.  These
results of our biologically-motivated architecture suggest that the
model may provide a reasonable approximation to many brain processes
involved in complex task-driven visual behaviors.},
keywords={Attention ; top-down ; bottom-up ; object detection ;
recognition ; task-relevance ; scene analysis},
type={bu ; td ; mod ; sc},
file = { http://iLab.usc.edu/publications/doc/Navalpakkam_Itti05vr.pdf },
if = {2003 impact factor: 1.958}
}

@inproceedings{Edgington_etal04aved,
author={D. R. Edgington and I. Kerkez and D. Oliver and L. Kuhnz and
D. Cline and D. Walther and L. Itti},
title={Detecting Benthic Megafauna in Underwater Video},
abstract={Remotely operated vehicles (ROVs) have revolutionized
oceanographic research, supplementing traditional technologies of
acoustics and trawling as tools which assess animal diversity,
distribution and abundance. Video equipment deployed on ROVs enable
quantitative video transects (QVTs) to be recorded from ocean
habitats, providing high-resolution imagery on the scale of individual
organisms and their associated habitat.  Currently, the manual method
employed by trained scientists analyzing QVTs is labor-intensive and
costly, limiting the amount of data analyzed from ROV dives.  An
automated system for detecting organisms and identifying objects
visible in video would address these concerns.  Automated event
detection (scene segmentation) is a step towards an automated
analytical system for QVTs.  In the work presented here, video frames
are processed with a neuromorphic selective-attention algorithm. The
candidate locations identified by the attention selection module are
subject to a number of parameters. These parameters, combined with
successful tracking over several frames, determine whether detected
events are deemed ``interesting'' or ``boring''. ``Interesting''
events are marked in the video frames for subsequent identification
and processing.  As reported previously for mid-water QVTs, the system
agrees with professional annotations 80 percent of the time. Poor
contrast of small translucent animals in conjunction with the presence
of debris (``marine snow'') complicates automated event detection.
While the visual characteristics of the seafloor (benthic) habitat are
very different from the mid-water environment, the system yields a 92
percent correlation of detected animals on the seafloor compared with
professional annotations. We present data detailing the comparison
between a) automated detection and b) professional detection and
classification, and we outline plans for future development of
automated analysis.},
booktitle={Proc. 2004 AGU Fall Meeting (AVED)},
volume={85},
number={47},
pages={OS43B/0551},
month={Dec},
address={San Francisco, California},
year={2004},
type={bu;mod;cv},
review={abs/conf}
}


@invited{Itti_Baldi04epfl,
author={L. Itti and P. F. Baldi},
title={A Surprise Theory of Attention},
year={2004},
booktitle={Ecole Polytechnique Federale de Lausanne, Lausanne, Switzerland},
month={Dec},
type={mod;bu;cv;su;eye}
}

@inproceedings{Mundhenk_etal04spie,
author={T. N. Mundhenk and C. Landauer and K. Bellman and
M. A. Arbib and L. Itti},
title={Teaching the computer subjective notions of feature
connectedness in a visual scene for real time vision},
booktitle={Proc. SPIE Conference on Intelligent Robots and Computer
Vision XXII: Algorithms, Techniques, and Active Vision},
abstract={We discus a tool kit for usage in scene understanding
where prior information about targets is not necessarily
understood. As such, we give it a notion of connectivity such that it
can classify features in an image for the purpose of tracking and
identification. The tool VFAT (Visual Feature Analysis Tool) is
designed to work in real time in an intelligent multi agent room. It
is built around a modular design and includes several fast vision
processes. The first components discussed are for feature selection
using visual saliency and Monte Carlo selection. Then features that
have been selected from an image are mixed into useful and more
complex features. All the features are then reduced in dimension and
contrasted using a combination of Independent Component Analysis and
Principle Component Analysis (ICA/PCA). Once this has been done, we
classify features using a custom non-parametric classifier
(NPclassify) that does not require hard parameters such as class size
or number of classes so that VFAT can create classes without stringent
priors about class structure. These classes are then generalized using
Gaussian regions which allows easier storage of class properties and
computation of probability for class matching.  To speed up to
creation of Gaussian regions we use a system of rotations instead of
the traditional Psuedo-inverse method. In addtion to discussing the
structure of VFAT we discuss training of the current system which is
relatively easy to perform. ICA/PCA is trained by giving VFAT a large
number of random images. The ICA/PCA matrix is computed by features
extracted by VFAT. The non-parametric classifier NPclasify it trained
by presenting it with images of objects having it decide how many
objects it thinks it sees. The difference between what it sees and
what it is supposed to see in terms of the number of objects is used
as the error term and allows VFAT to learn to classify based upon the
experimenters subjective idea of good classification. },
month={Oct},
year={2004},
editor={D. P. Casasent and E. L. Hall and J. Roning},
publisher={SPIE Press},
address={Bellingham, WA},
volume={5608},
pages={136-147},
type={bu;bb;mod;cv},
file = { http://iLab.usc.edu/publications/doc/Mundhenk_etal04spie.pdf },
review={abs/conf}
}

@invited{Itti_Baldi04cns,
author={L. Itti and P. F. Baldi},
title={A Surprise Theory of Attention},
year={2004},
booktitle={Caltech Computations and Neural Systems Anniversary,
Pasadena, California},
month={Nov},
type={mod;bu;cv;su;eye}
}

@inproceedings{Itti_Baldi04aipr,
author={L. Itti and P. F. Baldi},
title={A Surprising Theory of Attention},
booktitle={Proc. IEEE Workshop on Applied Imagery and Pattern 
Recognition (AIPR)},
abstract={The concept of information is central to science, technology and
biological function. Shannon's theory of information, although eminently
successful for the development of modern computer and telecommunication
technologies, does not capture subjective and semantic aspects of
information that are not related to transmission but rather to observer
expectations. Here we propose a subjective definition of information we
call surprise, to quantify how data affects a (natural or artificial)
observer, by measuring the difference between prior and posterior
distributions of observer belief over families of models for the data.
Surprise requires averaging over the space of models, contrasting with
Shannon entropy which averages over data.  We argue that biological
sensory neurons signal quantities closer to surprise than Shannon
information. To test this, we build a biologically-plausible computational
model of early vision and bottom-up attention, which topographically
computes low-level visual surprise. The model outperforms Shannon
information-based models in predicting eye movement recordings of four
human observers watching 50 complex video stimuli, including television
broadcast.  The resulting surprise theory of attention and subjective
information foraging is applicable across different modalities, datatypes
and abstraction levels.},
month={Oct},
year={2004},
type={bu;mod;cv;su},
review={abs/wkshp}
}

@inproceedings{Natarajan_etal04spie,
author={P. Natarajan and T. N. Mundhenk and K. Bellman and L. Itti
and M. A. Arbib},
title={Camera localization methods for intelligent room systems using
RF techniques},
abstract={One of the important components of a multi sensor
intelligent room, which can observe, track and react to its occupants,
is a multi camera system. This system involves the development of
algorithms that enable a set of cameras to communicate and cooperate
with each other effectively so that they can monitor the events
happening in the room. To achieve this, the cameras typically must
first build a map of their relative locations. In this paper, we
discuss a novel RF based technique for estimating distances between
cameras. The algorithm proposed for RF can estimate distances with
relatively good accuracy even in the presence of random noise.},
booktitle={Proc. SPIE International Conference on Intelligent Robots
and Computer Vision XXII: Algorithms, Techniques, and Active Vision},
volume={5608},
editor={D. P. Casasent and E. L. Hall and J. Roning},
publisher={SPIE Press},
address={Bellingham, WA},
pages={177-187},
type={mod;cv},
month={Oct},
year={2004},
review={abs/conf}
}

@inproceedings{Navalpakkam_Itti04sfn,
author = {V. Navalpakkam and L. Itti},
title = {Modeling the influence of target and distractor knowledge on
visual search},
abstract={Previous research on visual search suggests that knowledge
of the target leads to a multiplicative increase in the activity of
neurons that encode target features (i.e., gain factor > 1). We
investigate how the added knowledge of the distractors influences the
optimal choice of gain factors on target and non-target features, such
that the target salience increases relative to the distractors.
Assuming Poisson firing for neurons tuned to various visual features,
we wish to decide the gain factor on each feature type that will
maximize target detection and minimize distractor detection.  If a
distractor has the same amount of a feature as the target, the
distributions of responses from neurons tuned to that feature fully
overlap, and target and distractors will be detected equally. If a
distractor has a greater amount of a feature than the target, then the
distribution of distractor responses is shifted towards higher values,
and a negative gain factor will optimize detection. Further, if the
distractor has a new feature missing from the target, a negative gain
factor on the new feature will optimize detection.  To verify these
hypotheses, we designed search arrays with one target and equal
numbers of distractors of type SAME (same amount of feature as in
target), MORE (greater amount of feature), and NEW (new feature
missing from target). We ran 7 subjects for 600 trials each for 5 days
and recorded their eye movements. Measurement of the number of
fixations on each type of distractors revealed significantly more
fixations on SAME distractors than MORE (p < 10^-10) and NEW (p <
10^-8).  Our results suggest that subjects use a negative gain factor
on all features present less in target than distractors. This
prediction of negative gain at the behavioral level suggests the
presence of neurons encoding the absence of features, whose activity
is negatively correlated with those detecting the presence of
features, leading to interesting predictions for search asymmetry.},
month = {Oct},
year = {2004},
booktitle = {Proc. Society for Neuroscience Annual Meeting (SFN'04)},
type = {mod;psy;bu;sc},
review={abs/conf}
}

@invited{Itti_Baldi04hrl,
author={L. Itti and P. F. Baldi},
title={A Surprising Theory of Attention},
year={2004},
booktitle={Hughes Research Laboratories, Malibu, California},
month={Sep},
abstract={The concept of information is central to science,
technology, and biology.  Shannon's information theory, although
successful for developing computer and telecommunication technologies,
does not capture subjective and semantic aspects of information not
related to its transmission but rather to expectations of
observers. We propose a subjective definition of information we call
surprise, to quantify how data affects a natural or artificial
observer, by measuring the difference between prior and posterior
beliefs of that observer.  We argue that surprise is better suited to
studying subjective aspects of brain function and behavior,
particularly sensory processing and novelty detection.  Thus, we build
a computational model of early vision and attention, which
topographically computes visual surprise. It outperforms Shannon
information and other models in predicting gaze of four humans
watching 50 complex videoclips.  The resulting surprise theory of
attention and subjective information foraging is applicable across
different modalities, datatypes, tasks, and abstraction levels.},
type={mod;bu;cv;su;eye}
}

@invited{Itti_Baldi04rap,
author={L. Itti and P. F. Baldi},
title={A Surprising Theory of Attention},
year={2004},
booktitle={USC Research Activities Presentation Day,
Los Angeles, California},
month={Aug},
type={mod;bu;cv;su;eye}
}

@invited{Itti_Baldi04onr,
author={L. Itti and P. F. Baldi},
title={A Surprising Theory of Attention},
year={2004},
booktitle={Office of Naval Research Workshop, Reno, Nevada},
month={Aug},
type={mod;bu;cv;su;eye}
}

@invited{Itti04sab,
author={L. Itti},
title={Automatic Eye and Head Animation for Animats},
abstract={We propose a computational model for the automatic animation
of the eyes and head of virtual or physical avatars. Given any input
in the form of video streams, the model finds the most salient
(interesting) locations in the agent's visual environment and directs
its gaze towards them. The computation of visual salience at the basis
of the model relies on a neurobiological model of visual processing
along the occipito-parietal stream in the primate brain. The relative
contributions of eyes and head towards a given gaze shift are then
computed from a gaze decomposition model derived from behavioral
recordings in Rhesus monkeys. Finally, the dynamics of eye and head
movements are calibrated against behavioral recordings from human
subjects. The model autonomously gazes towards locations also gazed to
by human observers watching the same video inputs, in a highly
significant manner.},
booktitle={From Animals to Animats 8, Proceedings of the Eighth
International Conference on the Simulation of Autonomous Behavior,
Santa Monica, California},
month={Jul},
year={2004},
type={cv ; bu ; mod},
note={Plenary lecture}
}

@inproceedings{Siagian_Itti04fpiv,
author="C. Siagian and L. Itti",
title={Biologically-Inspired Face Detection: Non-Brute-Force-Search Approach},
booktitle={First IEEE-CVPR International Workshop on Face Processing in Video},
abstract={We present a biologically-inspired face detection
system. The system applies notions such as saliency, gist, and gaze to
localize a face without performing blind spatial search. The saliency
model consists of highly parallel low-level computations that operate
in domains such as intensity, orientation, and color. It is used to
direct attention to a set of conspicuous locations in an image as
starting points. The gist model, computed in parallel with the
saliency model, estimates holistic image characteristics such as
dominant contours and magnitude in high and low spatial frequency
bands. We are limiting its use to predicting the likely head size
based on the entire scene. Also, instead of identifying face as a
single entity, this system performs detection by parts and uses
spatial configuration constraints to be robust against occlusion and
perspective.},
month={Jun},
pages={62-69},
year={2004},
type={cv ; bu ; sc},
file = { http://iLab.usc.edu/publications/doc/Siagian_Itti04fpiv.pdf },
review={full/wkshp}
}

@invited{Itti04fpiv,
author={L. Itti},
title={Biological Models of Vision and Attention for Face Detection in 
Natural Scenes},
booktitle={First IEEE-CVPR International Workshop on Face Processing
in Video, Washington, D.C.},
month={Jun},
year={2004},
type={cv ; bu ; mod},
note={Plenary lecture}
}

@Article{Itti04tip,
author ="L. Itti",
title={Automatic Foveation for Video Compression Using a
  Neurobiological Model of Visual Attention},
journal={IEEE Transactions on Image Processing},
volume={13},
number={10},
pages={1304-1318},
month={Oct},
year={2004},
abstract={We evaluate the applicability of a biologically-motivated
algorithm to select visually-salient regions of interest in video
streams for multiply-foveated video compression.  Regions are selected
based on a nonlinear integration of low-level visual cues, mimicking
processing in primate occipital and posterior parietal cortex. A
dynamic foveation filter then blurs every frame, increasingly with
distance from salient locations.  Sixty-three variants of the
algorithm (varying number and shape of virtual foveas, maximum blur,
and saliency competition) are evaluated against an outdoor video
scene, using MPEG-1 and constant-quality MPEG-4 (DivX) encoding.
Additional compression radios of 1.1 to 8.5 are achieved by foveation.
Two variants of the algorithm are validated against eye fixations
recorded from 4-6 human observers on a heterogeneous collection of 50
video clips (over 45,000 frames in total).  Significantly higher
overlap than expected by chance is found between human and algorithmic
foveations.  With both variants, foveated clips are on average
approximately half the size of unfoveated clips, for both MPEG-1 and
MPEG-4. These results suggest a general-purpose usefulness of the
algorithm in improving compression ratios of unconstrained video.},
keywords={Visual attention ; video compression ; saliency ; bottom-up ; eye
movements ; foveated},
type                = { bu ; mod ; cv ; eye },
file = { http://iLab.usc.edu/publications/doc/Itti04tip.pdf },
if = {2003 impact factor: 2.642}
}

@invited{Itti04tno,
author={L. Itti and P. F. Baldi},
title={A Surprising Theory of Attention},
year={2004},
booktitle={T.N.O. Human Factors Research Institute, Utrecht, The Netherlands},
month={Jun},
type={mod;bu;cv;su;eye}
}

@inproceedings{Navalpakkam_Itti04jsnc,
author={V. Navalpakkam and L. Itti},
title={A mathematical framework for the design and analysis of feature biasing strategies},
abstract={Aim: Given a target and a set of distractors, we wish to
find a desirable feature biasing strategy that will render the target
most salient and suppress inteference from the distractors. That is,
we wish to find how bottom-up features such as color, intensity,
orientation can be biased in a top-down manner so as to enable quick
detection of the target amidst distractors. Desirable feature biasing
strategy: If a feature is present in the target and absent in the
distractor, promoting it can further boost the target's
salience. Since it already leads to a pop-out that is optimal
performance, biasing is not required. If a feature is present in both
the target and the distractor in an equal amount, then biasing the
feature cannot yield a performance gain. To investigate this, we
designed a SAME type distractor (see figure 2). Whereas, if the
feature is present in a less amount in the target as compared to the
distractor, then suppressing this feature can boost the target s
salience relative to the distractor. To investigate this, we designed
a MORE type distractor that contained the target s feature in a
greater amount, and another extreme case - a NEW type distractor that
contained a new feature absent in the target. Design and analysis of
experiment: To test whether humans use the desirable strategy, we
designed search arrays containing the target and all 3 types of
distractors and measured the relative number of fixations on each type
of distractor (see figure 1 for the list of target and distractors and
their difference, and figure 3 for sample search arrays). We tested 7
subjects on 600 trials over a period of 5 days each. Paired t tests
over the combined data of all subjects and also for individual
subjects supported our hypothesis that humans fixate more on SAME type
distractors than MORE (p value = 7.6471e-11) and NEW type distractors
(p value = 1.4125e-09), indicating that they used the desirable
strategy of suppressing all features as each feature was present in a
lesser amount in the target than the distractors. Conclusion: We have
provided a mathematical framework for the analysis and design of
experiments to test various feature biasing strategies, to determine a
desirable strategy and to test whether humans use that strategy.},
booktitle={Proc. 11th Joint Symposium on Neural Computation (JSNC'04),
Los Angeles, California},
month={May},
year={2004},
type={td;psy;eye},
file = { http://iLab.usc.edu/publications/doc/Navalpakkam_Itti04jsnc.pdf },
review={abs/conf}
}

@inproceedings{Carmi_Itti04jsnc,
author={R. Carmi and L. Itti},
title={Disentangling topdown from bottom up influences on attentional allocation in dynamic scenes},
abstract={Motivation: Attentional allocation is determined by the
interplay between bottom-up and top-down influences. Here we try to
quantify the relative contributions of different influences on
attentional allocation in dynamic scenes, as well as examine how they
change over time. Methods: In order to manipulate the availability of
top-down influences on attentional allocation, heterogeneous video
clips were cut into clippets (M=2s), which were scrambled and
re-assembled into MTV-style clips. Two groups of 8 Subjects each were
instructed to ``follow the main actors and actions.'' One group viewd
the original stimuli while the other group viewd the MTV-style
clips. Eye positions were recorded using an ISCAN eye-tracker (240Hz,
yielding a total of more than a million samples for each group), and
segmented into saccades, blinks, and fixation/smooth pursuit
periods. A saliency-based model of attention capture (Itti & Koch
2000) was used to probe the relative contribution of bottom-up
influences on attentional allocation based on a novel performance
metric - Chance-Adjusted Saliency Accumometric (CASA). CASA values
were computed based on the weighted sum of differences between
normalized saliency at human vs. random saccade targets. Results:
Total CASA based on the full saliency model was 6 percent higher in
the MTV group compared to the original group. In both original and MTV
groups, CASA based on either motion or flicker features alone was 95
percent of the CASA based on the full saliency model. CASA based on
either color, intensity, or orientation features alone was 66 percent
of the full model CASA. Generally, CASA values for earlier saccades
after stimulus onset (clip or clippet start) were higher than for
later saccades, but tapered off and flactuated around a fairly high
value after the first several saccades. Conclusions: The 6 percent
CASA difference between the original and MTV groups shows that
eliminating visual context beyond the first 2s of viewing barely
increased the overall relative weight of bottom-up influences on
attentional allocation. Our results imply that the relative weight of
top-down influences on attentional allocation in dynamic scenes does
not increase with viewing time (beyond the first 2s). We also found
that either motion or flicker are 150 percent stronger than either
color, intensity, or orientation as bottom-up attractors of
attention.},
booktitle={Proc. 11th Joint Symposium on Neural Computation (JSNC'04),
Los Angeles, California},
month={May},
year={2004},
type={bu;td;psy;eye},
review={abs/conf}
}

@inproceedings{Carmi_Itti04vss,
author={R. Carmi and L. Itti},
title={Bottom-up and top-down influences on attentional allocation in
natural dynamic scenes},
abstract={Motivation: Attentional allocation is often studied by
isolating a small subset of bottom-up or top-down influences in
highly-controlled environments. Other studies provide phenomenological
descriptions of overt attention in highly-complex environments (e.g.,
Land & McLeod 2000, Nat. Neuro.). The present study attempts to find
the middle ground by investigating the temporal interplay between
bottom-up and top-down influences on attentional allocation in dynamic
natural scenes. Methods: A set of heterogeneous video clips was cut
into clippets (M=2s), which were scrambled and reassembled into
MTV-style clips. Subjects were instructed to ``follow the main actors
and actions.'' Eye positions were recorded using an eye-tracker,
yielding a total of 1.35 million samples, which were segmented into
saccades, blinks, and fixation/smooth pursuit periods. Saccade target
selection was compared to predictions made by a computational model of
saliency-based attention capture (Itti & Koch 2000,
Vis. Res.). Results were compared to those of a twin experiment that
employed the same methodology, but used the unscrambled clips as
stimuli. Results: The ratio average/max saliency calculated by the
model at human saccade targets was 0.47 compared with 0.25 at random
saccade targets. The null hypothesis that human and random samples
were drawn from the same distribution was rejected by sign and
wilcoxon tests (p<10E-10 for both). Identical results were obtained
for both the scrambled and unscrambled stimuli. Conclusions:
Eliminating visual context effects beyond the first 2s by temporally
scrambling dynamic natural scenes did not increase the relative weight
of bottom-up influences on attentional allocation. A possible
interpretation of our results is that the relative weight of top-down
influences on attentional allocation in natural viewing conditions
does not change with presentation time (at least beyond the first
2s).},
booktitle={Proc. Vision Science Society Annual Meeting (VSS04)},
year={2004},
month={May},
pages={20},
type={bu ; eye},
review={abs/conf}
}

@inproceedings{Navalpakkam_etal04vss,
author={V. Navalpakkam and J. Rebesco and L. Itti},
title={Modeling the influence of knowledge of the target and distractors on
visual search},
abstract={We investigate how bottom-up features such as color,
intensity, and orientation at different spatial scales may be biased
in a top-down manner, so as to promote the detection of a known target
and suppress the interference from known distractors. Using eye
tracking during visual search, we probed the extent to which our
visual system promotes the target's features or suppresses the
distractors' features. Three subjects searched for a known target (an
upright 'L') among three known distractor types - SAME, MORE and
NEW. SAME distractors had the same amount of features as the target,
i.e, the target was rotated clockwise by 90 or 180 degrees. MORE
distractors had a greater amount of the target's features. NEW
distractors had a new feature that was absent in the target, in
addition to the target's features. Each subject performed 240 trials
containing 25 items (one target and eight items of each distractor
type) appearing at random locations. Analysis of the captured eye
movement data consistently showed a higher number of fixations on the
SAME distractors than on the MORE and NEW, for all subjects. This
suggests that the horizontal and the vertical features were suppressed
mildly, followed by strong suppression of the diagonal feature,
resulting in maximum suppression of NEW distractors, and least
suppression of SAME distractors. Thus, the exact amount of promotion
or suppression of a feature seems to be related to the difference in
its response to the target and the distractor,i.e., if a feature
responds stronger to the target than the distractor, it is promoted;
else, if it responds equally to both, it is not promoted; and if it
responds stronger to the distractor than the target, it is
suppressed. In conclusion, this study suggests a computational
mechanism for feature biasing based on knowledge of the target and
distractors, which can yield useful predictions in single unit
recordings, and psychophysics experiments.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS04)},
year={2004},
month={May},
pages={178},
type={bu ; td ; eye},
review={abs/conf}
}

@inproceedings{Lu_Itti04vss,
author={J. Lu and L. Itti},
title={Perceptual consequences of feature-based attention},
abstract={Feature-based attention has been shown to enhance the
representation of attended visual attributes throughout the visual
field. We investigate the consequences of feature-based attention onto
visual perception, using dual-task human psychophysics. Subjects
simultaneously performed two pattern discriminations on drifting
Gabors presented bilaterally to fixation: the first task presumably
triggered a first set of feature-based attention effects; the second
task triggered a second set and, of interest here, possibly benefited
from the first set. Stimuli were horizontal (H) or vertical (V),
drifting slowly (S) or faster (F). Tasks were orientation (O) or drift
speed (D) discriminations, using a dual two-interval forced-choice
paradigm. We measured dual-task discrimination thresholds (75 percent
correct) with a dual staircase procedure for the 64 combinations:
primary side, left or right; tasks, O or D; stimuli, HS, VS, HF or
VF. We define stimulus orientation as relevant to the orientation
discrimination task while drift speed is irrelevant, and conversely,
then focus on how primary tasks modulated secondary thresholds. Under
same-task conditions, feature-based attention benefited secondary
thresholds when stimuli shared task-relevant features, but did not
benefit further from additionally sharing task-irrelevant features;
however, when no task-relevant feature was shared, a small benefit was
observed from sharing a task-irrelevant feature. When tasks differed,
secondary thresholds were improved equally as soon as any relevant or
irrelevant feature was shared, compared to when nothing was
shared. Our systematic study suggests a model with three components:
engaging attention onto a stimulus enhances task-relevant features
strongly and task-irrelevant features weakly; performance at a pattern
discrimination benefits strongly from task-relevant features and
weakly from irrelevant features; feature enhancement and feature
benefit combine multiplicatively, with the final benefit dictated by
the largest of those products.},
booktitle={Proc. Vision Science Society Annual Meeting (VSS04)},
year={2004},
month={May},
pages={20},
type={psy ; td},
review={abs/conf}
}

@press{Patch04trn,
author={K. Patch},
journal={Technology Research News},
title={Model keeps virtual eyes right},
abstract={You can always tell by the eyes. Even if you fooled a
virtual human's features, his gaze still makes it apparent he's not
for real. Where a person looks turns out to be an issue of instinct
and reflex rather than intelligence. A computer model aims to give
virtual people the visual wiring of real primates.},
month={Mar},
year={2004},
type={cv;eye}
}

@article{Itti04nme,
author={L. Itti},
title={The iLab Neuromorphic Vision C++ Toolkit: Free tools for the
next generation of vision algorithms},
journal={The Neuromorphic Engineer},
abstract={Because of its truly interdisciplinary nature benefiting
from the latest advances in experimental and computational
neuroscience, electrical engineering, control theory, and signal and
image processing neuromorphic engineering is a very complex
field. This has been one of the leading motivations for the
development of a Neuromorphic Vision Toolkit at the iLab of the
University of Southern California to provide a set of basic tools that
can assist newcomers in the field with the development of new models
and systems. More generally, the iLab Neuromorphic Vision C++ Toolkit
project aims at developing the next generation of vision algorithms,
the architecture of which will closely mimic the neurobiology of the
primate brain rather than being specifically developed for a given set
of environmental conditions or tasks. To this end, it provides a
software foundation that is specifically geared towards the
development of neuromorphic models and systems.},
volume={1},
number={1},
pages={10},
month={Mar},
year={2004},
type={cv},
file = { http://iLab.usc.edu/publications/doc/Itti04nme.pdf }
}

@inproceedings{Mundhenk_etal04hvei,
  author = {T. N. Mundhenk and V. Navalpakkam and H. Makaliwe and
S. Vasudevan and L. Itti},
  title = {Biologically inspired feature based categorization of objects},
  year = {2004},
  month = {Jan},
  abstract = {We have developed a method for clustering features into
objects by taking those features which include intensity, orientations
and colors from the most salient points in an image as determined by
our biologically motivated saliency program.  We can train a program
to cluster these features by only supplying as training input the
number of objects that should appear in an image. We do this by
clustering from a technique that involves linking nodes in a minimum
spanning tree by not only distance, but by a density metric as
well. We can then form classes over objects or object segmentation in
a novel validation set by training over a set of seven soft and hard
parameters. We discus as well the uses of such a flexible method in
landmark based navigation since a robot using such a method may have a
better ability to generalize over the features and objects.  .},
  booktitle = {Proc. SPIE Human Vision and Electronic Imaging IX 
(HVEI04), San Jose, CA},
  volume = {5292},
  editor = {B. Rogowitz and T. N. Pappas},
publisher={SPIE Press},
address={Bellingham, WA},
  type = {mod;bu;cv},
  file = { http://iLab.usc.edu/publications/doc/Mundhenk_etal04hvei.pdf },
review={abs/conf}
}

@inproceedings{Itti04hvei,
author              = { L. Itti },
title = { Automatic Attention-Based Prioritization of Unconstrained 
Video for Compression},
year                = {2004},
month={Jan},
abstract = {We apply a biologically-motivated algorithm that selects
visually-salient regions of interest in video streams to
multiply-foveated video compression.  Regions of high encoding
priority are selected based on nonlinear integration of low-level
visual cues, mimicking processing in primate occipital and posterior
parietal cortex.  A dynamic foveation filter then blurs (foveates)
every frame, increasingly with distance from high-priority regions.
Sixty-three variants of the algorithm with different parameter
settings are evaluated against an outdoor video scene, using MPEG-1
and MPEG-4, yielding compression radios of 1.1 to 8.5.  Two variants
(one with continuously-variable blur proportional to saliency at every
pixel, and the other with blur proportional to distance from three
independent foveation centers) are validated against eye fixations
from 4-6 human observers on 50 video clips (synthetic stimuli, video
games, outdoors day and night home video, television newscast, sports,
talk-shows, etc).  Significant overlap is found between human and
algorithmic foveations on every clip with one variant, and on 48 out
of 50 clips with the other. Substantial compressed file size
reductions by a factor 0.5 on average are obtained for foveated
compared to unfoveated clips. These results suggest a general-purpose
usefulness of the algorithm in improving compression ratios of
unconstrained video.},
booktitle = { Proc. SPIE Human Vision and Electronic Imaging IX
(HVEI04), San Jose, CA },
volume={5292},
pages={272-283},
editor={B. Rogowitz and T. N. Pappas},
type                = { mod;bu;cv },
publisher={SPIE Press},
address={Bellingham, WA},
file = { http://iLab.usc.edu/publications/doc/Itti04hvei.pdf },
review={abs/conf}
}

@press{Kincade03lfw,
author={K. Kincade},
title={For the modern-day robot, seeing is nearing believing},
journal={Laser Focus World},
month={Nov},
year={2003},
type={cv;bb}
}


@invited{Itti03cnse,
author={L. Itti},
title={From Attention to Scene Understanding?},
year={2003},
booktitle={Caltech Center for Neuromorphic Systems Engineering Industry Day,
Pasadena, California},
month={Nov},
type={mod;bu;cv}
}


@Article{Itti_etal03an,
author  ="E. Itti and I. T. Gaw Gonzalo and K. B. Boone and D. H. Geschwind
and N. Berman and A. Pawlikowska-Haddal and L. Itti and F. S. Mishkin and
R. S. Swerdloff",
title   ={Functional neuroimaging provides evidence of anomalous cerebral
laterality in adults with Klinefelter's syndrome},
journal ={Annals of Neurology},
volume  ={54},
number  ={5},
pages   ={669-673},
month   ={Nov},
year    ={2003},
abstract={This study aimed to characterize cerebral perfusion in men with
Klinefelter's syndrome, known to present specific deficits in language,
using (99m)Tc- hexamethylpropylene-amine-oxime scintigraphy and Talairach
normalization. While a perfusion asymmetry toward the left hemisphere was
found in controls, perfusion was mostly symmetrical in Klinefelter patients
in the upper temporal and lower parietal areas. Scores on verbal tests were
inversely correlated with perfusion changes, providing neurobiological
substrate of anomalous cerebral laterality.},
keywords={},
address ={Department of Nuclear Medicine, Harbor-UCLA Medical Center, Research
and Education Institute, Torrance, CA.},
note    ={},
type                = { med },
file = { http://iLab.usc.edu/publications/doc/Itti_etal03an.pdf },
if = {2002 impact factor: 8.603}
}


@inproceedings{Navalpakkam_Itti03sfn,
author              = { V. Navalpakkam and L. Itti },
title               = { Towards a Unified Model for Attention and Recognition},
abstract = {Primate vision recruits at least three main components, namely,
bottom-up attention, top-down attention and object recognition. The
first component is used to direct gaze towards visually salient
locations in a scene, while the second is used to direct gaze towards
the task-relevant targets, and the third is used to recognize
objects. Most modelling efforts treat these problems independently,
and consequently, they use different sets of low level features and
distinct computational strategies. Our concern against such approaches
is related to the constraints on the available resources: how can one
visual system, such as ours, accomodate multiple low level visual
subsystems? This motivates us to investigate whether and how
bottom-up attention, top-down attention and object recognition may
share resources and be related. Towards this end, we designed,
implemented and tested our UNARE model for UNified Attention and
REcognition. Our model uses the same low level features to find
visually salient objects, to learn object representations, to detect
task-relevant target objects, and, to recognize them. We achieve
top-down attention by biasing the bottom-up attentional system with
the learned target representation. To recognize the object at the
current attended location, we match its features against the learned
representations. We tested our model on 343 images that ranged from
artificial images of geometrical objects to natural images containing
objects such as soda cans, and various signs in diverse
backgrounds. On an average, our model significantly accelerated
attention towards the target and detected it 1.8-16.4 times faster
than the bottom-up attention model even in scenes with poor resolution
or significant noise or clutter and complex backgrounds. There were
few false positives (0-10 percent) and false negatives (0-21 percent) in the
recognition. Overall, the performance of our model was remarkable
given its simplicity, and seems to suggest that bottom-up attention,
top-down attention and object recognition may share resources
extensively and be intimately related.},
month = { Nov },
year                = 2003,
booktitle           = { Proc. Society for Neuroscience Annual 
Meeting (SFN'03)},
type                = { mod;psy;bu;sc },
review={abs/conf}
}

@inproceedings{Chen_etal03sfn,
author              = { V. J. Chen and S. G. Erberich and L. Itti
and M. A. Arbib},
title               = { Perception of Consequences of Actions: An fMRI Study},
abstract = {Causal relationships among events are pervasive in the
environment. The detection and perception of causal relationships is
fundamental for successful interaction with the world around
us. Moreover, keeping track of the events caused by actions performed
by oneself, as differentiated from external causes, is critical for
the maintenance of a consistent perspective. Previous fMRI studies
designed to investigate the perception of mechanical causality (e.g.,
a ball collides with another and causes it to move) found elevated
activation in motion areas, superior temporal sulcus, and left
parietal areas (Blakemore et al, 2001). No influence of attention was
found. Results were interpreted as supporting the view that the
perception of elementary mechanical causality events is automatically
processed by early low-level mechanisms not influenced by top-down
processes (Michotte, 1946; Schlottman and Shanks, 1992). However,
planning of motor activities crucially involves expectation of
consequences, whether these are consequences of one's own or other's
actions. This fMRI study focuses on identifying regions of the brain
that process causality as seen in consequences of actions, looking for
commonalities and differences across modalities. We use a visual
display in which changes can either be consequent (causal) upon an
action by the subject (self-cause) or by an unknown external factor
(external-cause), in contrast to conditions in which changes of the
visual stimuli are not perceived as consequence of actions
(non-causal). Contrasting the causal conditions against the non-causal
conditions, results show significant difference in activities in the
cerebellum, Brodmann's areas, and the frontal lobe, but not in early
sensory, including motion, areas.},
month = { Nov },
year                = 2003,
booktitle           = { Proc. Society for Neuroscience Annual 
Meeting (SFN'03)},
file = { http://iLab.usc.edu/publications/doc/Chen_etal03sfn.pdf },
type                = { psy;fmri },
review={abs/conf}
}

@inproceedings{Peters_etal03sfn,
author              = { R. J. Peters and T. N. Mundhenk and
L. Itti and C. Koch},
title               = { Contour-facilitation in a model of bottom-up
attention },
abstract = {Previously we showed that interactions among overlapping
orientation-tuned units could improve a bottom-up attention model in
predicting human eye movement targets. We have now extended this work
to address the question of how elongated contours affect saliency in
natural scenes. We used a model of contour-facilitation based on
putative long-range excitatory and inhibitory interactions among
orientation-tuned units in early visual cortex. Each unit tends to
excite other units that are nearly collinear, and inhibit those that
are nearly parallel. We tested the model on artificial images such as
arrays of Gabor patches with embedded implicit contours ('snakes'), as
well as natural images such as outdoor photos and overhead satellite
photos. Our results agree with previous psychophysical measurements of
human observers' sensitivity to implicit contours such as Gabor
snakes; we found that a basic bottom-up saliency model was completely
blind to such contours, while an enhanced saliency model with
contour-facilitiation module could consistently identify the embedded
contour (left figure) as the most salient element in the image (right
figure). Preliminary eyetracking results suggest that observers are
less sensitive to high spatial-frequency contours in natural scenes.},
month = { Nov },
year                = 2003,
booktitle           = { Proc. Society for Neuroscience Annual 
Meeting (SFN'03)},
type                = { mod;psy;bu;eye },
review={abs/conf}
}

@invited{Itti03nima,
author={L. Itti},
title={Learning Higher-Order Perceptual Saliency of Monochromatic Images:
Psychophysics and Computational Modeling},
year={2003},
booktitle={NIMA Neuroscience-Enabled Computer Vision Workshop,
Washington, D.C.},
month={Oct},
type={mod;bu;psy}
}

@invited{Itti03cns,
author={L. Itti},
title={Contribution of Low-Level Saliency to Human Eye Movements},
year={2003},
booktitle={Caltech Computation and Neural Systems Seminar,
Pasadena, California},
month={Oct},
type={mod;bu;psy}
}

@inproceedings{Chen_etal03fvm,
author={V. J. Chen and J. F. Barraza and L. Itti},
title={Perception of contours defined by integrative motion mechanisms},
abstract={We studied the performance of human subjects for velocity
discrimination in six varieties of motion stimuli. Motion was defined
by dynamic modulation of luminance, contrast, or color of randomly
placed, sparse small dots on a homogeneous background. Importantly, in
each and every display used in this study, no dots ever moved relative
to the background. For each of the modulation scheme, two varieties of
displays are generated: one with static dots and the other with
dynamic dots(popping up and disappearing at different locations in
each frame of the movie, but never translated), resulting in six
varieties of displays. Average luminance stayed constant across
regions in the contrast and color modulation displays. Two classes of
velocity discrimination thresholds were found. The higher threshold
was approximately ten times that of the lower one. Interestingly, the
higher threshold was consistently associated with the absence of the
perception of well-defined contours in the motion stimuli. Moreover,
keeping the average luminance constant throughout the stimuli did not
necessarily produce high discrimination thresholds. Contours, real or
subjective, appeared to be critical for the estimate of motion
parameters such as velocity. To further investigate the link between
the perception of contour and the perception of motion, vernier acuity
tests were performed on each of the six varieties of displays. Results
further supported the existence of motion-based mechanisms that
integrate spatio-temporally coherent, but dispersed local changes over
large regions that ultimately produce the perception of contours and
surfaces.},
booktitle={Proc. Fall Vision Meeting},
month={Oct},
year={2003},
type={psy},
review={abs/conf}
}

@inproceedings{Mundhenk_etal03spie1,
author={T. N. Mundhenk and C. Ackerman and D. Chung and N. Dhavale and
B. Hudson and R. Hirata and E. Pichon and Z. Shi and A. Tsui and L.
Itti},
title={Low-cost high-performance mobile robot design utilizing
off-the-shelf parts and the Beowulf concept: the Beobot project},
booktitle={Proc. SPIE Conference on Intelligent Robots and
Computer Vision XXI: Algorithms, Techniques, and Active Vision},
abstract={Utilizing off the shelf low cost parts, we have constructed
a robot that is small, light, powerful and relatively inexpensive
(less than USD 3900). The system is constructed around the Beowulf
concept of linking multiple discrete computing units into a single
cooperative system. The goal of this project is to demonstrate a new
robotics platform with sufficient computing resources to run
biologically-inspired vision algorithms in real-time. This is
accomplished by connecting two dual-CPU embedded PC motherboards using
fast gigabit Ethernet. The motherboards contain integrated Firewire,
USB and serial connections to handle camera, servomotor, GPS and other
miscellaneous inputs/outputs. Computing systems are mounted on a
servomechanism-controlled off-the-shelf Off Road RC car. Using the
high performance characteristics of the car, the robot can attain
relatively high speeds outdoors. The robot is used as a test platform
for biologically-inspired as well as traditional robotic algorithms,
in outdoor navigation and exploration activities. Leader following
using multi blob tracking and segmentation, and navigation using
statistical information and decision inference from image spectral
information are discussed. The design of the robot is open-source and
is constructed in a manner that enhances ease of replication. This is
done to facilitate construction and development of mobile robots at
research institutions where large financial resources may not be
readily available as well as to put robots into the hands of hobbyists
and help lead to the next stage in the evolution of robotics, a home
hobby robot with potential real world applications.},
month={Oct},
year={2003},
pages={293-303},
publisher={SPIE Press},
address={Bellingham, WA},
type={ bu;bb;mod;cv },
file = { http://iLab.usc.edu/publications/doc/Mundhenk_etal03spie1.pdf },
review={abs/conf}
}

@inproceedings{Mundhenk_etal03spie2,
author={T. N. Mundhenk and N. Dhavale and S. Marmol and E. Calleja
and V. Navalpakkam and K. Bellman and C. Landauer and M. A. Arbib
and L. Itti},
title={Utilization and viability of biologically-inspired
algorithms in a dynamic multi-agent camera surveillance system},
booktitle={Proc. SPIE Conference on Intelligent Robots and
Computer Vision XXI: Algorithms, Techniques, and Active Vision},
abstract={In view of the growing complexity of computational tasks and
their design, we propose that certain interactive systems may be
better designed by utilizing computational strategies based on the
study of the human brain. Compared with current engineering paradigms,
brain theory offers the promise of improved self-organization and
adaptation to the current environment, freeing the programmer from
having to address those issues in a procedural manner when designing
and implementing large-scale complex systems. To advance this
hypothesis, we discus a multi-agent surveillance system where 12 agent
CPUs each with its own camera, compete and cooperate to monitor a
large room. To cope with the overload of image data streaming from 12
cameras, we take inspiration from the primate s visual system, which
allows the animal to operate a real-time selection of the few most
conspicuous locations in visual input. This is accomplished by having
each camera agent utilize the bottom-up, saliency-based visual
attention algorithm of Itti and Koch (Vision Research
2000;40(10-12):1489-1506) to scan the scene for objects of
interest. Real time operation is achieved using a distributed version
that runs on a 16-CPU Beowulf cluster composed of the agent
computers. The algorithm guides cameras to track and monitor salient
objects based on maps of color, orientation, intensity, and motion. To
spread camera view points or create cooperation in monitoring highly
salient targets, camera agents bias each other by increasing or
decreasing the weight of different feature vectors in other cameras,
using mechanisms similar to excitation and suppression that have been
documented in electrophysiology, psychophysics and imaging studies of
low-level visual processing. In addition, if cameras need to compete
for computing resources, allocation of computational time is weighed
based upon the history of each camera. A camera agent that has a
history of seeing more salient targets is more likely to obtain
computational resources. The system demonstrates the viability of
biologically inspired systems in a real time tracking. In future work
we plan on implementing additional biological mechanisms for
cooperative management of both the sensor and processing resources in
this system that include top down biasing for target specificity as
well as novelty and the activity of the tracked object in relation to
sensitive features of the environment.},
month={Oct},
year={2003},
pages={281-292},
publisher={SPIE Press},
address={Bellingham, WA},
type={ bu;mod;cv },
file = { http://iLab.usc.edu/publications/doc/Mundhenk_etal03spie2.pdf },
review={abs/conf}
}

@invited{Itti03rap,
author={L. Itti},
title={Attention-Based Video Compression},
year={2003},
booktitle={USC Research Activities Presentation Day,
Los Angeles, California},
month={Aug},
type={mod;bu;cv}
}

@inproceedings{Itti_etal03spienn,
author={L. Itti and N. Dhavale and F. Pighin},
title={Realistic Avatar Eye and Head Animation Using a
Neurobiological Model of Visual Attention},
abstract={ We describe a neurobiological model of visual attention and
eye/head movements in primates, and its application to the automatic
animation of a realistic virtual human head watching an unconstrained
variety of visual inputs. The bottom-up (image-based) attention model
is based on the known neurophysiology of visual processing along the
occipito-parietal pathway of the primate brain, while the eye/head
movement model is derived from recordings in freely behaving Rhesus
monkeys. The system is successful at autonomously saccading towards
and tracking salient targets in a variety of video clips, including
synthetic stimuli, real outdoors scenes and gaming console outputs.
The resulting virtual human eye/head animation yields realistic
rendering of the simulation results, both suggesting applicability of
this approach to avatar animation and reinforcing the plausibility of
the neural model.},
booktitle = { Proc. SPIE 48th Annual International Symposium on
Optical Science and Technology },
editor={B. Bosacchi and D. B. Fogel and J. C. Bezdek},
volume={5200},
publisher={SPIE Press},
address={Bellingham, WA},
type = { bu;mod;cv },
month={Aug},
pages={64-78},
year={2003},
file = { http://iLab.usc.edu/publications/doc/Itti_etal03spienn.pdf },
review={abs/conf}
}

@invited{Itti03tell,
author={L. Itti},
title={Bottom-Up and Top-Down Guidance of Visual Attention in
Natural Dynamic Scenes},
year={2003},
booktitle={Telluride Neuromorphic Engineering Workshop,
Telluride, Colorado},
month={Jul},
type={mod;bu;td;psy}
}

@invited{Itti03svs,
author={L. Itti},
title={Modeling Saliency and Task Influences on Visual Search},
booktitle={Symposium on Visual Search, Munich, Germany},
year={2003},
month={Jun},
type={mod;bu;td;psy;eye}
}

@inproceedings{Lu_Itti03jsnc,
author={J. Lu and L. Itti},
title={Feature-based Attention is Task-Dependent},
booktitle={Proc. 10th Joint Symposium on Neural Computation (JSNC'03),
Irvine, California},
month={May},
year={2003},
type={td;psy;eye},
review={abs/conf}
}

@inproceedings{Dhavale_Itti03isspa,
author={N. Dhavale and L. Itti},
title={Saliency-Based Multi-Foveated MPEG Compression},
abstract={Most current foveation strategies are limited to foveating sequences
based on a direct measurement or an implicit assumption of the gaze
direction. Such approaches often fail in unconstrained environments or
when necessary equipment is absent. Alternatively, a computational
model of visual attention may be used to predict visually salient
locations. We describe such a neurobiological model of attention and
its specific application to foveated video compression. The algorithm
is demonstrated to be successful in foveating to Regions Of human
Interest in a variety of video segments, including synthetic as well
as natural scenes, and also gives good compression ratios.},
booktitle={Proc. IEEE Seventh International Symposium on Signal Processing
and its Applications, Paris, France},
year={2003},
pages={229-232},
month={Jul},
type={mod;bu;cv},
file = { http://iLab.usc.edu/publications/doc/Dhavale_Itti03isspa.pdf },
review={full/conf}
}

@inproceedings{Navalpakkam_Itti03wapcv,
author={V. Navalpakkam and L. Itti},
title={Sharing Resources: Buy Attention, Get Recognition},
abstract={Inspired by nature s policy of sharing resources, we have
enhanced our attention model with minimal extra hardware to enable the
twin powers of object detection and recognition. With just the
elementary information available at the preattentive stage in the form
of low-level feature maps tuned to color, intensity and orientation,
our model learns representations of objects in diverse, complex
backgrounds. The representation starts with simple vectors of
low-level feature values computed at one location centered on a given
view of object. We then recursively combine views to form instances,
in turn combined into simple objects, composite objects, and so on,
taking into account feature values and their variance. Given any new
scene, our model uses the learnt representation of the target object
to perform top-down biasing on the attention system such as to render
this object more salient by enhancing those features which are
characteristic of the object. Experimental results indicate that our
enhanced model is 5-20 times faster at detecting targets using this
biasing than when no feature is enhanced. Our model is also able to
recognize a wide variety of objects ranging from simple geometrical
objects to complex objects such as soda cans, handicap signs, and many
others under noisy conditions. There are few false negatives and false
positives. The good performance of our lightweight model suggests that
the human visual system may indeed be sharing resources extensively
and attention and object recogni-tion may be so intimately related
that if we buy attention, we might get the other for free!  },
booktitle={Proc. International Workshop on Attention and Performance
in Computer Vision (WAPCV'03), Graz, Austria},
year={2003},
month={Jul},
pages={},
type={mod;bu;td;cv;sc},
file = { http://iLab.usc.edu/publications/doc/Navalpakkam_Itti03wapcv.pdf },
review={full/wkshp}
}

@article{Mundhenk_Itti02cns,
author={T. N. Mundhenk and L. Itti},
title={CINNIC, a new computational algorithm for the modeling of early 
visual contour integration in humans},
abstract={We have developed a computational model called CINNIC to
simulate contour integration and visual salience in early visual
processing in the human brain. Our model uses the standard butterfly
pattern of connections between early orientation selective neurons,
which are believed to mediate interactions along contour
elements. However, we add multi scale analysis, adaptive neuron group
suppression and fast plasticity of connection weights to increase the
performance of our algorithm. We show quantitatively that the addition
of these ideas helps in the detection of salient contours. We also
submit that our algorithm is biologically plausible and falls in line
with what is known about neuron connections and interactions in V1 and
possibly V2. },
journal={Neurocomputing},
volume={52-54},
pages={599-604},
year={2003},
month={Jun},
type={mod;bu},
file = { http://iLab.usc.edu/publications/doc/Mundhenk_Itti02cns.pdf },
if = {2001 impact factor: 0.534}
}

@misc{Tsui03NRNcover,
author={A. Tsui},
title={Nature Reviews Neuroscience Cover Artwork},
abstract={April 2003 vol 4 no 4. 'Clear thinking.' April Tsui is
holding a piece of aerogel, certified as the lightest solid on Earth
by the Guinness Book of World Records. Used in NASA's Stardust mission
to capture cosmic particles for analysis on Earth, aerogel is 1000
times less dense than glass, being composed of 99.8 percent air. In
her collaboration with Jet Propulsion Lab (JPL), April is
experimenting with marrying non-traditional materials with machining
and rapid prototyping methods. The cover image is a brain laser-etched
on aerogel.  April studied advertising at the University of Texas at
Austin (B.Sc., 1998) and worked as an art director at an advertising
agency (1998-2000). She has been a graduate industrial design student,
focusing on product and environmental design, at the Art Center
College of Design (Pasadena, California) since 2001. This program
encourages students to explore their creative process through
traditional and scientific understanding of thought and creation.
Lecturers from the California Institute of Technology and JPL enrich
the program with classes on the scientific foundations of brain
evolution, neuroscience, and aerospace research.  One of April's
projects is currently exhibited in The Art of Thought (Los Angeles), a
show curated by K. Abeles. This exhibition celebrates the processes of
thought, through displays that range from brain scans to expressions
of creative thinking. April's 'Personal Museum' celebrates the human
body with an entire collection of miniature radiological scans
embedded in the thickness of five metre-long acrylic bars. The
refraction of the embedded images reveals the exhibition only at
specific angles, requiring a viewer's interactive exploration of the
museum piece.  April is also working with USC's computational
neuroscience laboratory. In particular, she is designing and modelling
the mechanical components of iLab's Beobot autonomous mobile robots.
Her latest project is an interactive aerogel exhibition at the
California Science Center (Los Angeles).},
volume={4},
number={4},
month={Apr},
pages={235},
year={2003},
type={ },
note={Cover artwork and artist bio},
file = { http://iLab.usc.edu/publications/doc/Tsui03NRNcover.pdf}
}

@incollection{Itti02cnca,
author = { L. Itti },
title = { Modeling Primate Visual Attention },
year = {2003},
booktitle = { Computational Neuroscience: A Comprehensive Approach },
pages={635-655},
publisher = {CRC Press},
address={Boca Raton},
abstract={Selective visual attention is the mechanism by which we can
rapidly direct our gaze towards objects of interest in our visual
environment.  From an evolutionary viewpoint, this rapid orienting
capability is critical in allowing living systems to quickly become
aware of possible preys, mates or predators in their cluttered visual
world.  It has become clear that attention guides where to look next
based on both bottom-up (image-based) and top-down (task-dependent)
cues.  As such, attention implements an information processing
bottleneck, only allowing a small part of the incoming sensory
information to reach short-term memory and visual awareness.  That is,
instead of attempting to fully process the massive sensory input in
parallel, nature has devised a serial strategy to achieve near
real-time performance despite limited computational capacity:
Attention allows us to break down the problem of scene understanding
into rapid series of computationally less demanding, localized visual
analysis problems.  These orienting and scene analysis functions of
attention are complemented by a feedback modulation of neural activity
at the location and for the visual attributes of the desired or
selected targets. This feedback is believed to be essential for
binding the different visual attributes of an object, such as color
and form, into a unitary percept. That is, attention not only serves
to select a location of interest, but also enhances the cortical
representation at that location. As such, focal visual attention is
often compared to a rapidly shiftable spotlight, which scans our
visual environment both overtly (with accompanying eye movements) or
covertly (with the eyes fixed).  Finally, attention is involved in
triggering behavior, and consequently is intimately related to
recognition, planning and motor control. Of course, not all of vision
is attentional, as we can derive coarse understanding from
presentations of visual scenes that are too brief for attention to
explore the scene. Vision thus relies on sophisticated interactions
between coarse, massively parallel, full-field pre-attentive analysis
systems and the more detailed, circumscribed and sequential
attentional analysis system.  In what follows, we focus on several
critical aspects of selective visual attention: First, the brain area
involved in its control and deployment; second, the mechanisms by
which attention is attracted in a bottom-up or image-based manner
towards conspicuous or salient locations in our visual environment;
third, the mechanisms by which attention modulates the early sensory
representation of attended stimuli; fourth, the mechanisms for
top-down or voluntary deployment of attention; and fifth, the
interaction between attention, object recognition and scene
understanding.  },
editor = {J. Feng},
file = { http://iLab.usc.edu/publications/doc/Itti02cnca.pdf },
type={mod;bu;td;psy;rev;sc}
}

@invited{Itti03nbic,
author={L. Itti},
title={Exploiting Visual Information Using Biologically-Inspired
Architectures},
year={2003},
booktitle={NBIC Convergence 2003, Los Angeles, California},
month={Feb},
type={mod;bu;cv}
}


@invited{Itti03vwsim,
author={L. Itti},
title={Controlling Sensornets with Realistic Brain Models},
year={2003},
booktitle={Proc. Virtual Worlds and Simulation Conference, Orlando, Florida},
month={Jan},
type={mod;bu;cv}
}

@inproceedings{Itti03nips,
title={The Beobot Platform for Embedded Real-Time Neuromorphic Vision},
author={L. Itti},
abstract={We demonstrate a new mobile robotics platform designed for the
implementation and testing of neuromorphic vision algorithms in
unconstrained outdoors environments. It is being developed by a team
of undergraduate students with graduate supervision and help. Its
distinctive features include significant computational power (four
1.4GHz CPUs with gigabit interconnect), high-speed four-wheel-drive
chassis, standard Linux operating system, and a comprehensive toolkit
of C++ vision classes. The robot is designed with two major goals in
mind: real-time operation of sophisticated neuromorphic vision
algorithms, and off-the-shelf components to ensure rapid technological
evolvability.  A preliminary embedded neuromorphic vision architecture
that includes attentional, gist/layout, object recognition, and
high-level decision subsystems is showcased (see
http://iLab.usc.edu/beobots/ for additional information).},
year={2003},
publisher           = { MIT Press },
editor={T. G. Dietterich and S. Becker and Z. Ghahramani},
address             = { Cambridge, MA },
booktitle           = { Advances in Neural Information Processing Systems, Vol.
                        15, Hardware Demo Track },
type                = { mod;bu;bb;sc;cv },
keywords = { Robotics ; Neuromorphic Engineering ; Computational Modeling ;
Visual Processing ; Visual Attention },
file                 = { http://iLab.usc.edu/publications/doc/Itti03nips.pdf },
review={abs/conf}
}

@inproceedings{Vijayakumar_etal03nips,
title={ Real-Time Statistical Learning for Oculomotor Control and
Visuomotor Coordination},
author={S. Vijayakumar and A. D'Souza and J. Peters and J. Conradt and
T. Rutkowski and A. Ijspeert and J. Nakanishi and M. Inoue and T. Shibata
and A. Wiryo and L. Itti and S. Amari and S. Schaal},
year={2003},
publisher           = { MIT Press },
editor={T. G. Dietterich and S. Becker and Z. Ghahramani},
address             = { Cambridge, MA },
booktitle           = { Advances in Neural Information Processing Systems, Vol.
                        15, Hardware Demo Track },
type                = { mod;bu;bb;sc;cv },
review={abs/conf}
}

@incollection{Itti03hbtnn2e,
author = { L. Itti },
title = { Visual Attention },
year = {2003},
month={Jan},
booktitle = { The Handbook of Brain Theory and Neural Networks },
publisher = {MIT Press},
abstract={Selective visual attention is the mechanism by which we can
rapidly direct our gaze towards objects of interest in our visual
environment. From an evolutionary viewpoint, this rapid orienting
capability is critical in allowing living systems to quickly become
aware of possible preys, mates or predators in their cluttered visual
world.  It has become clear that attention guides where to look next
based on both bottom-up (image-based) and top-down (task-dependent)
cues.  As such, attention implements an information processing
bottleneck, only allowing a small part of the incoming sensory
information to reach short-term memory and visual awareness.  That is,
instead of attempting to fully process the massive sensory input in
parallel, nature has devised a serial strategy to achieve near
real-time performance despite limited computational capacity:
Attention allows us to break down the problem of scene understanding
into rapid series of computationally less demanding, localized visual
analysis problems.  These orienting and scene analysis functions of
attention are complemented by a feedback modulation of neural activity
at the location and for the visual attributes of the desired or
selected targets. This feedback is believed to be essential for
binding the different visual attributes of an object, such as color
and form, into a unitary percept. That is, attention not only serves
to select a location of interest, but also enhances the cortical
representation at that location. As such, focal visual attention is
often compared to a rapidly shiftable spotlight, which scans our
visual environment both overtly (with accompanying eye movements) or
covertly (with the eyes fixed).  This spotlight has been shown to have
variable size and shape depending on the target being attended to.
Finally, attention is involved in triggering behavior, and
consequently is intimately related to recognition, planning and motor
control. Of course, not all of vision is attentional, as we can derive
coarse understanding from presentations of visual scenes that are so
brief that they do not leave time for attention to explore the
scene. Vision thus appears to rely on sophisticated interactions
between coarse, massively parallel, full-field pre-attentive analysis
systems and the more detailed, circumscribed and sequential
attentional analysis system.  In what follows, we focus on several
critical aspects of selective visual attention: First, the brain area
involved in its control and deployment; second, the mechanisms by
which attention is attracted in a bottom-up or image-based manner
towards conspicuous or salient locations in our visual environment;
third, the mechanisms by which attention modulates the early sensory
representation of attended stimuli; fourth, the mechanisms for
top-down or voluntary deployment of attention towards visual locations
that may not necessarily be intrinsically conspicuous, but may be of
interest in solving a given visual task; and fifth, the interaction
between attention, object recognition and scene understanding.},
editor = {M. A. Arbib},
pages={1196-1201},
file = { http://iLab.usc.edu/publications/doc/Itti02hbtnn2e.pdf },
type={mod;bu;td;psy;rev;sc}
}

@inproceedings{Peters_etal02sfn,
author              = { R. J. Peters and L. Itti and C. Koch },
title               = { Eye Movements Are Influenced by Short-Range 
Interactions Among Orientation Channels },
abstract = {Recent research (Parkhurst et al., Vis. Res. 2002) showed
that a model of bottom-up visual attention can account in part for the
patterns of eye movements made by human observers while free-viewing
complex natural and artificial scenes. Using a similar method, we
tested an enhanced model with excitatory and inhibitory interactions
among units at overlapping locations, tuned to different spatial
scales and orientations, as inferred from previous psychophysical
experiments involving fine discrimination of gabor-like patches in the
periphery (Lee et al., Nat.  Neurosci. 1999). Subjects free-viewed
images (visual angle 25x20 degrees) from three databases (outdoor
photos, fractals, and overhead satellite photos) for 3000ms per
image. An infrared eyetracking system (ISCAN, Inc.) recorded eye
position at 120Hz with a spatial precision of 0.5 degrees. For each
image, we computed the mean model-predicted salience of the points
traversed by each subject's scanpath, and judged these values by their
z-score in a distribution obtained from random scanpaths of similar
length. Across all conditions, the z-scores ranged from 4-14,
confirming that in general, our model of bottom-up attention predicts
human eye movements with high statistical significance. Moreover, the
addition of interactions among oriented units with overlapping
receptive fields led to a robust increase in the z-scores, both
overall and for individual subjects and image databases. Thus, these
interactions, originally modeled after simple gabor-like stimuli
viewed under covert attention, also appear to contribute to subjects'
overt eye movements under more natural free-viewing conditions.},
month = { Nov },
pages={715.12},
year                = 2002,
booktitle           = { Proc. Society for Neuroscience Annual 
Meeting (SFN'02)},
type                = { mod;psy;bu },
review={abs/conf}
}

@invited{Itti02humc,
author={L. Itti},
title={Bottom-Up and Top-Down Guidance of Visual Attention},
year={2002},
booktitle={Harbor-UCLA Medical Center, Neurology Grand Rounds,
Los Angeles, California},
month={Nov},
type={mod;bu;td;psy}
}

@inproceedings{Navalpakkam_Itti02bmcv,
author={V. Navalpakkam and L. Itti},
title={A Goal Oriented Attention Guidance Model},
abstract={Previous experiments have shown that human attention is
influenced by high level task demands.  In this paper, we propose an
architecture to estimate the task-relevance of attended locations in a
scene.  We maintain a task graph and compute relevance of fixations
using an ontology that contains a description of worldly entities and
their relationships.  Our model guides attention according to a
topographic attention guidance map that encodes the bottom-up salience
and task-relevance of all locations in the scene. We have demonstrated
that our model detects entities that are salient and relevant to the
task even on natural cluttered scenes and arbitrary tasks.},
booktitle={Lecture Notes in Computer Science},
volume={2525},
year={2002},
month={Nov},
pages={453-461},
type={mod;bu;td;cv;sc},
file = { http://iLab.usc.edu/publications/doc/Navalpakkam_Itti02bmcv.pdf },
review={full/conf}
}

@inproceedings{Mundhenk_Itti02bmcv,
author={T. N. Mundhenk and L. Itti},
title={A Model of Contour Integration in Early Visual Cortex},
abstract={ We have created an algorithm to integrate contour elements
and find the salience value of them. The algorithm consists of basic
long-range orientation specific neural connections as well as a group
suppression gain control and a fast plasticity term to explain
interaction beyond a neurons normal size range. Integration is
executed as a series of convolutions on 12 orientation filtered images
augmented by the nonlinear fast plasticity and group suppression
terms. Testing done on a large number of artificially generated Gabor
element contour images shows that the algorithm is effective at
finding contour elements within parameters similar to that of human
subjects. Testing of real world images yields reasonable results and
shows that the algorithm has strong potential for use as an addition
to our already existent vision saliency algorithm.},
booktitle={Lecture Notes in Computer Science},
volume={2525},
year={2002},
month={Nov},
pages={80-89},
type={mod;bu;cv},
file = { http://iLab.usc.edu/publications/doc/Mundhenk_Itti02bmcv.pdf },
review={full/conf}
}

@inproceedings{Walther_etal02bmcv,
author={D. Walther and L. Itti and M. Riesenhuber and T. Poggio and C. Koch},
title={Attentional Selection for Object Recognition - a Gentle Way},
abstract={Attentional selection of an object for recognition is often
modeled using all-or-nothing switching of neuronal connection pathways
from the attended region of the retinal input to the recognition
units. However, there is little physiological evidence for such
all-or-none modulation in early areas. We present a combined model
for spatial attention and object recognition in which the recognition
system monitors the entire visual field, but attentional modulation by
as little as 20 percent at a high level is sufficient to recognize
multiple objects. To determine the size and shape of the region to be
modulated, a rough segmentation is performed, based on preattentive
features already computed to guide attention. Testing with synthetic
and natural stimuli demonstrates that our new approach to attentional
selection for recognition yields encouraging results in addition to
being biologically plausible.},
booktitle={Lecture Notes in Computer Science},
volume={2525},
year={2002},
month={Nov},
pages={472-479},
type={mod;bu;cv},
file = { http://iLab.usc.edu/publications/doc/Walther_etal02bmcv.pdf },
review={full/conf}
}

@inproceedings{Chung_etal02bmcv,
author={D. Chung and R. Hirata and T. N.  Mundhenk and J. Ng and
R. J. Peters and E.  Pichon and A. Tsui and T. Ventrice and D. Walther
and P. Williams and L. Itti},
title={A New Robotics Platform for Neuromorphic Vision: Beobots},
abstract={This paper is a technical description of a new mobile
robotics platform specifically designed for the implementation and
testing of neuromorphic vision algorithms in unconstrained outdoors
environments. The platform is being developed by a team of
undergraduate students with graduate supervision and help. Its
distinctive features include significant computational power (four
1.4GHz CPUs with gigabit interconnect), high-speed four-wheel-drive
chassis, standard Linux operating system, and a comprehensive toolkit
of C++ vision classes. The robot is designed with two major goals in
mind: real-time operation of sophisticated neuromorphic vision
algorithms, and off-the-shelf components to ensure rapid technological
evolvability.  A preliminary embedded neuromorphic vision architecture
that includes attentional, gist/layout, object recognition, and
high-level decision subsystems is finally described.},
booktitle={Lecture Notes in Computer Science},
volume={2525},
year={2002},
month={Nov},
pages={558-566},
type={bb;mod;bu;cv},
file = { http://iLab.usc.edu/publications/doc/Chung_etal02bmcv.pdf },
review={full/conf}
}

@press{Anonymous02pi,
Authors={A. Anonymous},
title={La percezione dell'ambiente: i robot non si fermano},
journal={Programmazione.it},
month={Oct},
year={2002},
type={cv},
file = { http://iLab.usc.edu/publications/doc/Anonymous02pi.pdf}
}

@press{Diop02,
author              = { J. C. Diop },
title               = { Upstream: Robotic Vision, Neuroscience-based
strategies give robots a new outlook},
journal             = { Technology Review },
pages               = 33,
month = { Oct },
year                = 2002,
volume={105},
number={8},
keywords            = { Visual perception ; research ; robots },
abstract = {There are some sights and noises that people just can?t
help but notice. Indeed, research in neuroscience now suggests that
the recognition of salient objects is a key part of how we make sense
of our environment. But building robots that can intelligently pick
out items of interest using sight or sound remains a daunting
challenge. So a handful of engineers are working on a new approach
called selective-attention modeling, which attempts to program robots
to evaluate scenes critically as some neuroscientists believe people
do.  ``General scene understanding is the Holy Grail for computer
vision,'' says University of Southern California computer scientist
Laurent Itti. Neuroscience-based algorithms, he contends, ``should be
the new approach.''},
type                = { mod;cv;bb },
url       = { http://www.technologyreview.com/articles/upstream1002.asp },
file = { http://iLab.usc.edu/publications/doc/Diop02.pdf}
}

@inproceedings{Itti02spie,
author              = { L. Itti },
title               = { Toward highly capable neuromorphic autonomous 
robots: beobots },
abstract = {We describe a new mobile robotics platform specifically
designed for the implementation and testing of neuromorphic vision
algorithms in unconstrained outdoors environments. The new platform
includes significant computational power (four 1.1GHz CPUs with
gigabit interconnect), a high-speed four-wheel-drive chassis, standard
Linux operating system, and a comprehensive toolkit of C++ vision
classes. The robot is designed with two major goals in mind: real-time
operation of sophisticated neuromorphic vision algorithms, and
off-the-shelf components to ensure rapid technological evolvability.
A preliminary embedded neuromorphic vision architecture that includes
attentional, gist/layout, object recognition, and high-level decision
subsystems is finally described.},
year                = { 2002 },
booktitle           = { Proc. SPIE 47 Annual International Symposium on Optical
                        Science and Technology },
editor={B. Bosacchi and D. B. Fogel and J. C. Bezdek},
volume={4787},
month={Dec},
pages={37-45},
publisher={SPIE Press},
address={Bellingham, WA},
type                = { bu;mod;cv;bb },
file                = { http://iLab.usc.edu/publications/doc/Itti02spie.pdf },
review={abs/conf}
}

@invited{Itti02rap,
author={L. Itti},
title={Computational Architectures in Biological Vision},
year={2002},
booktitle={USC Research Activities Presentation Day,
Los Angeles, California},
month={Aug},
type={mod;bu;cv}
}

@invited{Itti02nima,
author={L. Itti},
title={Biological Architectures for Imagery Analysis},
year={2002},
booktitle={National Imagery and Mapping Agency Workshop, Washington, D.C.},
month={Aug},
type={mod;bu;cv}
}

@inproceedings{Itti_etal02jnm,
author={E. Itti and I. T. G. Gonzalo and L. Itti and F. S. Mishkin
and R. S. Swerdloff},
title={Brain perfusion mapping shows ``inverse'' hemispheric dominance in 
men with Klinefelter syndrome},
year={2002},
month={May},
booktitle={Journal of Nuclear Medicine},
volume={43},
number={5},
pages={S1026},
type={med},
review={abs/conf}
}

@inproceedings{Navalpakkam_Itti02jsnc,
author={V. Navalpakkam and L. Itti},
title={A Biologically-Inspired Scene-based Question Answering Agent},
abstract={},
booktitle={Proc. 9th Joint Symposium on Neural Computation (JSNC'02),
Pasadena, California},
year={2002},
month={May},
type={mod;bu;sc},
file = { http://iLab.usc.edu/publications/doc/Navalpakkam_Itti02jsnc.pdf },
review={abs/conf}
}

@inproceedings{Chung_Itti02jsnc,
author={D. Chung and L. Itti},
title={Object recognition in the early visual system},
abstract={To complement our previous modeling of visual attention and
the 'where' processing stream in the primate brain, we now investigate
the recognition of objects at the attended locations in the 'what'
visual processing stream. In the work we built an object recognition
system of view-tuned units (16x16 pixels, 8-bit gray level
scale). The view-tuned units are believed as a critical part for
object recognition of human brain, which is found in the
inferotemporal cortex. The view-tuned units are constructed using HMAX
model that was proposed by M. Reisenhuber and T. Poggio (Hierarchical
models of object recognition in cortex. Nat Neurosci, 2:1019-1025,
1999). HMAX was based on the hierarchical model of early visual system
that uses some amount of abstraction from the incoming information to
primary visual system. In our implementation of HMAX, the input image
(128x128 pixels) is contracted into 16x16 pixels view-tuned
units. As the result, the view-tuned units can be said not only to
possess the information of the image, but also some degree of
invariance to scale and translation. Here we tested with three types
of polygons for the classification(or recognition) problem,
i.e. ellipses, rectangles, and triangles with several variations of
orientations, positions, shapes, sizes, and even occlusions. With two
different approaches, we implemented the successful classification
solutions for the view-tuned units. One method was a simple linear
method, which is mentioned in the paper by Reisenhuber and Poggio. The
point for this work was on the comparison of the simple linear method
with a more sophisticated and expensive method, Support Vector
Machine. We tested for the improvement in the performance of the
trained classifiers. We used three different kernels for the SVM: dot
product (linear), degree 2 polynomial, and Radial Basis Function. At
the results, both the linear method and the SVM-based methods used in
this project constructed adequate classifiers for the classification
of simple grayscale level images. In fact when we used expensive
methods of classification such as SVM with linear and polynomial
kernel, the model gave slightly worse results compared to the one with
simple linear method. From the experiments, we conclude that the
preprocessing (HMAX) already provide a clue for linear separation of
the data, thus the improvements by more sophisticated methods are
insignificant. Further experiments will extend this analysis to a
large number of objects, such as the natural environments with complex
situation, so the SVM classifier may prove more robust than the simple
linear classifier.},
booktitle={Proc. 9th Joint Symposium on Neural Computation (JSNC'02),
Pasadena, California},
year={2002},
month={May},
type={mod;bu;cv},
file = { http://iLab.usc.edu/publications/doc/Chung_Itti02jsnc.pdf },
review={abs/conf}
}

@inproceedings{Ng_etal02jsnc,
author={J. Ng and R. Hirata and T. N. Mundhenk and E. Pichon and A. Tsui
and T. Ventrice and P. Williams and L. Itti},
title={Towards Visually-Guided Neuromorphic Robots: Beobots},
abstract={Despite the advancements made in the field of AI and
Robotics, robots today remain vastly inferior to animals in terms of
mental agility. The main reason for this is that robots do not possess
the neural capabilities of an animal brain. Neural algorithms adapt
well to diverse environments, whereas robot AI is usually limited to a
test lab setting. To resolve this disparity, an intuitive solution
would be to try to emulate the neural functions present in animal
brains. However, neural algorithms require vast amounts of
computational power to process, in particular those algorithms that
require real-time vision. Many robots, which run on power-saving
embedded processors, do not have a lot of CPU cycles to spare. We are
developing a high-performance visually-guided robotics platform with
enough processing speed to run neural algorithms. This 'Beobot'
platform consists of a high-performance radio-controlled truck chassis
(the robot) carrying an x86-based supercomputer (the Beowulf
cluster). The computing cluster consists of two compact dual-CPU
motherboards linked together by a gigabit Ethernet
connection. Powering the computer are four Pentium-III (Coppermine)
1Ghz processors along with 768MB of memory per motherboard. Two
Firewire cameras provide the Beobot's vision. A compact flash card is
used as a makeshift hard drive, and it has enough space to store a
thin UNIX-variant kernel and iLab's vision software. The vision
software itself consists of several general-purpose neural
algorithms. Most prominent of these is iLab's Saliency-based visual
attention system, which enables the Beobot to drive its attention
towards the most salient locations and objects in a visual scene. In
addition, we have developed prototype algorithms that allow the Beobot
to parse scene layouts and perform object recognition. A primitive
action/memory AI system allows it to implement simple visually-guided
behavior. Finally, the component-oriented nature of the vision
software enables future additions of neural modules. The potential
advantage of the Beobot comes from its use of x86-based hardware and
UNIX-based C++ development environment. Nearly all the parts of the
Beobot are inexpensive, off-the-shelf components. This enables easy
replacement of broken parts. Furthermore, the expandability of PC
hardware enables devices to be plugged into the Beobot for additional
functionalities. All these traits combined make the Beobot potentially
easy to replicate, and this allows for wider adoption upon the
successful completion of the prototype.},
booktitle={Proc. 9th Joint Symposium on Neural Computation (JSNC'02),
Pasadena, California},
year={2002},
month={May},
type={mod;bu;cv;bb},
file = { http://iLab.usc.edu/publications/doc/Ng_etal02jsnc.pdf },
review={abs/conf}
}

@inproceedings{Mundhenk_Itti02jsnc,
author={T. N. Mundhenk and L. Itti},
title={Towards a simpler model of contour integration in early visual
processing using a composite of methods},
abstract={iLab has been attempting to simulate contour integration in
early visual preprocessing. Our model starts with a standard butterfly
pattern of neural connections that excite or suppress neighboring
neurons depending on their preferred visual orientation used for
instance by Li (1998). This creates systems where neurons tend to
excite other neurons with a collinear orientation, but tend to
suppress neurons with a parallel orientation. Our current model
attempts to distance itself from many current models that use either
neuro synchronization or cascade effect to obtain good contour
detection. Instead, we have concentrated on a simpler composite model
that uses group suppression gain control, multi scale image analysis
and fast plasticity. In this, group suppression works by summing the
excitation for small groups of neurons. If the group exceeds
threshold, proportionately suppression among the group s neurons is
increased. Fast plasticity works by increasing the excitatory ability
of a neuron if it has been excited by neighboring neurons to a large
enough extent. Finally, multi scale processing works by taking the
result of processing the same image in multiple scales on the same
neural kernel model at each scale. Experiments on real world images
shows that contours are most noticeably improved by the use of group
suppression gain control, while tests on computer generated contours
provided by Jachen Braun that are of varying size, phase and alignment
shows improvement most from the use of fast plasticity and multi scale
processing. Our results so far suggest that all three additions a both
viable and helpful. Further, our model suggests that simpler
mechanisms can be used by the brain in the act of early visual contour
integration. },
booktitle={Proc. 9th Joint Symposium on Neural Computation (JSNC'02),
Pasadena, California},
year={2002},
month={May},
type={mod;bu},
file = { http://iLab.usc.edu/publications/doc/Mundhenk_Itti02jsnc.pdf },
review={abs/conf}
}

@invited{Itti02aero,
author={L. Itti},
title={Distributed Attention},
year={2002},
booktitle={The Aerospace Corporation, El Segundo, California},
month={Apr},
type={mod;bu;cv}
}

@article{Itti_etal02cnm,
author={E. Itti and K. Huff and M. E. Cornford and L. Itti and K. Poruri
and F. S. Mishkin},
title={Postinfectious encephalitis: A coregistered SPECT and magnetic 
resonance imaging study},
abstract = {An 11-year-old boy was hospitalized for acute headaches,
lethargy, and left hemiparesis. A lumbar puncture showed
polymorphonuclear leukocytes, but cultures re-mained
negative. Flow-attenuated inversion recovery magnetic resonance
imaging (MRI) was performed, as was brain SPECT using Tc-99m
hexamethylpropylene amineoxime (HMPAO). Anatomic and functional images
were coregistered using a unique surface-matching
technique. Subsequently, biopsy of a large right frontal lobe lesion
was performed. Pathologic examination and electron microscopy showed
degeneration of cortical neurons and associated arenavirus-like
particles. Whether this caused the disease or represented release by
dying neurons of previously present particles remains uncertain. },
keywords={ Arenavirus ; Brain SPECT ; Coregistration ; Encephalitis },
journal={Clinical Nuclear Medicine},
volume={27},
number={2},
year={2002},
month={Feb},
pages={129-130},
file = { http://iLab.usc.edu/publications/doc/Itti_etal02cnm.pdf },
type={med},
if = {2000 impact factor: 0.399}
}

@inproceedings{Walther_etal02cns,
author = { D. Walther and M. Riesenhuber and T. Poggio and L. Itti and
C. Koch },
title = { Towards an integrated model of saliency-based attention and
object recognition in the primate's visual system },
year                = {2002},
abstract = {We present an integrated model for the dorsal (where) and
the ventral (what) pathway in the primate's visual processing system
and the interaction between these two pathways. To reach our goal of
modeling visual search behavior in primates, we integrate and extend
the saliency-based model for bottom-up attention by Itti and Koch
(Nat. Rev. Neurosci. 2001;2(3):194-203) and the HMAX hierarchical
model for object recognition by Riesenhuber and Poggio
(Nat. Neurosci. 1999;2:1019-1025).  In the combined model we use
saliency-based attention to modulate object recognition at the V4
level.  Interesting regions in the visual scene are successively
selected by a rapidly shiftable focus of attention (FOA).  Neural
activity of a particular neuron in V4 is inhibited based on its
distance from the current FOA.  Recognition rates for stimuli composed
of two paper clip objects typically increase twofold compared to
previous experiments without attention (Neuron 1999;24(1):87-93).  To
achieve this improvement a depression of the V4 activity outside the
focus of attention by as little as 20\% proves to be sufficient. With
10\% activity modulation recognition still improves by 70\%.  We find
that the twofold increase in recognition rate is robust over a large
range of modulation strengths of the V4 activity.  We conclude that a
rather weak attentional modulation of the neural activity at the V4
level suffices to recognize multiple objects in the same display. Our
model will be extended to search for specific objects in cluttered
natural scenes and include biasing of the attention system in a
top-down manner.  },
booktitle={Journal of Cognitive Neuroscience},
volume={B14},
number={S},
pages={46-47},
month={Apr},
type                = { mod;bu },
file  = { http://iLab.usc.edu/publications/doc/Walther_etal02cns.pdf },
review={full/conf}
}

@inproceedings{Pichon_Itti02aaai,
author={ E. Pichon and L. Itti },
title={Real-Time High-Performance Attention Focusing for Outdoors
 Mobile Beobots},
year={2002},
month={Mar},
pages={63},
booktitle={Proc. AAAI Spring Symposium, Stanford, CA (AAAI-TR-SS-02-04)},
abstract={When confronted with cluttered natural environments, animals still
perform orders of magnitude better than artificial vision systems in
tasks such as orienting, target detection, navigation and scene
understanding. The recent widespread availability of significant
computational resources, however, in particular through the deployment
of so-called ``Beowulf'' clusters of low-cost personal computers,
leaves us little excuse for the enormous gap still separating
biological from machine vision systems.
We describe a neuromorphic model of how our visual attention is
attracted towards conspicuous locations in a visual scene.  It
replicates processing in posterior parietal cortex and other brain
areas along the dorsal visual stream in the primate brain. The model
includes a bottom-up (image-based) computation of low-level color,
intensity, orientation and motion features, as well as a non-linear
spatial competition which enhances salient locations in each of these
feature channels.  All feature channels feed into a unique scalar
``saliency map'' which controls where to next focus attention
onto. Because it includes a detailed low-level vision front-end, the
model has been applied not only to laboratory stimuli, but also to a
wide variety of natural scenes. In addition to predicting a wealth of
psychophysical experiments, the model demonstrated remarkable
performance at detecting salient objects in outdoors imagery ---
sometimes exceeding human performance --- despite wide variations in
imaging conditions, targets to be detected, and environments.
The present paper focuses on a recently completed parallelization of
the model, which runs at 30 frames/s on a 16-CPU Beowulf cluster, and
on the enhancement of this real-time model to include motion cues in
addition to the previously studied color, intensity and orientation
cues. The parallel model architecture and its deployment onto Linux
Beowulf clusters are described, as well as several examples of
applications to real-time outdoors color video streams. Implementation
on a 4-CPU rugged high-speed mobile robot, a ``Beobot,'' is also
described. The model proves very robust at detecting salient targets
from live video streams, despite large possible variations in
illumination, rapid camera jitter, clutter, or omnipresent optical
flow (e.g., when used on a moving vehicle).  The success of this
approach suggests that the neuromorphic architecture described may
represent a robust and efficient real-time machine vision front-end,
which can be used in conjunction with more detailed localized object
recognition and identification algorithms to be applied at the
selected salient locations.},
type={mod;bu;cv;bb},
review={abs/conf}
}

@invited{Itti02bnl,
author={L. Itti},
title={Attentional Modulation of Early Sensory Processing},
year={2002},
booktitle={Brookhaven National Laboratories,
Uptown, New York},
month={Feb},
type={mod;td;psy}
}

@inproceedings{Itti02vwsim,
author={L. Itti},
title={Neuromorphic Attentional Selection for Efficient Allocation of Computing Resources},
year={2002},
abstract={When confronted with cluttered natural environments, animals
still perform orders of magnitude better than artificial vision
systems in tasks such as orienting, target detection, navigation and
scene understanding. To better understand biological visual
processing, we have developed a neuromorphic model of how our visual
attention is attracted towards conspicuous locations in a visual
scene.  It replicates processing in the dorsal (``where'') visual
stream in the primate brain. The model includes a bottom-up
(image-based) computation of low-level color, intensity, orientation
and motion features, as well as a non-linear spatial competition that
enhances salient locations in each feature channel.  All feature
channels feed into a unique scalar ``saliency map'' which controls
where to next focus attention. We show how our simple within-feature
competition for salience effectively suppresses strong but spatially
widespread feature responses due to clutter.  The model robustly
detects salient targets in live outdoors video streams, despite large
variations in illumination, clutter, and rapid egomotion. The success
of this approach suggests that neuromorphic vision algorithms may
prove unusually robust for outdoors vision applications. Further, we
argue that the massively parallel attentional selection implemented in
our model may represent an efficient approach to the general problem
of allocating limited computational resources under conditions of
sensory overload.},
booktitle={Proc. Virtual Worlds and Simulation Conference, San Antonio, Texas},
month={Jan},
type={mod;bu;cv},
review={abs/conf}
}

@inproceedings{Itti02hvei,
author              = { L. Itti },
title = { Real-Time High-Performance Attention Focusing in Outdoors
Color Video Streams},
year                = {2002},
month={Jan},
pages={235-243},
abstract = { When confronted with cluttered natural environments,
animals still perform orders of magnitude better than artificial
vision systems in tasks such as orienting, target detection,
navigation and scene understanding. The recent widespread availability
of significant computational resources, however, in particular through
the deployment of so-called ``Beowulf'' clusters of low-cost personal
computers, leaves us little excuse for the enormous gap still
separating biological from machine vision systems.  We describe a
neuromorphic model of how our visual attention is attracted towards
conspicuous locations in a visual scene.  It replicates processing in
posterior parietal cortex and other brain areas along the dorsal
visual stream in the primate brain. The model includes a bottom-up
(image-based) computation of low-level color, intensity, orientation
and motion features, as well as a non-linear spatial competition which
enhances salient locations in each of these feature channels.  All
feature channels feed into a unique scalar ``saliency map'' which
controls where to next focus attention onto. Because it includes a
detailed low-level vision front-end, the model has been applied not
only to laboratory stimuli, but also to a wide variety of natural
scenes. In addition to predicting a wealth of psychophysical
experiments, the model demonstrated remarkable performance at
detecting salient objects in outdoors imagery --- sometimes exceeding
human performance --- despite wide variations in imaging conditions,
targets to be detected, and environments.  The present paper focuses
on a recently completed parallelization of the model, which runs at 30
frames/s on a 16-CPU Beowulf cluster, and on the enhancement of this
real-time model to include motion cues in addition to the previously
studied color, intensity and orientation cues. The parallel model
architecture and its deployment onto Linux Beowulf clusters are
described, as well as several examples of applications to real-time
outdoors color video streams. The model proves very robust at
detecting salient targets from live video streams, despite large
possible variations in illumination, rapid camera jitter, clutter, or
omnipresent optical flow (e.g., when used on a moving vehicle).  The
success of this approach suggests that the neuromorphic architecture
described may represent a robust and efficient real-time machine
vision front-end, which can be used in conjunction with more detailed
localized object recognition and identification algorithms to be
applied at the selected salient locations.},
booktitle = { Proc. SPIE Human Vision and Electronic Imaging VII
(HVEI'02), San Jose, CA },
editor={B. Rogowitz and T. N. Pappas},
publisher={SPIE Press},
address={Bellingham, WA},
type                = { mod;bu;cv },
file       = { http://iLab.usc.edu/publications/doc/Itti02hvei.pdf },
review={abs/conf}
}

@inproceedings{Itti_etal02nips,
title={Modeling the Modulatory Effect of Attention on Human Spatial Vision},
author={L. Itti and J. Braun and C. Koch},
abstract={We present new simulation results, in which a computational model of
  interacting visual neurons simultaneously predicts the modulation of
  spatial vision thresholds by focal visual attention, for five
  dual-task human psychophysics experiments. This new study
  complements our previous findings that attention activates a
  winner-take-all competition among early visual neurons within one
  cortical hypercolumn.  This ``intensified competition'' hypothesis
  assumed that attention equally affects all neurons, and yielded two
  single-unit predictions: an increase in gain and a sharpening of
  tuning with attention. While both effects have been separately
  observed in electrophysiology, no single-unit study has yet shown
  them simultaneously.  Hence, we here explore whether our model could
  still predict our data if attention might only modulate neuronal
  gain, but do so non-uniformly across neurons and tasks.
  Specifically, we investigate whether modulating the gain of only the
  neurons that are loudest, best-tuned, or most informative about the
  stimulus, or of all neurons equally but in a task-dependent manner,
  may account for the data.  We find that none of these hypotheses
  yields predictions as plausible as the intensified competition
  hypothesis, hence providing additional support for our original
  findings.},
year={2002},
month={Aug},
publisher           = { MIT Press },
editor={T. G. Dietterich and S. Becker and Z. Ghahramani},
address             = { Cambridge, MA },
booktitle           = { Advances in Neural Information Processing Systems
(NIPS*2001), Vol. 14 },
pages={1247-1254},
type                = { mod;td;psy },
file       = { http://iLab.usc.edu/publications/doc/Itti_etal02nips.pdf },
review={full/conf},
if = {2001 acceptance rate: 30\%}
}

@invited{Itti01ict,
author={L. Itti},
title={Computational Architecture for Goal-Oriented Attention},
year={2001},
booktitle={Institute for Creative Technologies,
Marina del Rey, California},
month={Dec},
type={mod;bu;cv}
}

@inproceedings{Miau_etal01spie,
author              = { F. Miau and C. Papageorgiou and L. Itti },
title               = { Neuromorphic algorithms for computer vision and attention },
year                = { 2001 },
month={Nov},
keywords            = { Visual attention ; object recognition ; scene analysis ;
                        bottom-up ; top-down },
abstract = { We describe an integrated vision system which reliably
detects persons in static color natural scenes, or other targets among
distracting objects. The system is built upon the
biologically-inspired synergy between two processing stages: A fast
trainable visual attention front-end (``where''), which rapidly
selects a restricted number of conspicuous image locations, and a
computationally expensive object recognition back-end (``what''),
which determines whether the selected locations are targets of
interest. We experiment with two recognition back-ends: One uses a
support vector machine algorithm and achieves highly reliable
recognition of pedestrians in natural scenes, but is not particularly
biologically plausible, while the other is directly inspired from the
neurobiology of inferotemporal cortex, but is not yet as robust with
natural images. Integrating the attention and recognition algorithms
yields substantial speedup over exhaustive search, while preserving
detection rate. The success of this approach demonstrates that using a
biological attention-based strategy to guide an object recognition
system may represent an efficient strategy for rapid scene
analysis. },
booktitle           = { Proc. SPIE 46 Annual International Symposium on Optical
                        Science and Technology },
editor={B. Bosacchi and D. B. Fogel and J. C. Bezdek},
volume={4479},
pages={12-23},
publisher={SPIE Press},
address={Bellingham, WA},
type                = { bu;mod;cv },
file         = { http://iLab.usc.edu/publications/doc/Miau_etal01spie.pdf },
review={abs/conf}
}

@invited{Itti01miw,
author={L. Itti and S. Schaal},
title={Real-Time Motion Information Mining: Biological Approaches 
and Statistical Learning},
year={2001},
booktitle={NSF Motion Imagery Workshop, Washington, D.C.},
month={Nov},
type={mod;bu;cv}
}

@inproceedings{Miau_Itti01embs,
author              = { F. Miau and L. Itti },
title               = { A Neural Model Combining Attentional Orienting to Object
                        Recognition: Preliminary Explorations on the Interplay Between
                        Where and What },
year                = { 2001 },
month={Oct},
keywords            = { Visual attention ; object recognition ; scene analysis ;
                        bottom-up ; top-down },
abstract = { We propose a model of primate vision that integrates both
an attentional orienting (``where'') pathway and an object recognition
(``what'') pathway. The fast visual attention front-end rapidly
selects the few most conspicuous image locations, and the slower
object recognition back-end identifies objects at the selected
locations. The model is applied to classical visual search tasks,
consisting of finding a specific target among an array of distracting
visual patterns (e.g., a circle among many squares). The encouraging
results obtained, in which substantial speedup is achieved by the
combined attention-recognition model while maintaining good
recognition performance compared to an exhaustive search, suggest that
the biologically-inspired architecture proposed represents an
efficient solution to the difficult problem of rapid scene
analysis. },
booktitle           = { Proc. IEEE Engineering in Medicine and Biology Society (EMBS), Istanbul, Turkey },
pages={789-792},
type                = { bu;mod;cv },
note={Winner of the Excellence in Neural Engineering Travel Award},
file        = { http://iLab.usc.edu/publications/doc/Miau_Itti01embs.pdf },
review={full/conf}
}

@article{Itti_etal01jni,
author              = { L. Itti and L. Chang and T. Ernst },
title               = { Segmentation of Progressive Multifocal Leukoencephalopathy
                        Lesions in Fluid-Attenuated Inversion Recovery
 Magnetic Resonance Imaging },
journal             = { Journal of Neuroimaging },
year                = { 2001 },
month = {Oct},
volume={11},
number={4},
pages={412-417},
abstract = { Background and Purpose: To compare the reproducibility of
manual and a semi-automated technique for the quantitation of white
matter lesions in magnetic resonance imaging. Methods: Volumes of
white matter lesions were determined using FLAIR MRI in 23 AIDS
patients with progressive multifocal leukoencephalopathy.  Manual
outlining was compared to an automated method based on region growing
and adaptive thresholding. Results: Lesion volumes from the two
methods correlated well (61 lesions, r=0.99, p<0.0001), although the
volumes differed substantially (12.8 +/- 13.7 percent, mean +/-
S.D). Interscan intrasubject reproducibility was better for the
automated than the manual method (2.9 +/- 3.2 percent vs. 12.4 +/- 16.2 percent
volume difference, p=0.02).  Conclusion: The automated algorithm
appeared more reproducible, which renders it superior to the manual
method for longitudinal studies. },
type                = { mip;med },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal01jni.pdf },
if = {2000 impact factor: 0.942}
}

@article{Itti_etal01oe,
author              = { L. Itti and C. Gold and C. Koch },
title               = { Visual Attention and Target Detection in Cluttered Natural
                        Scenes },
journal             = { Optical Engineering },
volume={40},
number={9},
pages={1784-1793},
year                = { 2001 },
month={Sep},
keywords            = { Visual attention ; saliency ; preattentive ; inhibition
                        of return ; winner-take-all ; bottom-up ; natural scene },
abstract            = { Rather than attempting to fully interpret visual scenes
                        in a parallel fashion, biological systems appear to employ
                        a serial strategy by which an attentional spotlight rapidly
                        selects circumscribed regions in the scene for further analysis.
                        The spatiotemporal deploy- ment of attention has been shown
                        to be controlled by both bottom-up (image-based) and top-down
                        (volitional) cues. We describe a detailed neuromimetic computer
                        implementation of a bottom-up scheme for the control of
                        visual attention, focusing on the problem of combining information
                        across modalities, here orientation, intensity and color
                        information, in a purely stimulus-driven manner. We have
                        applied this model to a wide range of target detection tasks,
                        using synthetic and natural stimuli. Performance has however
                        remained difficult to objectively evaluate on natural scenes,
                        because no objective reference was available for comparison.
                        We here present predicted search times for our model on
                        the Search2 database of rural scenes containing a military
                        vehicle. Overall, we found a poor correlation between human
                        and model search times. Further analysis, however, revealed
                        that in 75 percent of the images, the model appeared to
                        detect the target faster than humans (for comparison, we
                        calibrated the model's arbitrary internal time frame such
                        that 2-4 image locations were visited per second). It hence
                        seems that this model, which had originally been designed
                        not to find small, hidden military vehicles, but rather
                        to find the few most obviously conspicuous objects in an
                        image, performed as an efficient target detector on the
                        Search2 dataset. Further developments of the model are finally
                        explored, in particular through a more formal treatment
                        of the difficult problem of extracting suitable low-level
                        features to be fed into the saliency map. },
type                = { bu ; mod ; cv },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal01oe.pdf },
if = {1999 impact factor: 1.171}
}

@article{Chang_etal01,
author              = { L. Chang and O. Speck and E. N. Miller and J. Braun and
                        J. Jovicich and C. Koch and L. Itti and T. Ernst },
title               = { Neural Correlates of Attention and Working Memory Deficits
                        in HIV Patients },
journal             = {Neurology},
year                = { 2001 },
volume={57},
number={6},
pages={1001-1007},
month={Sep},
abstract            = { Objectives: To evaluate the neural correlates of attention
                        and working memory deficits in patients infected with HIV-1.
                        Method: Functional magnetic resonance imaging (fMRI) was
                        used to evaluate brain activity in 11 HIV-patients and 11
                        age, gender, education and handedness-matched seronegative
                        subjects, while performing a battery of tasks that required
                        different levels of attention for working memory. Results:
                        HIV-patients showed greater brain activation (BOLD signal
                        changes) in some brain regions compared to control subjects
                        while performing the same tasks. For the simpler tasks,
                        HIV patients showed greater activation in the parietal regions.
                        However, with more difficult tasks, HIV patients showed
                        greater activation additionally in the frontal lobes. Reaction
                        times during these tasks were slower but accuracies were
                        similar in the HIV patients compared to control subjects.
                        Conclusion: Injury to the neural substrate due to the HIV
                        infection may necessitate greater attentional modulation
                        of the neural circuits, hence a greater usage of the brain
                        reserve; additional activation of the frontal lobes is required
                        to perform the more complex tasks. The task-dependent increased
                        frontal activation in the HIV patients suggests that the
                        neural correlate of attentional deficits may be excessive
                        attentional modulation as a result of frontostriatal brain
                        injury. },
file                = { http://iLab.usc.edu/publications/doc/Chang_etal01.pdf},
type                = { mip ; med ; td },
if = {1999 impact factor: 5.232}
}

@invited{Itti01rap,
author={L. Itti},
title={Where do you look in natural scenes?},
year={2001},
booktitle={USC Research Activities Presentation Day,
Los Angeles, California},
month={Sep},
type={mod;bu;cv}
}

@invited{Itti01cnse,
author={L. Itti},
title={Computational Models of Visual Attention},
year={2001},
booktitle={Caltech Computation and Neural Systems Anniversary,
Pasadena, California},
month={Sep},
type={mod;bu;cv}
}

@incollection{Niebur_etal01,
author              = { E. Niebur and L. Itti and C. Koch },
title               = { Controlling the Focus of Visual Selective Attention },
year                = { 2001 },
month = {Aug},
abstract            = { Selecting only a subset of the available sensory information
                        before further detailed processing is crucial for efficient
                        perception. In the visual modality, this selection is frequently
                        implemented by suppressing information outside a spatially
                        circumscribed region of the visual field, the so-called
                        ``focus of attention.'' The model for the control of the
                        focus of attention in primates presented here is based on
                        a ``Saliency Map'' which is a topographic representation
                        of the instantaneous saliency of the visual scene. },
editor              = { L. Van Hemmen and E. Domany and J. Cowan},
publisher           = { Springer Verlag },
booktitle           = { Models of Neural Networks IV },
type                = { bu;mod;cv },
file                = { http://iLab.usc.edu/publications/doc/Niebur_etal01.pdf }
}

@patent{Koch_Itti01,
author              = { C. Koch and L. Itti },
title               = { Computation of Intrinsic Perceptual Saliency in Visual Environments,
                        and Applications },
month = { Jul },
year                = 2001,
abstract            = { Detection of image salience in a visual display of an image.
                        The image is analyzed at multiple spatial scales and over
                        multiple
                        feature channels to determine the likely salience of different
                        portions of the image. One application for the system is
                        in an
                        advertising context. The detection may be improved by second
                        order
                        statistics, e.g. mean and standard deviations of different
                        image
                        portions relative to other portions. Different edges may
                        be
                        considered as being extended edges by looking at the edges
                        over
                        multiple spatial scales. One set of feature channels can
                        be optimized
                        for use in moving images, and can detect motion or flicker.
                        The
                        images can be obtained over multiple spectral ranges and
                        the user can
                        be instructed about how to maximize the saliency. This can
                        be applied
                        to automatically evaluate and optimize sales or advertisement
                        displays. },
note                = { Patent pending. Filed July 23, 2001, following provisional
                        applications No. 60/274,674 filed March 8, 2001 and 60/288,724
                        filed May 4, 2001. },
organization        = { California Institute of Technology, Pasadena, California },
type                = { bu;mod;cv }
}

@incollection{Braun_etal01,
author              = { J. Braun and C. Koch and D. K. Lee and L. Itti },
title               = { Perceptual Consequences of Multilevel Selection },
pages               = { 215-242 },
month = { Apr },
year                = 2001,
keywords            = { visual attention },
abstract            = { The neurobiology and psychology of attention have much to
                        learn from each other. Neurobiologists recognize that responses
                        in sensory cortex depend on the behavioral relevance of
                        a stimulus, but have few ways to study how perception changes
                        as a result. Psychologists have the conceptual and methodological
                        tools to do just that, but are confounded by the multiple
                        interpretations and theoretical ambiguities. This book attempts
                        to bridge the two fields and to derive a comprehensive theory
                        of attention from both neurobiological and psychological
                        data. It highlights situations where attention can be seen
                        to alter both neural activity and psychophysical performance/phenomenal
                        experience. This ``bicultural'' approach contributes not
                        only to attention research but to the larger goal of linking
                        neural activity to conscious experience. The book focuses
                        mainly on the effects of visual attention on the ventral
                        and dorsal streams of visual cortex in humans and monkeys
                        and the associated changes in visual performance. Several
                        larger findings emerge: attention may involve more than
                        one neural system; attention modulates all stages of cortical
                        visual processing; the effect of attention is constrained
                        by the intrinsic connectivity of cortex and the resulting
                        contextual interactions; and the notion of a ``saliency
                        map'' remains central to thinking about visual attention.
                        The book also considers several approaches to evaluating
                        the same variable through different methods, such as behavioral
                        measurements, functional imaging, and single-unit recording. },
editor              = { J. Braun and C. Koch and J. Davis },
publisher           = { MIT Press },
address             = { Cambridge, MA },
booktitle           = { Visual Attention and Cortical Circuits },
type                = { mod;td;psy }
}

@invited{Itti01vrl,
author={L. Itti},
title={Attentional Modulation of Early Vision},
year={2001},
booktitle={USC Vision Research Group,
Los Angeles, California},
month={Sep},
type={mod;td}
}

@article{Itti_Koch01nrn,
author              = { L. Itti and C. Koch },
title               = { Computational Modelling of Visual Attention },
journal             = { Nature Reviews Neuroscience },
volume              = 2,
number              = 3,
pages               = { 194-203 },
month = { Mar },
year                = 2001,
abstract            = { Five important trends have emerged from recent work on computational
                        models of focal visual attention that emphasize the bottom-up,
                        image-based control of attentional deployment. First, the
                        perceptual saliency of stimuli critically depends on the
                        surrounding context. Second, a unique saliency map that
                        topographically encodes for stimulus conspicuity over the
                        visual scene has proved to be an efficient and plausible
                        bottom-up control strategy. Third, inhibition of return,
                        the process by which the currently attended location is
                        prevented from being attended again, is a crucial element
                        of attentional deployment. Fourth, attention and eye movements
                        tightly interplay, posing computational challenges with
                        respect to the coordinate system used to control attention.
                        And last, scene understanding and object recognition strongly
                        constrain the selection of attended locations. Insights
                        from these five key areas provide a framework for a computational
                        and neurobiological understanding of visual attention. },
type                = { mod;bu;td;rev },
file                = { http://iLab.usc.edu/publications/doc/Itti_Koch01nrn.pdf },
if = {2002 impact factor: 24.047}
}

@article{Itti_etal01mrm,
author              = { L. Itti and L. Chang and T. Ernst },
title               = { Automatic scan prescription for brain MRI },
journal             = { Magnetic Resonance in Medicine },
volume              = 45,
number              = 3,
pages               = { 486-494 },
month = { Mar },
year                = 2001,
abstract            = { Diagnostic brain MRI scans are usually performed by trained
                        medical technologists who manually prescribe the position
                        and orientation of a scanning volume. In this study, a fully
                        automatic computer algorithm is described which compensates
                        for variable patient positioning and acquires brain MRI
                        scans in a predefined reference orientation. The method
                        involves acquiring a rapid water-only pilot scan, segmenting
                        the brain surface, and matching it to a reference surface.
                        The inverse matching transformation is then used to adapt
                        a geometric description of the desired scanning volume,
                        defined relative to the reference surface, to the current
                        patient. Both pilot scan and processing are performed within
                        30 sec. The method was tested in 25 subjects, and consistently
                        recovered orientation differences between the reference
                        and each subject to within +/-5 degrees. Compared to manual
                        prescription, automatic scan prescription promises many
                        potential benefits, including reduced scan times, reproducible
                        scan orientations along anatomically preferable orientations,
                        and better reproducibility for longitudinal studies. },
address             = { Harbor UCLA Research and Education Institute, Torrance,
                        California. },
type                = { mip;fmri },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal01mrm.pdf },
if = {1999 impact factor: 3.757}
}

@article{Itti_Koch01ei,
author              = { L. Itti and C. Koch },
title               = { Feature Combination Strategies for Saliency-Based Visual
                        Attention Systems },
journal             = { Journal of Electronic Imaging },
volume              = 10,
number              = 1,
pages               = { 161-169 },
month = { Jan },
year                = 2001,
keywords            = { Attention ; saliency ; target detection ; feature integration
                        ; learning },
abstract            = { Bottom-up or saliency-based visual attention allows primates
                        to detect non-specific conspicuous targets in cluttered
                        scenes. A classical metaphor, derived from electrophysiological
                        and psychophysical studies, describes attention as a rapidly
                        shiftable ``spotlight''. We use a model that reproduces
                        the attentional scanpaths of this spotlight. Simple multi-scale
                        ``feature maps'' detect local spatial discontinuities in
                        intensity, color, and orientation, and are combined into
                        a unique ``master'' or ``saliency'' map. The saliency map
                        is sequentially scanned, in order of decreasing saliency,
                        by the focus of attention. We here study the problem of
                        combining feature maps, from different visual modalities
                        (such as color and orientation), into a unique saliency
                        map. Four combination strategies are compared using three
                        databases of natural color images: (1) Simple normalized
                        summation, (2) linear combination with learned weights,
                        (3) global non-linear normalization followed by summation,
                        and (4) local non-linear competition between salient locations
                        followed by summation. Performance was measured as the number
                        of false detections before the most salient target was found.
                        Strategy (1) always yielded poorest performance and (2)
                        best performance, with a 3 to 8-fold improvement in time
                        to find a salient target. However, (2) yielded specialized
                        systems with poor generalization. Interestingly, strategy
                        (4) and its simplified, computationally efficient approximation
                        (3) yielded significantly better performance than (1), with
                        up to 4-fold improvement, while preserving generality. },
type                = { bu;mod;cv },
file                = { http://iLab.usc.edu/publications/doc/Itti_Koch01ei.pdf },
if = {1999 impact factor: 0.667}
}

@article{Ernst_etal00,
author              = { T. Ernst and E. Itti and L. Itti and L. Chang },
title               = { Changes in cerebral metabolism are detected prior to perfusion
                        changes in early HIV-CMC: A coregistered (1)H MRS and SPECT
                        study },
journal             = { Journal of Magnetic Resonance Imaging },
volume              = 12,
number              = 6,
pages               = { 859-865 },
month = { Dec },
year                = 2000,
abstract = { Human immunodeficiency virus-cognitive motor complex
(HIV-CMC), a common complication of the acquired immunodeficiency
syndrome (AIDS), is characterized by progressive cognitive impairment
and motor dysfunction. Functional imaging methods, such as
single-photon emission computed tomography (SPECT) and proton magnetic
resonance spectroscopy ((1)H-MRS), have been applied to assess the
severity of brain injury. However, it is unclear which of these two
methods is more sensitive in detecting brain abnormalities in patients
with early HIV-CMC. Twenty-four HIV-CMC patients were compared with 34
healthy subjects; each had quantitative SPECT ((133)Xenon-calibrated
(99m)Tc-HMPAO) and quantitative (1)H-MRS. Both modalities were
co-registered in order to assess regional cerebral blood flow (rCBF)
and metabolite concentrations within the same voxel of interest in
four brain regions (midfrontal and midparietal gray matter,
temporoparietal white matter, and basal ganglia). On SPECT, only the
temporoparietal white matter showed a trend for decreased rCBF in
HIV-CMC patients (-13\%, P = 0.06). On MRS, HIV-CMC patients showed
significantly reduced creatine concentration in the basal ganglia
(-8\%, P = 0.008), as well as increased myoinositol concentrations in
the basal ganglia (+25\%, P = 0.01) and the temporoparietal white
matter (+18\%, P = 0.08). There was no significant correlation between
SPECT and MRS variables in the patients in any region. (1)H MRS showed
abnormal neurochemistry in the basal ganglia, whereas rCBF on SPECT
was normal in the same region. This finding suggests that metabolite
concentrations on (1)H MRS are better surrogate markers than rCBF
measurements with SPECT for the evaluation of brain injury in early
HIV-CMC. },
address             = { Department of Radiology, Harbor-UCLA Medical Center, Torrance,
                        California 90502, USA. },
type                = { mip;med },
file                = { http://iLab.usc.edu/publications/doc/Ernst_etal00.pdf },
if = {1999 impact factor: 2.106}
}

@article{Itti_etal00josa,
author              = { L. Itti and C. Koch and J. Braun },
title               = { Revisiting Spatial Vision: Toward a Unifying Model },
journal             = { Journal of the Optical Society of America, JOSA-A },
volume              = 17,
number              = 11,
pages               = { 1899-1917 },
month = { Nov },
year                = 2000,
abstract            = { We report contrast detection, contrast increment, contrast
                        masking, orientation discrimination and spatial frequency
                        discrimination thresholds for spatially localized stimuli
                        at 4deg of eccentricity. Our stimulus geometry emphasizes
                        interactions among overlapping visual filters and differs
                        from that used in previous threshold measurements, which
                        also admits interactions between distant filters. We quantitatively
                        account for all measurements by simulating a small population
                        of overlapping visual filters interacting through divisive
                        inhibition. We depart from previous models of this kind
                        in the parameters of divisive inhibition and in using a
                        statistically efficient decision stage based on Fisher information.
                        The success of this unified account suggests that, contrary
                        to Bowne (1990), spatial vision thresholds reflect a single
                        level of processing, perhaps as early as primary visual
                        cortex. },
type                = { psy;mod },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal00josa.pdf },
if = {1999 impact factor: 1.864}
}


@invited{Itti00cnse,
author={L. Itti},
title={Computational Models of Visual Attention},
year={2000},
booktitle={Caltech Center for Neuromorphic Systems Engineering,
Pasadena, California},
month={Nov},
type={mod;bu;cv}
}


@inproceedings{Itti_etal00arvo,
author              = { L. Itti and J. Braun and C. Koch },
title               = { Single-Filter Gain Changes and Attentional Threshold Modulation },
volume              = 41,
number              = 4,
pages               = {S39},
month = { Mar },
year                = 2000,
abstract            = { Purpose: We previously used a simple model to interpret
                        attentional modulation of human psychophysical thresholds
                        for pattern discrimination. ``Fully'' and ``poorly'' attended
                        thresholds were obtained using a dual-task paradigm (Lee
                        et al., Nature Nsci'99), for increment contrast discrimination
                        (ICD), orientation and spatial frequency discriminations,
                        and for two contrast masking tasks. The model used 60 linear
                        filters (5 scales, 12 orientations) followed by Heeger-type
                        divisive gain control and statistically efficient decision.
                        It accounted for all the data (Itti et al., ARVO'98) by
                        assuming indiscriminate intensified competition among all
                        filters, predicting both a 3-fold gain increase and a 35\%
                        sharpening of tuning with attention. Here we explore whether
                        a simpler explanation involving only gain changes is tenable
                        as well. Since a unique gain modulation, identical for all
                        filters and tasks, failed to predict our data, we investigate
                        more specific attentional feedback, affecting the gain of
                        task-dependent sub-populations of filters. Methods: We still
                        assume identical modulation for all tasks, but now consider
                        that attention only affects the gain of the single filter
                        (i) best-tuned to, (ii) responding maximally to, or (iii)
                        most informative about a given stimulus. Other filters are
                        unaffected by attention. A downhill simplex with simulated
                        annealing overhead simultaneously fits to the data both
                        the unique, task-independent attentional gain factor and
                        our model's 10 intrinsic parameters (filter tuning, transducer
                        function, filter interaction, response variance), for all
                        tasks in both attentional conditions (64 datapoints). Results
                        and Conclusion: Hypotheses (i) and (ii) yielded qualitatively
                        incorrect fits with negligible modulation in at least two
                        tasks. Residual fit error was 150--300\% higher than under
                        the assumption of intensified competition. Hypothesis (iii)
                        yielded a qualitatively reasonable fit (though not predicting
                        the well-known ``dipper'' in ICD), yet was quantitatively
                        poor (75\% higher error). We conclude that the gain-only
                        manipulations studied do not simultaneously reproduce attentional
                        modulation as well as intensified competition. },
booktitle           = { Investigative Ophthalmology and Visual Science (Proc. ARVO
                        2000) },
type                = { td;mod },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal00arvo.pdf },
review={abs/conf}
}

@inproceedings{Tehrani_etal00,
author              = { F. Tehrani and L. Itti and C. Koch },
title               = { Visual Search Asymmetries Reproduced by Simple Model },
volume              = 41,
number              = 4,
pages               = {S423},
month = { Mar },
year                = 2000,
abstract            = { Purpose: In some instances of visual search, where human
                        observers detect the presence or absence of a special ``target''
                        visual pattern in an array of identical ``distractor'' patterns,
                        search time asymmetries have been reported when target and
                        distractor patterns are interchanged. A classical explanation
                        for this finding is that targets which contain ``richer
                        features'' are easier to find among simpler distractors
                        than the opposite. For example, a curved line segment is
                        detected faster among straight segments than the opposite,
                        presumably because it has the added property of curvature.
                        Here we test whether a simple computational model of bottom-up
                        attention reproduces such asymmetries. Methods: Our model
                        implements a number of simple multiscale ``feature maps''
                        selective for colors, orientations and intensity, and combines
                        them into a unique topographic ``saliency map'' which guides
                        attention (Itti et al., IEEE-PAMI, 1998). Stimulus arrays
                        were generated by an automatic program. Search elements
                        were randomly jittered by up to 60\% of their size and rotated
                        by up to +/-10deg. Uniform color speckle noise with 10\%
                        probability was finally added. For twenty target/distractor
                        pairs (e.g., ``Q'' among ``O'', or open among closed cicles),
                        we generated twenty instances of arrays containing 4x4 to
                        10x10 elements (seven sizes in total). The resulting 5600
                        images were evaluated by our model, and simulated search
                        times were collected. Results and Conclusion: For control
                        target/distractor pairs, for which no asymmetry is found
                        in humans, the model did not either exhibit spurious asymmetries.
                        For pairs yielding asymmetry in humans, the model generally
                        reproduced the asymmetry. With some pairs however, the model
                        initially predicted an opposite asymmetry; careful examination
                        of the model's internals revealed that such failure was
                        due to luminance imbalance between target and distractor.
                        After luminance correction, the correct asymmetries were
                        obtained. In all asymmetry cases, the model showed significantly
                        stronger activity in at least one feature map for the easily-found
                        target. Our simulations hence confirm in a computational
                        manner that asymmetries may be due to an ``added property''
                        in the target that is easy to detect. },
booktitle           = { Investigative Ophthalmology and Visual Science (Proc. ARVO
                        2000) },
type                = { bu;mod },
review={abs/conf}
}

@inproceedings{Egner_etal00,
author              = { S. Egner and L. Itti and C. R. Scheier },
title               = { Comparing attention models with different types of behavior
                        data },
volume              = 41,
number              = 4,
pages               = {S39},
month = { Mar },
year                = 2000,
abstract            = { Purpose: While looking at an image, a human observer generates
                        a sequence of attentional shifts between different image
                        locations. Different models of visual attention attempt
                        to predict these shifts. The goal of our ongoing project
                        is to evaluate different models by comparing their predicted
                        shifts of attention to the shifts produced by human observers.
                        Methods: The main challenge for our study is that attention
                        models typically predict covert shifts of attention, which
                        can not be measured directly from an observer's behavior.
                        What can be measured, for instance eye movements, is always
                        a result of response-specific (e.g. the oculo-motor system)
                        and non-specific (e.g. attentional) factors. To infer the
                        non-response-specific factors, we recorded different types
                        of responses, eye movements, finger pointing, and mouse
                        clicks for the same stimuli. Each stimulus, a search/pop-out
                        display, a natural scene or a web page, was presented for
                        four seconds. Responses that highly correlated between different
                        modalities were assumed to reflect attentional processes.
                        The behavior data was transformed into image coordinates
                        where it could be compared the models' predictions. We used
                        a local feature contrast based saliency measurement as a
                        baseline model and the model by Itti and Koch (1998). Other
                        models can be integrated in the same way. We computed how
                        well the distribution of responses from one response system
                        could predict the responses from another response system
                        or from a model. Results: (1) Distributions produced by
                        different response systems are highly correlated. (2) The
                        similarity between responses and model predictions strongly
                        depends on the stimulus category, but for all stimulus categories,
                        the model by Itti and Koch produces better predictions than
                        the baseline model. Conclusions: The high similarity between
                        response modalities indicates that the responses reflect
                        a common underlying, presumably attentional, process. We
                        suggest that mouse clicks are a particularly easy way to
                        gather attentional data. The model by Itti and Koch is favorable
                        over the baseline model. Some model improvements that would
                        lead to an even higher agreement with our empirical data
                        are discussed. },
booktitle           = { Investigative Ophthalmology and Visual Science (Proc. ARVO
                        2000) },
type                = { bu;mod;psy },
review={abs/conf}
}

@article{Itti_Koch00vr,
author              = { L. Itti and C. Koch },
title               = { A saliency-based search mechanism for overt and covert shifts
                        of visual attention },
journal             = { Vision Research },
volume              = 40,
number              = { 10-12 },
pages               = { 1489-1506 },
month = { May },
year                = 2000,
keywords            = { Adolescence ; Adult ; Attention/*physiology ; Color Perception/physiology
                        ; Female ; Human ; Male ; Middle Age ; *Models, Neurological
                        ; *Models, Psychological ; Psychophysics ; Support, Non-U.S.
                        Gov't ; Support, U.S. Gov't, Non-P.H.S. ; Support, U.S.
                        Gov't, P.H.S. ; Visual Perception/*physiology ; 2000/08/12
                        11:00 },
abstract            = { Most models of visual search, whether involving overt eye
                        movements or covert shifts of attention, are based on the
                        concept of a saliency map, that is, an explicit two-dimensional
                        map that encodes the saliency or conspicuity of objects
                        in the visual environment. Competition among neurons in
                        this map gives rise to a single winning location that corresponds
                        to the next attended target. Inhibiting this location automatically
                        allows the system to attend to the next most salient location.
                        We describe a detailed computer implementation of such a
                        scheme, focusing on the problem of combining information
                        across modalities, here orientation, intensity and color
                        information, in a purely stimulus-driven manner. The model
                        is applied to common psychophysical stimuli as well as to
                        a very demanding visual search task. Its successful performance
                        is used to address the extent to which the primate visual
                        system carries out visual search via one or more such saliency
                        maps and how this can be tested. },
address             = { Computation and Neural Systems Program, Division of Biology,
                        California Institute of Technology, Mail-Code 139-74, Pasadena,
                        CA 91125, USA. },
type                = { bu;mod;cv },
file                = { http://iLab.usc.edu/publications/doc/Itti_Koch00vr.pdf },
if = {1998 impact factor: 1.809}
}

@press{Selim00,
author              = { J. Selim },
title               = { Hidden in Plain Sight },
journal             = { Discover Magazine },
pages               = 16,
month = { May },
year                = 2000,
keywords            = { Visual perception ; research },
abstract            = { Your vision is terrible-you just don't know it. A camera
                        took the picture at left. Laurent Itti, a postgraduate researcher
                        in Caltech's Computation and Neural Systems Program, then
                        computer-altered the photo to show the corresponding image
                        that forms inside the eye, at right. Light-detecting cells
                        in the retina are tightly packed only in a small central
                        region. The density of those cells decreases toward the
                        outer edges of the retina, so peripheral vision becomes
                        less crisp. And the location where the optic nerve meets
                        the retina creates a perpetual blind spot. ''We move our
                        eyes three to five times per second, and we remember the
                        objects we've just seen, so we think we see better than
                        we actually do,'' says Itti. },
type                = { mod;cv },
url                 = { http://klab.caltech.edu/~itti/retina/ }
}

@inproceedings{Ernst_etal00ismrm,
author              = { T. Ernst and L. Itti and L. Chang },
title               = { Automatic Scan Prescription for Brain MRI },
pages               = 1385,
month = { Apr },
year                = 2000,
booktitle           = { Proceedings of the 8th Annual Meeting of the International
                        Society for Magnetic Resonance in Medicine (ISMRM'2000) },
type                = {mip},
file                = { http://iLab.usc.edu/publications/doc/Ernst_etal00ismrm.pdf },
review={full/conf}
}

@article{Chang_etal00,
author              = { L. Chang and C. S. Grob and T. Ernst and L. Itti and F.
                        S. Mishkin and R. Jose-Melchor and R. E. Poland },
title               = { Effect of ecstasy [3,4-methylenedioxymethamphetamine (MDMA)]
                        on cerebral blood flow: a co-registered SPECT and MRI study },
journal             = { Psychiatry Research },
volume              = 98,
number              = 1,
pages               = { 15-28 },
month = { Feb },
year                = 2000,
keywords            = { Adult ; Brain/*drug effects/pathology/radionuclide imaging
                        ; Case-Control Studies ; Cerebrovascular Circulation/*drug
                        effects ; Dose-Response Relationship, Drug ; Female ; Human
                        ; *Magnetic Resonance Imaging ; Male ; Middle Age ; N-Methyl-3,4-methylenedioxyamphetamine/administration
                        and dosage/*adverse ; effects ; Radiopharmaceuticals/diagnostic
                        use ; Serotonin Agents/administration and dosage/*adverse
                        effects ; Support, Non-U.S. Gov't ; Support, U.S. Gov't,
                        P.H.S. ; Technetium Tc 99m Exametazime/diagnostic use ;
                        Time Factors ; *Tomography, Emission-Computed, Single-Photon
                        ; 2000/05/16 09:00 },
abstract            = { 3,4-methylenedioxymethamphetamine (MDMA), an illicit recreational
                        drug, damages serotonergic nerve endings. Since the cerebrovasculature
                        is regulated partly by the serotonergic system, MDMA may
                        affect cerebral blood flow (CBF) in humans. We evaluated
                        21 abstinent recreational MDMA users and 21 age- and gender-matched
                        healthy subjects with brain SPECT and MRI. Ten of the MDMA
                        subjects also had repeat SPECT and MRI after receiving two
                        doses of MDMA. Abstinent MDMA users showed no significantly
                        different global or regional CBF (rCBF) compared to the
                        control subjects. However, within 3 weeks after MDMA administration,
                        rCBF remained decreased in the visual cortex, the caudate,
                        the superior parietal and dorsolateral frontal regions compared
                        to baseline rCBF. The decreased rCBF tended to be more pronounced
                        in subjects who received the higher dosage of MDMA. Two
                        subjects who were scanned at 2-3 months after MDMA administration
                        showed increased rather than decreased rCBF. Low-dose recreational
                        MDMA use does not cause detectable persistent rCBF changes
                        in humans. The lack of long-term rCBF changes may be due
                        to a non-significant effect of serotonergic deficits on
                        rCBF, or regeneration of serotonergic nerve terminals. The
                        subacute decrease in rCBF after MDMA administration may
                        be due to the direct effect of MDMA on the serotonergic
                        system or the indirect effects of its metabolites on the
                        dopaminergic system; the preliminary data suggest these
                        effects may be transient. },
address             = { Department of Neurology, UCLA School of Medicine, Harbor-UCLA
                        Medical Center, 1000 W. Carson Street, B-4, Torrance, CA
                        90509, USA. linda_chang@humc.edu },
file                = { http://iLab.usc.edu/publications/doc/Chang_etal00.pdf },
type                = { mip;fmri;med },
if = {1998 impact factor: 1.424}
}

@phdthesis{Itti00phd,
author              = { L. Itti },
title               = { Models of Bottom-Up and Top-Down Visual Attention },
month = { Jan },
year                = 2000,
keywords            = { Visual Attention ; Bottom-Up ; Top-Down ; Modeling ; Spatial
                        Vision ; Human Psychophysics ; Neural Networks ; Automatic
                        Target Recognition (ATR) ; Visual Search ; Eye Movements },
abstract            = { When we observe our visual environment, we do not perceive
                        all its components as being equally interesting. Some objects
                        automatically and effortlessly ``pop-out'' from their surroundings,
                        that is, they draw our visual attention, in a `bottom-up''
                        manner, towards them. In a first approximation, focal visual
                        attention acts as a rapidly shiftable ``spotlight,'' which
                        allows only the selected information to reach higher levels
                        of processing and representation. Most models of the bottom-up
                        control of attention are based on the concept of a saliency
                        map, that is, an explicit two-dimensional map that encodes
                        the conspicuity of objects in the visual environment. Competition
                        among neurons in this map gives rise to a single winning
                        location that corresponds to the next attended target. Inhibiting
                        this location automatically allows the system to attend
                        to the next most salient location. A first body of work
                        in this thesis describes a detailed computer implementation
                        of such a scheme, focusing on the problem of combining information
                        across modalities, here orientation, intensity and color
                        information, in a purely stimulus-driven manner. The model
                        is applied to common psychophysical stimuli as well as to
                        very demanding visual search tasks. Its successful performance
                        is used to address the extent to which the primate visual
                        system carries out visual search via one or more such saliency
                        maps and how this can be tested. We next address the question
                        of what happens once our attention is focused onto a restricted
                        part of our visual field. There is mounting experimental
                        evidence that attention is far more sophisticated than a
                        simple feed-forward spatially-selective filtering process.
                        Indeed, visual processing appears to be significantly different
                        inside the attentional spotlight than outside. That is,
                        in addition to its properties as a feed-forward information
                        processing and transmission bottleneck, focal visual attention
                        feeds back and locally modulates, in a ``top-down'' manner,
                        the visual processing and representation of selected objects.
                        The second body of work presented in this thesis is concerned
                        with a detailed computational model of basic pattern vision
                        in humans and its modulation by top-down attention. We start
                        by acquiring a complete dataset of five different simple
                        psychophysical experiments, including discriminations of
                        contrast, orientation and spatial frequency of simple pattern
                        stimuli by human observers. This experimental dataset places
                        strict constraints on our model of early pattern vision.
                        The model, however, is eventually able to reproduce the
                        entire dataset while assuming plausible neurobiological
                        components. The model is further applied to existing psychophysical
                        data which demonstrates how top-down attention alters performance
                        in these simple psychophysical discrimination experiments.
                        Our model is able to quantitatively account for all observations
                        by assuming that attention strengthens the non-linear cortical
                        interactions among visual neurons. Together, the two aspects
                        of attention studied in this thesis lead us to consider
                        the essential role of non-linear computations in visual
                        processing. We suggest that visual processing, even at its
                        earliest levels, is best characterized not by linear response
                        functions and spatial convolutions, but rather by non-linearly
                        interacting computational devices. },
address             = { Pasadena, California },
organization        = { California Institute of Technology },
type                = { bu;td;mod;psy;cv },
file                = { http://iLab.usc.edu/publications/doc/Itti00phd.pdf }
}

@inproceedings{Itti99palm,
author              = { L. Itti },
title               = { A saliency-based search mechanism for overt and covert shifts
                        of visual attention },
month = { Nov },
year                = 1999,
booktitle           = { Workshop on Attention. Reisensburg Castle, Guenzburg, Germany },
type                = { bu;mod },
review={abs/wkshp}
}

@article{Ernst_etal99mrm,
author              = { T. Ernst and O. Speck and L. Itti and L. Chang },
title               = { Simultaneous correction for interscan patient motion and
                        geometric distortions in echoplanar imaging },
journal             = { Magnetic Resonance in Medicine },
volume              = 42,
number              = 1,
pages               = { 201-5 },
month = { Jul },
year                = 1999,
keywords            = { Algorithms ; Artifacts ; Brain/anatomy and histology ; Computer
                        Simulation ; Echo-Planar Imaging/*instrumentation ; Human
                        ; Image Enhancement/*instrumentation ; Image Processing,
                        Computer-Assisted/*instrumentation ; Reference Values ;
                        Support, Non-U.S. Gov't ; Support, U.S. Gov't, P.H.S. ;
                        1999/07/10 10:00 },
abstract            = { A method is presented for simultaneous correction of linear
                        geometric distortions and interscan patient motion in echoplanar
                        imaging (EPI). The technique does not require the acquisition
                        of specialized scans other than high-resolution magnetic
                        resonance images. The method is based on a generalized surface-based
                        coregistration algorithm, which accounts for a complete
                        3-dimensional affine transformation, i.e., rotations, translations,
                        scaling, and shearing, between two volumetric image data
                        sets. Any minimally distorted high-resolution scan may serve
                        as a reference data set, to which the EPI data set is matched.
                        The algorithmic accuracy was assessed using simulated data
                        sets with known affine distortions. The deviation of the
                        parameters determined by the coregistration program from
                        the true values typically was 1\% or less. Precise alignment
                        of functional and anatomic information will be important
                        for many future clinical applications. },
address             = { Harbor UCLA Research and Education Institute, Torrance,
                        USA. },
type                = { mip;fmri },
file                = { http://iLab.usc.edu/publications/doc/Ernst_etal99mrm.pdf },
if = {2000 impact factor: 3.121}
}

@inproceedings{Niebur99,
author              = { E. Niebur and C. Koch and L. Itti and P. N. Steinmetz and
                        A. Roy and P. Fitzgerald and K. O. Johnson and S. S. Hsiao },
title               = { Modeling Selective Attention },
pages               = 151,
month = { Jun },
year                = 1999,
editor              = { N. Elsner and U. Eysel },
booktitle           = { From Molecular Neurobiology to Clinical Neuroscience, Proceedings
                        of the 1st Goettingen conference of the German Neuroscience
                        Society },
type                = { bu;mod },
review={abs/conf}
}

@article{Speck_etal99,
author              = { O. Speck and L. Chang and L. Itti and E. Itti and T. Ernst },
title               = { Comparison of static and dynamic MRI techniques for the
                        measurement of regional cerebral blood volume },
journal             = { Magnetic Resonance in Medicine },
volume              = 41,
number              = 6,
pages               = { 1264-8 },
month = { Jun },
year                = 1999,
keywords            = { Brain/*anatomy and histology/blood supply ; Cerebrovascular
                        Circulation/*physiology ; Comparative Study ; Contrast Media
                        ; Echo-Planar Imaging/methods ; Gadolinium DTPA/diagnostic
                        use ; Human ; Image Processing, Computer-Assisted/methods
                        ; Magnetic Resonance Imaging/*methods ; Support, Non-U.S.
                        Gov't ; Support, U.S. Gov't, P.H.S. ; 2000/06/20 09:00 },
abstract = { Two different acquisition and processing strategies to
determine the regional cerebral blood volume (rCBV) with magnetic
resonance imaging (MRI) are compared. The first method is based on the
acquisition of the signal time course during a bolus administration of
a contrast agent (dynamic method).  The second method evaluates signal
changes before and after the contrast agent injection (static method),
assuming the contrast agent remains primarily intravascular in the
brain after the first pass. Both methods were applied to the same data
sets, acquired with either echoplanar imaging (EPI, n = 18) or fast
low-angle shot (FLASH, n = 28) techniques.  A voxel-by-voxel
correlation between the static and dynamic method yielded a
correlation coefficient of 0.76 +/- 0.06 for the EPI and 0.71 +/- 0.10
for the FLASH measurements.  The static method was less sensitive and
showed higher standard deviations for rCBV than the dynamic
method. With the development of truly intravascular contrast agents,
the static perfusion MRI method, which can be performed with higher
signal-to-noise ratio and higher spatial resolution, may become an
alternative to ultra-fast MRI for measuring rCBV. },
address             = { Harbor UCLA Research and Education Institute, Torrance,
                        California, USA. },
type                = { mip;fmri },
file                = { http://iLab.usc.edu/publications/doc/Speck_etal99.pdf },
if = {2000 impact factor: 3.121}
}

@inproceedings{Itti_Koch99nato,
author              = { L. Itti and C. Koch },
title               = { Target Detection using Saliency-Based Attention },
pages               = { 3.1-3.10 },
month = { Jun },
year                = 1999,
keywords            = { Visual attention ; saliency ; preattentive ; inhibition
                        of return ; model ; winner-take-all ; bottom-up ; natural
                        scene },
abstract            = { Most models of visual search, whether involving overt eye
                        movements or covert shifts of attention, are based on the
                        concept of a ``saliency map,'' that is, an explicit two-dimensional
                        map that encodes the saliency or conspicuity of objects
                        in the visual environment. Competition among neurons in
                        this map gives rise to a single winning location that corresponds
                        to the next attended target. Inhibiting this location automatically
                        allows the system to attend to the next most salient location.
                        We describe a detailed computer implementation of such a
                        scheme, focusing on the problem of combining information
                        across modalities, here orientation, intensity and color
                        information, in a purely stimulus-driven manner. We have
                        successfully applied this model to a wide range of target
                        detection tasks, using synthetic and natural stimuli. Performance
                        has however remained difficult to objectively evaluate on
                        natural scenes, because no objective reference was available
                        for comparison. We here present predicted search times for
                        our model on the Search2 database of rural scenes containing
                        a military vehicle. Overall, we found a poor correlation
                        between human and model search times. Further analysis however
                        revealed that in 3/4 of the images, the model appeared to
                        detect the target faster than humans (for comparison, we
                        calibrated the model's arbitrary internal time frame such
                        that no more than 2-4 image locations were visited per second).
                        It hence seems that this model, which had originally been
                        designed not to find small, hidden military vehicles, but
                        rather to find the few most obviously conspicuous objects
                        in an image, performed as an efficient target detector on
                        the Search2 dataset. },
booktitle           = { Proc. RTO/SCI-12 Workshop on Search and Target Acquisition
                        (NATO Unclassified), Utrecht, The Netherlands, RTO-MP-45
                        AC/323(SCI)TP/19 },
type                = { bu;mod;cv },
file                = { http://iLab.usc.edu/publications/doc/Itti_Koch99nato.pdf },
review={abs/wkshp}
}

@article{Itti_etal99nc,
author              = { L. Itti and C. Koch and J. Braun },
title               = { A Quantitative Model Relating Visual Neuronal Activity to
                        Psychophysical Thresholds },
journal             = {Neurocomputing},
number              = { 26-27 },
pages               = { 743-748 },
month = { Jun },
year                = 1999,
keywords            = { early vision ; model ; divisive inhibition ; non-linear
                        interactions ; masking },
abstract            = { We investigate how a simple, physiologically motivated three-stage
                        neuronal model can establish a quantitative relationship
                        between activities in small populations of simulated early
                        visual neurons and human psychophysical thresholds. The
                        model consists of: First, a bank of linear filters tuned
                        for orientation and spatial period; second, non-linear interactions
                        between filters; and, third, a statistically efficient decision
                        stage. The model quantitatively reproduces human thresholds
                        for five classical pattern discrimination tasks, using a
                        unique set of automatically determined parameters. The resulting
                        model components are all plausible in terms of putative
                        neuronal correlates. },
type                = { mod;psy },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal99nc.pdf },
if = {1998 impact factor: 0.453}
}

@inproceedings{Ernst_etal99ismrm,
author              = { T. Ernst and O. Speck and L. Itti and L. Chang },
title               = { Simultaneous Correction for Interscan Patient Motion and
                        Geometric Distortions in Echo Planar Imaging },
pages               = 2208,
month = { May },
year                = 1999,
booktitle           = { Proc. 7th Annual Meeting of the International Society for
                        Magnetic Resonance in Medicine (ISMRM'1999) },
type                = {mip},
review={full/conf}
}

@inproceedings{Speck_etal99ismrm,
author              = { O. Speck and L. Chang and L. Itti and T. Ernst },
title               = { Comparison of Static and Dynamic MRI Techniques for the
                        Measurment of Regional Cerebral Blood Volume },
pages               = 603,
month = { May },
year                = 1999,
booktitle           = { Proc. 7th Annual Meeting of the International Society for
                        Magnetic Resonance in Medicine (ISMRM'1999) },
type                = {mip},
review={full/conf}
}

@inproceedings{Chang_etal99ismrm,
author              = { L. Chang and O. Speck and E. N. Miller and J. Braun and L.
                        Itti and T. Ernst },
title               = { Increased Usage of Brain Reserve Capacity in Patients with
                        HIV },
pages               = 822,
month = { May },
year                = 1999,
booktitle           = { Proc. 7th Annual Meeting of the International Society for
                        Magnetic Resonance in Medicine (ISMRM'1999) },
type                = { mip;med;fmri },
review={full/conf}
}

@inproceedings{Itti_etal99vrc,
author              = { L. Itti and C. R. Scheier and B. Khurana and C. Koch },
title               = { A Simple Model of Long-Range Interactions for the Computation
                        of Salience },
month = { May },
year                = 1999,
keywords            = { Bottom-up attention ; salience ; long-range interactions
                        ; visual search },
abstract            = { Combining, into a single saliency map, information from
                        multiple feature maps (each encoding for salience in different
                        feature types such as color, intensity or orientation at
                        different spatial scales) poses a signal-to-noise ratio
                        problem for the detection of salient targets among distractors.
                        For example, while an orientation pop-out would strongly
                        appear in an orientation discontinuity map tuned to the
                        target, localized activity resulting from the contrast between
                        image background and target or distractors alike would also
                        strongly appear in intensity and color contrast maps. This
                        equally strong activity of targets and distractors in the
                        intensity and color channels diminishes the effective salience
                        of the target from the orientation channel. We investigated
                        how competition for salience within each feature type may
                        alleviate this signal-to-noise problem. We implemented a
                        simple model of spatial competition between salient locations
                        in the form of iterative rectified filtering by a two-dimensional
                        ``Difference-of-Gaussians'' filter with narrow excitatory
                        and broad inhibitory widths (2\% and 25\% of the image width).
                        Based on this model, multiple locations initially eliciting
                        comparable responses (such as in an intensity contrast map
                        with an orientation popout stimulus) suppressed each other,
                        while a location initially standing out (such as in an orientation
                        contrast map with an orientation pop-out stimulus) was greatly
                        enhanced. The competitive process hence increased target-to-distractor
                        salience combined over all channels, both by enhancing locally
                        stronger signals and by suppressing spatially comparable
                        signals. This model has been applied successfully to various
                        visual search tasks, and may provide supportive mechanistic
                        evidence for the results of Scheier et al., in which local
                        versus global masking differentially affects visual search. },
booktitle           = { Proc. 3rd Annual Vision Research Conference, Fort Lauderdale,
                        FL },
type                = { mod;bu },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal99vrc.pdf },
review={abs/conf}
}

@inproceedings{Scheier_etal99vrc,
author              = { C. R. Scheier and B. Khurana and L. Itti and C. Koch },
title               = { Visual Search Amnesic or Memory Driven? },
month = { May },
year                = 1999,
booktitle           = { Proc. 3rd Annual Vision Research Conference, Fort Lauderdale,
                        FL },
type                = {bu},
review={abs/conf}
}

@inproceedings{Itti_etal99arvo,
author              = { L. Itti and J. Braun and C. Koch },
title               = { Contrast Discrimination can Explain Orientation Discrimination },
volume              = 40,
number              = 4,
pages               = 3016,
month = { Mar },
year                = 1999,
abstract            = { Purpose: Many current early vision models consist of a population
                        of noisy orientation-selective filters, followed by noiseless
                        central decision. It has been argued (Bowne, Vis Res 1990;30:449-61)
                        that all such models, with noise only at the sensory level
                        (filters), cannot explain the differential dependence of
                        contrast and orientation discrimination thresholds on stimulus
                        contrast (Delta_c propto c^-0.4 versus Delta_theta propto
                        c^-0.1, c>0.1). Most current models indeed predict improvement
                        with contrast at the same rate for Delta_c/c and Delta_theta,
                        both resulting from an overall improvement in signal-to-noise
                        ratio with stimulus contrast. One way to reconcile both
                        observations is to assume additional, task-dependent noise
                        at the central decision stage (Bowne, 1990). Here, we argue
                        that this apparent contradiction constitutes further evidence
                        for a certain type of non-linear interactions among filters
                        (``divisive inhibition'', also known as ``Heeger normalization'').
                        [[[FIGURE]]] Results: A model with noiseless decision was
                        able to simultaneously account for both observations (figure)
                        by implementing strong non-linear excitatory and inhibitory
                        interactions between filters tuned to similar orientations.
                        Resulting from the interactions, the orientation tuning
                        bandwidth of the filters broadened by 25\% as c increased
                        from 0.01 to 0.99, which partially canceled the improvement
                        of Delta_theta with c, but did not affect Delta_c/c. Conclusion:
                        ad hoc task-dependent central noise is unnecessary provided
                        that filters interact. Far from constituting a weakness
                        of current spatial vision models, the differential contrast
                        dependence of different types of thresholds corroborates
                        the current views as to the nature of interactions between
                        filters. },
booktitle           = { Investigative Ophthalmology and Visual Science (Proc. ARVO
                        1999) },
type                = { mod;psy },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal99arvo.pdf },
review={abs/conf}
}

@article{Lee_etal99nn,
author              = { D. K. Lee and L. Itti and C. Koch and J. Braun },
title               = { Attention activates winner-take-all competition among visual
                        filters },
journal             = { Nature Neuroscience },
volume              = 2,
number              = 4,
pages               = { 375-81 },
month = { Apr },
year                = 1999,
keywords            = { Attention/*physiology ; Contrast Sensitivity/physiology
                        ; Discrimination (Psychology)/*physiology ; Human ; *Models,
                        Neurological ; Neurons/physiology ; Pattern Recognition,
                        Visual/physiology ; Perceptual Masking ; Sensory Thresholds
                        ; Space Perception/physiology ; Support, U.S. Gov't, Non-P.H.S.
                        ; Support, U.S. Gov't, P.H.S. ; Visual Cortex/*physiology
                        ; Visual Perception/*physiology ; 1999/04/16 02:03 },
abstract            = { Shifting attention away from a visual stimulus reduces,
                        but does not abolish, visual discrimination performance.
                        This residual vision with 'poor' attention can be compared
                        to normal vision with 'full' attention to reveal how attention
                        alters visual perception. We report large differences between
                        residual and normal visual thresholds for discriminating
                        the orientation or spatial frequency of simple patterns,
                        and smaller differences for discriminating contrast. A computational
                        model, in which attention activates a winner-take-all competition
                        among overlapping visual filters, quantitatively accounts
                        for all observations. Our model predicts that the effects
                        of attention on visual cortical neurons include increased
                        contrast gain as well as sharper tuning to orientation and
                        spatial frequency. },
address             = { Computation and Neural Systems, California Institute of
                        Technology, Pasadena 91125, USA. },
type                = { mod;td;psy },
file                = { http://iLab.usc.edu/publications/doc/Lee_etal99nn.pdf },
if = {2000 impact factor: 12.636}
}

@article{Ernst_etal99mri,
author              = { T. Ernst and L. Chang and L. Itti and O. Speck },
title               = { Correlation of regional cerebral blood flow from perfusion
                        MRI and spect in normal subjects },
journal             = { Magnetic Resonance Imaging },
volume              = 17,
number              = 3,
pages               = { 349-54 },
month = { Apr },
year                = 1999,
keywords            = { Adult ; Aged ; Aged, 80 and over ; Blood Flow Velocity/physiology
                        ; Brain/*blood supply ; Female ; Human ; Image Enhancement
                        ; Image Processing, Computer-Assisted ; *Magnetic Resonance
                        Imaging ; Male ; Middle Age ; Reference Values ; Regional
                        Blood Flow/physiology ; Support, Non-U.S. Gov't ; Support,
                        U.S. Gov't, P.H.S. ; *Tomography, Emission-Computed, Single-Photon
                        ; 1999/04/09 02:03 },
abstract            = { The objective of this study was to determine the relationship
                        in regional cerebral blood flow (rCBF) as measured with
                        perfusion magnetic resonance imaging (pMRI) and single photon
                        emission computer tomography (SPECT). rCBF was determined
                        in 26 healthy subjects with pMRI and SPECT. After co-registration
                        of pMRI with SPECT, rCBF was determined in 10 brain regions
                        relative to the whole slice value. pMRI was evaluated with
                        and without elimination of large vessels. rCBF from pMRI
                        correlates significantly with rCBF from SPECT (r = 0.69
                        with and r = 0.59 without elimination of large vessels;
                        p < 0.0001 for both). Elimination of large vessels reduced
                        the interindividual variance of the pMRI measurements in
                        most regions. rCBF from pMRI shows good correlation with
                        rCBF from SPECT. Because pMRI is sensitive to flow in large
                        vessels while SPECT is not, elimination of large vessels
                        in pMRI reduces the interindividual variability of pMRI
                        and improves the-correlation between the two methods. pMRI
                        is a reliable noninvasive method for rCBF measurements. },
address             = { Department of Radiology, Harbor UCLA Medical Center, Torrance,
                        CA 90502, USA. ernst@afp76.humc.edu },
file                = { http://iLab.usc.edu/publications/doc/Ernst_etal99mri.pdf },
type                = { med;fmri;mip },
if = {1998 impact factor: 1.208}
}

@inproceedings{Itti_etal99nips,
author              = { L. Itti and J. Braun and D. K. Lee and C. Koch },
title               = { Attentional Modulation of Human Pattern Discrimination Psychophysics
                        Reproduced by a Quantitative Model },
pages               = { 789-795 },
month = { Aug },
year                = 1999,
abstract            = { We previously proposed a quantitative model of early visual
                        processing in primates, based on non-linearly interacting
                        visual filters and statistically efficient decision. We
                        now use this model to interpret the observed modulation
                        of a range of human psychophysical thresholds with and without
                        focal visual attention. Our model -- calibrated by an automatic
                        fitting procedure -- simultaneously reproduces thresholds
                        for four classical pattern discrimination tasks, performed
                        while attention was engaged by another concurrent task.
                        Our model then predicts that the seemingly complex improvements
                        of certain thresholds, which we observed when attention
                        was fully available for the discrimination tasks, can best
                        be explained by a strengthening of competition among early
                        visual filters. },
editor              = { M. S. Kearns and S. A. Solla and D. A. Cohn },
publisher           = { MIT Press },
address             = { Cambridge, MA },
booktitle           = { Advances in Neural Information Processing Systems (NIPS*1998), Vol.
                        11 },
type                = { mod;td;psy },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal99nips.pdf },
url                 = { http://klab.caltech.edu/~itti/topdown/98_NIPS/ },
review={full/conf},
if ={1998 acceptance rate: 31\%}
}

@inproceedings{Itti_etal99eilat,
author              = { L. Itti and J. Braun and D. K. Lee and C. Koch },
title               = { Attentional Modulation of Human Pattern Discrimination Psychophysics
                        Reproduced by a Quantitative Model },
pages               = 41,
month = { Mar },
year                = 1999,
booktitle           = { Proc. Joint Symposium on Frontiers in Computational Neuroscience,
                        Eilat, Israel },
type                = { td;mod },
review={abs/wkshp}
}

@inproceedings{Itti_Koch99spie,
author              = { L. Itti and C. Koch },
title               = { Comparison of Feature Combination Strategies for Saliency-Based
                        Visual Attention Systems },
volume              = 3644,
pages               = { 473-82 },
month = { Jan },
year                = 1999,
abstract            = { Bottom-up or saliency-based visual attention allows primates
                        to detect non-specific conspicuous targets in cluttered
                        scenes. A classical metaphor, derived from electrophysiological
                        and psychophysical studies, describes attention as a rapidly
                        shiftable 'spotlight'. The model described here reproduces
                        the attentional scanpaths of this spotlight: Simple multi-scale
                        'feature maps' detect local spatial discontinuities in intensity,
                        color, orientation or optical flow, and are combined into
                        a unique 'master' or 'saliency' map. the saliency map is
                        sequentially scanned, in order of decreasing saliency, by
                        the focus of attention. We study the problem of combining
                        feature maps, from different visual modalities and with
                        unrelated dynamic ranges, into a unique saliency map. Four
                        combination strategies are compared using three databases
                        of natural color images: (1) Simple normalized summation,
                        (2) linear combination with learned weights, (3) global
                        non-linear normalization followed by summation, and (4)
                        local non-linear competition between salient locations.
                        Performance was measured as the number of false detections
                        before the most salient target was found. Strategy (1) always
                        yielded poorest performance and (2) best performance, with
                        a 3- to 8-fold improvement in time to find a salient target.
                        However, (2) yielded specialized systems with poor generations.
                        Interestingly, strategy (4) and its simplified, computationally
                        efficient approximation (3) yielded significantly better
                        performance than (1), with up to 4-fold improvement, while
                        preserving generality. },
booktitle           = { Proc. SPIE Human Vision and Electronic Imaging IV (HVEI'99),
                        San Jose, CA },
publisher={SPIE Press},
address={Bellingham, WA},
type                = { mod;bu;cv },
file                = { http://iLab.usc.edu/publications/doc/Itti_Koch99spie.pdf },
review={abs/conf}
}

@article{Itti_etal98pami,
author              = { L. Itti and C. Koch and E. Niebur },
title               = { A Model of Saliency-Based Visual Attention for Rapid Scene
                        Analysis },
journal             = { IEEE Transactions on Pattern Analysis and Machine Intelligence },
volume              = 20,
number              = 11,
pages               = { 1254-1259 },
month = { Nov },
year                = 1998,
keywords            = { Visual attention ; target detection ; saliency ; image understanding },
abstract            = { A trainable visual attention system, inspired by the behavior
                        and the neuronal architecture of the early primate visual
                        system, is presented. Multiscale image features are combined
                        into a single topographical saliency map. A dynamical neural
                        network then selects attended locations in order of decreasing
                        saliency. The system breaks down the complex problem of
                        scene understanding by rapidly selecting, in a computationally
                        efficient manner, conspicuous locations to be analyzed in
                        detail. },
type                = { mod;bu;cv },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal98pami.pdf },
if = {1998 impact factor: 1.417}
}

@inproceedings{Braun_etal98sfn,
author              = { J. Braun and D. K. Lee and L. Itti and C. Koch },
title               = { Parallels Between Psychophysical and Neuronal Models of
                        Orientation Tuning },
pages               = 767,
month = { Nov },
year                = 1998,
booktitle           = { Proc.Society for Neuroscience Annual Meeting (SFN'98) },
type                = { mod;psy },
review={abs/conf}
}

@inproceedings{Itti_etal98nips,
author              = { L. Itti and J. Braun and D. K. Lee and C. Koch },
title               = { A Model of Early Visual Processing },
pages               = { 173-179 },
month = { Aug },
year                = 1998,
abstract            = { We propose a model for early visual processing in primates.
                        The model consists of a population of linear spatial filters
                        which interact through non-linear excitatory and inhibitory
                        pooling. Statistical estimation theory is then used to derive
                        human psychophysical thresholds from the responses of the
                        entire population of units. The model is able to reproduce
                        human thresholds for contrast and orientation discrimination
                        tasks, and to predict contrast thresholds in the presence
                        of masks of varying orientation and spatial frequency. },
editor              = { M. I. Jordan and M. J. Kearns and S. A. Solla },
publisher           = { MIT Press },
address             = { Cambridge, MA },
booktitle           = { Advances in Neural Information Processing Systems (NIPS*1997), Vol.
                        10 },
type                = { mod;psy },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal98nips.pdf },
review={full/conf},
if ={1997 oral acceptance rate: 4\%}

}

@patent{Ernst_etal98pat,
author              = { T. Ernst and L. Chang and L. Itti },
title               = { An Automated Scanning Apparatus for Tomographic Images },
year                = 1998,
keywords            = { automated ; scan acquisition ; scanner ; human ; medical
                        ; image processing },
abstract            = { The invention describes an apparatus and computer algorithm
                        which allows for fast and accurate acquisition of tomographic
                        scans, always in the same frame of reference irrespectively
                        of patient positioning, using conventional scanners. The
                        invention not only significantly reduces total scan time
                        but also ensures reproducibility of the scanning process,
                        which is useful for follow-up studies. },
note                = { U.S. Patent number 09/272,436. },
organization        = { Harbor-UCLA Research and Education Institute, Torrance,
                        CA, USA },
type                = {mip}
}

@patent{Itti_etal98copy,
author              = { L. Itti and L. Chang and T. Ernst },
title               = { Coregistration for Neuroimaging Systems (C.N.S.) },
year                = 1998,
keywords            = { Medical Imaging ; Image Processing ; MRI ; SPECT ; Coregistration
                        ; Software },
abstract            = { C.N.S. is a medical image processing software suite consisting
                        of 300 processing modules and over 250 networks of such
                        modules, developed at Harbor-UCLA Medical Center with Drs.~Chang
                        and Ernst, as well as through a number of collaborations.
                        The software includes, among others, custom automated algorithms
                        for skull stripping, surface-based coregistration, segmentation
                        of CSF, correction for partial volume effects in SPECT/PET/pMRI,
                        coregistration of MRS to MRI allowing accurate MRS localization
                        on follow-up scans, MRI morphometry and drawing of regions
                        of interest, extraction of white-matter lesions, correction
                        for MRI intensity decay with surface coils, correction for
                        geometric distortions and patient motion in EPI-fMRI, sparse
                        fusion of fMRI time series across sessions, computation
                        of fMRI activation, computation of blood flow in Gd-pMRI,
                        elimination of large vessels in pMRI, computation of diffusion
                        tensor in dMRI, 3D surface mesh reconstruction and optimization,
                        and various 2D and 3D visualization tools, classical image
                        processing tools and image conversion tools. <P> The system
                        can be simultaneously built and executed by an unrestricted
                        number of users on various hardware platforms, can automatically
                        generate restricted distributions and updates from the master
                        system, and automatically generates LaTeX and HTML manuals
                        from the online help pages. Copyright by Harbor-UCLA Research
                        and Education Institute. <P> In addition to being a key
                        component of our research at the Dept. of Neurology, Harbor-UCLA
                        Medical Center, our system or parts of it is being used
                        at The National Institutes of Health, the University of
                        California, San Francisco (UCSF), the University of California,
                        San Diego (UCSD), the University of California, Los Angeles,
                        Veterans Affairs Medical Center (UCLA-VAMC), Harbor-UCLA
                        Medical Center, Dept. of Nuclear Medicine, and the Weill
                        Medical College of Cornell University. },
address             = { Harbor-UCLA REI - 1124 W. Carson St. - Torrance, CA 90502 },
organization        = { Harbor-UCLA Research and Education Institute },
type                = { mip;fmri },
url                 = { http://iLab.usc.edu/cns/ }
}

@inproceedings{Itti_etal98ismrm1,
author              = { L. Itti and T. Ernst and J. Braun and L. Chang },
title               = { Fusion of fMRI Time-Series Across Sessions },
pages               = 1489,
month = { Apr },
year                = 1998,
booktitle           = { Proc. 6th Annual Meeting of the International Society for
                        Magnetic Resonance in Medicine (ISMRM'1998), Sydney, Australia },
type                = { mip;fmri },
file                = { http://ilab.usc.edu/publications/doc/Itti_etal98ismrm1.pdf },
review={full/conf}
}

@inproceedings{Itti_etal98ismrm2,
author              = { L. Itti and L. Chang and T. Ernst },
title               = { Manual and Automatic Extraction of White-Matter Lesions
                        in FLAIR Images },
pages               = 2073,
month = { Apr },
year                = 1998,
booktitle           = { Proc. 6th Annual Meeting of the International Society for
                        Magnetic Resonance in Medicine (ISMRM'1998), Sydney, Australia },
type                = { mip;fmri },
file                = { http://ilab.usc.edu/publications/doc/Itti_etal98ismrm2.pdf },
review={full/conf}
}

@inproceedings{Itti_etal98arvo,
author              = { L. Itti and C. Koch and J. Braun },
title               = { A Model for the Attentional Modulation of Spatial Vision,
                        Continued },
volume              = 39,
number              = 4,
pages               = 2934,
month = { Mar },
year                = 1998,
booktitle           = { Investigative Ophthalmology and Visual Science (Proc. ARVO
                        1998) },
type                = { td;mod;psy },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal98arvo.pdf },
review={abs/conf}
}

@inproceedings{Lee_etal98arvo,
author              = { D. K. Lee and C. Koch and L. Itti and J. Braun },
title               = { Attentional Modulation of Contrast Masking, Continued },
volume              = 39,
number              = 4,
pages               = 2938,
month = { Mar },
year                = 1998,
booktitle           = { Investigative Ophthalmology and Visual Science (Proc. ARVO
                        1998) },
type                = {psy},
review={abs/conf}
}

@inproceedings{Braun_etal97sfn,
author              = { J. Braun and T. Ernst and R. Wang and L. Chang and L. Itti
                        and P. Ledden and C. Koch },
title               = { A fMRI Study of Cortical Activity During Visual Segmentation },
pages               = { 755.12 },
month = { Oct },
year                = 1997,
booktitle           = { Proc. Society for Neuroscience Annual Meeting (SFN'97) },
type                = {fmri},
review={abs/conf}
}

@article{Itti_etal97hbm2,
author              = { L. Itti and L. Chang and T. Ernst and F. S. Mishkin },
title               = { Improved 3D correction for partial volume effects in brain
                        SPECT },
journal             = { Human Brain Mapping },
volume              = 5,
number              = 5,
pages               = { 379-388 },
year                = 1997,
keywords            = { partial volume effects ; MRI ; SPECT ; brain atrophy ; image
                        processing },
abstract            = { An improved method for correction of partial volume effects
                        (PVE) in brain SPECT is proposed. It is fully three-dimensional,
                        does not require particular patient positioning, and works
                        with scans only partially covering the brain. The location
                        of functionally inactive brain regions (primarily cerebrospinal
                        fluid) is extracted from high-resolution MRI. An automatic
                        3D registration algorithm then determines the geometric
                        transformation between MRI and SPECT. Correction consists
                        of: 1) counting the volumetric active/inactive ratio in
                        each volume element of the functional scan using the measured
                        SPECT point spread function; 2) correcting the functional
                        measures according to these ratios; 3) fusing functional
                        and anatomical information at the resolution of MRI. Quantitative
                        validation was performed using a phantom containing a test
                        region in which multiple parallel acrylic plates thinner
                        than SPECT resolution created high PVE, as well. as a large
                        reference region not suffering from PVE. Reference activity
                        was recovered in the test region with an accuracy of 1-3
                        percent. The method was applied to clinical images demonstrating
                        a combination of hypoperfusion and cortical atrophy. The
                        composite anatomical-functional corrected images, in which
                        the main sulci are visible, yield better differentiation
                        between decreased function and focal atrophy. },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal97hbm2.pdf },
type                = {mip},
if = {1998 impact factor: 4.738}
}

@inproceedings{Itti_etal97ismrm,
author              = { L. Itti and L. Chang and D. Osborn and T. Ernst },
title               = { Automated Extraction of White Matter Lesions in FLAIR Images },
pages               = 418,
month = { Apr },
year                = 1997,
booktitle           = { Proc. 5th Annual Meeting of the International Society for
                        Magnetic Resonance in Medicine (ISMRM'1997), Vancouver,
                        Canada },
type                = {mip},
review={full/conf}
}

@inproceedings{Ernst_etal97ismrm,
author              = { T. Ernst and L. Chang and L. Itti },
title               = { Elimination of Large Vessels Improves Reproducibility of
                        Perfusion MRI },
pages               = 1791,
month = { Apr },
year                = 1997,
booktitle           = { Proc. 5th Annual Meeting of the International Society for
                        Magnetic Resonance in Medicine (ISMRM'1997), Vancouver,
                        Canada },
type                = {mip},
review={full/conf}
}

@inproceedings{Itti_etal97arvo,
author              = { L. Itti and C. Koch and J. Braun },
title               = { A Model for the Attentional Modulation of Spatial Vision },
volume              = 38,
number              = 4,
pages               = 5461,
month = { Mar },
year                = 1997,
abstract            = { Purpose: We present a computational model of interactions
                        between visual spatial filters in humans. The model accounts
                        for a range of visual spatial thresholds and their modulation
                        by attention (Wen et al., ARVO'97). Methods: The model assumes
                        that each visual location is analyzed by a set of spatial
                        filters tuned for different spatial frequencies $(lambda)$
                        and orientations $(theta)$. Pairs of filters in quadrature
                        phase produce an energy response, $E_{lambda,theta}$, which
                        is normalized by the responses $E_{lambda',theta'}$ of a
                        subset of filters, centered around $(lambda,theta)$ and
                        weighted by coefficients $w_{lambda,theta}(lambda',theta')$,
                        yielding the pooled energy: $${cal E}_{lambda,theta} = {{E_{lambda,theta}^{gamma}}over
                        {sigma_{lambda}^{delta}+sum_{lambda',theta'} w_{lambda,theta}(lambda',theta')
                        E_{lambda',theta'}^{delta}}} qquad (gamma approx 2; delta
                        approx 1.5, without attention)$$ Psychophysical performance
                        is derived by assuming that each filter exhibits constant
                        background noise plus Poisson noise and by selecting the
                        most sensitive filter for each task (ROC analysis). Results:
                        The model is similar to models of contrast gain control
                        in cat simple cells (Heeger, Vis. Neurosci. 9:181-97, 1992)
                        and produces, without additional assumptions, the ``dip-shaped''
                        non-linearity characteristic of human threshold vision (Wilson
                        et al., Vis. Res. 23:873--82, 1983). Our model differs from
                        others in that only a relatively narrow normalization pool
                        (halfwidth of weight function $approx 1octave$ and $approx
                        15^circ$) accounts for human vision. The model successfully
                        reproduces human contrast, contrast increment, and orientation
                        discrimination thresholds, as well as threshold elevation
                        by contrast masking (Wen et al., ARVO'96 and '97). Increasing
                        both exponents $gamma$ and $delta$ accounts for the effect
                        of attention. Conclusion: The results suggest that attention
                        increases the gain and sharpens the tuning of visual spatial
                        filters, by strengthening normalizing interactions between
                        such filters. },
booktitle           = { Investigative Ophthalmology and Visual Science (Proc. ARVO
                        1997) },
type                = { td;mod;psy },
review={abs/conf}
}

@article{Kaufer_etal97,
author              = { D. I. Kaufer and B. L. Miller and L. Itti and L. A. Fairbanks
                        and J. Li and J. Fishman and J. Kushi and J. L. Cummings },
title               = { Midline cerebral morphometry distinguishes frontotemporal
                        dementia and Alzheimer's disease },
journal             = {Neurology},
volume              = 48,
number              = 4,
pages               = { 978-85 },
month = { Apr },
year                = 1997,
keywords            = { Aged ; Aged, 80 and over ; Algorithms ; Alzheimer Disease/*diagnosis/psychology
                        ; Brain/*pathology ; Dementia/*diagnosis/psychology ; Diagnosis,
                        Differential ; Discriminant Analysis ; Female ; *Frontal
                        Lobe ; Human ; Magnetic Resonance Imaging ; Male ; Middle
                        Age ; Psychiatric Status Rating Scales ; Reference Values
                        ; Support, Non-U.S. Gov't ; Support, U.S. Gov't, Non-P.H.S.
                        ; Support, U.S. Gov't, P.H.S. ; *Temporal Lobe ; 1997/04/01
                        00:00 },
abstract            = { We investigated and contrasted midline cerebral structures
                        in frontotemporal dementia (FTD) and Alzheimer's disease
                        (AD). FTD and AD may be difficult to distinguish clinically.
                        FTD typically affects frontal and anterior temporal regions,
                        whereas AD tends to involve more posterior temporal and
                        parietal areas. We hypothesized that disease-specific cerebral
                        alterations would be differentially reflected in corresponding
                        regions of the corpus callosum (CC), pericallosal CSF space
                        (PCS), or their ratio (CC:PCS). Regions-of-interest (ROIs)
                        from midsagittal MRIs in 17 AD, 16 FTD, and 12 elderly control
                        (EC) subjects were analyzed. ROIs were divided into four
                        regions using an anatomic landmark-based computer algorithm
                        and were adjusted for head size variation. FTD subjects
                        had a much smaller anterior CC region and significantly
                        larger PCS area, particularly in anterior regions. AD and
                        EC subjects did not differ significantly in any total or
                        regional ROI measure. Total and anterior CC:PCS ratios were
                        markedly lower in FTD patients. Across groups, total CC:PCS
                        correlated significantly with midsagittal cerebral area
                        and was similarly associated with Mini-Mental State Examination
                        score. Anterior CC (AD) and PCS (FTD) regions exhibited
                        disease-specific relationships to these variables. A discriminant
                        model using two ROI variables correctly classified 91\% of
                        AD and FTD patients, comparing favorably with blind clinical
                        MRI diagnostic ratings. Midline cerebral structural alterations
                        reflect differential patterns of cerebral degeneration in
                        AD and FTD, yielding morphometric indices that may facilitate
                        the study of brain-behavior relationships and differential
                        diagnosis of dementia. },
address             = { Department of Psychiatry, University of Pittsburgh School
                        of Medicine, PA, USA. },
type                = {med},
if = {1998 impact factor: 4.972}
}

@inproceedings{Lee_etal97cns,
author              = { D. K. Lee and L. Itti and C. Koch and J. Braun },
title               = { Attentional Modulation of Spatial Vision },
pages               = 125,
month = { Mar },
year                = 1997,
booktitle           = { Proc. Cognitive Neuroscience Society, 4th Annual Meeting
                        (CNS'97) },
type                = {psy},
review={abs/conf}
}

@article{Itti_etal97hbm1,
author              = { L. Itti and L. Chang and J. F. Mangin and J. Darcourt and
                        T. Ernst },
title               = { Robust multimodality registration for brain mapping },
journal             = { Human Brain Mapping },
volume              = 5,
number              = 1,
pages               = { 3-17 },
year                = 1997,
keywords            = { multimodality registration ; neuroimaging ; brain ; chamfer
                        matching ; brain surface ; image processing },
abstract            = { We present a robust intrasubject registration method for
                        the synergistic use of multiple neuroimaging modalities,
                        with applications to magnetic resonance imaging (MRI), functional
                        MRI, perfusion MRI, MR spectroscopy, and single-photon emission
                        computed tomography (SPECT). This method allows user-friendly
                        processing of difficult examinations (low spatial resolution,
                        advanced pathology, motion during acquisition, and large
                        areas of focal activation). Registration of three-dimensional
                        (3D) brain scans is initially estimated by first-order moment
                        matching, followed by iterative anisotrophic chamfer matching
                        of brain surfaces. Automatic brain surface extraction is
                        performed in all imaging modalities. A new generalized distance
                        definition and new specific methodologies allow registration
                        of scans that cover only a limited range of brain surface.
                        A new semiautomated supervision scheme allows fast and intuitive
                        corrections of possible false automatic registration results.
                        The accuracy of the MRI/SPECT anatomical-functional correspondence
                        obtained was evaluated using simulations and two difficult
                        clinical populations (tumors and degenerative brain disorders).
                        The average discrimination capability of SPECT (12.4 mm
                        in-plane resolution, 20 mm slice thickness) was found to
                        be better than 5 mm after registration with MRI (5 mm slice
                        thickness). Registration accuracy was always better than
                        imaging resolution. Complete 3D MRI and SPECT registration
                        time ranged between 6-11 min, in which surface matching
                        represented 2-3 min. No registration failure occurred. In
                        conclusion, the application of several new image processing
                        techniques allowed efficient and robust registration. },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal97hbm1.pdf },
type                = {mip},
if = {1998 impact factor: 4.738}
}

@article{Craig_etal96,
author              = { A. H. Craig and J. L. Cummings and L. A. Fairbanks and L. Itti
                        and B. L. Miller and J. Li and I. Mena },
title               = { Cerebral blood flow correlates of apathy in Alzheimer disease },
journal             = { Archives of Neurology },
volume              = 53,
number              = 11,
pages               = { 1116-20 },
month = { Nov },
year                = 1996,
keywords            = { Aged ; Aged, 80 and over ; Alzheimer Disease/*physiopathology/psychology/radionuclide
                        imaging ; Cerebrovascular Circulation/*physiology ; Female
                        ; Frontal Lobe/physiopathology ; Human ; Male ; Middle Age
                        ; Psychiatric Status Rating Scales ; Support, U.S. Gov't,
                        Non-P.H.S. ; Support, U.S. Gov't, P.H.S. ; Temporal Lobe/physiopathology
                        ; Tomography, Emission-Computed, Single-Photon ; 1996/11/01
                        00:00 },
abstract            = { BACKGROUND: Apathy is a pervasive noncognitive neuropsychiatric
                        disturbance in Alzheimer disease, which causes significant
                        caregiver distress. The neuroanatomical substrate of apathy
                        is not well understood. OBJECTIVE: To study the relationship
                        between regional cerebral blood flow and the presence and
                        severity of the personality disturbance, apathy, in individuals
                        with Alzheimer disease. DESIGN: Analysis of the relationship
                        between regional cerebral blood flow as measured by single
                        photon emission computed tomography and severity of apathy
                        as measured by the Neuropsychiatric Inventory using an analysis
                        of variance design. We examined regional cerebral perfusion
                        alterations as measured by xenon 133Xecalibrated technetium
                        Tc 99m hexamethyl-propyleneamine-oxime single photon emission
                        computed tomography in relation to the presence and severity
                        of apathy. SETTING: The neurology clinics of the University
                        of California, Los Angeles, UCLA School of Medicine, and
                        Harbor-UCLA Medical Center. PARTICIPANTS: Thirty-one community-dwelling
                        patients fulfilling National Institute of Neurological and
                        Communicative Disorders and Stroke-Alzheimer's Disease and
                        Related Disorders Association diagnostic criteria for probable
                        Alzheimer disease who had a single photon computed tomographic
                        scan performed within 3 months of administration of the
                        Neuropsychiatric Inventory. RESULTS: The presence of apathy
                        was associated with more severe prefrontal and anterior
                        temporal dysfunction. These regional cerebral perfusion
                        relationships with apathy were independent of cognitive
                        decline except in the dorsolateral prefrontal cortex. CONCLUSIONS:
                        These results demonstrate the association of apathetic syndromes
                        with prefrontal and anterior temporal regional brain dysfunction
                        and are consistent with similar findings previously reported
                        in other disorders. },
address             = { Department of Neurology, University of California, Los Angeles,
                        USA. },
type                = {med},
if = {1998 impact factor: 3.375}
}

@inproceedings{Itti_etal96sfn,
author              = { L. Itti and E. Niebur and J. Braun and C. Koch },
title               = { A Trainable Model of Visual Attention },
pages               = 270,
month = { Nov },
year                = 1996,
abstract            = { We present a model of bottom-up selective visual attention,
                        developed in accordance with the known physiology of the
                        visual system of macaque monkeys and humans. The model comprises
                        two interacting stages, the first being a fast and parallel
                        pre-attentive extraction of visual features (orientation,
                        intensity and color, at several spatial scales), and the
                        second a slow and sequential focal attention shifting mechanism
                        (Winner-Take-All neural network for the selection of the
                        most conspicuous image location, and inhibition-of-return
                        mechanism to generate attentional shifts). The link between
                        the two stages is a ``saliency map'', which topographically
                        encodes for the local conspicuity in the visual scene, and
                        controls where the focus of attention is currently deployed
                        [Koch and Ullman, Human Neurobiol. 1985;4:219-227]. Supervized
                        learning can be introduced to bias the relative weights
                        of the features in the construction of the saliency map
                        and achieve some degree of specialization towards target
                        detection tasks. Despite its simplicity, this model has
                        demonstrated interesting performances in reproducing human
                        stimulus-driven task independent attention. Results with
                        the model are comparable to humans' on simple psychophysical
                        tasks (e.g. A. Treisman's pop-out and conjunctive search).
                        Good performance was also obtained in the detection of salient
                        targets in natural color images, despite high noise, large
                        variations in color and illumination, shadows, reflections
                        and strong textures, which are reputed problematic for artificial
                        vision systems (an interactive demonstration may be found
                        at http://www.klab.caltech.edu/~itti/). },
booktitle           = { Proc. Society for Neuroscience Annual Meeting (SFN'96) },
type                = { bu;mod },
file                = { http://iLab.usc.edu/publications/doc/Itti_etal96sfn.pdf },
review={abs/conf}
}

@inproceedings{Braun_etal96to,
author              = { J. Braun and J. J. Wen and L. Itti and C. Koch },
title               = { Concurrent-Task Studies of Attention },
year                = 1996,
booktitle           = { Proc. Canadian Institute for Advanced Research, Artificial
                        Intelligence and Robotics Program, Workshop on Visual Attention:
                        Focus on Modeling, Toronto, Canada },
type                = {psy},
review={abs/wkshp}
}

@inproceedings{Niebur_etal96to,
author              = { E. Niebur and L. Itti and C. Koch },
title               = { A Neural Model for the ''Where'' Pathway },
year                = 1996,
booktitle           = { Proc. Canadian Institute for Advanced Research, Artificial
                        Intelligence and Robotics Program, Workshop on Visual Attention:
                        Focus on Modeling, Toronto, Canada },
type                = { bu;mod },
review={abs/wkshp}
}

@inproceedings{Goldberg_etal96,
author              = { A. Goldberg and L. Chang and L. Itti and I. Mena and B.
                        L. Miller },
title               = { Visual Agnosia in Alzheimer Disease },
volume              = 46,
pages               = {A358},
year                = 1996,
booktitle           = { Neurology Annual Meeting },
type                = { med;mip },
review={abs/conf}
}

@inproceedings{Kaufer_etal96,
author              = { D. I. Kaufer and L. Itti and B. L. Miller and L. A. Fairbanks and
                        J. Li and J. Fishman and J. L. Cummings },
title               = { Midline Cerebral Morphometry Discriminates Frontotemporal
                        Dementia and Alzheimer's Disease },
volume              = 46,
pages               = 2061,
year                = 1996,
booktitle           = { Neurology Annual Meeting },
type                = { med;mip },
review={abs/conf}
}

@article{Benson_etal96,
author              = { D. F. Benson and A. Djenderedjian and B. L. Miller and N.
                        A. Pachana and L. Chang and L. Itti and I. Mena },
title               = { Neural basis of confabulation },
journal             = {Neurology},
volume              = 46,
number              = 5,
pages               = { 1239-43 },
month = { May },
year                = 1996,
keywords            = { Adult ; Alcohol Amnestic Disorder/*physiopathology/psychology
                        ; Case Report ; Cerebrovascular Circulation ; Cognition
                        Disorders/etiology/physiopathology ; Diencephalon/physiopathology/radionuclide
                        imaging ; Female ; Frontal Lobe/physiopathology/radionuclide
                        imaging ; Human ; Learning Disorders ; Neuropsychological
                        Tests ; Organotechnetium Compounds/diagnostic use ; Oximes/diagnostic
                        use ; Regional Blood Flow ; Tomography, Emission-Computed,
                        Single-Photon ; 1996/05/01 00:00 },
abstract            = { We present a case of acute alcohol-induced Korsakoff amnesia.
                        A severe amnestic-confabulatory syndrome characterized the
                        early clinical status. The initial neuropsychological tests
                        demonstrated severe learning deficits plus impaired performance
                        on many, but not all, tests of frontal lobe function. Single-photon
                        emission CT (SPECT) at this stage showed hypoperfusion in
                        the orbital and medical frontal regions and the medial diencephalic
                        area. Four months later, the patient's amnesia remained
                        but there was no confabulation. Repeat neuropsychological
                        tests confirmed an ongoing severe amnesia, but performance
                        on the frontal lobe tests now was normal. Repeat SPECT showed
                        a return to normal perfusion in the frontal brain areas
                        but little improvement in the medial diencephalic region.
                        These findings along with data from the clinical literature
                        suggest that confabulation results from dysfunction of orbital
                        and a medial frontal cortex. },
address             = { Department of Neurology, UCLA School of Medicine, USA. },
type                = {med},
if = {1998 impact factor: 4.972}
}

@inproceedings{Itti_etal96ismrm,
author              = { L. Itti and L. Chang and T. Ernst },
title               = { Robust Multimodality Registration for Neuroimaging },
pages               = 35,
month = { Apr },
year                = 1996,
booktitle           = { Proc. 4th Annual Meeting of the International Society for
                        Magnetic Resonance in Medicine (ISMRM'1996), New York, NY },
type                = {mip},
review={full/conf}
}

@inproceedings{Ernst_etal96ismrm,
author              = { T. Ernst and L. Chang and L. Itti },
title               = { Correlation of Perfusion MRI and SPECT in Normal Subjects },
pages               = 1304,
month = { Apr },
year                = 1996,
booktitle           = { Proc. 4th Annual Meeting of the International Society for
                        Magnetic Resonance in Medicine (ISMRM'1996), New York, NY },
type                = { mip;med },
review={full/conf}
}

@article{Miller_etal95,
author              = { B. L. Miller and L. Itti and J. Li and A. L. Darby and R.
                        Booth and L. Chang and I. Mena },
title               = { Atrophy-Corrected Cerebral Blood Flow in Fronto-Temporal
                        Dementia },
journal             = { Facts and Research in Gerontology },
number              = {S},
pages               = { 93-103 },
year                = 1995,
type                = { mip;med }
}

@inproceedings{Benson_etal95,
author              = { D. F. Benson and B. L. Miller and L. Chang and N. A. Pachana
                        and I. Mena and A. Djenderedjian and L. Itti },
title               = { The Anatomical Substrate of Confabulation },
volume              = 45,
pages               = {A389},
year                = 1995,
booktitle           = { Neurology Annual Meeting },
type                = { med;mip },
review={abs/conf}
}

@inproceedings{Itti_etal95hbm,
author              = { L. Itti and L. Chang and T. Ernst and B. L. Miller and I.
                        Mena },
title               = { High-Resolution Atrophy Correction in Brain SPECT },
volume              = {S1},
pages               = 132,
month = { Jun },
year                = 1995,
booktitle           = { Human Brain Mapping (Proc. First International Conference
                        on Functional Mapping of the Human Brain, Paris, France },
type                = {mip},
review={abs/conf}
}

@inproceedings{Itti_etal95nid,
author              = { L. Itti and L. Chang and T. Ernst and B. L. Miller and I.
                        Mena },
title               = { High-Resolution Atrophy Correction in Brain SPECT },
month = { Mar },
year                = 1995,
booktitle           = { Proc. Neuroimaging in Dementia, Nice, France },
type                = {mip},
review={abs/wkshp}
}

@inproceedings{Goldberg_etal95nid,
author              = { A. Goldberg and L. Chang and L. Itti and B. L. Miller and
                        I. Mena },
title               = { Visual Agnosia in Alzheimer Disease },
month = { Mar },
year                = 1995,
booktitle           = { Proc. Neuroimaging in Dementia, Nice, France },
type                = {med},
review={abs/wkshp}
}

@inproceedings{Itti_etal95nih,
author              = { L. Itti and L. Chang and P. Morales and B. L. Miller },
title               = { New computer methods in neurological image processing },
month = { Mar },
year                = 1995,
booktitle           = { Proc. Annual NIH-GCRC Meeting, San Diego, CA },
type                = {mip},
review={abs/wkshp}
}

@inproceedings{Chang_etal95nih,
author              = { L. Chang and P. Lutchmansingh and R. E. Poland and B. Palmer
                        and K. B. Boone and R. Melchor and T. Ernst and L. Itti and
                        I. Mena },
title               = { Cerebral Blood Flow, Biochemical Changes, Cognitive Deficits
                        and Sleep Apnea in Myotonic Dystrophy },
month = { Mar },
year                = 1995,
booktitle           = { Proc. Annual NIH-GCRC Meeting, San Diego, CA },
type                = {med},
review={abs/wkshp}
}

@inproceedings{Anderson_etal94ana,
author              = { P. G. Anderson and N. Ortego and L. Chang and B. L. Miller
                        and R. Melchor and L. Itti and E. Singer and H. Myers and
                        P. Satz and I. Mena and B. Palmer },
title               = { Co-registration of Single Photon Emission Computed Tomography
                        and Magnetic Resonance Imaging in HIV-1-associated Dementia
                        Complex },
month = { Oct },
year                = 1994,
booktitle           = { Proc. 119th Annual Meeting of the American Neurological
                        Association, San Francisco, CA },
type                = { mip;med },
review={abs/conf}
}

@mastersthesis{Itti94ms,
author              = { L. Itti },
title               = { Mise en Correspondance d'Atlas et d'Images Anatomiques 3D
                        pour la Cartographie Fonctionnelle Cerebrale },
month = { Jun },
year                = 1994,
keywords            = { Computerized brain atlas },
abstract            = { Les etudes actuelles visant a etablir une cartographie fonctionnelle
                        du cerveau humain necessitent la comparaison d'informations
                        fonctionnelles (tomographie d'emission de positons, magneto-encephalographie...)
                        provenant de plusieurs individus. Cette comparaison repose
                        sur la mise en correspondance prealable des anatomies individuelles
                        (images IRM) avec un atlas. Le but du memoire est dans un
                        premier temps de faire le point sur les methodes relativement
                        frustes actuellement utilisees pour realiser cette mise
                        en correspondance, puis d'envisager de nouvelles approches
                        utilisant les outils performants recemment developpes dans
                        le monde du traitement des images (contours actifs, extraction
                        de lignes de courbure maximale sur des surfaces, topologie
                        discrete, champs de Markov...). Les approches proposees
                        devront inclure la representation des donnees symboliques
                        et (ou) numeriques envisagees pour manipuler l'atlas. },
note                = {in French},
organization        = { Ecole Nationale Superieure des Telecommunications, Paris },
type                = {mip}
}

@inproceedings{Darcourt_etal94snm,
author              = { J. Darcourt and L. Itti and L. Chang and J. C. Cauvin and
                        B. L. Miller and I. Mena },
title               = { Tl-201 and Tc-99-MIBI SPECT for Brain Tumor Detection: Comparison
                        using MRI Coregistration },
volume              = 35,
number              = 5,
pages               = { P43-P44 },
month = { Jun },
year                = 1994,
booktitle           = { Journal of Nuclear Medicine (Proc. Annual Meeting of the
                        Society of Nuclear Medicine) },
type                = { mip;med },
review={abs/conf}
}

@inproceedings{Damien_etal92,
author              = { J. Damien and L. Itti and P. Egroizard and R. Itti },
title               = { Left Ventricle Detection in Radionuclide Ventriculography
                        by a Model of Neural Network },
month = { Oct },
abstract={A 2-layer neural network was applied to determine the LV in
radionuclide ventriculography. After learning by back-propagation, the
correlations between computed pictures and learning data-set outputs,
and between learning data-set outputs and other pictures were
excellent (r=0.92 and r=0.83 respectively).},
year                = 1992,
booktitle           = { Proc. 14th Annual International Conference of the IEEE Engineering
                        in Medicine and Biology Society (IEEE-EMBS), Paris, France },
pages={994-995},
type                = {mip},
file = { http://iLab.usc.edu/publications/doc/Damien_etal92embs.pdf },
review={full/conf}
}

@inproceedings{Itti_etal91,
author              = { L. Itti and R. Itti and J. Damien },
title               = { Development of a Micro-Computer Based System for Scintigraphic
                        Image Display and Processing },
month = { Mar },
year                = 1991,
booktitle           = { Proc. 2nd International Symposium on Computer Applications
                        in nuclear Medicine and Cardiac Magnetic Resonance Imaging,
                        Rotterdam, The Netherlands },
type                = {mip},
review={abs/conf}
}

@inproceedings{Itti_Itti90,
author              = { L. Itti and R. Itti },
title               = { Traitement d'Images Radioisotopiques Cardiaques sur Micro-Ordinateur },
month = { Oct },
year                = 1990,
booktitle           = { Proc. SYNBIO Techniques Avancees pour les Sciences de la
                        Vie, Lyon, France },
type                = {mip},
review={abs/conf}
}

@inproceedings{Itti_etal89,
author              = { L. Itti and A. Dougangi and D. Casset-Senon and R. Itti },
title               = { Scintigraphic Display and Processing Software Developed
                        on a Personal Computer },
volume              = 30,
number              = 5,
pages               = 1046,
month = { May },
year                = 1989,
booktitle           = { Journal of Nuclear Medicine (Proc. 36th Annual Meeting of
                        the Society of Nuclear Medicine, Saint Louis, MS) },
type                = {mip},
review={abs/conf}
}




