%% This BibTeX bibliography file was created using BibDesk. %% https://bibdesk.sourceforge.io/ %% Created for Maarten de Rijke at 2022-07-20 11:15:57 +0200 %% Saved with string encoding Unicode (UTF-8) @book{wassermann-2004-all, author = {Wasserman, Larry A.}, date-added = {2022-07-20 11:12:53 +0200}, date-modified = {2022-07-20 11:12:53 +0200}, publisher = {Springer}, title = {All of Statistics}, year = {2004}} @article{van-hasselt-2018-deep, author = {van Hasselt, Hado and Doron, Yotam and Strub, Florian and Hessel, Matteo and Sonnerat, Nicolas and Modayil, Joseph}, date-added = {2022-07-20 11:12:45 +0200}, date-modified = {2022-07-20 11:12:45 +0200}, journal = {arXiv preprint arXiv:1812.02648}, title = {Deep Reinforcement Learning and the Deadly Triad}, year = {2018}} @book{sutton-2018-reinforcement, author = {Richard Sutton and Andrew Barto}, date-added = {2022-07-20 11:12:32 +0200}, date-modified = {2022-07-20 11:13:11 +0200}, publisher = {MIT Press}, title = {Reinforcement Learning: An Introduction}, year = {2018}} @misc{silver-2022-lecture, author = {Silver, David}, date-added = {2022-07-20 11:12:23 +0200}, date-modified = {2022-07-20 11:12:23 +0200}, howpublished = {http://youtube.com}, title = {{RL Course} -- {Lecture} 9: Exploitation and Exploration}, year = {2022}} @inproceedings{sarvi-2022-understanding, author = {Sarvi, Fatemeh and Heuss, Maria and Aliannejadi, Mohammad and Schelter, Sebastian and de Rijke, Maarten}, booktitle = {WSDM 2022: The Fifteenth International Conference on Web Search and Data Mining}, date-added = {2022-07-20 11:12:17 +0200}, date-modified = {2022-07-20 11:12:17 +0200}, month = {February}, publisher = {ACM}, title = {Understanding and Mitigating the Effect of Outliers in Fair Ranking}, year = {2022}} @inproceedings{saito-2021-counterfactual, abstract = { Counterfactual estimators enable the use of existing log data to estimate how some new target recommendation policy would have performed, if it had been used instead of the policy that logged the data. We say that those estimators work ''off-policy'', since the policy that logged the data is different from the target policy. In this way, counterfactual estimators enable Off-policy Evaluation (OPE) akin to an unbiased offline A/B test, as well as learning new recommendation policies through Off-policy Learning (OPL). The goal of this tutorial is to summarize Foundations, Implementations, and Recent Advances of OPE/OPL. Specifically, we will introduce the fundamentals of OPE/OPL and provide theoretical and empirical comparisons of conventional methods. Then, we will cover emerging practical challenges such as how to take into account combinatorial actions, distributional shift, fairness of exposure, and two-sided market structures. We will then present Open Bandit Pipeline, an open-source package for OPE/OPL, and how it can be used for both research and practical purposes. We will conclude the tutorial by presenting real-world case studies and future directions.}, author = {Saito, Yuta and Joachims, Thorsten}, booktitle = {Fifteenth ACM Conference on Recommender Systems}, date-added = {2022-07-20 11:12:10 +0200}, date-modified = {2022-07-20 11:12:10 +0200}, pages = {828--830}, publisher = {ACM}, title = {Counterfactual Learning and Evaluation for Recommender Systems: Foundations, Implementations, and Recent Advances}, year = {2021}, bdsk-url-1 = {https://doi.org/10.1145/3460231.3473320}} @inproceedings{riquelme-2018-deep, author = {Riquelme, Carlos and Tucker, George and Snoek, Jasper}, booktitle = {ICLR 2018}, date-added = {2022-07-20 11:12:00 +0200}, date-modified = {2022-07-20 11:12:00 +0200}, title = {Deep Bayesian Bandits Showdown: An Empirical Comparison of Bayesian Deep Networks for Thompson Sampling}, year = {2018}} @inproceedings{precup-2000-eligibility, address = {San Francisco, CA, USA}, author = {Precup, Doina and Sutton, Richard S. and Singh, Satinder P.}, booktitle = {Proceedings of the Seventeenth International Conference on Machine Learning}, date-added = {2022-07-20 11:11:54 +0200}, date-modified = {2022-07-20 11:11:54 +0200}, pages = {759--766}, publisher = {Morgan Kaufmann Publishers Inc.}, series = {ICML '00}, title = {Eligibility Traces for Off-Policy Policy Evaluation}, year = {2000}} @inproceedings{oosterhuis-2020-unbiased, author = {Oosterhuis, Harrie and Jagerman, Rolf and de Rijke, Maarten}, booktitle = {Companion Proceedings of the Web Conference 2020}, date-added = {2022-07-20 11:11:45 +0200}, date-modified = {2022-07-20 11:11:45 +0200}, month = {April}, pages = {299--300}, publisher = {ACM}, title = {Unbiased Learning to Rank: Counterfactual and Online Approaches}, year = {2020}} @inproceedings{oosterhuis-2021-unifying, author = {Oosterhuis, Harrie and de Rijke, Maarten}, booktitle = {WSDM 2021: 14th International Conference on Web Search and Data Mining}, date-added = {2022-07-20 11:11:30 +0200}, date-modified = {2022-07-20 11:11:30 +0200}, month = {March}, publisher = {ACM}, title = {Unifying Online and Counterfactual Learning to Rank}, year = {2021}} @inproceedings{oosterhuis-2018-differentiable, author = {Oosterhuis, Harrie and de Rijke, Maarten}, booktitle = {CIKM 2018: International Conference on Information and Knowledge Management}, date-added = {2022-07-20 11:11:21 +0200}, date-modified = {2022-07-20 11:13:25 +0200}, month = {October}, pages = {1293--1302}, publisher = {ACM}, title = {Differentiable Unbiased Online Learning to Rank}, year = {2018}} @article{oosterhuis-2022-reaching, author = {Oosterhuis, Harrie}, date-added = {2022-07-20 11:11:15 +0200}, date-modified = {2022-07-20 11:11:15 +0200}, journal = {arXiv preprint arXiv:2206.12204}, title = {Reaching the End of Unbiasedness: Uncovering Implicit Limitations of Click-Based Learning to Rank}, year = 2022} @inproceedings{mcinerney-2020-counterfactual, author = {McInerney, James and Brost, Brian and Chandar, Praveen and Mehrotra, Rishabh and Carterette, Benjamin A.}, booktitle = {{KDD} '20: The 26th {ACM} {SIGKDD} Conference on Knowledge Discovery and Data Mining, Virtual Event, CA, USA, August 23-27, 2020}, date-added = {2022-07-20 11:11:05 +0200}, date-modified = {2022-07-20 11:11:05 +0200}, pages = {1779--1788}, publisher = {ACM}, title = {Counterfactual Evaluation of Slate Recommendations with Sequential Reward Interactions}, year = {2020}} @misc{levine-2021-gentle, author = {Levine, Sergey}, date-added = {2022-07-20 11:10:57 +0200}, date-modified = {2022-07-20 11:10:57 +0200}, howpublished = {\url{https://www.youtube.com/watch?v=tW-BNW1ApN8}}, title = {A Gentle Introduction to Offline Reinforcement Learning}, year = {2021}} @article{joachims-2007-evaluating, abstract = {This article examines the reliability of implicit feedback generated from clickthrough data and query reformulations in World Wide Web (WWW) search. Analyzing the users' decision process using eyetracking and comparing implicit feedback against manual relevance judgments, we conclude that clicks are informative but biased. While this makes the interpretation of clicks as absolute relevance judgments difficult, we show that relative preferences derived from clicks are reasonably accurate on average. We find that such relative preferences are accurate not only between results from an individual query, but across multiple sets of results within chains of query reformulations.}, author = {Joachims, Thorsten and Granka, Laura and Pan, Bing and Hembrooke, Helene and Radlinski, Filip and Gay, Geri}, date-added = {2022-07-20 11:10:49 +0200}, date-modified = {2022-07-20 11:10:49 +0200}, journal = {ACM Trans. Inf. Syst.}, number = {2}, pages = {7--es}, publisher = {ACM}, title = {Evaluating the Accuracy of Implicit Feedback from Clicks and Query Reformulations in Web Search}, volume = {25}, year = {2007}, bdsk-url-1 = {https://doi.org/10.1145/1229179.1229181}} @inproceedings{jeunen-2021-pessimistic, abstract = { Methods for bandit learning from user interactions often require a model of the reward a certain context-action pair will yield -- for example, the probability of a click on a recommendation. This common machine learning task is highly non-trivial, as the data-generating process for contexts and actions is often skewed by the recommender system itself. Indeed, when the deployed recommendation policy at data collection time does not pick its actions uniformly-at-random, this leads to a selection bias that can impede effective reward modelling. This in turn makes off-policy learning -- the typical setup in industry -- particularly challenging. In this work, we propose and validate a general pessimistic reward modelling approach for off-policy learning in recommendation. Bayesian uncertainty estimates allow us to express scepticism about our own reward model, which can in turn be used to generate a conservative decision rule. We show how it alleviates a well-known decision making phenomenon known as the Optimiser's Curse, and draw parallels with existing work on pessimistic policy learning. Leveraging the available closed-form expressions for both the posterior mean and variance when a ridge regressor models the reward, we show how to apply pessimism effectively and efficiently to an off-policy recommendation use-case. Empirical observations in a wide range of environments show that being conservative in decision-making leads to a significant and robust increase in recommendation performance. The merits of our approach are most outspoken in realistic settings with limited logging randomisation, limited training samples, and larger action spaces.}, author = {Jeunen, Olivier and Goethals, Bart}, booktitle = {Fifteenth ACM Conference on Recommender Systems}, date-added = {2022-07-20 11:10:42 +0200}, date-modified = {2022-07-20 11:10:42 +0200}, pages = {63--74}, publisher = {ACM}, title = {Pessimistic Reward Models for Off-Policy Learning in Recommendation}, year = {2021}, bdsk-url-1 = {https://doi.org/10.1145/3460231.3474247}} @book{chuklin-2015-click, author = {Chuklin, Aleksandr and Markov, Ilya and de Rijke, Maarten}, date-added = {2022-07-20 11:10:32 +0200}, date-modified = {2022-07-20 11:10:32 +0200}, month = {August}, publisher = {Morgan \& Claypool Publishers}, series = {Synthesis Lectures on Information Concepts, Retrieval, and Services}, title = {Click Models for Web Search}, year = {2015}, bdsk-url-1 = {http://clickmodels.weebly.com}} @inproceedings{chen-2021-decision, author = {Lili Chen and Kevin Lu and Aravind Rajeswaran and Kimin Lee and Aditya Grover and Michael Laskin and Pieter Abbeel and Aravind Srinivas and Igor Mordatch}, booktitle = {NeurIPS 2021: Conference on Neural Information Processing Systems}, date-added = {2022-07-20 11:10:25 +0200}, date-modified = {2022-07-20 11:10:25 +0200}, title = {Decision Transformer: Reinforcement Learning via Sequence Modeling}, year = {2021}} @inproceedings{chapelle-2011-empirical, abstract = {Thompson sampling is one of oldest heuristic to address the exploration / exploitation trade-off, but it is surprisingly unpopular in the literature. We present here some empirical results using Thompson sampling on simulated and real data, and show that it is highly competitive. And since this heuristic is very easy to implement, we argue that it should be part of the standard baselines to compare against.}, author = {Chapelle, Olivier and Li, Lihong}, booktitle = {Proceedings of the 24th International Conference on Neural Information Processing Systems}, date-added = {2022-07-20 11:10:14 +0200}, date-modified = {2022-07-20 11:10:14 +0200}, pages = {2249--2257}, publisher = {Curran Associates Inc.}, title = {An Empirical Evaluation of Thompson Sampling}, year = {2011}} @inproceedings{agrawal-2013-thompson, abstract = {Thompson Sampling is one of the oldest heuristics for multi-armed bandit problems. It is a randomized algorithm based on Bayesian ideas, and has recently generated significant interest after several studies demonstrated it to have better empirical performance compared to the state-of-the-art methods. However, many questions regarding its theoretical performance remained open. In this paper, we design and analyze a generalization of Thompson Sampling algorithm for the stochastic contextual multi-armed bandit problem with linear payoff functions, when the contexts are provided by an adaptive adversary. This is among the most important and widely studied version of the contextual bandits problem. We prove a high probability regret bound of O(d2/ε√T1+ε) in time T for any 0 < ε < 1, where d is the dimension of each context vector and ε is a parameter used by the algorithm. Our results provide the first theoretical guarantees for the contextual version of Thompson Sampling, and are close to the lower bound of Ω(d√T) for this problem. This essentially solves a COLT open problem of Chapelle and Li [COLT 2012].}, author = {Agrawal, Shipra and Goyal, Navin}, booktitle = {Proceedings of the 30th International Conference on International Conference on Machine Learning - Volume 28}, date-added = {2022-07-20 11:10:07 +0200}, date-modified = {2022-07-20 11:10:07 +0200}, pages = {III--1220--III--1228}, publisher = {JMLR.org}, series = {ICML'13}, title = {Thompson Sampling for Contextual Bandits with Linear Payoffs}, year = {2013}} @article{agrawal-2017-near-optimal, abstract = {Thompson Sampling (TS) is one of the oldest heuristics for multiarmed bandit problems. It is a randomized algorithm based on Bayesian ideas and has recently generated significant interest after several studies demonstrated that it has favorable empirical performance compared to the state-of-the-art methods. In this article, a novel and almost tight martingale-based regret analysis for Thompson Sampling is presented. Our technique simultaneously yields both problem-dependent and problem-independent bounds: (1) the first near-optimal problem-independent bound of O(√ NT ln T) on the expected regret and (2) the optimal problem-dependent bound of (1 + ϵ)Σi ln T / d(μi,μ1) + O(N/ϵ2) on the expected regret (this bound was first proven by Kaufmann et al. (2012b)).Our technique is conceptually simple and easily extends to distributions other than the Beta distribution used in the original TS algorithm. For the version of TS that uses Gaussian priors, we prove a problem-independent bound of O(√ NT ln N) on the expected regret and show the optimality of this bound by providing a matching lower bound. This is the first lower bound on the performance of a natural version of Thompson Sampling that is away from the general lower bound of Ω (√ NT) for the multiarmed bandit problem.}, author = {Agrawal, Shipra and Goyal, Navin}, date-added = {2022-07-20 11:10:02 +0200}, date-modified = {2022-07-20 11:10:02 +0200}, journal = {J. ACM}, number = {5}, publisher = {ACM}, title = {Near-Optimal Regret Bounds for Thompson Sampling}, volume = {64}, year = {2017}, bdsk-url-1 = {https://doi.org/10.1145/3088510}}