From de0df95a2de4577344fe89f226a2be4deef693cc Mon Sep 17 00:00:00 2001
From: yanirs <yanirs@users.noreply.github.com>
Date: Mon, 15 Jan 2024 23:57:04 +0000
Subject: [PATCH] deploy: 3b84b2473445ad2b33cf5af0969d2a249c798164

---
 2014/08/17/datas-hierarchy-of-needs/index.html                | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 2014/10/23/what-is-data-science/index.html                    | 2 +-
 2014/11/05/bcrecommender-traction-update/index.html           | 2 +-
 .../index.html                                                | 2 +-
 2014/12/15/seo-mostly-about-showing-up/index.html             | 2 +-
 2015/01/15/automating-parse-com-bulk-data-imports/index.html  | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 2015/03/22/the-long-road-to-a-lifestyle-business/index.html   | 2 +-
 2015/04/24/my-divestment-from-fossil-fuels/index.html         | 2 +-
 .../index.html                                                | 2 +-
 2015/06/06/hopping-on-the-deep-learning-bandwagon/index.html  | 2 +-
 .../index.html                                                | 2 +-
 2015/07/31/goodbye-parse-com/index.html                       | 2 +-
 2015/08/24/you-dont-need-a-data-scientist-yet/index.html      | 2 +-
 .../02/the-wonderful-world-of-recommender-systems/index.html  | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 2015/11/23/the-hardest-parts-of-data-science/index.html       | 2 +-
 .../08/this-holiday-season-give-me-real-insights/index.html   | 2 +-
 2016/01/24/the-joys-of-offline-data-collection/index.html     | 2 +-
 .../index.html                                                | 2 +-
 2016/03/20/the-rise-of-greedy-robots/index.html               | 2 +-
 .../index.html                                                | 2 +-
 .../19/making-bayesian-ab-testing-more-accessible/index.html  | 2 +-
 2016/08/04/is-data-scientist-a-useless-job-title/index.html   | 2 +-
 .../08/21/seven-ways-to-be-data-driven-off-a-cliff/index.html | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 2017/09/02/state-of-bandcamp-recommender/index.html           | 2 +-
 .../index.html                                                | 2 +-
 2018/07/22/defining-data-science-in-2018/index.html           | 2 +-
 2018/11/03/reflections-on-remote-data-science-work/index.html | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 2019/10/06/bootstrapping-the-right-way/index.html             | 2 +-
 .../a-day-in-the-life-of-a-remote-data-scientist/index.html   | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 2021/04/05/some-highlights-from-2020/index.html               | 2 +-
 2021/10/07/my-work-with-automattic/index.html                 | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 .../analysis-strategies-in-online-a-b-experiments/index.html  | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 .../12/causal-machine-learning-book-draft-review/index.html   | 2 +-
 2022/12/11/chatgpt-is-transformative-ai/index.html            | 2 +-
 .../remaining-relevant-as-a-small-language-model/index.html   | 2 +-
 .../how-hackable-are-automated-coding-assessments/index.html  | 2 +-
 .../index.html                                                | 2 +-
 .../index.html                                                | 2 +-
 2023/10/25/lessons-from-reluctant-data-engineering/index.html | 2 +-
 .../index.html                                                | 2 +-
 about/index.html                                              | 2 +-
 consult/index.html                                            | 2 +-
 kaggle/index.html                                             | 2 +-
 phd-work/index.html                                           | 2 +-
 sitemap.xml                                                   | 2 +-
 talks/index.html                                              | 4 ++--
 65 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/2014/08/17/datas-hierarchy-of-needs/index.html b/2014/08/17/datas-hierarchy-of-needs/index.html
index 753f24ea5..5b22377c3 100644
--- a/2014/08/17/datas-hierarchy-of-needs/index.html
+++ b/2014/08/17/datas-hierarchy-of-needs/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Data’s hierarchy of needs | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="business,data business,data science"><meta name=description content="Discussing the hierarchy of needs proposed by Jay Kreps. Key takeaway: Data-driven algorithms & insights can only be as good as the underlying data."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Data’s hierarchy of needs"><meta property="og:description" content="Discussing the hierarchy of needs proposed by Jay Kreps. Key takeaway: Data-driven algorithms & insights can only be as good as the underlying data."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/"><meta property="og:image" content="https://yanirseroussi.com/datas-hierarchy-of-needs.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-08-17T13:09:30+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/datas-hierarchy-of-needs.jpg"><meta name=twitter:title content="Data’s hierarchy of needs"><meta name=twitter:description content="Discussing the hierarchy of needs proposed by Jay Kreps. Key takeaway: Data-driven algorithms & insights can only be as good as the underlying data."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Data’s hierarchy of needs","item":"https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Data’s hierarchy of needs","name":"Data’s hierarchy of needs","description":"Discussing the hierarchy of needs proposed by Jay Kreps. Key takeaway: Data-driven algorithms \u0026amp; insights can only be as good as the underlying data.","keywords":["business","data business","data science"],"articleBody":"One of my favourite blog posts in recent times is The Log: What every software engineer should know about real-time data’s unifying abstraction by Jay Kreps. That post comprehensively describes how abstracting all the data produced by LinkedIn’s various components into a single log pipeline greatly simplified their architecture and enabled advanced data-driven applications. Among the various technical details there are some beautifully-articulated business insights. My favourite one defines data’s hierarchy of needs:\nEffective use of data follows a kind of Maslow’s hierarchy of needs. The base of the pyramid involves capturing all the relevant data, being able to put it together in an applicable processing environment (be that a fancy real-time query system or just text files and python scripts). This data needs to be modeled in a uniform way to make it easy to read and process. Once these basic needs of capturing data in a uniform way are taken care of it is reasonable to work on infrastructure to process this data in various ways—MapReduce, real-time query systems, etc.\nIt’s worth noting the obvious: without a reliable and complete data flow, a Hadoop cluster is little more than a very expensive and difficult to assemble space heater. Once data and processing are available, one can move concern on to more refined problems of good data models and consistent well understood semantics. Finally, concentration can shift to more sophisticated processing—better visualization, reporting, and algorithmic processing and prediction.\nIn my experience, most organizations have huge holes in the base of this pyramid—they lack reliable complete data flow—but want to jump directly to advanced data modeling techniques. This is completely backwards. [emphasis mine]\nVisually, it looks something like this:\nIn addition, before starting to build a data pipeline, one needs to ensure that the tracked system works as expected. For example, a buggy website is likely to produce weird metrics, which in turn would make the data processing, reporting and predictions unreliable. I completely agree with Jay’s point about needing to get the basis of the pyramid right before setting out to do “something with data” (which seems to be the desire of every company nowadays).\nThe general point is that it’s important to have realistic expectations about what can be obtained by data-driven algorithms and insights. These can only be as good as the underlying data, with the results always depending to a large degree on having a solid infrastructure. Not everything has to be perfect from the start (most things never will be), but some degree of robustness is required to avoid spending too many resources on things that would never work. Trying to apply the latest predictive models without a reliable data infrastructure is like driving a fancy car on broken roads – you’re unlikely to get very far.\n","wordCount":"462","inLanguage":"en","image":"https://yanirseroussi.com/datas-hierarchy-of-needs.jpg","datePublished":"2014-08-17T13:09:30Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Data’s hierarchy of needs</h1><div class=post-meta><span title='2014-08-17 13:09:30 +0000 UTC'>August 17, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-08-17-datas-hierarchy-of-needs/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs_huf4ee54c31df2e5d065ff2a11dd219264_126270_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs_huf4ee54c31df2e5d065ff2a11dd219264_126270_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs_huf4ee54c31df2e5d065ff2a11dd219264_126270_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs.jpg 794w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs.jpg alt width=794 height=582></figure><div class=post-content><p>One of my favourite blog posts in recent times is <a href=http://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying>The Log: What every software engineer should know about real-time data&rsquo;s unifying abstraction</a> by Jay Kreps. That post comprehensively describes how abstracting all the data produced by LinkedIn&rsquo;s various components into a single log pipeline greatly simplified their architecture and enabled advanced data-driven applications. Among the various technical details there are some beautifully-articulated business insights. My favourite one defines data&rsquo;s hierarchy of needs:</p><blockquote><p>Effective use of data follows a kind of <a href=http://en.wikipedia.org/wiki/Maslow%27s_hierarchy_of_needs target=_blank rel=noopener>Maslow&rsquo;s hierarchy of needs</a>. The base of the pyramid involves capturing all the relevant data, being able to put it together in an applicable processing environment (be that a fancy real-time query system or just text files and python scripts). This data needs to be modeled in a uniform way to make it easy to read and process. Once these basic needs of capturing data in a uniform way are taken care of it is reasonable to work on infrastructure to process this data in various ways—MapReduce, real-time query systems, etc.</p><p>It&rsquo;s worth noting the obvious: without a reliable and complete data flow, a Hadoop cluster is little more than a very expensive and difficult to assemble space heater. Once data and processing are available, one can move concern on to more refined problems of good data models and consistent well understood semantics. Finally, concentration can shift to more sophisticated processing—better visualization, reporting, and algorithmic processing and prediction.</p><p><strong>In my experience, most organizations have huge holes in the base of this pyramid—they lack reliable complete data flow—but want to jump directly to advanced data modeling techniques. This is completely backwards.</strong> [emphasis mine]</p></blockquote><p>Visually, it looks something like this:</p><figure><a href=datas-hierarchy-of-needs.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="business,data business,data science"><meta name=description content="Discussing the hierarchy of needs proposed by Jay Kreps. Key takeaway: Data-driven algorithms & insights can only be as good as the underlying data."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Data’s hierarchy of needs"><meta property="og:description" content="Discussing the hierarchy of needs proposed by Jay Kreps. Key takeaway: Data-driven algorithms & insights can only be as good as the underlying data."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/"><meta property="og:image" content="https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-08-17T13:09:30+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs.jpg"><meta name=twitter:title content="Data’s hierarchy of needs"><meta name=twitter:description content="Discussing the hierarchy of needs proposed by Jay Kreps. Key takeaway: Data-driven algorithms & insights can only be as good as the underlying data."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Data’s hierarchy of needs","item":"https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Data’s hierarchy of needs","name":"Data’s hierarchy of needs","description":"Discussing the hierarchy of needs proposed by Jay Kreps. Key takeaway: Data-driven algorithms \u0026amp; insights can only be as good as the underlying data.","keywords":["business","data business","data science"],"articleBody":"One of my favourite blog posts in recent times is The Log: What every software engineer should know about real-time data’s unifying abstraction by Jay Kreps. That post comprehensively describes how abstracting all the data produced by LinkedIn’s various components into a single log pipeline greatly simplified their architecture and enabled advanced data-driven applications. Among the various technical details there are some beautifully-articulated business insights. My favourite one defines data’s hierarchy of needs:\nEffective use of data follows a kind of Maslow’s hierarchy of needs. The base of the pyramid involves capturing all the relevant data, being able to put it together in an applicable processing environment (be that a fancy real-time query system or just text files and python scripts). This data needs to be modeled in a uniform way to make it easy to read and process. Once these basic needs of capturing data in a uniform way are taken care of it is reasonable to work on infrastructure to process this data in various ways—MapReduce, real-time query systems, etc.\nIt’s worth noting the obvious: without a reliable and complete data flow, a Hadoop cluster is little more than a very expensive and difficult to assemble space heater. Once data and processing are available, one can move concern on to more refined problems of good data models and consistent well understood semantics. Finally, concentration can shift to more sophisticated processing—better visualization, reporting, and algorithmic processing and prediction.\nIn my experience, most organizations have huge holes in the base of this pyramid—they lack reliable complete data flow—but want to jump directly to advanced data modeling techniques. This is completely backwards. [emphasis mine]\nVisually, it looks something like this:\nIn addition, before starting to build a data pipeline, one needs to ensure that the tracked system works as expected. For example, a buggy website is likely to produce weird metrics, which in turn would make the data processing, reporting and predictions unreliable. I completely agree with Jay’s point about needing to get the basis of the pyramid right before setting out to do “something with data” (which seems to be the desire of every company nowadays).\nThe general point is that it’s important to have realistic expectations about what can be obtained by data-driven algorithms and insights. These can only be as good as the underlying data, with the results always depending to a large degree on having a solid infrastructure. Not everything has to be perfect from the start (most things never will be), but some degree of robustness is required to avoid spending too many resources on things that would never work. Trying to apply the latest predictive models without a reliable data infrastructure is like driving a fancy car on broken roads – you’re unlikely to get very far.\n","wordCount":"462","inLanguage":"en","image":"https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs.jpg","datePublished":"2014-08-17T13:09:30Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Data’s hierarchy of needs</h1><div class=post-meta><span title='2014-08-17 13:09:30 +0000 UTC'>August 17, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-08-17-datas-hierarchy-of-needs/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs_huf4ee54c31df2e5d065ff2a11dd219264_126270_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs_huf4ee54c31df2e5d065ff2a11dd219264_126270_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs_huf4ee54c31df2e5d065ff2a11dd219264_126270_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs.jpg 794w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs.jpg alt width=794 height=582></figure><div class=post-content><p>One of my favourite blog posts in recent times is <a href=http://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying>The Log: What every software engineer should know about real-time data&rsquo;s unifying abstraction</a> by Jay Kreps. That post comprehensively describes how abstracting all the data produced by LinkedIn&rsquo;s various components into a single log pipeline greatly simplified their architecture and enabled advanced data-driven applications. Among the various technical details there are some beautifully-articulated business insights. My favourite one defines data&rsquo;s hierarchy of needs:</p><blockquote><p>Effective use of data follows a kind of <a href=http://en.wikipedia.org/wiki/Maslow%27s_hierarchy_of_needs target=_blank rel=noopener>Maslow&rsquo;s hierarchy of needs</a>. The base of the pyramid involves capturing all the relevant data, being able to put it together in an applicable processing environment (be that a fancy real-time query system or just text files and python scripts). This data needs to be modeled in a uniform way to make it easy to read and process. Once these basic needs of capturing data in a uniform way are taken care of it is reasonable to work on infrastructure to process this data in various ways—MapReduce, real-time query systems, etc.</p><p>It&rsquo;s worth noting the obvious: without a reliable and complete data flow, a Hadoop cluster is little more than a very expensive and difficult to assemble space heater. Once data and processing are available, one can move concern on to more refined problems of good data models and consistent well understood semantics. Finally, concentration can shift to more sophisticated processing—better visualization, reporting, and algorithmic processing and prediction.</p><p><strong>In my experience, most organizations have huge holes in the base of this pyramid—they lack reliable complete data flow—but want to jump directly to advanced data modeling techniques. This is completely backwards.</strong> [emphasis mine]</p></blockquote><p>Visually, it looks something like this:</p><figure><a href=datas-hierarchy-of-needs.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs_huf4ee54c31df2e5d065ff2a11dd219264_126270_360x0_resize_q75_box.jpg 360w,
 https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs_huf4ee54c31df2e5d065ff2a11dd219264_126270_480x0_resize_q75_box.jpg 480w,
 https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/datas-hierarchy-of-needs_huf4ee54c31df2e5d065ff2a11dd219264_126270_720x0_resize_q75_box.jpg 720w,
diff --git a/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/index.html b/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/index.html
index b81ac2915..ed9aab98c 100644
--- a/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/index.html
+++ b/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Building a Bandcamp recommender system (part 1 – motivation) | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Bandcamp,BCRecommender,music,music industry,recommender systems"><meta name=description content="My motivation behind building BCRecommender, a free recommendation & discovery service for Bandcamp music."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Building a Bandcamp recommender system (part 1 – motivation)"><meta property="og:description" content="My motivation behind building BCRecommender, a free recommendation & discovery service for Bandcamp music."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/"><meta property="og:image" content="https://yanirseroussi.com/bcrecommender-screenshot.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-08-30T08:11:38+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bcrecommender-screenshot.png"><meta name=twitter:title content="Building a Bandcamp recommender system (part 1 – motivation)"><meta name=twitter:description content="My motivation behind building BCRecommender, a free recommendation & discovery service for Bandcamp music."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Building a Bandcamp recommender system (part 1 – motivation)","item":"https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Building a Bandcamp recommender system (part 1 – motivation)","name":"Building a Bandcamp recommender system (part 1 – motivation)","description":"My motivation behind building BCRecommender, a free recommendation \u0026amp; discovery service for Bandcamp music.","keywords":["Bandcamp","BCRecommender","music","music industry","recommender systems"],"articleBody":"I’ve been a Bandcamp user for a few years now. I love the fact that they pay out a significant share of the revenue directly to the artists, unlike other services. In addition, despite the fact that fans may stream all the music for free and even easily rip it, almost $80M were paid out to artists through Bandcamp to date (including almost $3M in the last month) – serving as strong evidence that the traditional music industry’s fight against piracy is a waste of resources and time.\nOne thing I’ve been struggling with since starting to use Bandcamp is the discovery of new music. Originally (in 2011), I used the browse-by-tag feature, but it is often too broad to find music that I like. A newer feature is the Discoverinator, which is meant to emulate the experience of browsing through covers at a record store – sadly, I could never find much stuff I liked using that method. Last year, Bandcamp announced Bandcamp for fans, which includes the ability to wishlist items and discover new music by stalking/following other fans. In addition, they released a mobile app, which made the music purchased on Bandcamp much easier to access.\nAll these new features definitely increased my engagement and helped me find more stuff to listen to, but I still feel that Bandcamp music discovery could be much better. Specifically, I would love to be served personalised recommendations and be able to browse music that is similar to specific tracks and albums that I like. Rather than waiting for Bandcamp to implement these features, I decided to do it myself. Visit BCRecommender – Bandcamp recommendations based on your fan account to see where this effort stands at the moment.\nWhile BCRecommender has already helped me discover new music to add to my collection, building it gave me many more ideas on how it can be improved, so it’s definitely a work in progress. I’ll probably tinker with the underlying algorithms as I go, so recommendations may occasionally seem weird (but this always seems to be the case with recommender systems in the real world). In subsequent posts I’ll discuss some of the technical details and where I’d like to take this project.\nIt’s probably worth noting that BCRecommender is not associated with or endorsed by Bandcamp, but I doubt they would mind since it was built using publicly-available information, and is full of links to buy the music back on their site.\n","wordCount":"411","inLanguage":"en","image":"https://yanirseroussi.com/bcrecommender-screenshot.png","datePublished":"2014-08-30T08:11:38Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Building a Bandcamp recommender system (part 1 – motivation)</h1><div class=post-meta><span title='2014-08-30 08:11:38 +0000 UTC'>August 30, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-08-30-building-a-bandcamp-recommender-system-part-1-motivation/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_1500x0_resize_box_3.png 1500w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot.png 1581w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot.png alt width=1581 height=821></figure><div class=post-content><p>I&rsquo;ve been a <a href=http://bandcamp.com target=_blank rel=noopener>Bandcamp</a> user for a few years now. I love the fact that they pay out a <a href=https://bandcamp.com/pricing target=_blank rel=noopener>significant share of the revenue</a> directly to the artists, unlike <a href=https://en.wikipedia.org/wiki/Spotify#Criticism target=_blank rel=noopener>other services</a>. In addition, despite the fact that fans may stream all the music for free and even <a href=https://bandcamp.com/help/audio_basics#steal target=_blank rel=noopener>easily rip it</a>, almost $80M were paid out to artists through Bandcamp to date (including almost $3M in the last month) – serving as strong evidence that the traditional music industry&rsquo;s fight against piracy is a waste of resources and time.</p><p>One thing I&rsquo;ve been struggling with since starting to use Bandcamp is the discovery of new music. Originally (in 2011), I used the <a href=https://bandcamp.com/tags target=_blank rel=noopener>browse-by-tag</a> feature, but it is often too broad to find music that I like. A newer feature is the <a href=https://bandcamp.com/discover target=_blank rel=noopener>Discoverinator</a>, which is meant to emulate the experience of <a href=http://blog.bandcamp.com/2012/06/07/behold-the-glory-of-the-discoverinator/ target=_blank rel=noopener>browsing through covers at a record store</a> – sadly, I could never find much stuff I liked using that method. Last year, Bandcamp announced <a href=http://blog.bandcamp.com/2013/01/10/bandcamp-for-fans/ target=_blank rel=noopener>Bandcamp for fans</a>, which includes the ability to wishlist items and discover new music by stalking/following other fans. In addition, they released a <a href=http://blog.bandcamp.com/2013/10/25/its-over/ target=_blank rel=noopener>mobile app</a>, which made the music purchased on Bandcamp much easier to access.</p><p>All these new features definitely increased my engagement and helped me find more stuff to listen to, but I still feel that Bandcamp music discovery could be much better. Specifically, I would love to be served personalised recommendations and be able to browse music that is similar to specific tracks and albums that I like. Rather than waiting for Bandcamp to implement these features, I decided to do it myself. Visit <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender – Bandcamp recommendations based on your fan account</a> to see where this effort stands at the moment.</p><p>While BCRecommender has already helped me discover new music to add to <a href=https://bandcamp.com/yanir target=_blank rel=noopener>my collection</a>, building it gave me many more ideas on how it can be improved, so it&rsquo;s definitely a work in progress. I&rsquo;ll probably tinker with the underlying algorithms as I go, so recommendations may occasionally seem weird (but this always seems to be the case with recommender systems in the real world). In subsequent posts I&rsquo;ll discuss some of the technical details and where I&rsquo;d like to take this project.</p><p><small><br>It&rsquo;s probably worth noting that BCRecommender is not associated with or endorsed by Bandcamp, but I doubt they would mind since it was built using publicly-available information, and is full of links to buy the music back on their site.<br></small></p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bandcamp/>Bandcamp</a></li><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li><li><a href=https://yanirseroussi.com/tags/music/>music</a></li><li><a href=https://yanirseroussi.com/tags/music-industry/>music industry</a></li><li><a href=https://yanirseroussi.com/tags/recommender-systems/>recommender systems</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on x" href="https://x.com/intent/tweet/?text=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f&amp;hashtags=Bandcamp%2cBCRecommender%2cmusic%2cmusicindustry%2crecommendersystems"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f&amp;title=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&amp;summary=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f&title=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on whatsapp" href="https://api.whatsapp.com/send?text=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on telegram" href="https://telegram.me/share/url?text=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on ycombinator" href="https://news.ycombinator.com/submitlink?t=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&u=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="Bandcamp,BCRecommender,music,music industry,recommender systems"><meta name=description content="My motivation behind building BCRecommender, a free recommendation & discovery service for Bandcamp music."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Building a Bandcamp recommender system (part 1 – motivation)"><meta property="og:description" content="My motivation behind building BCRecommender, a free recommendation & discovery service for Bandcamp music."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/"><meta property="og:image" content="https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-08-30T08:11:38+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot.png"><meta name=twitter:title content="Building a Bandcamp recommender system (part 1 – motivation)"><meta name=twitter:description content="My motivation behind building BCRecommender, a free recommendation & discovery service for Bandcamp music."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Building a Bandcamp recommender system (part 1 – motivation)","item":"https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Building a Bandcamp recommender system (part 1 – motivation)","name":"Building a Bandcamp recommender system (part 1 – motivation)","description":"My motivation behind building BCRecommender, a free recommendation \u0026amp; discovery service for Bandcamp music.","keywords":["Bandcamp","BCRecommender","music","music industry","recommender systems"],"articleBody":"I’ve been a Bandcamp user for a few years now. I love the fact that they pay out a significant share of the revenue directly to the artists, unlike other services. In addition, despite the fact that fans may stream all the music for free and even easily rip it, almost $80M were paid out to artists through Bandcamp to date (including almost $3M in the last month) – serving as strong evidence that the traditional music industry’s fight against piracy is a waste of resources and time.\nOne thing I’ve been struggling with since starting to use Bandcamp is the discovery of new music. Originally (in 2011), I used the browse-by-tag feature, but it is often too broad to find music that I like. A newer feature is the Discoverinator, which is meant to emulate the experience of browsing through covers at a record store – sadly, I could never find much stuff I liked using that method. Last year, Bandcamp announced Bandcamp for fans, which includes the ability to wishlist items and discover new music by stalking/following other fans. In addition, they released a mobile app, which made the music purchased on Bandcamp much easier to access.\nAll these new features definitely increased my engagement and helped me find more stuff to listen to, but I still feel that Bandcamp music discovery could be much better. Specifically, I would love to be served personalised recommendations and be able to browse music that is similar to specific tracks and albums that I like. Rather than waiting for Bandcamp to implement these features, I decided to do it myself. Visit BCRecommender – Bandcamp recommendations based on your fan account to see where this effort stands at the moment.\nWhile BCRecommender has already helped me discover new music to add to my collection, building it gave me many more ideas on how it can be improved, so it’s definitely a work in progress. I’ll probably tinker with the underlying algorithms as I go, so recommendations may occasionally seem weird (but this always seems to be the case with recommender systems in the real world). In subsequent posts I’ll discuss some of the technical details and where I’d like to take this project.\nIt’s probably worth noting that BCRecommender is not associated with or endorsed by Bandcamp, but I doubt they would mind since it was built using publicly-available information, and is full of links to buy the music back on their site.\n","wordCount":"411","inLanguage":"en","image":"https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot.png","datePublished":"2014-08-30T08:11:38Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Building a Bandcamp recommender system (part 1 – motivation)</h1><div class=post-meta><span title='2014-08-30 08:11:38 +0000 UTC'>August 30, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-08-30-building-a-bandcamp-recommender-system-part-1-motivation/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot_hu0bc6edb14393435331a10ae51f90dbe8_731004_1500x0_resize_box_3.png 1500w ,https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot.png 1581w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/bcrecommender-screenshot.png alt width=1581 height=821></figure><div class=post-content><p>I&rsquo;ve been a <a href=http://bandcamp.com target=_blank rel=noopener>Bandcamp</a> user for a few years now. I love the fact that they pay out a <a href=https://bandcamp.com/pricing target=_blank rel=noopener>significant share of the revenue</a> directly to the artists, unlike <a href=https://en.wikipedia.org/wiki/Spotify#Criticism target=_blank rel=noopener>other services</a>. In addition, despite the fact that fans may stream all the music for free and even <a href=https://bandcamp.com/help/audio_basics#steal target=_blank rel=noopener>easily rip it</a>, almost $80M were paid out to artists through Bandcamp to date (including almost $3M in the last month) – serving as strong evidence that the traditional music industry&rsquo;s fight against piracy is a waste of resources and time.</p><p>One thing I&rsquo;ve been struggling with since starting to use Bandcamp is the discovery of new music. Originally (in 2011), I used the <a href=https://bandcamp.com/tags target=_blank rel=noopener>browse-by-tag</a> feature, but it is often too broad to find music that I like. A newer feature is the <a href=https://bandcamp.com/discover target=_blank rel=noopener>Discoverinator</a>, which is meant to emulate the experience of <a href=http://blog.bandcamp.com/2012/06/07/behold-the-glory-of-the-discoverinator/ target=_blank rel=noopener>browsing through covers at a record store</a> – sadly, I could never find much stuff I liked using that method. Last year, Bandcamp announced <a href=http://blog.bandcamp.com/2013/01/10/bandcamp-for-fans/ target=_blank rel=noopener>Bandcamp for fans</a>, which includes the ability to wishlist items and discover new music by stalking/following other fans. In addition, they released a <a href=http://blog.bandcamp.com/2013/10/25/its-over/ target=_blank rel=noopener>mobile app</a>, which made the music purchased on Bandcamp much easier to access.</p><p>All these new features definitely increased my engagement and helped me find more stuff to listen to, but I still feel that Bandcamp music discovery could be much better. Specifically, I would love to be served personalised recommendations and be able to browse music that is similar to specific tracks and albums that I like. Rather than waiting for Bandcamp to implement these features, I decided to do it myself. Visit <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender – Bandcamp recommendations based on your fan account</a> to see where this effort stands at the moment.</p><p>While BCRecommender has already helped me discover new music to add to <a href=https://bandcamp.com/yanir target=_blank rel=noopener>my collection</a>, building it gave me many more ideas on how it can be improved, so it&rsquo;s definitely a work in progress. I&rsquo;ll probably tinker with the underlying algorithms as I go, so recommendations may occasionally seem weird (but this always seems to be the case with recommender systems in the real world). In subsequent posts I&rsquo;ll discuss some of the technical details and where I&rsquo;d like to take this project.</p><p><small><br>It&rsquo;s probably worth noting that BCRecommender is not associated with or endorsed by Bandcamp, but I doubt they would mind since it was built using publicly-available information, and is full of links to buy the music back on their site.<br></small></p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bandcamp/>Bandcamp</a></li><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li><li><a href=https://yanirseroussi.com/tags/music/>music</a></li><li><a href=https://yanirseroussi.com/tags/music-industry/>music industry</a></li><li><a href=https://yanirseroussi.com/tags/recommender-systems/>recommender systems</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on x" href="https://x.com/intent/tweet/?text=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f&amp;hashtags=Bandcamp%2cBCRecommender%2cmusic%2cmusicindustry%2crecommendersystems"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f&amp;title=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&amp;summary=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f&title=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on whatsapp" href="https://api.whatsapp.com/send?text=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on telegram" href="https://telegram.me/share/url?text=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a Bandcamp recommender system (part 1 – motivation) on ycombinator" href="https://news.ycombinator.com/submitlink?t=Building%20a%20Bandcamp%20recommender%20system%20%28part%201%20%e2%80%93%20motivation%29&u=https%3a%2f%2fyanirseroussi.com%2f2014%2f08%2f30%2fbuilding-a-bandcamp-recommender-system-part-1-motivation%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-6240><div class=comment-header><a href=#comment-6240><img class=comment-avatar src="https://www.gravatar.com/avatar/fa3df43fa529c8416bfe8b8831fe6bd0?s=50"><p class=comment-info><strong>Clément</strong><br><small>2019-02-08 15:43:50</small></p></a></div><div class="comment-body post-content"><p>Hi!</p><p>I just found these articles a few years after their publication&mldr;
 I saw that the BCRecommender seems not active anymore and that the last post is from 2015.</p><p>Any update?
 I&rsquo;m interested to have your feedback.</p><p>Thanks,</p><p>Clément</p></div></div><div class=comment-level-1 id=comment-6287><div class=comment-header><a href=#comment-6287><img class=comment-avatar src="https://www.gravatar.com/avatar/dda019c47a6183120608a6aeac2db6c5?s=50"><p class=comment-info><strong>Yanir Seroussi</strong><br><small>2019-02-08 22:14:39</small></p></a></div><div class="comment-body post-content">Hi Clément, there&rsquo;s an update here: <a href=https://yanirseroussi.com/state-of-bandcamp-recommender-september-2017/>https://yanirseroussi.com/state-of-bandcamp-recommender-september-2017/</a></div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
diff --git a/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/index.html b/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/index.html
index 4f2c2be13..e30330208 100644
--- a/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/index.html
+++ b/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Bandcamp,BCRecommender,DevOps,recommender systems,software engineering"><meta name=description content="Iterating on my BCRecommender service with the goal of keeping costs low while providing a valuable music recommendation service."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)"><meta property="og:description" content="Iterating on my BCRecommender service with the goal of keeping costs low while providing a valuable music recommendation service."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/"><meta property="og:image" content="https://yanirseroussi.com/bcrecommender-architecture.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-09-07T10:48:44+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bcrecommender-architecture.png"><meta name=twitter:title content="Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)"><meta name=twitter:description content="Iterating on my BCRecommender service with the goal of keeping costs low while providing a valuable music recommendation service."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)","item":"https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)","name":"Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)","description":"Iterating on my BCRecommender service with the goal of keeping costs low while providing a valuable music recommendation service.","keywords":["Bandcamp","BCRecommender","DevOps","recommender systems","software engineering"],"articleBody":"This is the second part of a series of posts on my BCRecommender – personalised Bandcamp recommendations project. Check out the first part for the general motivation behind this project.\nBCRecommender is a hobby project whose main goal is to help me find music I like on Bandcamp. Its secondary goal is to serve as a testing ground for ideas I have and things I’d like to explore.\nOne question I’ve been wondering about is: how much money does one need to spend on infrastructure for a simple web-based product before it reaches meaningful traffic?\nThe answer is: not much at all. It can easily be done for less than $1 per month.\nThis post discusses my exploration of this question by describing the main components of the BCRecommender system, without getting into the algorithms that drive it (which will be covered in subsequent posts).\nThe general flow of BCRecommender is fairly simple: crawl publicly-available data from Bandcamp (fan collections and tracks/albums = tralbums), generate recommendations based on this data (static lists of tralbums indexed by fan for personalised recommendations and by tralbum for similarity), and present the recommendations to users in a way that’s easy to browse and explore (since we’re dealing with music it must be playable, which is easy to achieve by embedding Bandcamp’s iframes).\nFirst iteration: Django \u0026 AWS The first iteration of the project was implemented as a Django project. Having never built a Django project from scratch, I figured this would be a good way to learn how it’s done properly. One thing I was keen on learning was using the Django ORM with an SQL database (in the past I’ve worked with Django and MongoDB). This ended up working less smoothly than I expected, perhaps because I’m too used to MongoDB, or because SQL forces you to model your data in unnatural ways, or because I insisted on using SQLite for simplicity. Whatever it was, I quickly started missing MongoDB, despite its flaws.\nI chose AWS for hosting because my personal account was under the free tier, and using a micro instance is more than enough for serving a website with no traffic. I considered Google App Engine with its indefinite free tier, but after reading the docs I realised I don’t want to jump through so many hoops to use their system – Google’s free tier was likely to cost too much in pain and time.\nWhile an AWS micro instance is enough for serving the recommendations, it’s not enough for generating them. Rather than paying Amazon for another instance, I figured that using spare capacity on my own laptop (quad-core with 16GB of RAM) would be good enough. So the backend worker for BCRecommender ended up being a local virtual machine using one core and 4GB of RAM.\nAfter some coding I had a nice setup in place:\nAWS webserver running Django with SQLite as the database layer and a simple frontend, styled with Bootstrap Local backend worker running Celery under Supervisor to collect the data (with errors reported to a dedicated Gmail account), Dropbox for backups, and Django management commands to generate the recommendations Code and issue tracker hosted on Bitbucket (which provides free private repositories) Fabric scripts for deployments to the AWS webserver and the local backend worker (including database sync as one big SQLite file) Local virtual machine for development (provisioned with Vagrant) This system wasn’t going to scale, but I didn’t care. I just used it to discover new music, and it worked. I didn’t even bother registering a domain name, so it was all running for free.\nSecond iteration: “Django” backend \u0026 Parse A few months ago, Facebook announced that Parse’s free tier will include 30 requests / second. That’s over 2.5 million requests per day, which is quite a lot – probably enough to run the majority of websites on the internet. It seemed too good to be true, so I had to try it myself.\nIt took a few hours to convert the Django webserver/frontend code to Parse. This was fairly straightforward, and it had the added advantages of getting rid of some deployment scripts and having a more solid development environment. Parse supplies a command-line tool for deployment that constantly syncs the code to an app that is identical to the production app – much better than the Fabric script I had.\nThe disadvantages of the move to Parse were having to rewrite some of the backend in JavaScript (= less readable than Python), and a more complex data sync command (no longer just copying a big SQLite file). However, I would definitely use it for other projects because of the generous free tier, the availability of APIs for all major platforms, and the elimination of most operational concerns.\nCurrent iteration: Goodbye Django, hello BCRecommender With the Django webserver out of the way, there was little use left for Django in the project. It took a few more hours to get rid of it, replacing the management commands with Commandr, and the SQLite database with MongoDB (wrapped with the excellent MongoEngine, which has matured a lot in recent years). MongoDB has become a more natural choice now, since it is the database used by Parse. I expect this setup of a local Python backend and a Parse frontend to work quite well (and remain virtually free) for the foreseeable future.\nThe only fixed cost I now have comes from registering the bcrecommender.com domain and managing it with Route 53. This wasn’t required when I was running it only for myself, and I could have just kept it under bcrecommender.parseapp.com, but I think it would be useful for other Bandcamp users. I would also like to use it as a training lab to improve my (poor) marketing skills – not having a dedicated domain just looks bad.\nIn summary, it’s definitely possible to build simple projects and host them for free. It also looks like my approach would scale way beyond the current BCRecommender volume. The next post in this series will cover some of the algorithms and general considerations of building the recommender system.\n","wordCount":"1021","inLanguage":"en","image":"https://yanirseroussi.com/bcrecommender-architecture.png","datePublished":"2014-09-07T10:48:44Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)</h1><div class=post-meta><span title='2014-09-07 10:48:44 +0000 UTC'>September 7, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-09-07-building-a-recommender-system-on-a-shoestring-budget/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture_hu30771a1e5f4a580acd5b458b23f57625_45736_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture_hu30771a1e5f4a580acd5b458b23f57625_45736_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture_hu30771a1e5f4a580acd5b458b23f57625_45736_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture_hu30771a1e5f4a580acd5b458b23f57625_45736_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture.png 1176w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture.png alt width=1176 height=526></figure><div class=post-content><p class=intro-note>This is the second part of a series of posts on my <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender – personalised Bandcamp recommendations</a> project. Check out <a href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/>the first part</a> for the general motivation behind this project.</p><p><a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender</a> is a hobby project whose main goal is to help me find music I like on <a href=https://bandcamp.com target=_blank rel=noopener>Bandcamp</a>. Its secondary goal is to serve as a testing ground for ideas I have and things I&rsquo;d like to explore.<br>One question I&rsquo;ve been wondering about is: how much money does one need to spend on infrastructure for a simple web-based product before it reaches meaningful traffic?<br>The answer is: not much at all. It can easily be done for less than $1 per month.<br>This post discusses my exploration of this question by describing the main components of the BCRecommender system, without getting into the algorithms that drive it (which will be covered in subsequent posts).</p><p>The general flow of BCRecommender is fairly simple: crawl publicly-available data from Bandcamp (fan collections and tracks/albums = tralbums), generate recommendations based on this data (static lists of tralbums indexed by fan for personalised recommendations and by tralbum for similarity), and present the recommendations to users in a way that&rsquo;s easy to browse and explore (since we&rsquo;re dealing with music it must be playable, which is easy to achieve by embedding Bandcamp&rsquo;s iframes).</p><h3 id=first-iteration-django--aws>First iteration: Django & AWS<a hidden class=anchor aria-hidden=true href=#first-iteration-django--aws>#</a></h3><p>The first iteration of the project was implemented as a <a href=https://www.djangoproject.com/ target=_blank rel=noopener>Django</a> project. Having never built a Django project from scratch, I figured this would be a good way to learn how it&rsquo;s done properly. One thing I was keen on learning was using the Django ORM with an SQL database (in the past I&rsquo;ve worked with Django and <a href=https://www.mongodb.org/ target=_blank rel=noopener>MongoDB</a>). This ended up working less smoothly than I expected, perhaps because I&rsquo;m too used to MongoDB, or because SQL forces you to model your data in unnatural ways, or because I insisted on using <a href=https://sqlite.org/ target=_blank rel=noopener>SQLite</a> for simplicity. Whatever it was, I quickly started missing MongoDB, despite its flaws.</p><p>I chose <a href=https://aws.amazon.com/ target=_blank rel=noopener>AWS</a> for hosting because my personal account was under the free tier, and using a micro instance is more than enough for serving a website with no traffic. I considered <a href=https://developers.google.com/appengine/ target=_blank rel=noopener>Google App Engine</a> with its indefinite free tier, but after reading the docs I realised I don&rsquo;t want to jump through so many hoops to use their system – Google&rsquo;s free tier was likely to cost too much in pain and time.</p><p>While an AWS micro instance is enough for <em>serving</em> the recommendations, it&rsquo;s not enough for generating them. Rather than paying Amazon for another instance, I figured that using spare capacity on my own laptop (quad-core with 16GB of RAM) would be good enough. So the backend worker for BCRecommender ended up being a local virtual machine using one core and 4GB of RAM.</p><p>After some coding I had a nice setup in place:</p><ul><li>AWS webserver running Django with SQLite as the database layer and a simple frontend, styled with <a href=http://getbootstrap.com/ target=_blank rel=noopener>Bootstrap</a></li><li>Local backend worker running <a href=http://www.celeryproject.org/ target=_blank rel=noopener>Celery</a> under <a href=http://supervisord.org/ target=_blank rel=noopener>Supervisor</a> to collect the data (with errors reported to a dedicated Gmail account), Dropbox for backups, and Django management commands to generate the recommendations</li><li>Code and issue tracker hosted on <a href=https://bitbucket.org/ target=_blank rel=noopener>Bitbucket</a> (which provides free private repositories)</li><li><a href=http://www.fabfile.org/ target=_blank rel=noopener>Fabric</a> scripts for deployments to the AWS webserver and the local backend worker (including database sync as one big SQLite file)</li><li>Local virtual machine for development (provisioned with <a href=http://www.vagrantup.com/ target=_blank rel=noopener>Vagrant</a>)</li></ul><p>This system wasn&rsquo;t going to scale, but I didn&rsquo;t care. I just used it to discover new music, and it worked. I didn&rsquo;t even bother registering a domain name, so it was all running for free.</p><h3 id=second-iteration-django-backend--parse>Second iteration: &ldquo;Django&rdquo; backend & Parse<a hidden class=anchor aria-hidden=true href=#second-iteration-django-backend--parse>#</a></h3><p>A few months ago, <a href=http://blog.parse.com/2014/04/30/parse-pricing-now-cheaper-and-simpler/ target=_blank rel=noopener>Facebook announced that Parse&rsquo;s free tier will include 30 requests / second</a>. That&rsquo;s over 2.5 million requests per day, which is quite a lot – probably enough to run the majority of websites on the internet. It seemed too good to be true, so I had to try it myself.</p><p>It took a few hours to convert the Django webserver/frontend code to Parse. This was fairly straightforward, and it had the added advantages of getting rid of some deployment scripts and having a more solid development environment. Parse supplies a command-line tool for deployment that constantly syncs the code to an app that is identical to the production app – much better than the Fabric script I had.</p><p>The disadvantages of the move to Parse were having to rewrite some of the backend in JavaScript (= less readable than Python), and a more complex data sync command (no longer just copying a big SQLite file). However, I would definitely use it for other projects because of the generous free tier, the availability of APIs for all major platforms, and the elimination of most operational concerns.</p><h3 id=current-iteration-goodbye-django-hello-bcrecommender>Current iteration: Goodbye Django, hello BCRecommender<a hidden class=anchor aria-hidden=true href=#current-iteration-goodbye-django-hello-bcrecommender>#</a></h3><p>With the Django webserver out of the way, there was little use left for Django in the project. It took a few more hours to get rid of it, replacing the management commands with <a href=https://github.com/tellapart/commandr target=_blank rel=noopener>Commandr</a>, and the SQLite database with MongoDB (wrapped with the excellent <a href=http://mongoengine.org/ target=_blank rel=noopener>MongoEngine</a>, which has matured a lot in recent years). MongoDB has become a more natural choice now, since it is the database used by Parse. I expect this setup of a local Python backend and a Parse frontend to work quite well (and remain virtually free) for the foreseeable future.</p><p>The only fixed cost I now have comes from registering the <a href=http://www.bcrecommender.com target=_blank rel=noopener>bcrecommender.com domain</a> and managing it with Route 53. This wasn&rsquo;t required when I was running it only for myself, and I could have just kept it under bcrecommender.parseapp.com, but I think it would be useful for other Bandcamp users. I would also like to use it as a training lab to improve my (poor) marketing skills – not having a dedicated domain just looks bad.</p><p>In summary, it&rsquo;s definitely possible to build simple projects and host them for free. It also looks like my approach would scale way beyond the current BCRecommender volume. The next post in this series will cover some of the algorithms and general considerations of building the recommender system.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bandcamp/>Bandcamp</a></li><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li><li><a href=https://yanirseroussi.com/tags/devops/>DevOps</a></li><li><a href=https://yanirseroussi.com/tags/recommender-systems/>recommender systems</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on x" href="https://x.com/intent/tweet/?text=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f&amp;hashtags=Bandcamp%2cBCRecommender%2cDevOps%2crecommendersystems%2csoftwareengineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f&amp;title=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&amp;summary=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f&title=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on whatsapp" href="https://api.whatsapp.com/send?text=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on telegram" href="https://telegram.me/share/url?text=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on ycombinator" href="https://news.ycombinator.com/submitlink?t=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&u=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="Bandcamp,BCRecommender,DevOps,recommender systems,software engineering"><meta name=description content="Iterating on my BCRecommender service with the goal of keeping costs low while providing a valuable music recommendation service."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)"><meta property="og:description" content="Iterating on my BCRecommender service with the goal of keeping costs low while providing a valuable music recommendation service."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/"><meta property="og:image" content="https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-09-07T10:48:44+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture.png"><meta name=twitter:title content="Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)"><meta name=twitter:description content="Iterating on my BCRecommender service with the goal of keeping costs low while providing a valuable music recommendation service."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)","item":"https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)","name":"Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)","description":"Iterating on my BCRecommender service with the goal of keeping costs low while providing a valuable music recommendation service.","keywords":["Bandcamp","BCRecommender","DevOps","recommender systems","software engineering"],"articleBody":"This is the second part of a series of posts on my BCRecommender – personalised Bandcamp recommendations project. Check out the first part for the general motivation behind this project.\nBCRecommender is a hobby project whose main goal is to help me find music I like on Bandcamp. Its secondary goal is to serve as a testing ground for ideas I have and things I’d like to explore.\nOne question I’ve been wondering about is: how much money does one need to spend on infrastructure for a simple web-based product before it reaches meaningful traffic?\nThe answer is: not much at all. It can easily be done for less than $1 per month.\nThis post discusses my exploration of this question by describing the main components of the BCRecommender system, without getting into the algorithms that drive it (which will be covered in subsequent posts).\nThe general flow of BCRecommender is fairly simple: crawl publicly-available data from Bandcamp (fan collections and tracks/albums = tralbums), generate recommendations based on this data (static lists of tralbums indexed by fan for personalised recommendations and by tralbum for similarity), and present the recommendations to users in a way that’s easy to browse and explore (since we’re dealing with music it must be playable, which is easy to achieve by embedding Bandcamp’s iframes).\nFirst iteration: Django \u0026 AWS The first iteration of the project was implemented as a Django project. Having never built a Django project from scratch, I figured this would be a good way to learn how it’s done properly. One thing I was keen on learning was using the Django ORM with an SQL database (in the past I’ve worked with Django and MongoDB). This ended up working less smoothly than I expected, perhaps because I’m too used to MongoDB, or because SQL forces you to model your data in unnatural ways, or because I insisted on using SQLite for simplicity. Whatever it was, I quickly started missing MongoDB, despite its flaws.\nI chose AWS for hosting because my personal account was under the free tier, and using a micro instance is more than enough for serving a website with no traffic. I considered Google App Engine with its indefinite free tier, but after reading the docs I realised I don’t want to jump through so many hoops to use their system – Google’s free tier was likely to cost too much in pain and time.\nWhile an AWS micro instance is enough for serving the recommendations, it’s not enough for generating them. Rather than paying Amazon for another instance, I figured that using spare capacity on my own laptop (quad-core with 16GB of RAM) would be good enough. So the backend worker for BCRecommender ended up being a local virtual machine using one core and 4GB of RAM.\nAfter some coding I had a nice setup in place:\nAWS webserver running Django with SQLite as the database layer and a simple frontend, styled with Bootstrap Local backend worker running Celery under Supervisor to collect the data (with errors reported to a dedicated Gmail account), Dropbox for backups, and Django management commands to generate the recommendations Code and issue tracker hosted on Bitbucket (which provides free private repositories) Fabric scripts for deployments to the AWS webserver and the local backend worker (including database sync as one big SQLite file) Local virtual machine for development (provisioned with Vagrant) This system wasn’t going to scale, but I didn’t care. I just used it to discover new music, and it worked. I didn’t even bother registering a domain name, so it was all running for free.\nSecond iteration: “Django” backend \u0026 Parse A few months ago, Facebook announced that Parse’s free tier will include 30 requests / second. That’s over 2.5 million requests per day, which is quite a lot – probably enough to run the majority of websites on the internet. It seemed too good to be true, so I had to try it myself.\nIt took a few hours to convert the Django webserver/frontend code to Parse. This was fairly straightforward, and it had the added advantages of getting rid of some deployment scripts and having a more solid development environment. Parse supplies a command-line tool for deployment that constantly syncs the code to an app that is identical to the production app – much better than the Fabric script I had.\nThe disadvantages of the move to Parse were having to rewrite some of the backend in JavaScript (= less readable than Python), and a more complex data sync command (no longer just copying a big SQLite file). However, I would definitely use it for other projects because of the generous free tier, the availability of APIs for all major platforms, and the elimination of most operational concerns.\nCurrent iteration: Goodbye Django, hello BCRecommender With the Django webserver out of the way, there was little use left for Django in the project. It took a few more hours to get rid of it, replacing the management commands with Commandr, and the SQLite database with MongoDB (wrapped with the excellent MongoEngine, which has matured a lot in recent years). MongoDB has become a more natural choice now, since it is the database used by Parse. I expect this setup of a local Python backend and a Parse frontend to work quite well (and remain virtually free) for the foreseeable future.\nThe only fixed cost I now have comes from registering the bcrecommender.com domain and managing it with Route 53. This wasn’t required when I was running it only for myself, and I could have just kept it under bcrecommender.parseapp.com, but I think it would be useful for other Bandcamp users. I would also like to use it as a training lab to improve my (poor) marketing skills – not having a dedicated domain just looks bad.\nIn summary, it’s definitely possible to build simple projects and host them for free. It also looks like my approach would scale way beyond the current BCRecommender volume. The next post in this series will cover some of the algorithms and general considerations of building the recommender system.\n","wordCount":"1021","inLanguage":"en","image":"https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture.png","datePublished":"2014-09-07T10:48:44Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)</h1><div class=post-meta><span title='2014-09-07 10:48:44 +0000 UTC'>September 7, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-09-07-building-a-recommender-system-on-a-shoestring-budget/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture_hu30771a1e5f4a580acd5b458b23f57625_45736_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture_hu30771a1e5f4a580acd5b458b23f57625_45736_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture_hu30771a1e5f4a580acd5b458b23f57625_45736_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture_hu30771a1e5f4a580acd5b458b23f57625_45736_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture.png 1176w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/bcrecommender-architecture.png alt width=1176 height=526></figure><div class=post-content><p class=intro-note>This is the second part of a series of posts on my <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender – personalised Bandcamp recommendations</a> project. Check out <a href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/>the first part</a> for the general motivation behind this project.</p><p><a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender</a> is a hobby project whose main goal is to help me find music I like on <a href=https://bandcamp.com target=_blank rel=noopener>Bandcamp</a>. Its secondary goal is to serve as a testing ground for ideas I have and things I&rsquo;d like to explore.<br>One question I&rsquo;ve been wondering about is: how much money does one need to spend on infrastructure for a simple web-based product before it reaches meaningful traffic?<br>The answer is: not much at all. It can easily be done for less than $1 per month.<br>This post discusses my exploration of this question by describing the main components of the BCRecommender system, without getting into the algorithms that drive it (which will be covered in subsequent posts).</p><p>The general flow of BCRecommender is fairly simple: crawl publicly-available data from Bandcamp (fan collections and tracks/albums = tralbums), generate recommendations based on this data (static lists of tralbums indexed by fan for personalised recommendations and by tralbum for similarity), and present the recommendations to users in a way that&rsquo;s easy to browse and explore (since we&rsquo;re dealing with music it must be playable, which is easy to achieve by embedding Bandcamp&rsquo;s iframes).</p><h3 id=first-iteration-django--aws>First iteration: Django & AWS<a hidden class=anchor aria-hidden=true href=#first-iteration-django--aws>#</a></h3><p>The first iteration of the project was implemented as a <a href=https://www.djangoproject.com/ target=_blank rel=noopener>Django</a> project. Having never built a Django project from scratch, I figured this would be a good way to learn how it&rsquo;s done properly. One thing I was keen on learning was using the Django ORM with an SQL database (in the past I&rsquo;ve worked with Django and <a href=https://www.mongodb.org/ target=_blank rel=noopener>MongoDB</a>). This ended up working less smoothly than I expected, perhaps because I&rsquo;m too used to MongoDB, or because SQL forces you to model your data in unnatural ways, or because I insisted on using <a href=https://sqlite.org/ target=_blank rel=noopener>SQLite</a> for simplicity. Whatever it was, I quickly started missing MongoDB, despite its flaws.</p><p>I chose <a href=https://aws.amazon.com/ target=_blank rel=noopener>AWS</a> for hosting because my personal account was under the free tier, and using a micro instance is more than enough for serving a website with no traffic. I considered <a href=https://developers.google.com/appengine/ target=_blank rel=noopener>Google App Engine</a> with its indefinite free tier, but after reading the docs I realised I don&rsquo;t want to jump through so many hoops to use their system – Google&rsquo;s free tier was likely to cost too much in pain and time.</p><p>While an AWS micro instance is enough for <em>serving</em> the recommendations, it&rsquo;s not enough for generating them. Rather than paying Amazon for another instance, I figured that using spare capacity on my own laptop (quad-core with 16GB of RAM) would be good enough. So the backend worker for BCRecommender ended up being a local virtual machine using one core and 4GB of RAM.</p><p>After some coding I had a nice setup in place:</p><ul><li>AWS webserver running Django with SQLite as the database layer and a simple frontend, styled with <a href=http://getbootstrap.com/ target=_blank rel=noopener>Bootstrap</a></li><li>Local backend worker running <a href=http://www.celeryproject.org/ target=_blank rel=noopener>Celery</a> under <a href=http://supervisord.org/ target=_blank rel=noopener>Supervisor</a> to collect the data (with errors reported to a dedicated Gmail account), Dropbox for backups, and Django management commands to generate the recommendations</li><li>Code and issue tracker hosted on <a href=https://bitbucket.org/ target=_blank rel=noopener>Bitbucket</a> (which provides free private repositories)</li><li><a href=http://www.fabfile.org/ target=_blank rel=noopener>Fabric</a> scripts for deployments to the AWS webserver and the local backend worker (including database sync as one big SQLite file)</li><li>Local virtual machine for development (provisioned with <a href=http://www.vagrantup.com/ target=_blank rel=noopener>Vagrant</a>)</li></ul><p>This system wasn&rsquo;t going to scale, but I didn&rsquo;t care. I just used it to discover new music, and it worked. I didn&rsquo;t even bother registering a domain name, so it was all running for free.</p><h3 id=second-iteration-django-backend--parse>Second iteration: &ldquo;Django&rdquo; backend & Parse<a hidden class=anchor aria-hidden=true href=#second-iteration-django-backend--parse>#</a></h3><p>A few months ago, <a href=http://blog.parse.com/2014/04/30/parse-pricing-now-cheaper-and-simpler/ target=_blank rel=noopener>Facebook announced that Parse&rsquo;s free tier will include 30 requests / second</a>. That&rsquo;s over 2.5 million requests per day, which is quite a lot – probably enough to run the majority of websites on the internet. It seemed too good to be true, so I had to try it myself.</p><p>It took a few hours to convert the Django webserver/frontend code to Parse. This was fairly straightforward, and it had the added advantages of getting rid of some deployment scripts and having a more solid development environment. Parse supplies a command-line tool for deployment that constantly syncs the code to an app that is identical to the production app – much better than the Fabric script I had.</p><p>The disadvantages of the move to Parse were having to rewrite some of the backend in JavaScript (= less readable than Python), and a more complex data sync command (no longer just copying a big SQLite file). However, I would definitely use it for other projects because of the generous free tier, the availability of APIs for all major platforms, and the elimination of most operational concerns.</p><h3 id=current-iteration-goodbye-django-hello-bcrecommender>Current iteration: Goodbye Django, hello BCRecommender<a hidden class=anchor aria-hidden=true href=#current-iteration-goodbye-django-hello-bcrecommender>#</a></h3><p>With the Django webserver out of the way, there was little use left for Django in the project. It took a few more hours to get rid of it, replacing the management commands with <a href=https://github.com/tellapart/commandr target=_blank rel=noopener>Commandr</a>, and the SQLite database with MongoDB (wrapped with the excellent <a href=http://mongoengine.org/ target=_blank rel=noopener>MongoEngine</a>, which has matured a lot in recent years). MongoDB has become a more natural choice now, since it is the database used by Parse. I expect this setup of a local Python backend and a Parse frontend to work quite well (and remain virtually free) for the foreseeable future.</p><p>The only fixed cost I now have comes from registering the <a href=http://www.bcrecommender.com target=_blank rel=noopener>bcrecommender.com domain</a> and managing it with Route 53. This wasn&rsquo;t required when I was running it only for myself, and I could have just kept it under bcrecommender.parseapp.com, but I think it would be useful for other Bandcamp users. I would also like to use it as a training lab to improve my (poor) marketing skills – not having a dedicated domain just looks bad.</p><p>In summary, it&rsquo;s definitely possible to build simple projects and host them for free. It also looks like my approach would scale way beyond the current BCRecommender volume. The next post in this series will cover some of the algorithms and general considerations of building the recommender system.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bandcamp/>Bandcamp</a></li><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li><li><a href=https://yanirseroussi.com/tags/devops/>DevOps</a></li><li><a href=https://yanirseroussi.com/tags/recommender-systems/>recommender systems</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on x" href="https://x.com/intent/tweet/?text=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f&amp;hashtags=Bandcamp%2cBCRecommender%2cDevOps%2crecommendersystems%2csoftwareengineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f&amp;title=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&amp;summary=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f&title=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on whatsapp" href="https://api.whatsapp.com/send?text=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on telegram" href="https://telegram.me/share/url?text=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) on ycombinator" href="https://news.ycombinator.com/submitlink?t=Building%20a%20recommender%20system%20on%20a%20shoestring%20budget%20%28or%3a%20BCRecommender%20part%202%20%e2%80%93%20general%20system%20layout%29&u=https%3a%2f%2fyanirseroussi.com%2f2014%2f09%2f07%2fbuilding-a-recommender-system-on-a-shoestring-budget%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/index.html b/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/index.html
index 79ea672f0..a410577d1 100644
--- a/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/index.html
+++ b/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Greek Media Monitoring Kaggle competition: My approach | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,Kaggle,Kaggle competition,multi-label classification,predictive modelling"><meta name=description content="Summary of my approach to the Greek Media Monitoring Kaggle competition, where I finished 6th out of 120 teams."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Greek Media Monitoring Kaggle competition: My approach"><meta property="og:description" content="Summary of my approach to the Greek Media Monitoring Kaggle competition, where I finished 6th out of 120 teams."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/"><meta property="og:image" content="https://yanirseroussi.com/wise2014-connected-components.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-10-07T03:21:35+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/wise2014-connected-components.png"><meta name=twitter:title content="Greek Media Monitoring Kaggle competition: My approach"><meta name=twitter:description content="Summary of my approach to the Greek Media Monitoring Kaggle competition, where I finished 6th out of 120 teams."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Greek Media Monitoring Kaggle competition: My approach","item":"https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Greek Media Monitoring Kaggle competition: My approach","name":"Greek Media Monitoring Kaggle competition: My approach","description":"Summary of my approach to the Greek Media Monitoring Kaggle competition, where I finished 6th out of 120 teams.","keywords":["data science","Kaggle","Kaggle competition","multi-label classification","predictive modelling"],"articleBody":"A few months ago I participated in the Kaggle Greek Media Monitoring competition. The goal of the competition was doing multilabel classification of texts scanned from Greek print media. Despite not having much time due to travelling and other commitments, I managed to finish 6th (out of 120 teams). This post describes my approach to the problem.\nData \u0026 evaluation The data consists of articles scanned from Greek print media in May-September 2013. Due to copyright issues, the organisers didn’t make the original articles available – competitors only had access to normalised tf-idf representations of the texts. This limited the options for doing feature engineering and made it impossible to consider things like word order, but it made things somewhat simpler as the focus was on modelling due to inability to extract interesting features.\nOverall, there are about 65K texts in the training set and 35K in the test set, where the split is based on chronological ordering (i.e., the training articles were published before the test articles). Each article was manually labelled with one or more labels out of a set of 203 labels. For each test article, the goal is to infer its set of labels. Submissions were ranked using the mean F1 score.\nDespite being manually annotated, the data isn’t very clean. Issues include identical texts that have different labels, empty articles, and articles with very few words. For example, the training set includes ten “articles” with a single word. Five of these articles have the word 68839, but each of these five was given a different label. Such issues are not unusual in Kaggle competitions or in real life, but they do limit the general usefulness of the results since any model built on this data would fit some noise.\nLocal validation setup As mentioned in previous posts (How to (almost) win Kaggle competitions and Kaggle beginner tips) having a solid local validation setup is very important. It ensures you don’t waste time on weak submissions, increases confidence in the models, and avoids leaking information about how well you’re doing.\nI used the first 35K training texts for local training and the following 30K texts for validation. While the article publication dates weren’t provided, I hoped that this would mimic the competition setup, where the test dataset consists of articles that were published after the articles in the training dataset. This seemed to work, as my local results were consistent with the leaderboard results. I’m pleased to report that this setup allowed me to have the lowest number of submissions of all the top-10 teams 🙂\nThings that worked I originally wanted to use this competition to play with deep learning through Python packages such as Theano and PyLearn2. However, as this was the first time I worked on a multilabel classification problem, I got sucked into reading a lot of papers on the topic and never got around to doing deep learning. Maybe next time…\nOne of my key discoveries was that there if you define a graph where the vertices are labels and there’s an edge between two labels if they appear together in a document’s label set, then there are two main connected components of labels and several small ones with single labels (see figure below). It is possible to train a linear classifier that distinguishes between the components with very high accuracy (over 99%). This allowed me to improve performance by training different classifiers on each connected component.\nMy best submission ended up being a simple weighted linear combination of three models. All these models are hierarchical ensembles, where a linear classifier distinguishes between connected components, and the base models are trained on texts from a single connected component. These base models are:\nEnsemble of classifier chains (ECC) with linear classifiers (SGDClassifier from scikit-learn) trained for each label, using hinge loss and L1 penalty Same as 1, but with modified Huber loss A linear classifier with modified Huber loss and L1 penalty that predicts single label probabilities For each test document, each one of these base models yields a score for each label. These scores are weighted and thresholded to yield the final predictions.\nIt was interesting to learn that a relatively-simple model like ECC yields competitive results. The basic idea behind ECC is to combine different classifier chains. Each classifier chain is also an ensemble where each base classifier is trained to predict a single label. The input for each classifier in the chain depends on the output of preceding classifiers, so it encodes dependencies between labels. For example, if label 2 always appears with label 1 and the label 1 classifier precedes the label 2 classifier in the chain, the label 2 classifier is able to use this dependency information directly, which should increase its accuracy (though it is affected by misclassifications by the label 1 classifier). See Read et al.’s paper for a more in-depth explanation.\nAnother notable observation is that L1 penalty worked well, which is not too surprising when considering the fact that the dataset has 300K features and many of them are probably irrelevant to prediction (L1 penalty yields sparse models where many features get zero weight).\nThings that didn’t work As I was travelling, I didn’t have much time to work on this competition over its two final weeks (though this was a good way of passing the time on long flights). One thing that I tried was understanding some of the probabilistic classifier chain (PCC) code out there by porting it to Python, but the results were very disappointing, probably due to bugs in my code. I expected PCC to work well, especially with the extension for optimising the F-measure. Figuring out how to run the Java code would have probably been a better use of my time than porting the code to Python.\nI also played with reverse-engineering the features back to counts, but it was problematic since the feature values are normalised. It was disappointing that we weren’t at least given the bag of words representations. I also attempted to reduce the feature representation with latent Dirichlet allocation, but it didn’t perform well – possibly because I couldn’t get the correct word counts.\nConclusion Overall, this was a fun competition. Despite minor issues with the data and not having enough time to do everything I wanted to do, it was a great learning experience. From reading the summaries by the other teams, it appears that other competitors enjoyed it too. As always, I highly recommend Kaggle competitions to beginners who are trying to learn more about the field of data science and predictive modelling, and to more experienced data scientists who want to improve their skills.\n","wordCount":"1114","inLanguage":"en","image":"https://yanirseroussi.com/wise2014-connected-components.png","datePublished":"2014-10-07T03:21:35Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Greek Media Monitoring Kaggle competition: My approach</h1><div class=post-meta><span title='2014-10-07 03:21:35 +0000 UTC'>October 7, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-10-07-greek-media-monitoring-kaggle-competition-my-approach/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components_hu4bfbbe3f9a9448d9a431640c78e486b4_93326_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components_hu4bfbbe3f9a9448d9a431640c78e486b4_93326_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components_hu4bfbbe3f9a9448d9a431640c78e486b4_93326_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components.png 769w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components.png alt width=769 height=527></figure><div class=post-content><p>A few months ago I participated in the <a href=http://www.kaggle.com/c/wise-2014 target=_blank rel=noopener>Kaggle Greek Media Monitoring competition</a>. The goal of the competition was doing <a href=https://en.wikipedia.org/wiki/Multi-label_classification target=_blank rel=noopener>multilabel classification</a> of texts scanned from Greek print media. Despite not having much time due to travelling and other commitments, I managed to finish 6th (out of 120 teams). This post describes my approach to the problem.</p><h3 id=data--evaluation>Data & evaluation<a hidden class=anchor aria-hidden=true href=#data--evaluation>#</a></h3><p>The data consists of articles scanned from Greek print media in May-September 2013. Due to copyright issues, the organisers didn&rsquo;t make the original articles available – competitors only had access to normalised <a href=https://en.wikipedia.org/wiki/Tf%E2%80%93idf target=_blank rel=noopener>tf-idf</a> representations of the texts. This limited the options for doing feature engineering and made it impossible to consider things like word order, but it made things somewhat simpler as the focus was on modelling due to inability to extract interesting features.</p><p>Overall, there are about 65K texts in the training set and 35K in the test set, where the split is based on chronological ordering (i.e., the training articles were published before the test articles). Each article was manually labelled with one or more labels out of a set of 203 labels. For each test article, the goal is to infer its set of labels. Submissions were ranked using the <a href=http://www.kaggle.com/c/wise-2014/details/evaluation target=_blank rel=noopener>mean F1 score</a>.</p><p>Despite being manually annotated, the data isn&rsquo;t very clean. Issues include identical texts that have different labels, empty articles, and articles with very few words. For example, the training set includes ten &ldquo;articles&rdquo; with a single word. Five of these articles have the word 68839, but each of these five was given a different label. Such issues are not unusual in Kaggle competitions or in real life, but they do limit the general usefulness of the results since any model built on this data would fit some noise.</p><h3 id=local-validation-setup>Local validation setup<a hidden class=anchor aria-hidden=true href=#local-validation-setup>#</a></h3><p>As mentioned in previous posts (<a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/>How to (almost) win Kaggle competitions</a> and <a href=https://yanirseroussi.com/2014/01/19/kaggle-beginner-tips/>Kaggle beginner tips</a>) having a solid local validation setup is very important. It ensures you don&rsquo;t waste time on weak submissions, increases confidence in the models, and avoids leaking information about how well you&rsquo;re doing.</p><p>I used the first 35K training texts for local training and the following 30K texts for validation. While the article publication dates weren&rsquo;t provided, I hoped that this would mimic the competition setup, where the test dataset consists of articles that were published after the articles in the training dataset. This seemed to work, as my local results were consistent with the leaderboard results. I&rsquo;m pleased to report that this setup allowed me to have the lowest number of submissions of all the top-10 teams 🙂</p><h3 id=things-that-worked>Things that worked<a hidden class=anchor aria-hidden=true href=#things-that-worked>#</a></h3><p>I originally wanted to use this competition to play with deep learning through Python packages such as <a href=http://deeplearning.net/software/theano/ target=_blank rel=noopener>Theano</a> and <a href=http://deeplearning.net/software/pylearn2/ target=_blank rel=noopener>PyLearn2</a>. However, as this was the first time I worked on a multilabel classification problem, I got sucked into reading a lot of papers on the topic and never got around to doing deep learning. Maybe next time&mldr;</p><p>One of my key discoveries was that there if you define a graph where the vertices are labels and there&rsquo;s an edge between two labels if they appear together in a document&rsquo;s label set, then there are two main connected components of labels and several small ones with single labels (see figure below). It is possible to train a linear classifier that distinguishes between the components with very high accuracy (over 99%). This allowed me to improve performance by training different classifiers on each connected component.</p><figure><a href=wise2014-connected-components.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="data science,Kaggle,Kaggle competition,multi-label classification,predictive modelling"><meta name=description content="Summary of my approach to the Greek Media Monitoring Kaggle competition, where I finished 6th out of 120 teams."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Greek Media Monitoring Kaggle competition: My approach"><meta property="og:description" content="Summary of my approach to the Greek Media Monitoring Kaggle competition, where I finished 6th out of 120 teams."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/"><meta property="og:image" content="https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-10-07T03:21:35+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components.png"><meta name=twitter:title content="Greek Media Monitoring Kaggle competition: My approach"><meta name=twitter:description content="Summary of my approach to the Greek Media Monitoring Kaggle competition, where I finished 6th out of 120 teams."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Greek Media Monitoring Kaggle competition: My approach","item":"https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Greek Media Monitoring Kaggle competition: My approach","name":"Greek Media Monitoring Kaggle competition: My approach","description":"Summary of my approach to the Greek Media Monitoring Kaggle competition, where I finished 6th out of 120 teams.","keywords":["data science","Kaggle","Kaggle competition","multi-label classification","predictive modelling"],"articleBody":"A few months ago I participated in the Kaggle Greek Media Monitoring competition. The goal of the competition was doing multilabel classification of texts scanned from Greek print media. Despite not having much time due to travelling and other commitments, I managed to finish 6th (out of 120 teams). This post describes my approach to the problem.\nData \u0026 evaluation The data consists of articles scanned from Greek print media in May-September 2013. Due to copyright issues, the organisers didn’t make the original articles available – competitors only had access to normalised tf-idf representations of the texts. This limited the options for doing feature engineering and made it impossible to consider things like word order, but it made things somewhat simpler as the focus was on modelling due to inability to extract interesting features.\nOverall, there are about 65K texts in the training set and 35K in the test set, where the split is based on chronological ordering (i.e., the training articles were published before the test articles). Each article was manually labelled with one or more labels out of a set of 203 labels. For each test article, the goal is to infer its set of labels. Submissions were ranked using the mean F1 score.\nDespite being manually annotated, the data isn’t very clean. Issues include identical texts that have different labels, empty articles, and articles with very few words. For example, the training set includes ten “articles” with a single word. Five of these articles have the word 68839, but each of these five was given a different label. Such issues are not unusual in Kaggle competitions or in real life, but they do limit the general usefulness of the results since any model built on this data would fit some noise.\nLocal validation setup As mentioned in previous posts (How to (almost) win Kaggle competitions and Kaggle beginner tips) having a solid local validation setup is very important. It ensures you don’t waste time on weak submissions, increases confidence in the models, and avoids leaking information about how well you’re doing.\nI used the first 35K training texts for local training and the following 30K texts for validation. While the article publication dates weren’t provided, I hoped that this would mimic the competition setup, where the test dataset consists of articles that were published after the articles in the training dataset. This seemed to work, as my local results were consistent with the leaderboard results. I’m pleased to report that this setup allowed me to have the lowest number of submissions of all the top-10 teams 🙂\nThings that worked I originally wanted to use this competition to play with deep learning through Python packages such as Theano and PyLearn2. However, as this was the first time I worked on a multilabel classification problem, I got sucked into reading a lot of papers on the topic and never got around to doing deep learning. Maybe next time…\nOne of my key discoveries was that there if you define a graph where the vertices are labels and there’s an edge between two labels if they appear together in a document’s label set, then there are two main connected components of labels and several small ones with single labels (see figure below). It is possible to train a linear classifier that distinguishes between the components with very high accuracy (over 99%). This allowed me to improve performance by training different classifiers on each connected component.\nMy best submission ended up being a simple weighted linear combination of three models. All these models are hierarchical ensembles, where a linear classifier distinguishes between connected components, and the base models are trained on texts from a single connected component. These base models are:\nEnsemble of classifier chains (ECC) with linear classifiers (SGDClassifier from scikit-learn) trained for each label, using hinge loss and L1 penalty Same as 1, but with modified Huber loss A linear classifier with modified Huber loss and L1 penalty that predicts single label probabilities For each test document, each one of these base models yields a score for each label. These scores are weighted and thresholded to yield the final predictions.\nIt was interesting to learn that a relatively-simple model like ECC yields competitive results. The basic idea behind ECC is to combine different classifier chains. Each classifier chain is also an ensemble where each base classifier is trained to predict a single label. The input for each classifier in the chain depends on the output of preceding classifiers, so it encodes dependencies between labels. For example, if label 2 always appears with label 1 and the label 1 classifier precedes the label 2 classifier in the chain, the label 2 classifier is able to use this dependency information directly, which should increase its accuracy (though it is affected by misclassifications by the label 1 classifier). See Read et al.’s paper for a more in-depth explanation.\nAnother notable observation is that L1 penalty worked well, which is not too surprising when considering the fact that the dataset has 300K features and many of them are probably irrelevant to prediction (L1 penalty yields sparse models where many features get zero weight).\nThings that didn’t work As I was travelling, I didn’t have much time to work on this competition over its two final weeks (though this was a good way of passing the time on long flights). One thing that I tried was understanding some of the probabilistic classifier chain (PCC) code out there by porting it to Python, but the results were very disappointing, probably due to bugs in my code. I expected PCC to work well, especially with the extension for optimising the F-measure. Figuring out how to run the Java code would have probably been a better use of my time than porting the code to Python.\nI also played with reverse-engineering the features back to counts, but it was problematic since the feature values are normalised. It was disappointing that we weren’t at least given the bag of words representations. I also attempted to reduce the feature representation with latent Dirichlet allocation, but it didn’t perform well – possibly because I couldn’t get the correct word counts.\nConclusion Overall, this was a fun competition. Despite minor issues with the data and not having enough time to do everything I wanted to do, it was a great learning experience. From reading the summaries by the other teams, it appears that other competitors enjoyed it too. As always, I highly recommend Kaggle competitions to beginners who are trying to learn more about the field of data science and predictive modelling, and to more experienced data scientists who want to improve their skills.\n","wordCount":"1114","inLanguage":"en","image":"https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components.png","datePublished":"2014-10-07T03:21:35Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Greek Media Monitoring Kaggle competition: My approach</h1><div class=post-meta><span title='2014-10-07 03:21:35 +0000 UTC'>October 7, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-10-07-greek-media-monitoring-kaggle-competition-my-approach/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components_hu4bfbbe3f9a9448d9a431640c78e486b4_93326_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components_hu4bfbbe3f9a9448d9a431640c78e486b4_93326_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components_hu4bfbbe3f9a9448d9a431640c78e486b4_93326_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components.png 769w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components.png alt width=769 height=527></figure><div class=post-content><p>A few months ago I participated in the <a href=http://www.kaggle.com/c/wise-2014 target=_blank rel=noopener>Kaggle Greek Media Monitoring competition</a>. The goal of the competition was doing <a href=https://en.wikipedia.org/wiki/Multi-label_classification target=_blank rel=noopener>multilabel classification</a> of texts scanned from Greek print media. Despite not having much time due to travelling and other commitments, I managed to finish 6th (out of 120 teams). This post describes my approach to the problem.</p><h3 id=data--evaluation>Data & evaluation<a hidden class=anchor aria-hidden=true href=#data--evaluation>#</a></h3><p>The data consists of articles scanned from Greek print media in May-September 2013. Due to copyright issues, the organisers didn&rsquo;t make the original articles available – competitors only had access to normalised <a href=https://en.wikipedia.org/wiki/Tf%E2%80%93idf target=_blank rel=noopener>tf-idf</a> representations of the texts. This limited the options for doing feature engineering and made it impossible to consider things like word order, but it made things somewhat simpler as the focus was on modelling due to inability to extract interesting features.</p><p>Overall, there are about 65K texts in the training set and 35K in the test set, where the split is based on chronological ordering (i.e., the training articles were published before the test articles). Each article was manually labelled with one or more labels out of a set of 203 labels. For each test article, the goal is to infer its set of labels. Submissions were ranked using the <a href=http://www.kaggle.com/c/wise-2014/details/evaluation target=_blank rel=noopener>mean F1 score</a>.</p><p>Despite being manually annotated, the data isn&rsquo;t very clean. Issues include identical texts that have different labels, empty articles, and articles with very few words. For example, the training set includes ten &ldquo;articles&rdquo; with a single word. Five of these articles have the word 68839, but each of these five was given a different label. Such issues are not unusual in Kaggle competitions or in real life, but they do limit the general usefulness of the results since any model built on this data would fit some noise.</p><h3 id=local-validation-setup>Local validation setup<a hidden class=anchor aria-hidden=true href=#local-validation-setup>#</a></h3><p>As mentioned in previous posts (<a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/>How to (almost) win Kaggle competitions</a> and <a href=https://yanirseroussi.com/2014/01/19/kaggle-beginner-tips/>Kaggle beginner tips</a>) having a solid local validation setup is very important. It ensures you don&rsquo;t waste time on weak submissions, increases confidence in the models, and avoids leaking information about how well you&rsquo;re doing.</p><p>I used the first 35K training texts for local training and the following 30K texts for validation. While the article publication dates weren&rsquo;t provided, I hoped that this would mimic the competition setup, where the test dataset consists of articles that were published after the articles in the training dataset. This seemed to work, as my local results were consistent with the leaderboard results. I&rsquo;m pleased to report that this setup allowed me to have the lowest number of submissions of all the top-10 teams 🙂</p><h3 id=things-that-worked>Things that worked<a hidden class=anchor aria-hidden=true href=#things-that-worked>#</a></h3><p>I originally wanted to use this competition to play with deep learning through Python packages such as <a href=http://deeplearning.net/software/theano/ target=_blank rel=noopener>Theano</a> and <a href=http://deeplearning.net/software/pylearn2/ target=_blank rel=noopener>PyLearn2</a>. However, as this was the first time I worked on a multilabel classification problem, I got sucked into reading a lot of papers on the topic and never got around to doing deep learning. Maybe next time&mldr;</p><p>One of my key discoveries was that there if you define a graph where the vertices are labels and there&rsquo;s an edge between two labels if they appear together in a document&rsquo;s label set, then there are two main connected components of labels and several small ones with single labels (see figure below). It is possible to train a linear classifier that distinguishes between the components with very high accuracy (over 99%). This allowed me to improve performance by training different classifiers on each connected component.</p><figure><a href=wise2014-connected-components.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components_hu4bfbbe3f9a9448d9a431640c78e486b4_93326_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components_hu4bfbbe3f9a9448d9a431640c78e486b4_93326_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/wise2014-connected-components_hu4bfbbe3f9a9448d9a431640c78e486b4_93326_720x0_resize_box_3.png 720w,
diff --git a/2014/10/23/what-is-data-science/index.html b/2014/10/23/what-is-data-science/index.html
index 5ae1b28a6..839e3b7a3 100644
--- a/2014/10/23/what-is-data-science/index.html
+++ b/2014/10/23/what-is-data-science/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>What is data science? | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,Kaggle,software engineering"><meta name=description content="Data science has been a hot term in the past few years. Still, there isn&rsquo;t a single definition of the field. This post discusses my favourite definition."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/10/23/what-is-data-science/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="What is data science?"><meta property="og:description" content="Data science has been a hot term in the past few years. Still, there isn&rsquo;t a single definition of the field. This post discusses my favourite definition."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/10/23/what-is-data-science/"><meta property="og:image" content="https://yanirseroussi.com/data-skill-continuum.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-10-23T03:22:08+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/data-skill-continuum.png"><meta name=twitter:title content="What is data science?"><meta name=twitter:description content="Data science has been a hot term in the past few years. Still, there isn&rsquo;t a single definition of the field. This post discusses my favourite definition."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"What is data science?","item":"https://yanirseroussi.com/2014/10/23/what-is-data-science/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"What is data science?","name":"What is data science?","description":"Data science has been a hot term in the past few years. Still, there isn\u0026rsquo;t a single definition of the field. This post discusses my favourite definition.","keywords":["data science","Kaggle","software engineering"],"articleBody":"Data science has been a hot term in the past few years. Despite this fact (or perhaps because of it), it still seems like there isn't a single unifying definition of data science. This post discusses my favourite definition.\nData Scientist (n.): Person who is better at statistics than any software engineer and better at software engineering than any statistician.\n— Josh Wills (@josh_wills) May 3, 2012\nOne of my reasons for doing a PhD was wanting to do something more interesting than “vanilla” software engineering. When I was in the final stages of my PhD, I started going to meetups to see what’s changed in the world outside academia. Back then, I defined myself as a “software engineer with a research background”, which didn’t mean much to most people. My first post-PhD job ended up being a data scientist at a small startup. As soon as I changed my LinkedIn title to Data Scientist, many offers started flowing. This is probably the reason why so many people call themselves data scientists these days, often diluting the term to a point where it’s so broad it becomes meaningless. This post presents my preferred data science definitions and my opinions on who should or shouldn’t call themselves a data scientist.\nDefining data science I really like the definition quoted above, of data science as the intersection of software engineering and statistics. Ofer Mendelevitch goes into more detail, drawing a continuum of professions that ranges from software engineer on the left to pure statistician (or machine learning researcher) on the right.\nThis continuum contains two additional roles, which are often confused with data scientists:\nData engineer: a software engineer that deals with data plumbing (traditional database setup, Hadoop, Spark and all the rest) Data analyst: a person who digs into data to surface insights, but lacks the skills to do so at scale (e.g., they know how to use Excel, Tableau and SQL but can’t build a web app from scratch) Data science mixes all these roles. Because of this, there are few true data science positions for people with no work experience. A successful data scientist needs to be able to “become one with the data” by exploring it and applying rigorous statistical analysis (right-hand side of the continuum). But good data scientists also understand what it takes to deploy production systems, and are ready to get their hands dirty by writing code that cleans up the data or performs core system functionality (left-hand side of the continuum). Gaining all these skills takes time. It is still somewhat rare to find people who are true data scientists according to this definition, which is why Ofer Mendelevitch’s post recommends building teams that consist of people with skills from both sides of the continuum.\nHow is data science different from just science? Data is everywhere. Extracting knowledge from data is an essential part of any science. Hence, the name data science doesn’t really capture what’s new about the field. The way I see it, the novelty of data science comes from the application of software to model any type of data in a way that generalises across domains. So while a physicist may use software to build models based on data, they won’t become a data scientist until they’ve gone and applied these skills to other fields (as many physicists end up doing). As Kaggle shows, data scientists can work on a wide variety of problems – from biology and physics to marketing, text mining and web search personalisation. It’s often the case in Kaggle competitions that the same people apply similar techniques to very different problems, obtaining results that significantly improve on the state of the art.\nHowever, domain experts such as physicists aren’t going to be made redundant any time soon. Contrary to what Kaggle may have you believe, there is much more to data science than predictive modelling on a well-defined problem. Data scientists typically spend much of their time working with domain experts to define the problem, and chasing down diverse data sources to extract features that enable predictive modelling (also known as “the fun part”). Despite the existence of these less-glamorous aspects of data science, there’s still a lot of fun to be had working in the area. I highly recommend getting into data science to people who enjoy such challenges.\nGetting started as a data scientist is actually pretty simple: become a software engineer, become a data analyst, learn how to model data using software (e.g., by participating in Kaggle competitions), and find a job as a data scientist. Obviously, it’s not going to happen overnight. It took me around 10 ten years, and I’m still learning.\n","wordCount":"780","inLanguage":"en","image":"https://yanirseroussi.com/data-skill-continuum.png","datePublished":"2014-10-23T03:22:08Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/10/23/what-is-data-science/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">What is data science?</h1><div class=post-meta><span title='2014-10-23 03:22:08 +0000 UTC'>October 23, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-10-23-what-is-data-science/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum_hude8f4ba53ab678a51f562b1a637a59bc_5172_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum_hude8f4ba53ab678a51f562b1a637a59bc_5172_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum_hude8f4ba53ab678a51f562b1a637a59bc_5172_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum.png 981w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum.png alt width=981 height=100></figure><div class=post-content><p class=intro-note>Data science has been a hot term in the past few years. Despite this fact (or perhaps because of it), it still seems like there isn't a single unifying definition of data science. This post discusses my favourite definition.</p><blockquote><p>Data Scientist (n.): Person who is better at statistics than any software engineer and better at software engineering than any statistician.</p><p>— Josh Wills (@josh_wills) <a href=https://twitter.com/josh_wills/status/198093512149958656 target=_blank rel=noopener>May 3, 2012</a></p></blockquote><p>One of my reasons for doing a PhD was wanting to do something more interesting than &ldquo;vanilla&rdquo; software engineering. When I was in the final stages of my PhD, I started going to meetups to see what&rsquo;s changed in the world outside academia. Back then, I defined myself as a &ldquo;software engineer with a research background&rdquo;, which didn&rsquo;t mean much to most people. My first post-PhD job ended up being a data scientist at a small startup. As soon as I changed my LinkedIn title to Data Scientist, many offers started flowing. This is probably the reason why so many people call themselves data scientists these days, often diluting the term to a point where it&rsquo;s so broad it becomes meaningless. This post presents my preferred data science definitions and my opinions on who should or shouldn&rsquo;t call themselves a data scientist.</p><h3 id=defining-data-science>Defining data science<a hidden class=anchor aria-hidden=true href=#defining-data-science>#</a></h3><p>I really like the definition quoted above, of data science as <em>the intersection of software engineering and statistics</em>. <a href=http://hortonworks.com/blog/hortonworks-hadoop-data-science/ target=_blank rel=noopener>Ofer Mendelevitch</a> goes into more detail, drawing a continuum of professions that ranges from software engineer on the left to pure statistician (or machine learning researcher) on the right.</p><figure><a href=data-skill-continuum.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="data science,Kaggle,software engineering"><meta name=description content="Data science has been a hot term in the past few years. Still, there isn&rsquo;t a single definition of the field. This post discusses my favourite definition."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/10/23/what-is-data-science/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="What is data science?"><meta property="og:description" content="Data science has been a hot term in the past few years. Still, there isn&rsquo;t a single definition of the field. This post discusses my favourite definition."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/10/23/what-is-data-science/"><meta property="og:image" content="https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-10-23T03:22:08+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum.png"><meta name=twitter:title content="What is data science?"><meta name=twitter:description content="Data science has been a hot term in the past few years. Still, there isn&rsquo;t a single definition of the field. This post discusses my favourite definition."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"What is data science?","item":"https://yanirseroussi.com/2014/10/23/what-is-data-science/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"What is data science?","name":"What is data science?","description":"Data science has been a hot term in the past few years. Still, there isn\u0026rsquo;t a single definition of the field. This post discusses my favourite definition.","keywords":["data science","Kaggle","software engineering"],"articleBody":"Data science has been a hot term in the past few years. Despite this fact (or perhaps because of it), it still seems like there isn't a single unifying definition of data science. This post discusses my favourite definition.\nData Scientist (n.): Person who is better at statistics than any software engineer and better at software engineering than any statistician.\n— Josh Wills (@josh_wills) May 3, 2012\nOne of my reasons for doing a PhD was wanting to do something more interesting than “vanilla” software engineering. When I was in the final stages of my PhD, I started going to meetups to see what’s changed in the world outside academia. Back then, I defined myself as a “software engineer with a research background”, which didn’t mean much to most people. My first post-PhD job ended up being a data scientist at a small startup. As soon as I changed my LinkedIn title to Data Scientist, many offers started flowing. This is probably the reason why so many people call themselves data scientists these days, often diluting the term to a point where it’s so broad it becomes meaningless. This post presents my preferred data science definitions and my opinions on who should or shouldn’t call themselves a data scientist.\nDefining data science I really like the definition quoted above, of data science as the intersection of software engineering and statistics. Ofer Mendelevitch goes into more detail, drawing a continuum of professions that ranges from software engineer on the left to pure statistician (or machine learning researcher) on the right.\nThis continuum contains two additional roles, which are often confused with data scientists:\nData engineer: a software engineer that deals with data plumbing (traditional database setup, Hadoop, Spark and all the rest) Data analyst: a person who digs into data to surface insights, but lacks the skills to do so at scale (e.g., they know how to use Excel, Tableau and SQL but can’t build a web app from scratch) Data science mixes all these roles. Because of this, there are few true data science positions for people with no work experience. A successful data scientist needs to be able to “become one with the data” by exploring it and applying rigorous statistical analysis (right-hand side of the continuum). But good data scientists also understand what it takes to deploy production systems, and are ready to get their hands dirty by writing code that cleans up the data or performs core system functionality (left-hand side of the continuum). Gaining all these skills takes time. It is still somewhat rare to find people who are true data scientists according to this definition, which is why Ofer Mendelevitch’s post recommends building teams that consist of people with skills from both sides of the continuum.\nHow is data science different from just science? Data is everywhere. Extracting knowledge from data is an essential part of any science. Hence, the name data science doesn’t really capture what’s new about the field. The way I see it, the novelty of data science comes from the application of software to model any type of data in a way that generalises across domains. So while a physicist may use software to build models based on data, they won’t become a data scientist until they’ve gone and applied these skills to other fields (as many physicists end up doing). As Kaggle shows, data scientists can work on a wide variety of problems – from biology and physics to marketing, text mining and web search personalisation. It’s often the case in Kaggle competitions that the same people apply similar techniques to very different problems, obtaining results that significantly improve on the state of the art.\nHowever, domain experts such as physicists aren’t going to be made redundant any time soon. Contrary to what Kaggle may have you believe, there is much more to data science than predictive modelling on a well-defined problem. Data scientists typically spend much of their time working with domain experts to define the problem, and chasing down diverse data sources to extract features that enable predictive modelling (also known as “the fun part”). Despite the existence of these less-glamorous aspects of data science, there’s still a lot of fun to be had working in the area. I highly recommend getting into data science to people who enjoy such challenges.\nGetting started as a data scientist is actually pretty simple: become a software engineer, become a data analyst, learn how to model data using software (e.g., by participating in Kaggle competitions), and find a job as a data scientist. Obviously, it’s not going to happen overnight. It took me around 10 ten years, and I’m still learning.\n","wordCount":"780","inLanguage":"en","image":"https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum.png","datePublished":"2014-10-23T03:22:08Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/10/23/what-is-data-science/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">What is data science?</h1><div class=post-meta><span title='2014-10-23 03:22:08 +0000 UTC'>October 23, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-10-23-what-is-data-science/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum_hude8f4ba53ab678a51f562b1a637a59bc_5172_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum_hude8f4ba53ab678a51f562b1a637a59bc_5172_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum_hude8f4ba53ab678a51f562b1a637a59bc_5172_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum.png 981w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum.png alt width=981 height=100></figure><div class=post-content><p class=intro-note>Data science has been a hot term in the past few years. Despite this fact (or perhaps because of it), it still seems like there isn't a single unifying definition of data science. This post discusses my favourite definition.</p><blockquote><p>Data Scientist (n.): Person who is better at statistics than any software engineer and better at software engineering than any statistician.</p><p>— Josh Wills (@josh_wills) <a href=https://twitter.com/josh_wills/status/198093512149958656 target=_blank rel=noopener>May 3, 2012</a></p></blockquote><p>One of my reasons for doing a PhD was wanting to do something more interesting than &ldquo;vanilla&rdquo; software engineering. When I was in the final stages of my PhD, I started going to meetups to see what&rsquo;s changed in the world outside academia. Back then, I defined myself as a &ldquo;software engineer with a research background&rdquo;, which didn&rsquo;t mean much to most people. My first post-PhD job ended up being a data scientist at a small startup. As soon as I changed my LinkedIn title to Data Scientist, many offers started flowing. This is probably the reason why so many people call themselves data scientists these days, often diluting the term to a point where it&rsquo;s so broad it becomes meaningless. This post presents my preferred data science definitions and my opinions on who should or shouldn&rsquo;t call themselves a data scientist.</p><h3 id=defining-data-science>Defining data science<a hidden class=anchor aria-hidden=true href=#defining-data-science>#</a></h3><p>I really like the definition quoted above, of data science as <em>the intersection of software engineering and statistics</em>. <a href=http://hortonworks.com/blog/hortonworks-hadoop-data-science/ target=_blank rel=noopener>Ofer Mendelevitch</a> goes into more detail, drawing a continuum of professions that ranges from software engineer on the left to pure statistician (or machine learning researcher) on the right.</p><figure><a href=data-skill-continuum.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum_hude8f4ba53ab678a51f562b1a637a59bc_5172_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum_hude8f4ba53ab678a51f562b1a637a59bc_5172_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2014/10/23/what-is-data-science/data-skill-continuum_hude8f4ba53ab678a51f562b1a637a59bc_5172_720x0_resize_box_3.png 720w,
diff --git a/2014/11/05/bcrecommender-traction-update/index.html b/2014/11/05/bcrecommender-traction-update/index.html
index 42e360520..5480eb776 100644
--- a/2014/11/05/bcrecommender-traction-update/index.html
+++ b/2014/11/05/bcrecommender-traction-update/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>BCRecommender Traction Update | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Bandcamp,BCRecommender,business,marketing,music,traction book"><meta name=description content="Update on BCRecommender traction using three channels: blogger outreach, search engine optimisation, and content marketing."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="BCRecommender Traction Update"><meta property="og:description" content="Update on BCRecommender traction using three channels: blogger outreach, search engine optimisation, and content marketing."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/"><meta property="og:image" content="https://yanirseroussi.com/bullseye.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-11-05T02:29:35+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bullseye.png"><meta name=twitter:title content="BCRecommender Traction Update"><meta name=twitter:description content="Update on BCRecommender traction using three channels: blogger outreach, search engine optimisation, and content marketing."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"BCRecommender Traction Update","item":"https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"BCRecommender Traction Update","name":"BCRecommender Traction Update","description":"Update on BCRecommender traction using three channels: blogger outreach, search engine optimisation, and content marketing.","keywords":["Bandcamp","BCRecommender","business","marketing","music","traction book"],"articleBody":" This is the fifth part of a series of posts on my Bandcamp recommendations (BCRecommender) project. Check out previous posts on the general motivation behind this project, the system’s architecture, the recommendation algorithms, and initial traction planning. In a previous post, I discussed my plans to apply the Bullseye framework from the Traction Book to BCRecommender, my Bandcamp recommendations project. In that post, I reviewed the 19 traction channels described in the book, and decided to focus on the three most promising ones: blogger outreach, search engine optimisation (SEO), and content marketing. This post discusses my progress to date.\nGoals My initial traction goals were rather modest: get some feedback from real people, build up steady nonzero traffic to the site, and then increase that traffic to 10+ unique visitors per day. It’s worth noting that I have four other main areas of focus at the moment, so BCRecommender is not getting all the attention I could potentially give it. Nonetheless, I have made good progress on achieving my goals (first two have been obtained, but traffic still fluctuates), and learnt a lot in the process.\nThings that worked Blogger outreach. The most obvious people to contact are existing Bandcamp fans. It was straightforward to generate a list of prolific fans with blogs, as Bandcamp allows people to populate their profile with a short bio and links to their sites. I worked my way through part of the list, sending each fan an email introducing BCRecommender and asking for their feedback. Each email required some manual work, as the vast majority of people don’t have their email address listed on their Bandcamp profile page. I was careful not to be too spammy, which seemed to work: about 50% of the people I contacted visited BCRecommender, 20% responded with positive feedback, and 10% linked to BCRecommender in some form, with the largest volume of traffic coming from my Hypebot guest post. The problem with this approach is that it doesn’t scale, but the most valuable thing I got out of it was that people like the project and that there’s a real need for it.\nTwitter. I’m not sure where Twitter falls as a traction channel. It’s probably somewhere between (micro)blogger outreach and content marketing. However you categorise Twitter, it has been working well as a source of traffic. Simply finding people who may be interested in BCRecommender and tweeting related content has proven to be a rather low-effort way of getting attention, which is great at this stage. I have a few ideas for driving more traffic from Twitter, which I will try as I go.\nThings that didn’t work Content marketing. I haven’t really spent time doing serious content marketing apart from the Spotlights pilot. My vision for the spotlights was to generate quality articles automatically and showcase music on Bandcamp in an engaging way that helps people discover new artists, even if they don’t have a fan account. However, full automation of the spotlight feature would require a lot of work, and I think that there are lower-hanging fruits that I should focus on first. For example, finding interesting insights in the data and presenting them in an engaging way may be a better content strategy, as it would be unique to BCRecommender. For the spotlights, partnering with bloggers to write the articles may be a better approach than automation.\nSEO. I expected BCRecommender to rank higher for “bandcamp recommendations” by now, as a result of my blogger outreach efforts. At the moment, it’s still on the second page for this query on Google, though it’s the first result on Bing and DuckDuckGo. Obviously, “bandcamp recommendations” is not the only query worth ranking for, but it’s very relevant to BCRecommender, and not too competitive (half of the first page results are old forum posts). One encouraging outcome from the work done so far is that my Hypebot guest post does appear on the first page. Nonetheless, I’m still interested in getting more search engine traffic. Ranking higher would probably require adding more relevant content on the site and getting more quality links (basically what SEO is all about).\nPoints to improve and next steps I could definitely do better work on all of the above channels. Contrary to what’s suggested by the Bullseye framework, I would like to put more effort into the channels that didn’t work well. The reason is that I think they didn’t work well because of lack of attention and weak experiments, rather than due to their unsuitability to BCRecommender.\nAs mentioned above, my main limiting factor is a lack of time to spend on the project. However, there’s no pressing need to hit certain traction milestones by a specific deadline. My stretch goals are to get all Bandcamp fans to check out the project (hundreds of thousands of people), and have a significant portion of them convert by signing up to updates (tens of thousands of people). Getting there will take time. So far I’m finding the process educational and enjoyable, which is a pleasant surprise.\n","wordCount":"843","inLanguage":"en","image":"https://yanirseroussi.com/bullseye.png","datePublished":"2014-11-05T02:29:35Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">BCRecommender Traction Update</h1><div class=post-meta><span title='2014-11-05 02:29:35 +0000 UTC'>November 5, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-11-05-bcrecommender-traction-update/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/bullseye.png alt></figure><div class=post-content><p class=intro-note>This is the fifth part of a series of posts on my <a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp recommendations (BCRecommender)</a> project. Check out previous posts on <a href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/>the general motivation behind this project</a>, <a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/>the system’s architecture</a>, <a href=https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/>the recommendation algorithms</a>, and <a title="Applying the Traction Book’s Bullseye framework to BCRecommender" href=https://yanirseroussi.com/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/>initial traction planning</a>.</p><p>In a previous post, I discussed <a href=https://yanirseroussi.com/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/ title="Applying the Traction Book’s Bullseye framework to BCRecommender">my plans to apply the Bullseye framework from the Traction Book</a> to BCRecommender, my <a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp recommendations</a> project. In that post, I reviewed the 19 traction channels described in the book, and decided to focus on the three most promising ones: blogger outreach, search engine optimisation (SEO), and content marketing. This post discusses my progress to date.</p><h3 id=goals>Goals<a hidden class=anchor aria-hidden=true href=#goals>#</a></h3><p>My initial traction goals were rather modest: get some feedback from real people, build up steady nonzero traffic to the site, and then increase that traffic to 10+ unique visitors per day. It&rsquo;s worth noting that I have four other main areas of focus at the moment, so BCRecommender is not getting all the attention I could potentially give it. Nonetheless, I have made good progress on achieving my goals (first two have been obtained, but traffic still fluctuates), and learnt a lot in the process.</p><h3 id=things-that-worked>Things that worked<a hidden class=anchor aria-hidden=true href=#things-that-worked>#</a></h3><p><strong>Blogger outreach.</strong> The most obvious people to contact are existing Bandcamp fans. It was straightforward to generate a list of prolific fans with blogs, as Bandcamp allows people to populate their profile with a short bio and links to their sites. I worked my way through part of the list, sending each fan an email introducing BCRecommender and asking for their feedback. Each email required some manual work, as the vast majority of people don&rsquo;t have their email address listed on their Bandcamp profile page. I was careful not to be too spammy, which seemed to work: about 50% of the people I contacted visited BCRecommender, 20% responded with positive feedback, and 10% linked to BCRecommender in some form, with the largest volume of traffic coming from my <a href=http://www.hypebot.com/hypebot/2014/10/personalized-bandcamp-recommendations-with-bcrecommender.html target=_blank rel=noopener>Hypebot guest post</a>. The problem with this approach is that it doesn&rsquo;t scale, but the most valuable thing I got out of it was that people like the project and that there&rsquo;s a real need for it.</p><p><strong>Twitter.</strong> I&rsquo;m not sure where Twitter falls as a traction channel. It&rsquo;s probably somewhere between (micro)blogger outreach and content marketing. However you categorise Twitter, it has been working well as a source of traffic. Simply finding people who may be interested in BCRecommender and tweeting related content has proven to be a rather low-effort way of getting attention, which is great at this stage. I have a few ideas for driving more traffic from Twitter, which I will try as I go.</p><h3 id=things-that-didnt-work>Things that didn&rsquo;t work<a hidden class=anchor aria-hidden=true href=#things-that-didnt-work>#</a></h3><p><strong>Content marketing.</strong> I haven&rsquo;t really spent time doing serious content marketing apart from the <a href=http://www.bcrecommender.com/spotlights target=_blank rel=noopener>Spotlights</a> pilot. My vision for the spotlights was to generate quality articles automatically and showcase music on Bandcamp in an engaging way that helps people discover new artists, even if they don&rsquo;t have a fan account. However, full automation of the spotlight feature would require a lot of work, and I think that there are lower-hanging fruits that I should focus on first. For example, finding interesting insights in the data and presenting them in an engaging way may be a better content strategy, as it would be unique to BCRecommender. For the spotlights, partnering with bloggers to write the articles may be a better approach than automation.</p><p><strong>SEO.</strong> I expected BCRecommender to rank higher for &ldquo;bandcamp recommendations&rdquo; by now, as a result of my blogger outreach efforts. At the moment, it&rsquo;s still on the second page for this query on Google, though it&rsquo;s the first result on Bing and <a href=http://duckduckgo.com target=_blank rel=noopener>DuckDuckGo</a>. Obviously, &ldquo;bandcamp recommendations&rdquo; is not the only query worth ranking for, but it&rsquo;s very relevant to BCRecommender, and not too competitive (half of the first page results are old forum posts). One encouraging outcome from the work done so far is that <a href=http://www.hypebot.com/hypebot/2014/10/personalized-bandcamp-recommendations-with-bcrecommender.html target=_blank rel=noopener>my Hypebot guest post</a> does appear on the first page. Nonetheless, I&rsquo;m still interested in getting more search engine traffic. Ranking higher would probably require adding more relevant content on the site and getting more quality links (basically what SEO is all about).</p><h3 id=points-to-improve-and-next-steps>Points to improve and next steps<a hidden class=anchor aria-hidden=true href=#points-to-improve-and-next-steps>#</a></h3><p>I could definitely do better work on all of the above channels. Contrary to what&rsquo;s suggested by the Bullseye framework, I would like to put more effort into the channels that didn&rsquo;t work well. The reason is that I think they didn&rsquo;t work well because of lack of attention and weak experiments, rather than due to their unsuitability to BCRecommender.</p><p>As mentioned above, my main limiting factor is a lack of time to spend on the project. However, there&rsquo;s no pressing need to hit certain traction milestones by a specific deadline. My stretch goals are to get all Bandcamp fans to check out the project (hundreds of thousands of people), and have a significant portion of them convert by signing up to updates (tens of thousands of people). Getting there will take time. So far I&rsquo;m finding the process educational and enjoyable, which is a pleasant surprise.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bandcamp/>Bandcamp</a></li><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li><li><a href=https://yanirseroussi.com/tags/business/>business</a></li><li><a href=https://yanirseroussi.com/tags/marketing/>marketing</a></li><li><a href=https://yanirseroussi.com/tags/music/>music</a></li><li><a href=https://yanirseroussi.com/tags/traction-book/>traction book</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on x" href="https://x.com/intent/tweet/?text=BCRecommender%20Traction%20Update&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f&amp;hashtags=Bandcamp%2cBCRecommender%2cbusiness%2cmarketing%2cmusic%2ctractionbook"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f&amp;title=BCRecommender%20Traction%20Update&amp;summary=BCRecommender%20Traction%20Update&amp;source=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f&title=BCRecommender%20Traction%20Update"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on whatsapp" href="https://api.whatsapp.com/send?text=BCRecommender%20Traction%20Update%20-%20https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on telegram" href="https://telegram.me/share/url?text=BCRecommender%20Traction%20Update&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on ycombinator" href="https://news.ycombinator.com/submitlink?t=BCRecommender%20Traction%20Update&u=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="Bandcamp,BCRecommender,business,marketing,music,traction book"><meta name=description content="Update on BCRecommender traction using three channels: blogger outreach, search engine optimisation, and content marketing."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="BCRecommender Traction Update"><meta property="og:description" content="Update on BCRecommender traction using three channels: blogger outreach, search engine optimisation, and content marketing."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/"><meta property="og:image" content="https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/bullseye.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-11-05T02:29:35+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/bullseye.png"><meta name=twitter:title content="BCRecommender Traction Update"><meta name=twitter:description content="Update on BCRecommender traction using three channels: blogger outreach, search engine optimisation, and content marketing."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"BCRecommender Traction Update","item":"https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"BCRecommender Traction Update","name":"BCRecommender Traction Update","description":"Update on BCRecommender traction using three channels: blogger outreach, search engine optimisation, and content marketing.","keywords":["Bandcamp","BCRecommender","business","marketing","music","traction book"],"articleBody":" This is the fifth part of a series of posts on my Bandcamp recommendations (BCRecommender) project. Check out previous posts on the general motivation behind this project, the system’s architecture, the recommendation algorithms, and initial traction planning. In a previous post, I discussed my plans to apply the Bullseye framework from the Traction Book to BCRecommender, my Bandcamp recommendations project. In that post, I reviewed the 19 traction channels described in the book, and decided to focus on the three most promising ones: blogger outreach, search engine optimisation (SEO), and content marketing. This post discusses my progress to date.\nGoals My initial traction goals were rather modest: get some feedback from real people, build up steady nonzero traffic to the site, and then increase that traffic to 10+ unique visitors per day. It’s worth noting that I have four other main areas of focus at the moment, so BCRecommender is not getting all the attention I could potentially give it. Nonetheless, I have made good progress on achieving my goals (first two have been obtained, but traffic still fluctuates), and learnt a lot in the process.\nThings that worked Blogger outreach. The most obvious people to contact are existing Bandcamp fans. It was straightforward to generate a list of prolific fans with blogs, as Bandcamp allows people to populate their profile with a short bio and links to their sites. I worked my way through part of the list, sending each fan an email introducing BCRecommender and asking for their feedback. Each email required some manual work, as the vast majority of people don’t have their email address listed on their Bandcamp profile page. I was careful not to be too spammy, which seemed to work: about 50% of the people I contacted visited BCRecommender, 20% responded with positive feedback, and 10% linked to BCRecommender in some form, with the largest volume of traffic coming from my Hypebot guest post. The problem with this approach is that it doesn’t scale, but the most valuable thing I got out of it was that people like the project and that there’s a real need for it.\nTwitter. I’m not sure where Twitter falls as a traction channel. It’s probably somewhere between (micro)blogger outreach and content marketing. However you categorise Twitter, it has been working well as a source of traffic. Simply finding people who may be interested in BCRecommender and tweeting related content has proven to be a rather low-effort way of getting attention, which is great at this stage. I have a few ideas for driving more traffic from Twitter, which I will try as I go.\nThings that didn’t work Content marketing. I haven’t really spent time doing serious content marketing apart from the Spotlights pilot. My vision for the spotlights was to generate quality articles automatically and showcase music on Bandcamp in an engaging way that helps people discover new artists, even if they don’t have a fan account. However, full automation of the spotlight feature would require a lot of work, and I think that there are lower-hanging fruits that I should focus on first. For example, finding interesting insights in the data and presenting them in an engaging way may be a better content strategy, as it would be unique to BCRecommender. For the spotlights, partnering with bloggers to write the articles may be a better approach than automation.\nSEO. I expected BCRecommender to rank higher for “bandcamp recommendations” by now, as a result of my blogger outreach efforts. At the moment, it’s still on the second page for this query on Google, though it’s the first result on Bing and DuckDuckGo. Obviously, “bandcamp recommendations” is not the only query worth ranking for, but it’s very relevant to BCRecommender, and not too competitive (half of the first page results are old forum posts). One encouraging outcome from the work done so far is that my Hypebot guest post does appear on the first page. Nonetheless, I’m still interested in getting more search engine traffic. Ranking higher would probably require adding more relevant content on the site and getting more quality links (basically what SEO is all about).\nPoints to improve and next steps I could definitely do better work on all of the above channels. Contrary to what’s suggested by the Bullseye framework, I would like to put more effort into the channels that didn’t work well. The reason is that I think they didn’t work well because of lack of attention and weak experiments, rather than due to their unsuitability to BCRecommender.\nAs mentioned above, my main limiting factor is a lack of time to spend on the project. However, there’s no pressing need to hit certain traction milestones by a specific deadline. My stretch goals are to get all Bandcamp fans to check out the project (hundreds of thousands of people), and have a significant portion of them convert by signing up to updates (tens of thousands of people). Getting there will take time. So far I’m finding the process educational and enjoyable, which is a pleasant surprise.\n","wordCount":"843","inLanguage":"en","image":"https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/bullseye.png","datePublished":"2014-11-05T02:29:35Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">BCRecommender Traction Update</h1><div class=post-meta><span title='2014-11-05 02:29:35 +0000 UTC'>November 5, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-11-05-bcrecommender-traction-update/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/bullseye.png alt></figure><div class=post-content><p class=intro-note>This is the fifth part of a series of posts on my <a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp recommendations (BCRecommender)</a> project. Check out previous posts on <a href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/>the general motivation behind this project</a>, <a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/>the system’s architecture</a>, <a href=https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/>the recommendation algorithms</a>, and <a title="Applying the Traction Book’s Bullseye framework to BCRecommender" href=https://yanirseroussi.com/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/>initial traction planning</a>.</p><p>In a previous post, I discussed <a href=https://yanirseroussi.com/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/ title="Applying the Traction Book’s Bullseye framework to BCRecommender">my plans to apply the Bullseye framework from the Traction Book</a> to BCRecommender, my <a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp recommendations</a> project. In that post, I reviewed the 19 traction channels described in the book, and decided to focus on the three most promising ones: blogger outreach, search engine optimisation (SEO), and content marketing. This post discusses my progress to date.</p><h3 id=goals>Goals<a hidden class=anchor aria-hidden=true href=#goals>#</a></h3><p>My initial traction goals were rather modest: get some feedback from real people, build up steady nonzero traffic to the site, and then increase that traffic to 10+ unique visitors per day. It&rsquo;s worth noting that I have four other main areas of focus at the moment, so BCRecommender is not getting all the attention I could potentially give it. Nonetheless, I have made good progress on achieving my goals (first two have been obtained, but traffic still fluctuates), and learnt a lot in the process.</p><h3 id=things-that-worked>Things that worked<a hidden class=anchor aria-hidden=true href=#things-that-worked>#</a></h3><p><strong>Blogger outreach.</strong> The most obvious people to contact are existing Bandcamp fans. It was straightforward to generate a list of prolific fans with blogs, as Bandcamp allows people to populate their profile with a short bio and links to their sites. I worked my way through part of the list, sending each fan an email introducing BCRecommender and asking for their feedback. Each email required some manual work, as the vast majority of people don&rsquo;t have their email address listed on their Bandcamp profile page. I was careful not to be too spammy, which seemed to work: about 50% of the people I contacted visited BCRecommender, 20% responded with positive feedback, and 10% linked to BCRecommender in some form, with the largest volume of traffic coming from my <a href=http://www.hypebot.com/hypebot/2014/10/personalized-bandcamp-recommendations-with-bcrecommender.html target=_blank rel=noopener>Hypebot guest post</a>. The problem with this approach is that it doesn&rsquo;t scale, but the most valuable thing I got out of it was that people like the project and that there&rsquo;s a real need for it.</p><p><strong>Twitter.</strong> I&rsquo;m not sure where Twitter falls as a traction channel. It&rsquo;s probably somewhere between (micro)blogger outreach and content marketing. However you categorise Twitter, it has been working well as a source of traffic. Simply finding people who may be interested in BCRecommender and tweeting related content has proven to be a rather low-effort way of getting attention, which is great at this stage. I have a few ideas for driving more traffic from Twitter, which I will try as I go.</p><h3 id=things-that-didnt-work>Things that didn&rsquo;t work<a hidden class=anchor aria-hidden=true href=#things-that-didnt-work>#</a></h3><p><strong>Content marketing.</strong> I haven&rsquo;t really spent time doing serious content marketing apart from the <a href=http://www.bcrecommender.com/spotlights target=_blank rel=noopener>Spotlights</a> pilot. My vision for the spotlights was to generate quality articles automatically and showcase music on Bandcamp in an engaging way that helps people discover new artists, even if they don&rsquo;t have a fan account. However, full automation of the spotlight feature would require a lot of work, and I think that there are lower-hanging fruits that I should focus on first. For example, finding interesting insights in the data and presenting them in an engaging way may be a better content strategy, as it would be unique to BCRecommender. For the spotlights, partnering with bloggers to write the articles may be a better approach than automation.</p><p><strong>SEO.</strong> I expected BCRecommender to rank higher for &ldquo;bandcamp recommendations&rdquo; by now, as a result of my blogger outreach efforts. At the moment, it&rsquo;s still on the second page for this query on Google, though it&rsquo;s the first result on Bing and <a href=http://duckduckgo.com target=_blank rel=noopener>DuckDuckGo</a>. Obviously, &ldquo;bandcamp recommendations&rdquo; is not the only query worth ranking for, but it&rsquo;s very relevant to BCRecommender, and not too competitive (half of the first page results are old forum posts). One encouraging outcome from the work done so far is that <a href=http://www.hypebot.com/hypebot/2014/10/personalized-bandcamp-recommendations-with-bcrecommender.html target=_blank rel=noopener>my Hypebot guest post</a> does appear on the first page. Nonetheless, I&rsquo;m still interested in getting more search engine traffic. Ranking higher would probably require adding more relevant content on the site and getting more quality links (basically what SEO is all about).</p><h3 id=points-to-improve-and-next-steps>Points to improve and next steps<a hidden class=anchor aria-hidden=true href=#points-to-improve-and-next-steps>#</a></h3><p>I could definitely do better work on all of the above channels. Contrary to what&rsquo;s suggested by the Bullseye framework, I would like to put more effort into the channels that didn&rsquo;t work well. The reason is that I think they didn&rsquo;t work well because of lack of attention and weak experiments, rather than due to their unsuitability to BCRecommender.</p><p>As mentioned above, my main limiting factor is a lack of time to spend on the project. However, there&rsquo;s no pressing need to hit certain traction milestones by a specific deadline. My stretch goals are to get all Bandcamp fans to check out the project (hundreds of thousands of people), and have a significant portion of them convert by signing up to updates (tens of thousands of people). Getting there will take time. So far I&rsquo;m finding the process educational and enjoyable, which is a pleasant surprise.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bandcamp/>Bandcamp</a></li><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li><li><a href=https://yanirseroussi.com/tags/business/>business</a></li><li><a href=https://yanirseroussi.com/tags/marketing/>marketing</a></li><li><a href=https://yanirseroussi.com/tags/music/>music</a></li><li><a href=https://yanirseroussi.com/tags/traction-book/>traction book</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on x" href="https://x.com/intent/tweet/?text=BCRecommender%20Traction%20Update&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f&amp;hashtags=Bandcamp%2cBCRecommender%2cbusiness%2cmarketing%2cmusic%2ctractionbook"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f&amp;title=BCRecommender%20Traction%20Update&amp;summary=BCRecommender%20Traction%20Update&amp;source=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f&title=BCRecommender%20Traction%20Update"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on whatsapp" href="https://api.whatsapp.com/send?text=BCRecommender%20Traction%20Update%20-%20https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on telegram" href="https://telegram.me/share/url?text=BCRecommender%20Traction%20Update&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share BCRecommender Traction Update on ycombinator" href="https://news.ycombinator.com/submitlink?t=BCRecommender%20Traction%20Update&u=https%3a%2f%2fyanirseroussi.com%2f2014%2f11%2f05%2fbcrecommender-traction-update%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.html b/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.html
index e6602eba3..6309a3549 100644
--- a/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.html
+++ b/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary) | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,gradient boosting,Kaggle,Kaggle competition,predictive modelling,price forecasting,scikit-learn"><meta name=description content="Summary of a Kaggle competition to forecast bulldozer sale price, where I finished 9th out of 476 teams."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)"><meta property="og:description" content="Summary of a Kaggle competition to forecast bulldozer sale price, where I finished 9th out of 476 teams."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/"><meta property="og:image" content="https://yanirseroussi.com/noisy-bulldozers.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-11-19T09:17:34+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/noisy-bulldozers.jpg"><meta name=twitter:title content="Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)"><meta name=twitter:description content="Summary of a Kaggle competition to forecast bulldozer sale price, where I finished 9th out of 476 teams."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)","item":"https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)","name":"Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)","description":"Summary of a Kaggle competition to forecast bulldozer sale price, where I finished 9th out of 476 teams.","keywords":["data science","gradient boosting","Kaggle","Kaggle competition","predictive modelling","price forecasting","scikit-learn"],"articleBody":"Messy data, buggy software, but all in all a good learning experience...\nEarly last year, I had some free time on my hands, so I decided to participate in yet another Kaggle competition. Having never done any price forecasting work before, I thought it would be interesting to work on the Blue Book for Bulldozers competition, where the goal was to predict the sale price of auctioned bulldozers. I’ve done alright, finishing 9th out of 476 teams. And the experience did turn out to be interesting, but not for the reasons I expected.\nData and evaluation The competition dataset consists of about 425K historical records of bulldozer sales. The training subset consists of sales from the 1990s through to the end of 2011, with the validation and testing periods being January-April 2012 and May-November 2012 respectively. The goal is to predict the sale price of each bulldozer, given the sale date and venue, and the bulldozer’s features (e.g., model ID, mechanical specifications, and machine-specific data such as machine ID and manufacturing year). Submissions were scored using the RMSLE measure.\nEarly in the competition (before I joined), there were many posts in the forum regarding issues with the data. The organisers responded by posting an appendix to the data, which included the “correct” information. From people’s posts after the competition ended, it seems like using the “correct” data consistently made the results worse. Luckily, I discovered this about a week before the competition ended. Reducing my reliance on the appendix made a huge difference in the performance of my models. This discovery was thanks to a forum post, which illustrates the general point on the importance of monitoring the forum in Kaggle competitions.\nMy approach: feature engineering, data splitting, and stochastic gradient boosting Having read the forum discussions on data quality, I assumed that spending time on data cleanup and feature engineering would give me an edge over competitors who focused only on data modelling. It’s well-known that simple models fitted on more/better data tend to yield better results than complex models fitted on less/messy data (aka GIGO – garbage in, garbage out). However, doing data cleaning and feature engineering is less glamorous than building sophisticated models, which is why many people avoid the former.\nSadly, the data was incredibly messy, so most of my cleanup efforts resulted in no improvements. Even intuitive modifications yielded poor results, like transforming each bulldozer’s manufacturing year into its age at the time of sale. Essentially, to do well in this competition, one had to fit the noise rather than remove it. This was rather disappointing, as one of the nice things about Kaggle competitions is being able to work on relatively clean data. Anomalies in data included bulldozers that have been running for hundreds of years and machines that got sold years before they were manufactured (impossible for second-hand bulldozers!). It is obvious that Fast Iron (the company who sponsored the competition) would have obtained more usable models from this competition if they had spent more time cleaning up the data themselves.\nThroughout the competition I went through several iterations of modelling and data cleaning. My final submission ended up being a linear combination of four models:\nGradient boosting machine (GBM) regression on the full dataset A linear model on the full dataset An ensemble of GBMs, one for each product group (rationale: different product groups represent different bulldozer classes, like track excavators and motor graders, so their prices are not really comparable) A similar ensemble, where each product group and sale year has a separate GBM, and earlier years get lower weight than more recent years I ended up discarding old training data (before 2000) and the machine IDs (another surprise: even though some machines were sold multiple times, this information was useless). For the GBMs, I treated categorical features as ordinal, which sort of makes sense for many of the features (e.g., model series values are ordered). For the linear model, I just coded them as binary indicators.\nThe most important discovery: stochastic gradient boosting bugs This was the first time I used gradient boosting. Since I was using so many different models, it was hard to reliably tune the number of trees, so I figured I’d use stochastic gradient boosting and rely on out-of-bag (OOB) samples to set the number of trees. This led to me finding a bug in scikit-learn: the OOB scores were actually calculated on in-bag samples.\nI reported the issue to the maintainers of scikit-learn and made an attempt at fixing it by skipping trees to obtain the OOB samples. This yielded better results than the buggy version, and in some cases I replaced a plain GBM with an ensemble of four stochastic GBMs with subsample ratio of 0.5 and a different random seed for each one (averaging their outputs).\nThis wasn’t enough to convince the maintainers of scikit-learn to accept the pull request with my fix, as they didn’t like my idea of skipping trees. This is for a good reason — obtaining better results on a single dataset should be insufficient to convince anyone. They ended up fixing the issue by copying the implementation from R’s GBM package, which is known to underestimate the number of required trees/boosting iterations (see Section 3.3 in the GBM guide).\nRecently, I had some time to test my tree skipping idea on the toy dataset used in the scikit-learn documentation. As the following figure shows, a smoothed variant of my tree skipping idea (TSO in the figure) yields superior results to the scikit-learn/R approach (SKO in the figure). The actual loss doesn’t matter — what matters is where it’s minimised. In this case TSO obtains the closest approximation of the number of iterations to the value that minimises the test error, which is a promising result.\nThese results are pretty cool, but this is still just a toy dataset (though repeating the experiment with 100 different random seeds to generate different toy datasets yields similar results). The next steps would be to repeat Ridgeway’s experiments from the GBM guide on multiple datasets to see whether the results generalise well, which will be the topic of a different post. Regardless of the final outcomes, this story illustrates the unexpected paths in which a Kaggle competition can take you. No matter what rank you end up obtaining and regardless of your skill level, there’s always something new to learn.\nUpdate: I ran Ridgway’s experiments. The results are discussed in Stochastic Gradient Boosting: Choosing the Best Number of Iterations.\n","wordCount":"1087","inLanguage":"en","image":"https://yanirseroussi.com/noisy-bulldozers.jpg","datePublished":"2014-11-19T09:17:34Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)</h1><div class=post-meta><span title='2014-11-19 09:17:34 +0000 UTC'>November 19, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-11-19-fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers_hu766b19432f2e7b969d67fa48688a7a26_267258_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers_hu766b19432f2e7b969d67fa48688a7a26_267258_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers_hu766b19432f2e7b969d67fa48688a7a26_267258_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers.jpg 800w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers.jpg alt width=800 height=261></figure><div class=post-content><p class=intro-note>Messy data, buggy software, but all in all a good learning experience...</p><p>Early last year, I had some free time on my hands, so I decided to participate in yet another Kaggle competition. Having never done any price forecasting work before, I thought it would be interesting to work on the <a href=https://www.kaggle.com/c/bluebook-for-bulldozers target=_blank rel=noopener>Blue Book for Bulldozers competition</a>, where the goal was to predict the sale price of auctioned bulldozers. I&rsquo;ve done alright, finishing 9th out of 476 teams. And the experience did turn out to be interesting, but not for the reasons I expected.</p><h3 id=data-and-evaluation>Data and evaluation<a hidden class=anchor aria-hidden=true href=#data-and-evaluation>#</a></h3><p>The competition dataset consists of about 425K historical records of bulldozer sales. The training subset consists of sales from the 1990s through to the end of 2011, with the validation and testing periods being January-April 2012 and May-November 2012 respectively. The goal is to predict the sale price of each bulldozer, given the sale date and venue, and the bulldozer&rsquo;s features (e.g., model ID, mechanical specifications, and machine-specific data such as machine ID and manufacturing year). Submissions were scored using the <a href=http://www.kaggle.com/wiki/RootMeanSquaredLogarithmicError target=_blank rel=noopener>RMSLE measure</a>.</p><p>Early in the competition (before I joined), there were many posts in the forum regarding issues with the data. The organisers responded by posting an appendix to the data, which included the &ldquo;correct&rdquo; information. From people&rsquo;s posts after the competition ended, it seems like using the &ldquo;correct&rdquo; data consistently made the results <strong>worse</strong>. Luckily, I discovered this about a week before the competition ended. Reducing my reliance on the appendix made a huge difference in the performance of my models. This discovery was thanks to a forum post, which illustrates the <a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/ title="How to (almost) win Kaggle competitions - Tip 9">general point on the importance of monitoring the forum in Kaggle competitions</a>.</p><h3 id=my-approach-feature-engineering-data-splitting-and-stochastic-gradient-boosting>My approach: feature engineering, data splitting, and stochastic gradient boosting<a hidden class=anchor aria-hidden=true href=#my-approach-feature-engineering-data-splitting-and-stochastic-gradient-boosting>#</a></h3><p>Having read the forum discussions on data quality, I assumed that spending time on data cleanup and feature engineering would give me an edge over competitors who focused only on data modelling. It&rsquo;s well-known that simple models fitted on more/better data tend to yield better results than complex models fitted on less/messy data (aka GIGO – garbage in, garbage out). However, doing data cleaning and feature engineering is less glamorous than building sophisticated models, which is why many people avoid the former.</p><p>Sadly, the data was incredibly messy, so most of my cleanup efforts resulted in no improvements. Even intuitive modifications yielded poor results, like transforming each bulldozer&rsquo;s manufacturing year into its age at the time of sale. Essentially, to do well in this competition, one had to fit the noise rather than remove it. This was rather disappointing, as one of the nice things about Kaggle competitions is being able to work on relatively clean data. Anomalies in data included bulldozers that have been running for hundreds of years and machines that got sold years before they were manufactured (impossible for second-hand bulldozers!). It is obvious that Fast Iron (the company who sponsored the competition) would have obtained more usable models from this competition if they had spent more time cleaning up the data themselves.</p><p>Throughout the competition I went through several iterations of modelling and data cleaning. My final submission ended up being a linear combination of four models:</p><ul><li><a href=http://scikit-learn.org/stable/modules/ensemble.html#gradient-tree-boosting target=_blank rel=noopener>Gradient boosting machine</a> (GBM) regression on the full dataset</li><li>A linear model on the full dataset</li><li>An ensemble of GBMs, one for each product group (rationale: different product groups represent different bulldozer classes, like track excavators and motor graders, so their prices are not really comparable)</li><li>A similar ensemble, where each product group and sale year has a separate GBM, and earlier years get lower weight than more recent years</li></ul><p>I ended up discarding old training data (before 2000) and the machine IDs (another surprise: even though some machines were sold multiple times, this information was useless). For the GBMs, I treated categorical features as ordinal, which sort of makes sense for many of the features (e.g., model series values are ordered). For the linear model, I just coded them as binary indicators.</p><h3 id=the-most-important-discovery-stochastic-gradient-boosting-bugs>The most important discovery: stochastic gradient boosting bugs<a hidden class=anchor aria-hidden=true href=#the-most-important-discovery-stochastic-gradient-boosting-bugs>#</a></h3><p>This was the first time I used gradient boosting. Since I was using so many different models, it was hard to reliably tune the number of trees, so I figured I&rsquo;d use stochastic gradient boosting and rely on out-of-bag (OOB) samples to set the number of trees. This led to me finding a bug in <a href=http://scikit-learn.org target=_blank rel=noopener>scikit-learn</a>: the OOB scores were actually calculated on in-bag samples.</p><p>I <a href=https://github.com/scikit-learn/scikit-learn/issues/1802 target=_blank rel=noopener>reported the issue</a> to the maintainers of scikit-learn and made an attempt at fixing it by skipping trees to obtain the OOB samples. This yielded better results than the buggy version, and in some cases I replaced a plain GBM with an ensemble of four stochastic GBMs with subsample ratio of 0.5 and a different random seed for each one (averaging their outputs).</p><p>This wasn&rsquo;t enough to convince the maintainers of scikit-learn to accept <a href=https://github.com/scikit-learn/scikit-learn/pull/1806 target=_blank rel=noopener>the pull request with my fix</a>, as they didn&rsquo;t like my idea of skipping trees. This is for a good reason — obtaining better results on a single dataset should be insufficient to convince anyone. They ended up fixing the issue by copying the implementation from R&rsquo;s GBM package, which is known to underestimate the number of required trees/boosting iterations (see <a href=http://cran.open-source-solution.org/web/packages/gbm/vignettes/gbm.pdf target=_blank rel=noopener>Section 3.3 in the GBM guide</a>).</p><p>Recently, I had some time to test my tree skipping idea on the toy dataset used in <a href=http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_oob.html target=_blank rel=noopener>the scikit-learn documentation</a>. As the following figure shows, a smoothed variant of my tree skipping idea (TSO in the figure) yields superior results to the scikit-learn/R approach (SKO in the figure). The actual loss doesn&rsquo;t matter — what matters is where it&rsquo;s minimised. In this case TSO obtains the closest approximation of the number of iterations to the value that minimises the test error, which is a promising result.</p><figure><a href=gradient-boosting-out-of-bag-experiment-toy-dataset.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="data science,gradient boosting,Kaggle,Kaggle competition,predictive modelling,price forecasting,scikit-learn"><meta name=description content="Summary of a Kaggle competition to forecast bulldozer sale price, where I finished 9th out of 476 teams."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)"><meta property="og:description" content="Summary of a Kaggle competition to forecast bulldozer sale price, where I finished 9th out of 476 teams."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/"><meta property="og:image" content="https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-11-19T09:17:34+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers.jpg"><meta name=twitter:title content="Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)"><meta name=twitter:description content="Summary of a Kaggle competition to forecast bulldozer sale price, where I finished 9th out of 476 teams."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)","item":"https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)","name":"Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)","description":"Summary of a Kaggle competition to forecast bulldozer sale price, where I finished 9th out of 476 teams.","keywords":["data science","gradient boosting","Kaggle","Kaggle competition","predictive modelling","price forecasting","scikit-learn"],"articleBody":"Messy data, buggy software, but all in all a good learning experience...\nEarly last year, I had some free time on my hands, so I decided to participate in yet another Kaggle competition. Having never done any price forecasting work before, I thought it would be interesting to work on the Blue Book for Bulldozers competition, where the goal was to predict the sale price of auctioned bulldozers. I’ve done alright, finishing 9th out of 476 teams. And the experience did turn out to be interesting, but not for the reasons I expected.\nData and evaluation The competition dataset consists of about 425K historical records of bulldozer sales. The training subset consists of sales from the 1990s through to the end of 2011, with the validation and testing periods being January-April 2012 and May-November 2012 respectively. The goal is to predict the sale price of each bulldozer, given the sale date and venue, and the bulldozer’s features (e.g., model ID, mechanical specifications, and machine-specific data such as machine ID and manufacturing year). Submissions were scored using the RMSLE measure.\nEarly in the competition (before I joined), there were many posts in the forum regarding issues with the data. The organisers responded by posting an appendix to the data, which included the “correct” information. From people’s posts after the competition ended, it seems like using the “correct” data consistently made the results worse. Luckily, I discovered this about a week before the competition ended. Reducing my reliance on the appendix made a huge difference in the performance of my models. This discovery was thanks to a forum post, which illustrates the general point on the importance of monitoring the forum in Kaggle competitions.\nMy approach: feature engineering, data splitting, and stochastic gradient boosting Having read the forum discussions on data quality, I assumed that spending time on data cleanup and feature engineering would give me an edge over competitors who focused only on data modelling. It’s well-known that simple models fitted on more/better data tend to yield better results than complex models fitted on less/messy data (aka GIGO – garbage in, garbage out). However, doing data cleaning and feature engineering is less glamorous than building sophisticated models, which is why many people avoid the former.\nSadly, the data was incredibly messy, so most of my cleanup efforts resulted in no improvements. Even intuitive modifications yielded poor results, like transforming each bulldozer’s manufacturing year into its age at the time of sale. Essentially, to do well in this competition, one had to fit the noise rather than remove it. This was rather disappointing, as one of the nice things about Kaggle competitions is being able to work on relatively clean data. Anomalies in data included bulldozers that have been running for hundreds of years and machines that got sold years before they were manufactured (impossible for second-hand bulldozers!). It is obvious that Fast Iron (the company who sponsored the competition) would have obtained more usable models from this competition if they had spent more time cleaning up the data themselves.\nThroughout the competition I went through several iterations of modelling and data cleaning. My final submission ended up being a linear combination of four models:\nGradient boosting machine (GBM) regression on the full dataset A linear model on the full dataset An ensemble of GBMs, one for each product group (rationale: different product groups represent different bulldozer classes, like track excavators and motor graders, so their prices are not really comparable) A similar ensemble, where each product group and sale year has a separate GBM, and earlier years get lower weight than more recent years I ended up discarding old training data (before 2000) and the machine IDs (another surprise: even though some machines were sold multiple times, this information was useless). For the GBMs, I treated categorical features as ordinal, which sort of makes sense for many of the features (e.g., model series values are ordered). For the linear model, I just coded them as binary indicators.\nThe most important discovery: stochastic gradient boosting bugs This was the first time I used gradient boosting. Since I was using so many different models, it was hard to reliably tune the number of trees, so I figured I’d use stochastic gradient boosting and rely on out-of-bag (OOB) samples to set the number of trees. This led to me finding a bug in scikit-learn: the OOB scores were actually calculated on in-bag samples.\nI reported the issue to the maintainers of scikit-learn and made an attempt at fixing it by skipping trees to obtain the OOB samples. This yielded better results than the buggy version, and in some cases I replaced a plain GBM with an ensemble of four stochastic GBMs with subsample ratio of 0.5 and a different random seed for each one (averaging their outputs).\nThis wasn’t enough to convince the maintainers of scikit-learn to accept the pull request with my fix, as they didn’t like my idea of skipping trees. This is for a good reason — obtaining better results on a single dataset should be insufficient to convince anyone. They ended up fixing the issue by copying the implementation from R’s GBM package, which is known to underestimate the number of required trees/boosting iterations (see Section 3.3 in the GBM guide).\nRecently, I had some time to test my tree skipping idea on the toy dataset used in the scikit-learn documentation. As the following figure shows, a smoothed variant of my tree skipping idea (TSO in the figure) yields superior results to the scikit-learn/R approach (SKO in the figure). The actual loss doesn’t matter — what matters is where it’s minimised. In this case TSO obtains the closest approximation of the number of iterations to the value that minimises the test error, which is a promising result.\nThese results are pretty cool, but this is still just a toy dataset (though repeating the experiment with 100 different random seeds to generate different toy datasets yields similar results). The next steps would be to repeat Ridgeway’s experiments from the GBM guide on multiple datasets to see whether the results generalise well, which will be the topic of a different post. Regardless of the final outcomes, this story illustrates the unexpected paths in which a Kaggle competition can take you. No matter what rank you end up obtaining and regardless of your skill level, there’s always something new to learn.\nUpdate: I ran Ridgway’s experiments. The results are discussed in Stochastic Gradient Boosting: Choosing the Best Number of Iterations.\n","wordCount":"1087","inLanguage":"en","image":"https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers.jpg","datePublished":"2014-11-19T09:17:34Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)</h1><div class=post-meta><span title='2014-11-19 09:17:34 +0000 UTC'>November 19, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-11-19-fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers_hu766b19432f2e7b969d67fa48688a7a26_267258_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers_hu766b19432f2e7b969d67fa48688a7a26_267258_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers_hu766b19432f2e7b969d67fa48688a7a26_267258_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers.jpg 800w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/noisy-bulldozers.jpg alt width=800 height=261></figure><div class=post-content><p class=intro-note>Messy data, buggy software, but all in all a good learning experience...</p><p>Early last year, I had some free time on my hands, so I decided to participate in yet another Kaggle competition. Having never done any price forecasting work before, I thought it would be interesting to work on the <a href=https://www.kaggle.com/c/bluebook-for-bulldozers target=_blank rel=noopener>Blue Book for Bulldozers competition</a>, where the goal was to predict the sale price of auctioned bulldozers. I&rsquo;ve done alright, finishing 9th out of 476 teams. And the experience did turn out to be interesting, but not for the reasons I expected.</p><h3 id=data-and-evaluation>Data and evaluation<a hidden class=anchor aria-hidden=true href=#data-and-evaluation>#</a></h3><p>The competition dataset consists of about 425K historical records of bulldozer sales. The training subset consists of sales from the 1990s through to the end of 2011, with the validation and testing periods being January-April 2012 and May-November 2012 respectively. The goal is to predict the sale price of each bulldozer, given the sale date and venue, and the bulldozer&rsquo;s features (e.g., model ID, mechanical specifications, and machine-specific data such as machine ID and manufacturing year). Submissions were scored using the <a href=http://www.kaggle.com/wiki/RootMeanSquaredLogarithmicError target=_blank rel=noopener>RMSLE measure</a>.</p><p>Early in the competition (before I joined), there were many posts in the forum regarding issues with the data. The organisers responded by posting an appendix to the data, which included the &ldquo;correct&rdquo; information. From people&rsquo;s posts after the competition ended, it seems like using the &ldquo;correct&rdquo; data consistently made the results <strong>worse</strong>. Luckily, I discovered this about a week before the competition ended. Reducing my reliance on the appendix made a huge difference in the performance of my models. This discovery was thanks to a forum post, which illustrates the <a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/ title="How to (almost) win Kaggle competitions - Tip 9">general point on the importance of monitoring the forum in Kaggle competitions</a>.</p><h3 id=my-approach-feature-engineering-data-splitting-and-stochastic-gradient-boosting>My approach: feature engineering, data splitting, and stochastic gradient boosting<a hidden class=anchor aria-hidden=true href=#my-approach-feature-engineering-data-splitting-and-stochastic-gradient-boosting>#</a></h3><p>Having read the forum discussions on data quality, I assumed that spending time on data cleanup and feature engineering would give me an edge over competitors who focused only on data modelling. It&rsquo;s well-known that simple models fitted on more/better data tend to yield better results than complex models fitted on less/messy data (aka GIGO – garbage in, garbage out). However, doing data cleaning and feature engineering is less glamorous than building sophisticated models, which is why many people avoid the former.</p><p>Sadly, the data was incredibly messy, so most of my cleanup efforts resulted in no improvements. Even intuitive modifications yielded poor results, like transforming each bulldozer&rsquo;s manufacturing year into its age at the time of sale. Essentially, to do well in this competition, one had to fit the noise rather than remove it. This was rather disappointing, as one of the nice things about Kaggle competitions is being able to work on relatively clean data. Anomalies in data included bulldozers that have been running for hundreds of years and machines that got sold years before they were manufactured (impossible for second-hand bulldozers!). It is obvious that Fast Iron (the company who sponsored the competition) would have obtained more usable models from this competition if they had spent more time cleaning up the data themselves.</p><p>Throughout the competition I went through several iterations of modelling and data cleaning. My final submission ended up being a linear combination of four models:</p><ul><li><a href=http://scikit-learn.org/stable/modules/ensemble.html#gradient-tree-boosting target=_blank rel=noopener>Gradient boosting machine</a> (GBM) regression on the full dataset</li><li>A linear model on the full dataset</li><li>An ensemble of GBMs, one for each product group (rationale: different product groups represent different bulldozer classes, like track excavators and motor graders, so their prices are not really comparable)</li><li>A similar ensemble, where each product group and sale year has a separate GBM, and earlier years get lower weight than more recent years</li></ul><p>I ended up discarding old training data (before 2000) and the machine IDs (another surprise: even though some machines were sold multiple times, this information was useless). For the GBMs, I treated categorical features as ordinal, which sort of makes sense for many of the features (e.g., model series values are ordered). For the linear model, I just coded them as binary indicators.</p><h3 id=the-most-important-discovery-stochastic-gradient-boosting-bugs>The most important discovery: stochastic gradient boosting bugs<a hidden class=anchor aria-hidden=true href=#the-most-important-discovery-stochastic-gradient-boosting-bugs>#</a></h3><p>This was the first time I used gradient boosting. Since I was using so many different models, it was hard to reliably tune the number of trees, so I figured I&rsquo;d use stochastic gradient boosting and rely on out-of-bag (OOB) samples to set the number of trees. This led to me finding a bug in <a href=http://scikit-learn.org target=_blank rel=noopener>scikit-learn</a>: the OOB scores were actually calculated on in-bag samples.</p><p>I <a href=https://github.com/scikit-learn/scikit-learn/issues/1802 target=_blank rel=noopener>reported the issue</a> to the maintainers of scikit-learn and made an attempt at fixing it by skipping trees to obtain the OOB samples. This yielded better results than the buggy version, and in some cases I replaced a plain GBM with an ensemble of four stochastic GBMs with subsample ratio of 0.5 and a different random seed for each one (averaging their outputs).</p><p>This wasn&rsquo;t enough to convince the maintainers of scikit-learn to accept <a href=https://github.com/scikit-learn/scikit-learn/pull/1806 target=_blank rel=noopener>the pull request with my fix</a>, as they didn&rsquo;t like my idea of skipping trees. This is for a good reason — obtaining better results on a single dataset should be insufficient to convince anyone. They ended up fixing the issue by copying the implementation from R&rsquo;s GBM package, which is known to underestimate the number of required trees/boosting iterations (see <a href=http://cran.open-source-solution.org/web/packages/gbm/vignettes/gbm.pdf target=_blank rel=noopener>Section 3.3 in the GBM guide</a>).</p><p>Recently, I had some time to test my tree skipping idea on the toy dataset used in <a href=http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_oob.html target=_blank rel=noopener>the scikit-learn documentation</a>. As the following figure shows, a smoothed variant of my tree skipping idea (TSO in the figure) yields superior results to the scikit-learn/R approach (SKO in the figure). The actual loss doesn&rsquo;t matter — what matters is where it&rsquo;s minimised. In this case TSO obtains the closest approximation of the number of iterations to the value that minimises the test error, which is a promising result.</p><figure><a href=gradient-boosting-out-of-bag-experiment-toy-dataset.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/gradient-boosting-out-of-bag-experiment-toy-dataset_hu02dc1ebe47af12a7ec8f5877429b5dec_71277_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/gradient-boosting-out-of-bag-experiment-toy-dataset_hu02dc1ebe47af12a7ec8f5877429b5dec_71277_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/gradient-boosting-out-of-bag-experiment-toy-dataset_hu02dc1ebe47af12a7ec8f5877429b5dec_71277_720x0_resize_box_3.png 720w,
diff --git a/2014/12/15/seo-mostly-about-showing-up/index.html b/2014/12/15/seo-mostly-about-showing-up/index.html
index 0838f682b..58a9c9901 100644
--- a/2014/12/15/seo-mostly-about-showing-up/index.html
+++ b/2014/12/15/seo-mostly-about-showing-up/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>SEO: Mostly about showing up? | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="BCRecommender,marketing,search engine optimisation,traction book"><meta name=description content="Increasing SEO traffic to BCRecommender by adding content and opening up more pages for crawling. It turns out that thin content is better than no content."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="SEO: Mostly about showing up?"><meta property="og:description" content="Increasing SEO traffic to BCRecommender by adding content and opening up more pages for crawling. It turns out that thin content is better than no content."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/"><meta property="og:image" content="https://yanirseroussi.com/bcrecommender-search-queries.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-12-15T04:25:25+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bcrecommender-search-queries.png"><meta name=twitter:title content="SEO: Mostly about showing up?"><meta name=twitter:description content="Increasing SEO traffic to BCRecommender by adding content and opening up more pages for crawling. It turns out that thin content is better than no content."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"SEO: Mostly about showing up?","item":"https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"SEO: Mostly about showing up?","name":"SEO: Mostly about showing up?","description":"Increasing SEO traffic to BCRecommender by adding content and opening up more pages for crawling. It turns out that thin content is better than no content.","keywords":["BCRecommender","marketing","search engine optimisation","traction book"],"articleBody":"In previous posts about getting traction for my Bandcamp recommendations project (BCRecommender), I mentioned search engine optimisation (SEO) as one of the promising traction channels. Unfortunately, early efforts yielded negligible traffic – most new visitors came from referrals from blogs and Twitter. It turns out that the problem was not showing up for the SEO game: most of BCRecommender’s pages were blocked for crawling via robots.txt because I was worried that search engines (=Google) would penalise the website for thin/duplicate content.\nRecently, I beefed up most of the pages, created a sitemap, and removed most pages from robots.txt. This resulted in a significant increase in traffic, as illustrated by the above graph. The number of organic impressions went up from less than ten per day to over a thousand. This is expected to go up even further, as only about 10% of pages are indexed. In addition, some traffic went to my staging site because it wasn’t blocked from crawling (I had to set up a new staging site that is password-protected and add a redirect from the old site to the production site – a bit annoying but I couldn’t find a better solution).\nI hope Google won’t suddenly decide that BCRecommender content is not valuable or too thin. The content is automatically generated, which is “bad”, but it doesn’t “consist of paragraphs of random text that make no sense to the reader but which may contain search keywords”. As a (completely unbiased) user, I think it is valuable to find similar albums when searching for an album you like – an example that represents the majority of people that click through to BCRecommender. Judging from the main engagement measure I’m using (time spent on site), a good number of these people are happy with what they find.\nMore updates to come in the future. For now, my conclusion is: thin content is better than no content, as long as it’s relevant to what people are searching for and provides real value.\n","wordCount":"333","inLanguage":"en","image":"https://yanirseroussi.com/bcrecommender-search-queries.png","datePublished":"2014-12-15T04:25:25Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">SEO: Mostly about showing up?</h1><div class=post-meta><span title='2014-12-15 04:25:25 +0000 UTC'>December 15, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-12-15-seo-mostly-about-showing-up/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/bcrecommender-search-queries.png alt></figure><div class=post-content><p>In previous posts about getting traction for my <a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp recommendations project (BCRecommender)</a>, I mentioned search engine optimisation (SEO) as one of the promising traction channels. Unfortunately, early efforts yielded negligible traffic – <a href=https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/ title="BCRecommender Traction Update">most new visitors came from referrals from blogs and Twitter</a>. It turns out that the problem was <strong>not showing up for the SEO game</strong>: most of BCRecommender&rsquo;s pages were blocked for crawling via robots.txt because I was worried that search engines (=Google) would penalise the website for <a href="https://support.google.com/webmasters/answer/2604719?hl=en" target=_blank rel=noopener>thin/duplicate content</a>.</p><p>Recently, I beefed up most of the pages, created a sitemap, and removed most pages from robots.txt. This resulted in a significant increase in traffic, as illustrated by the above graph. The number of organic impressions went up from less than ten per day to over a thousand. This is expected to go up even further, as only about 10% of pages are indexed. In addition, some traffic went to my staging site because it wasn&rsquo;t blocked from crawling (I had to set up a new staging site that is password-protected and add a redirect from the old site to the production site – a bit annoying but I couldn&rsquo;t find a better solution).</p><p>I hope Google won&rsquo;t suddenly decide that BCRecommender content is not valuable or too thin. The content is automatically generated, which is <a href=https://support.google.com/webmasters/answer/2721306 target=_blank rel=noopener>&ldquo;bad&rdquo;</a>, but it doesn&rsquo;t &ldquo;consist of paragraphs of random text that make no sense to the reader but which may contain search keywords&rdquo;. As a (completely unbiased) user, I think it is valuable to find similar albums when searching for an album you like – an example that represents the majority of people that click through to BCRecommender. Judging from the main engagement measure I&rsquo;m using (time spent on site), a good number of these people are happy with what they find.</p><p>More updates to come in the future. For now, my conclusion is: thin content is better than no content, as long as it&rsquo;s relevant to what people are searching for and provides real value.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li><li><a href=https://yanirseroussi.com/tags/marketing/>marketing</a></li><li><a href=https://yanirseroussi.com/tags/search-engine-optimisation/>search engine optimisation</a></li><li><a href=https://yanirseroussi.com/tags/traction-book/>traction book</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on x" href="https://x.com/intent/tweet/?text=SEO%3a%20Mostly%20about%20showing%20up%3f&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f&amp;hashtags=BCRecommender%2cmarketing%2csearchengineoptimisation%2ctractionbook"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f&amp;title=SEO%3a%20Mostly%20about%20showing%20up%3f&amp;summary=SEO%3a%20Mostly%20about%20showing%20up%3f&amp;source=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f&title=SEO%3a%20Mostly%20about%20showing%20up%3f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on whatsapp" href="https://api.whatsapp.com/send?text=SEO%3a%20Mostly%20about%20showing%20up%3f%20-%20https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on telegram" href="https://telegram.me/share/url?text=SEO%3a%20Mostly%20about%20showing%20up%3f&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on ycombinator" href="https://news.ycombinator.com/submitlink?t=SEO%3a%20Mostly%20about%20showing%20up%3f&u=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="BCRecommender,marketing,search engine optimisation,traction book"><meta name=description content="Increasing SEO traffic to BCRecommender by adding content and opening up more pages for crawling. It turns out that thin content is better than no content."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="SEO: Mostly about showing up?"><meta property="og:description" content="Increasing SEO traffic to BCRecommender by adding content and opening up more pages for crawling. It turns out that thin content is better than no content."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/"><meta property="og:image" content="https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/bcrecommender-search-queries.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-12-15T04:25:25+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/bcrecommender-search-queries.png"><meta name=twitter:title content="SEO: Mostly about showing up?"><meta name=twitter:description content="Increasing SEO traffic to BCRecommender by adding content and opening up more pages for crawling. It turns out that thin content is better than no content."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"SEO: Mostly about showing up?","item":"https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"SEO: Mostly about showing up?","name":"SEO: Mostly about showing up?","description":"Increasing SEO traffic to BCRecommender by adding content and opening up more pages for crawling. It turns out that thin content is better than no content.","keywords":["BCRecommender","marketing","search engine optimisation","traction book"],"articleBody":"In previous posts about getting traction for my Bandcamp recommendations project (BCRecommender), I mentioned search engine optimisation (SEO) as one of the promising traction channels. Unfortunately, early efforts yielded negligible traffic – most new visitors came from referrals from blogs and Twitter. It turns out that the problem was not showing up for the SEO game: most of BCRecommender’s pages were blocked for crawling via robots.txt because I was worried that search engines (=Google) would penalise the website for thin/duplicate content.\nRecently, I beefed up most of the pages, created a sitemap, and removed most pages from robots.txt. This resulted in a significant increase in traffic, as illustrated by the above graph. The number of organic impressions went up from less than ten per day to over a thousand. This is expected to go up even further, as only about 10% of pages are indexed. In addition, some traffic went to my staging site because it wasn’t blocked from crawling (I had to set up a new staging site that is password-protected and add a redirect from the old site to the production site – a bit annoying but I couldn’t find a better solution).\nI hope Google won’t suddenly decide that BCRecommender content is not valuable or too thin. The content is automatically generated, which is “bad”, but it doesn’t “consist of paragraphs of random text that make no sense to the reader but which may contain search keywords”. As a (completely unbiased) user, I think it is valuable to find similar albums when searching for an album you like – an example that represents the majority of people that click through to BCRecommender. Judging from the main engagement measure I’m using (time spent on site), a good number of these people are happy with what they find.\nMore updates to come in the future. For now, my conclusion is: thin content is better than no content, as long as it’s relevant to what people are searching for and provides real value.\n","wordCount":"333","inLanguage":"en","image":"https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/bcrecommender-search-queries.png","datePublished":"2014-12-15T04:25:25Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">SEO: Mostly about showing up?</h1><div class=post-meta><span title='2014-12-15 04:25:25 +0000 UTC'>December 15, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-12-15-seo-mostly-about-showing-up/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/bcrecommender-search-queries.png alt></figure><div class=post-content><p>In previous posts about getting traction for my <a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp recommendations project (BCRecommender)</a>, I mentioned search engine optimisation (SEO) as one of the promising traction channels. Unfortunately, early efforts yielded negligible traffic – <a href=https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/ title="BCRecommender Traction Update">most new visitors came from referrals from blogs and Twitter</a>. It turns out that the problem was <strong>not showing up for the SEO game</strong>: most of BCRecommender&rsquo;s pages were blocked for crawling via robots.txt because I was worried that search engines (=Google) would penalise the website for <a href="https://support.google.com/webmasters/answer/2604719?hl=en" target=_blank rel=noopener>thin/duplicate content</a>.</p><p>Recently, I beefed up most of the pages, created a sitemap, and removed most pages from robots.txt. This resulted in a significant increase in traffic, as illustrated by the above graph. The number of organic impressions went up from less than ten per day to over a thousand. This is expected to go up even further, as only about 10% of pages are indexed. In addition, some traffic went to my staging site because it wasn&rsquo;t blocked from crawling (I had to set up a new staging site that is password-protected and add a redirect from the old site to the production site – a bit annoying but I couldn&rsquo;t find a better solution).</p><p>I hope Google won&rsquo;t suddenly decide that BCRecommender content is not valuable or too thin. The content is automatically generated, which is <a href=https://support.google.com/webmasters/answer/2721306 target=_blank rel=noopener>&ldquo;bad&rdquo;</a>, but it doesn&rsquo;t &ldquo;consist of paragraphs of random text that make no sense to the reader but which may contain search keywords&rdquo;. As a (completely unbiased) user, I think it is valuable to find similar albums when searching for an album you like – an example that represents the majority of people that click through to BCRecommender. Judging from the main engagement measure I&rsquo;m using (time spent on site), a good number of these people are happy with what they find.</p><p>More updates to come in the future. For now, my conclusion is: thin content is better than no content, as long as it&rsquo;s relevant to what people are searching for and provides real value.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li><li><a href=https://yanirseroussi.com/tags/marketing/>marketing</a></li><li><a href=https://yanirseroussi.com/tags/search-engine-optimisation/>search engine optimisation</a></li><li><a href=https://yanirseroussi.com/tags/traction-book/>traction book</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on x" href="https://x.com/intent/tweet/?text=SEO%3a%20Mostly%20about%20showing%20up%3f&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f&amp;hashtags=BCRecommender%2cmarketing%2csearchengineoptimisation%2ctractionbook"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f&amp;title=SEO%3a%20Mostly%20about%20showing%20up%3f&amp;summary=SEO%3a%20Mostly%20about%20showing%20up%3f&amp;source=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f&title=SEO%3a%20Mostly%20about%20showing%20up%3f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on whatsapp" href="https://api.whatsapp.com/send?text=SEO%3a%20Mostly%20about%20showing%20up%3f%20-%20https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on telegram" href="https://telegram.me/share/url?text=SEO%3a%20Mostly%20about%20showing%20up%3f&amp;url=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share SEO: Mostly about showing up? on ycombinator" href="https://news.ycombinator.com/submitlink?t=SEO%3a%20Mostly%20about%20showing%20up%3f&u=https%3a%2f%2fyanirseroussi.com%2f2014%2f12%2f15%2fseo-mostly-about-showing-up%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2015/01/15/automating-parse-com-bulk-data-imports/index.html b/2015/01/15/automating-parse-com-bulk-data-imports/index.html
index bcbf31131..dc7264b8f 100644
--- a/2015/01/15/automating-parse-com-bulk-data-imports/index.html
+++ b/2015/01/15/automating-parse-com-bulk-data-imports/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Automating Parse.com bulk data imports | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="DevOps,parse.com,PhantomJS,software engineering"><meta name=description content="A script for importing data into the Parse backend-as-a-service."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Automating Parse.com bulk data imports"><meta property="og:description" content="A script for importing data into the Parse backend-as-a-service."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/"><meta property="og:image" content="https://yanirseroussi.com/parse-hosting.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-01-15T04:41:16+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/parse-hosting.jpg"><meta name=twitter:title content="Automating Parse.com bulk data imports"><meta name=twitter:description content="A script for importing data into the Parse backend-as-a-service."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Automating Parse.com bulk data imports","item":"https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Automating Parse.com bulk data imports","name":"Automating Parse.com bulk data imports","description":"A script for importing data into the Parse backend-as-a-service.","keywords":["DevOps","parse.com","PhantomJS","software engineering"],"articleBody":"Parse is a great backend-as-a-service (BaaS) product. It removes much of the hassle involved in backend devops with its web hosting service, SDKs for all the major mobile platforms, and a generous free tier. Parse does have its share of flaws, including various reliability issues (which seem to be getting rarer), and limitations on what you can do (which is reasonable price to pay for working within a sandboxed environment). One such limitation is the lack of APIs to perform bulk data imports. This post introduces my workaround for this limitation (tl;dr: it’s a PhantomJS script).\nUpdate: The script no longer works due to changes to Parse’s website. I won’t be fixing it since I’ve migrated my projects off the platform. If you fix it, let me know and I’ll post a link to the updated script here.\nI use Parse for two of my projects: BCRecommender and Price Dingo. In both cases, some of the data is generated outside Parse by a Python backend. Doing all the data processing within Parse is not a viable option, so a solution for importing this data into Parse is required.\nMy original solution for data import was using the Parse REST API via ParsePy. The problem with this solution is that Parse billing is done on a requests/second basis. The free tier includes 30 requests/second, so importing BCRecommender’s ~million objects takes about nine hours when operating at maximum capacity. However, operating at maximum capacity causes other client requests to be dropped (i.e., real users suffer). Hence, some sort of rate limiting is required, which makes the sync process take even longer.\nI thought that using batch requests would speed up the process, but it actually slowed it down! This is because batch requests are billed according to the number of sub-requests, so making even one successful batch request per second with the maximum number of sub-requests (50) causes more requests to be dropped. I implemented some code to retry failed requests, but the whole process was just too brittle.\nA few months ago I discovered that Parse supports bulk data import via the web interface (with no API support). This feature comes with the caveat that existing collections can’t be updated: a new collection must be created. This is actually a good thing, as it essentially makes the collections immutable. And immutability makes many things easier.\nBCRecommender data gets updated once a month, so I was happy with manually importing the data via the web interface. As a price comparison engine, Price Dingo’s data changes more frequently, so manual updates are out of the question. For Price Dingo to be hosted on Parse, I had to find a way to automate bulk imports. Some people suggest emulating the requests made by the web interface, but this requires relying on hardcoded cookie and CSRF token data, which may change at any time. A more robust solution would be to scriptify the manual actions, but how? PhantomJS, that’s how.\nI ended up implementing a PhantomJS script that logs in as the user and uploads a dump to a given collection. This script is available on GitHub Gist. To run it, simply install PhantomJS and run:\n$ phantomjs --ssl-protocol any \\ import-parse-class.js See the script’s source for a detailed explanation of the command-line arguments.\nIt is worth noting that the script doesn’t do any post-upload verification on the collection. This is done by an extra bit of Python code that verifies that the collection has the expected number of objects, and tries to query the collection sorted by all the keys that are supposed to be indexed (for large collections, it takes Parse a while to index all the fields, which may result in timeouts). Once these conditions are fulfilled, the Parse hosting code is updated to point to the new collection. For security, I added a bot user that has access only to the Parse app that it needs to update. Unlike the root user, this bot user can’t delete the app. As the config file contains the bot’s password, it should be encrypted and stored in a safe place (like the Parse master key).\nThat’s it! I hope that other people would find this solution useful. Any suggestions/comments/issues are very welcome.\nImage source: Parse Blog.\n","wordCount":"715","inLanguage":"en","image":"https://yanirseroussi.com/parse-hosting.jpg","datePublished":"2015-01-15T04:41:16Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Automating Parse.com bulk data imports</h1><div class=post-meta><span title='2015-01-15 04:41:16 +0000 UTC'>January 15, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-01-15-automating-parse-com-bulk-data-imports/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting_hu8a5ad3bca5f9ab53f157c3c03f7dc2d6_52832_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting_hu8a5ad3bca5f9ab53f157c3c03f7dc2d6_52832_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting_hu8a5ad3bca5f9ab53f157c3c03f7dc2d6_52832_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting_hu8a5ad3bca5f9ab53f157c3c03f7dc2d6_52832_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting.jpg 1080w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting.jpg alt width=1080 height=440></figure><div class=post-content><p><a href=http://parse.com target=_blank rel=noopener>Parse</a> is a great backend-as-a-service (BaaS) product. It removes much of the hassle involved in backend devops with its web hosting service, SDKs for all the major mobile platforms, and a generous free tier. Parse does have its share of flaws, including various reliability issues (which seem to be getting rarer), and limitations on what you can do (which is reasonable price to pay for working within a sandboxed environment). One such limitation is the lack of APIs to perform bulk data imports. This post introduces my workaround for this limitation (tl;dr: it&rsquo;s a <a href=https://gist.github.com/yanirs/eddedf152f42c1ee02b2 target=_blank rel=noopener>PhantomJS script</a>).</p><p><strong>Update:</strong> The script no longer works due to changes to Parse&rsquo;s website. I won&rsquo;t be fixing it since <a href=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/>I&rsquo;ve migrated my projects off the platform</a>. If you fix it, let me know and I&rsquo;ll post a link to the updated script here.</p><p>I use Parse for two of my projects: <a title="Bandcamp recommendations based on your fan profile" href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender</a> and Price Dingo. In both cases, some of the data is <a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/ title="Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)">generated outside Parse by a Python backend</a>. Doing all the data processing within Parse is not a viable option, so a solution for importing this data into Parse is required.</p><p>My original solution for data import was using the Parse REST API via <a href=https://github.com/dgrtwo/ParsePy target=_blank rel=noopener>ParsePy</a>. The problem with this solution is that Parse billing is done on a requests/second basis. The free tier includes 30 requests/second, so importing BCRecommender&rsquo;s ~million objects takes about nine hours when operating at maximum capacity. However, operating at maximum capacity causes other client requests to be dropped (i.e., real users suffer). Hence, some sort of rate limiting is required, which makes the sync process take even longer.</p><p>I thought that using <a href=https://parse.com/docs/rest#objects-batch target=_blank rel=noopener>batch requests</a> would speed up the process, but it actually slowed it down! This is because batch requests are billed according to the number of sub-requests, so making even one successful batch request per second with the maximum number of sub-requests (50) causes more requests to be dropped. I implemented some code to retry failed requests, but the whole process was just too brittle.</p><p>A few months ago I discovered that Parse supports <a href=https://parse.com/docs/data#data-import target=_blank rel=noopener>bulk data import via the web interface</a> (with no API support). This feature comes with the caveat that existing collections can&rsquo;t be updated: a new collection must be created. This is actually a good thing, as it essentially makes the collections immutable. And <a href=http://en.wikipedia.org/wiki/Immutable_object target=_blank rel=noopener>immutability makes many things easier</a>.</p><p>BCRecommender data gets updated once a month, so I was happy with manually importing the data via the web interface. As a price comparison engine, Price Dingo&rsquo;s data changes more frequently, so manual updates are out of the question. For Price Dingo to be hosted on Parse, I had to find a way to automate bulk imports. Some people suggest <a href=https://www.parse.com/questions/programmatically-create-classes-import-json target=_blank rel=noopener>emulating the requests made by the web interface</a>, but this requires relying on hardcoded cookie and CSRF token data, which may change at any time. A more robust solution would be to scriptify the manual actions, but how? <a href=http://phantomjs.org/ target=_blank rel=noopener>PhantomJS</a>, that&rsquo;s how.</p><p>I ended up implementing a PhantomJS script that logs in as the user and uploads a dump to a given collection. This script is <a href=https://gist.github.com/yanirs/eddedf152f42c1ee02b2 target=_blank rel=noopener>available on GitHub Gist</a>. To run it, simply install PhantomJS and run:</p><div class=highlight><pre tabindex=0 style=color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-bash data-lang=bash><span style=display:flex><span>$ phantomjs --ssl-protocol any <span style=color:#ae81ff>\
+<meta name=keywords content="DevOps,parse.com,PhantomJS,software engineering"><meta name=description content="A script for importing data into the Parse backend-as-a-service."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Automating Parse.com bulk data imports"><meta property="og:description" content="A script for importing data into the Parse backend-as-a-service."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/"><meta property="og:image" content="https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-01-15T04:41:16+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting.jpg"><meta name=twitter:title content="Automating Parse.com bulk data imports"><meta name=twitter:description content="A script for importing data into the Parse backend-as-a-service."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Automating Parse.com bulk data imports","item":"https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Automating Parse.com bulk data imports","name":"Automating Parse.com bulk data imports","description":"A script for importing data into the Parse backend-as-a-service.","keywords":["DevOps","parse.com","PhantomJS","software engineering"],"articleBody":"Parse is a great backend-as-a-service (BaaS) product. It removes much of the hassle involved in backend devops with its web hosting service, SDKs for all the major mobile platforms, and a generous free tier. Parse does have its share of flaws, including various reliability issues (which seem to be getting rarer), and limitations on what you can do (which is reasonable price to pay for working within a sandboxed environment). One such limitation is the lack of APIs to perform bulk data imports. This post introduces my workaround for this limitation (tl;dr: it’s a PhantomJS script).\nUpdate: The script no longer works due to changes to Parse’s website. I won’t be fixing it since I’ve migrated my projects off the platform. If you fix it, let me know and I’ll post a link to the updated script here.\nI use Parse for two of my projects: BCRecommender and Price Dingo. In both cases, some of the data is generated outside Parse by a Python backend. Doing all the data processing within Parse is not a viable option, so a solution for importing this data into Parse is required.\nMy original solution for data import was using the Parse REST API via ParsePy. The problem with this solution is that Parse billing is done on a requests/second basis. The free tier includes 30 requests/second, so importing BCRecommender’s ~million objects takes about nine hours when operating at maximum capacity. However, operating at maximum capacity causes other client requests to be dropped (i.e., real users suffer). Hence, some sort of rate limiting is required, which makes the sync process take even longer.\nI thought that using batch requests would speed up the process, but it actually slowed it down! This is because batch requests are billed according to the number of sub-requests, so making even one successful batch request per second with the maximum number of sub-requests (50) causes more requests to be dropped. I implemented some code to retry failed requests, but the whole process was just too brittle.\nA few months ago I discovered that Parse supports bulk data import via the web interface (with no API support). This feature comes with the caveat that existing collections can’t be updated: a new collection must be created. This is actually a good thing, as it essentially makes the collections immutable. And immutability makes many things easier.\nBCRecommender data gets updated once a month, so I was happy with manually importing the data via the web interface. As a price comparison engine, Price Dingo’s data changes more frequently, so manual updates are out of the question. For Price Dingo to be hosted on Parse, I had to find a way to automate bulk imports. Some people suggest emulating the requests made by the web interface, but this requires relying on hardcoded cookie and CSRF token data, which may change at any time. A more robust solution would be to scriptify the manual actions, but how? PhantomJS, that’s how.\nI ended up implementing a PhantomJS script that logs in as the user and uploads a dump to a given collection. This script is available on GitHub Gist. To run it, simply install PhantomJS and run:\n$ phantomjs --ssl-protocol any \\ import-parse-class.js See the script’s source for a detailed explanation of the command-line arguments.\nIt is worth noting that the script doesn’t do any post-upload verification on the collection. This is done by an extra bit of Python code that verifies that the collection has the expected number of objects, and tries to query the collection sorted by all the keys that are supposed to be indexed (for large collections, it takes Parse a while to index all the fields, which may result in timeouts). Once these conditions are fulfilled, the Parse hosting code is updated to point to the new collection. For security, I added a bot user that has access only to the Parse app that it needs to update. Unlike the root user, this bot user can’t delete the app. As the config file contains the bot’s password, it should be encrypted and stored in a safe place (like the Parse master key).\nThat’s it! I hope that other people would find this solution useful. Any suggestions/comments/issues are very welcome.\nImage source: Parse Blog.\n","wordCount":"715","inLanguage":"en","image":"https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting.jpg","datePublished":"2015-01-15T04:41:16Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Automating Parse.com bulk data imports</h1><div class=post-meta><span title='2015-01-15 04:41:16 +0000 UTC'>January 15, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-01-15-automating-parse-com-bulk-data-imports/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting_hu8a5ad3bca5f9ab53f157c3c03f7dc2d6_52832_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting_hu8a5ad3bca5f9ab53f157c3c03f7dc2d6_52832_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting_hu8a5ad3bca5f9ab53f157c3c03f7dc2d6_52832_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting_hu8a5ad3bca5f9ab53f157c3c03f7dc2d6_52832_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting.jpg 1080w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/parse-hosting.jpg alt width=1080 height=440></figure><div class=post-content><p><a href=http://parse.com target=_blank rel=noopener>Parse</a> is a great backend-as-a-service (BaaS) product. It removes much of the hassle involved in backend devops with its web hosting service, SDKs for all the major mobile platforms, and a generous free tier. Parse does have its share of flaws, including various reliability issues (which seem to be getting rarer), and limitations on what you can do (which is reasonable price to pay for working within a sandboxed environment). One such limitation is the lack of APIs to perform bulk data imports. This post introduces my workaround for this limitation (tl;dr: it&rsquo;s a <a href=https://gist.github.com/yanirs/eddedf152f42c1ee02b2 target=_blank rel=noopener>PhantomJS script</a>).</p><p><strong>Update:</strong> The script no longer works due to changes to Parse&rsquo;s website. I won&rsquo;t be fixing it since <a href=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/>I&rsquo;ve migrated my projects off the platform</a>. If you fix it, let me know and I&rsquo;ll post a link to the updated script here.</p><p>I use Parse for two of my projects: <a title="Bandcamp recommendations based on your fan profile" href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender</a> and Price Dingo. In both cases, some of the data is <a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/ title="Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)">generated outside Parse by a Python backend</a>. Doing all the data processing within Parse is not a viable option, so a solution for importing this data into Parse is required.</p><p>My original solution for data import was using the Parse REST API via <a href=https://github.com/dgrtwo/ParsePy target=_blank rel=noopener>ParsePy</a>. The problem with this solution is that Parse billing is done on a requests/second basis. The free tier includes 30 requests/second, so importing BCRecommender&rsquo;s ~million objects takes about nine hours when operating at maximum capacity. However, operating at maximum capacity causes other client requests to be dropped (i.e., real users suffer). Hence, some sort of rate limiting is required, which makes the sync process take even longer.</p><p>I thought that using <a href=https://parse.com/docs/rest#objects-batch target=_blank rel=noopener>batch requests</a> would speed up the process, but it actually slowed it down! This is because batch requests are billed according to the number of sub-requests, so making even one successful batch request per second with the maximum number of sub-requests (50) causes more requests to be dropped. I implemented some code to retry failed requests, but the whole process was just too brittle.</p><p>A few months ago I discovered that Parse supports <a href=https://parse.com/docs/data#data-import target=_blank rel=noopener>bulk data import via the web interface</a> (with no API support). This feature comes with the caveat that existing collections can&rsquo;t be updated: a new collection must be created. This is actually a good thing, as it essentially makes the collections immutable. And <a href=http://en.wikipedia.org/wiki/Immutable_object target=_blank rel=noopener>immutability makes many things easier</a>.</p><p>BCRecommender data gets updated once a month, so I was happy with manually importing the data via the web interface. As a price comparison engine, Price Dingo&rsquo;s data changes more frequently, so manual updates are out of the question. For Price Dingo to be hosted on Parse, I had to find a way to automate bulk imports. Some people suggest <a href=https://www.parse.com/questions/programmatically-create-classes-import-json target=_blank rel=noopener>emulating the requests made by the web interface</a>, but this requires relying on hardcoded cookie and CSRF token data, which may change at any time. A more robust solution would be to scriptify the manual actions, but how? <a href=http://phantomjs.org/ target=_blank rel=noopener>PhantomJS</a>, that&rsquo;s how.</p><p>I ended up implementing a PhantomJS script that logs in as the user and uploads a dump to a given collection. This script is <a href=https://gist.github.com/yanirs/eddedf152f42c1ee02b2 target=_blank rel=noopener>available on GitHub Gist</a>. To run it, simply install PhantomJS and run:</p><div class=highlight><pre tabindex=0 style=color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-bash data-lang=bash><span style=display:flex><span>$ phantomjs --ssl-protocol any <span style=color:#ae81ff>\
 </span></span></span><span style=display:flex><span><span style=color:#ae81ff></span>    import-parse-class.js &lt;configFile&gt; &lt;dumpFile&gt; &lt;collectionName&gt;
 </span></span></code></pre></div><p><a href=https://gist.github.com/yanirs/eddedf152f42c1ee02b2 target=_blank rel=noopener>See the script&rsquo;s source</a> for a detailed explanation of the command-line arguments.</p><p>It is worth noting that the script doesn&rsquo;t do any post-upload verification on the collection. This is done by an extra bit of Python code that verifies that the collection has the expected number of objects, and tries to query the collection sorted by all the keys that are supposed to be indexed (for large collections, it takes Parse a while to index all the fields, which may result in timeouts). Once these conditions are fulfilled, the Parse hosting code is updated to point to the new collection. For security, I added a bot user that has access only to the Parse app that it needs to update. Unlike the root user, this bot user can&rsquo;t delete the app. As the config file contains the bot&rsquo;s password, it should be encrypted and stored in a safe place (<a href=https://parse.com/docs/data#security target=_blank rel=noopener>like the Parse master key</a>).</p><p>That&rsquo;s it! I hope that other people would find this solution useful. Any suggestions/comments/issues are very welcome.</p><p><small><br>Image source: <a href=http://blog.parse.com/2013/05/07/goodbye-web-servers-hello-parse-hosting/ target=_blank rel=noopener>Parse Blog</a>.<br></small></p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/devops/>DevOps</a></li><li><a href=https://yanirseroussi.com/tags/parse.com/>parse.com</a></li><li><a href=https://yanirseroussi.com/tags/phantomjs/>PhantomJS</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Automating Parse.com bulk data imports on x" href="https://x.com/intent/tweet/?text=Automating%20Parse.com%20bulk%20data%20imports&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f15%2fautomating-parse-com-bulk-data-imports%2f&amp;hashtags=DevOps%2cparse.com%2cPhantomJS%2csoftwareengineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Automating Parse.com bulk data imports on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f15%2fautomating-parse-com-bulk-data-imports%2f&amp;title=Automating%20Parse.com%20bulk%20data%20imports&amp;summary=Automating%20Parse.com%20bulk%20data%20imports&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f15%2fautomating-parse-com-bulk-data-imports%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Automating Parse.com bulk data imports on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f15%2fautomating-parse-com-bulk-data-imports%2f&title=Automating%20Parse.com%20bulk%20data%20imports"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Automating Parse.com bulk data imports on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f15%2fautomating-parse-com-bulk-data-imports%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Automating Parse.com bulk data imports on whatsapp" href="https://api.whatsapp.com/send?text=Automating%20Parse.com%20bulk%20data%20imports%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f15%2fautomating-parse-com-bulk-data-imports%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Automating Parse.com bulk data imports on telegram" href="https://telegram.me/share/url?text=Automating%20Parse.com%20bulk%20data%20imports&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f15%2fautomating-parse-com-bulk-data-imports%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Automating Parse.com bulk data imports on ycombinator" href="https://news.ycombinator.com/submitlink?t=Automating%20Parse.com%20bulk%20data%20imports&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f15%2fautomating-parse-com-bulk-data-imports%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-556><div class=comment-header><a href=#comment-556><img class=comment-avatar src="https://www.gravatar.com/avatar/6fcd5405112d9893195e7c3fa29a5715?s=50"><p class=comment-info><strong>Walter</strong><br><small>2015-07-30 08:22:30</small></p></a></div><div class="comment-body post-content">Hi, very nice trick! Trying to implement this as we speak, does this code still work? I get to the collections page, but I don&rsquo;t think the upload is working. I&rsquo;m new to Phantomjs.
diff --git a/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.html b/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.html
index 6447eb4a6..13b778823 100644
--- a/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.html
+++ b/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1) | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,Kaggle,Kaggle competition,machine learning,predictive modelling,search engine optimisation"><meta name=description content="Insights on search personalisation and SEO from participating in a Kaggle competition (finished 9th out of 194 teams)."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)"><meta property="og:description" content="Insights on search personalisation and SEO from participating in a Kaggle competition (finished 9th out of 194 teams)."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/"><meta property="og:image" content="https://yanirseroussi.com/artificial-intelligence.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-01-29T10:37:39+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/artificial-intelligence.jpg"><meta name=twitter:title content="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)"><meta name=twitter:description content="Insights on search personalisation and SEO from participating in a Kaggle competition (finished 9th out of 194 teams)."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)","item":"https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)","name":"Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)","description":"Insights on search personalisation and SEO from participating in a Kaggle competition (finished 9th out of 194 teams).","keywords":["data science","Kaggle","Kaggle competition","machine learning","predictive modelling","search engine optimisation"],"articleBody":"About a year ago, I participated in the Yandex search personalisation Kaggle competition. I started off as a solo competitor, and then added a few Kaggle newbies to the team as part of a program I was running for the Sydney Data Science Meetup. My team hasn’t done too badly, finishing 9th out of 194 teams. As is usually the case with Kaggle competitions, the most valuable part was the lessons learned from the experience. In this case, the lessons go beyond the usual data science skills, and include some insights that are relevant to search engine optimisation (SEO) and privacy. This post describes the competition setup and covers the more general insights. A follow-up post will cover the technical side of our approach.\nThe data Yandex is the leading search engine in Russia. For the competition, they supplied a dataset that consists of log data of search activity from a single large city, which represents one month of search activity (excluding popular queries). In total, the dataset contains about 21M unique queries, 700M unique urls, 6M unique users, and 35M search sessions. This is a relatively-big dataset for a Kaggle competition (the training file is about 16GB uncompressed), but it’s really rather small in comparison to Yandex’s overall search volume and tiny compared to what Google handles.\nThe data was anonymised, so a sample looks like this (see full description of the data format – the example and its description are taken from there):\n744899 M 23 123123123 744899 0 Q 0 192902 4857,3847,2939 632428,2384 309585,28374 319567,38724 6547,28744 20264,2332 3094446,34535 90,21 841,231 8344,2342 119571,45767 744899 1403 C 0 632428 These records describe the session (SessionID = 744899) of the user with USERID 123123123, performed on the 23rd day of the dataset. The user submitted the query with QUERYID 192902, which contains terms with TermIDs 4857,3847,2939. The URL with URLID 632428 placed on the domain DomainID 2384 is the top result on the corresponding SERP. 1403 units of time after beginning of the session the user clicked on the result with URLID 632428 (ranked first in the list).\nWhile this may seem daunting at first, the data is actually quite simple. For each search session, we know the user, the queries they’ve made, which URLs and domains were returned in the SERP (search engine result page), which results they’ve clicked, and at what point in time the queries and clicks happened.\nGoal and evaluation The goal of the competition is to rerank the results in each SERP such that the highest-ranking documents are those that the user would find most relevant. As the name of the competition suggests, personalising the results is key, but non-personalised approaches were also welcome (and actually worked quite well).\nOne question that arises is how to tell from this data which results the user finds relevant. In this competition, the results were labelled as either irrelevant (0), relevant (1), or highly relevant (2). Relevance is a function of clicks and dwell time, where dwell time is the time spent on the result (determined by the time that passed until the next query or click). Irrelevant results are ones that weren’t clicked, or those for which the dwell time is less than 50 (the time unit is left unspecified). Relevant results are those that were clicked and have dwell time of 50 to 399. Highly relevant results have dwell time of at least 400, or were clicked as the last action in the session (i.e., it is assumed the user finished the session satisfied with the results rather than left because they couldn’t find what they were looking for).\nThis approach to determining relevance has some obvious flaws, but it apparently correlates well with actual user satisfaction with search results.\nGiven the above definition of relevance, one can quantify how well a reranking method improves the relevance of the results. For this competition, the organisers chose the normalised discounted cumulative gain (NDCG) measure, which is a fancy name for a measure that, in the words of Wikipedia, encodes the assumptions that:\nHighly relevant documents are more useful when appearing earlier in a search engine result list (have higher ranks) Highly relevant documents are more useful than marginally relevant documents, which are in turn more useful than irrelevant documents. SEO insights and other thoughts A key insight that is relevant to SEO and privacy, is that even without considering browser-based tracking and tools like Google Analytics (which may or may not be used by Google to rerank search results), search engines can infer a lot about user behaviour on other sites, just based on user interaction with the SERP. So if your users bounce quickly because your website is slow to load or ranks highly for irrelevant queries, the search engine can know that, and will probably penalise you accordingly.\nThis works both ways, though, and is evident even on search engines that don’t track personal information. Just try searching for “f” or “fa” or “fac” using DuckDuckGo, Google, Bing, Yahoo, or even Yandex. Facebook will be one of the top results (most often the first one), probably just because people tend to search for or visit Facebook after searching for one of those terms by mistake. So if your website ranks poorly for a term for which it should rank well, and your users behave accordingly (because, for example, they’re searching for your website specifically), you may magically end up with better ranking without any changes to inbound links or to your site.\nAnother thing that is demonstrated by this competition’s dataset is just how much data search engines consider when determining ranking. The dataset is just a sample of logs for one city for one month. I don’t like throwing the words “big data” around, but the full volume of data is pretty big. Too big for anyone to grasp and fully understand how exactly search engines work, and this includes the people who build them. What’s worth keeping in mind is that for all major search engines, the user is the product that they sell to advertisers, so keeping the users happy is key. Any changes made to the underlying algorithms are usually done with the end-user in mind, because not making such changes may kill the search engine (remember AltaVista?). Further, personalisation means that different users see different results for the same query. So my feeling is that it’s somewhat futile to do any SEO beyond making the website understandable by search engines, acquiring legitimate links, and just building a website that people would want to visit.\nNext steps With those thoughts out of the way, it’s time to describe the way we addressed the challenge. This is covered in the next post, Learning to rank for personalised search.\n","wordCount":"1126","inLanguage":"en","image":"https://yanirseroussi.com/artificial-intelligence.jpg","datePublished":"2015-01-29T10:37:39Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)</h1><div class=post-meta><span title='2015-01-29 10:37:39 +0000 UTC'>January 29, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-01-29-is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence.jpg 1568w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence.jpg alt width=1568 height=1051></figure><div class=post-content><p>About a year ago, I participated in the <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge target=_blank rel=noopener>Yandex search personalisation Kaggle competition</a>. I started off as a solo competitor, and then added a few Kaggle newbies to the team as part of a program I was running for the <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Sydney Data Science Meetup</a>. My team hasn&rsquo;t done too badly, finishing 9th out of 194 teams. As is usually the case with Kaggle competitions, the most valuable part was the lessons learned from the experience. In this case, the lessons go beyond the usual data science skills, and include some insights that are relevant to search engine optimisation (SEO) and privacy. This post describes the competition setup and covers the more general insights. A follow-up post will cover the technical side of our approach.</p><h3 id=the-data>The data<a hidden class=anchor aria-hidden=true href=#the-data>#</a></h3><p>Yandex is the leading search engine in Russia. For the competition, they <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge/data target=_blank rel=noopener>supplied a dataset</a> that consists of log data of search activity from a single large city, which represents one month of search activity (excluding popular queries). In total, the dataset contains about 21M unique queries, 700M unique urls, 6M unique users, and 35M search sessions. This is a relatively-big dataset for a Kaggle competition (the training file is about 16GB uncompressed), but it&rsquo;s really rather small in comparison to Yandex&rsquo;s overall search volume and tiny compared to what Google handles.</p><p>The data was anonymised, so a sample looks like this (see <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge/details/logs-format target=_blank rel=noopener>full description of the data format</a> – the example and its description are taken from there):</p><blockquote><div class=highlight><pre tabindex=0 style=color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-text data-lang=text><span style=display:flex><span>744899 M 23 123123123
+<meta name=keywords content="data science,Kaggle,Kaggle competition,machine learning,predictive modelling,search engine optimisation"><meta name=description content="Insights on search personalisation and SEO from participating in a Kaggle competition (finished 9th out of 194 teams)."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)"><meta property="og:description" content="Insights on search personalisation and SEO from participating in a Kaggle competition (finished 9th out of 194 teams)."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/"><meta property="og:image" content="https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-01-29T10:37:39+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence.jpg"><meta name=twitter:title content="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)"><meta name=twitter:description content="Insights on search personalisation and SEO from participating in a Kaggle competition (finished 9th out of 194 teams)."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)","item":"https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)","name":"Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)","description":"Insights on search personalisation and SEO from participating in a Kaggle competition (finished 9th out of 194 teams).","keywords":["data science","Kaggle","Kaggle competition","machine learning","predictive modelling","search engine optimisation"],"articleBody":"About a year ago, I participated in the Yandex search personalisation Kaggle competition. I started off as a solo competitor, and then added a few Kaggle newbies to the team as part of a program I was running for the Sydney Data Science Meetup. My team hasn’t done too badly, finishing 9th out of 194 teams. As is usually the case with Kaggle competitions, the most valuable part was the lessons learned from the experience. In this case, the lessons go beyond the usual data science skills, and include some insights that are relevant to search engine optimisation (SEO) and privacy. This post describes the competition setup and covers the more general insights. A follow-up post will cover the technical side of our approach.\nThe data Yandex is the leading search engine in Russia. For the competition, they supplied a dataset that consists of log data of search activity from a single large city, which represents one month of search activity (excluding popular queries). In total, the dataset contains about 21M unique queries, 700M unique urls, 6M unique users, and 35M search sessions. This is a relatively-big dataset for a Kaggle competition (the training file is about 16GB uncompressed), but it’s really rather small in comparison to Yandex’s overall search volume and tiny compared to what Google handles.\nThe data was anonymised, so a sample looks like this (see full description of the data format – the example and its description are taken from there):\n744899 M 23 123123123 744899 0 Q 0 192902 4857,3847,2939 632428,2384 309585,28374 319567,38724 6547,28744 20264,2332 3094446,34535 90,21 841,231 8344,2342 119571,45767 744899 1403 C 0 632428 These records describe the session (SessionID = 744899) of the user with USERID 123123123, performed on the 23rd day of the dataset. The user submitted the query with QUERYID 192902, which contains terms with TermIDs 4857,3847,2939. The URL with URLID 632428 placed on the domain DomainID 2384 is the top result on the corresponding SERP. 1403 units of time after beginning of the session the user clicked on the result with URLID 632428 (ranked first in the list).\nWhile this may seem daunting at first, the data is actually quite simple. For each search session, we know the user, the queries they’ve made, which URLs and domains were returned in the SERP (search engine result page), which results they’ve clicked, and at what point in time the queries and clicks happened.\nGoal and evaluation The goal of the competition is to rerank the results in each SERP such that the highest-ranking documents are those that the user would find most relevant. As the name of the competition suggests, personalising the results is key, but non-personalised approaches were also welcome (and actually worked quite well).\nOne question that arises is how to tell from this data which results the user finds relevant. In this competition, the results were labelled as either irrelevant (0), relevant (1), or highly relevant (2). Relevance is a function of clicks and dwell time, where dwell time is the time spent on the result (determined by the time that passed until the next query or click). Irrelevant results are ones that weren’t clicked, or those for which the dwell time is less than 50 (the time unit is left unspecified). Relevant results are those that were clicked and have dwell time of 50 to 399. Highly relevant results have dwell time of at least 400, or were clicked as the last action in the session (i.e., it is assumed the user finished the session satisfied with the results rather than left because they couldn’t find what they were looking for).\nThis approach to determining relevance has some obvious flaws, but it apparently correlates well with actual user satisfaction with search results.\nGiven the above definition of relevance, one can quantify how well a reranking method improves the relevance of the results. For this competition, the organisers chose the normalised discounted cumulative gain (NDCG) measure, which is a fancy name for a measure that, in the words of Wikipedia, encodes the assumptions that:\nHighly relevant documents are more useful when appearing earlier in a search engine result list (have higher ranks) Highly relevant documents are more useful than marginally relevant documents, which are in turn more useful than irrelevant documents. SEO insights and other thoughts A key insight that is relevant to SEO and privacy, is that even without considering browser-based tracking and tools like Google Analytics (which may or may not be used by Google to rerank search results), search engines can infer a lot about user behaviour on other sites, just based on user interaction with the SERP. So if your users bounce quickly because your website is slow to load or ranks highly for irrelevant queries, the search engine can know that, and will probably penalise you accordingly.\nThis works both ways, though, and is evident even on search engines that don’t track personal information. Just try searching for “f” or “fa” or “fac” using DuckDuckGo, Google, Bing, Yahoo, or even Yandex. Facebook will be one of the top results (most often the first one), probably just because people tend to search for or visit Facebook after searching for one of those terms by mistake. So if your website ranks poorly for a term for which it should rank well, and your users behave accordingly (because, for example, they’re searching for your website specifically), you may magically end up with better ranking without any changes to inbound links or to your site.\nAnother thing that is demonstrated by this competition’s dataset is just how much data search engines consider when determining ranking. The dataset is just a sample of logs for one city for one month. I don’t like throwing the words “big data” around, but the full volume of data is pretty big. Too big for anyone to grasp and fully understand how exactly search engines work, and this includes the people who build them. What’s worth keeping in mind is that for all major search engines, the user is the product that they sell to advertisers, so keeping the users happy is key. Any changes made to the underlying algorithms are usually done with the end-user in mind, because not making such changes may kill the search engine (remember AltaVista?). Further, personalisation means that different users see different results for the same query. So my feeling is that it’s somewhat futile to do any SEO beyond making the website understandable by search engines, acquiring legitimate links, and just building a website that people would want to visit.\nNext steps With those thoughts out of the way, it’s time to describe the way we addressed the challenge. This is covered in the next post, Learning to rank for personalised search.\n","wordCount":"1126","inLanguage":"en","image":"https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence.jpg","datePublished":"2015-01-29T10:37:39Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)</h1><div class=post-meta><span title='2015-01-29 10:37:39 +0000 UTC'>January 29, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-01-29-is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence_hucf29f125477380947c76df29ad469af8_251570_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence.jpg 1568w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/artificial-intelligence.jpg alt width=1568 height=1051></figure><div class=post-content><p>About a year ago, I participated in the <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge target=_blank rel=noopener>Yandex search personalisation Kaggle competition</a>. I started off as a solo competitor, and then added a few Kaggle newbies to the team as part of a program I was running for the <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Sydney Data Science Meetup</a>. My team hasn&rsquo;t done too badly, finishing 9th out of 194 teams. As is usually the case with Kaggle competitions, the most valuable part was the lessons learned from the experience. In this case, the lessons go beyond the usual data science skills, and include some insights that are relevant to search engine optimisation (SEO) and privacy. This post describes the competition setup and covers the more general insights. A follow-up post will cover the technical side of our approach.</p><h3 id=the-data>The data<a hidden class=anchor aria-hidden=true href=#the-data>#</a></h3><p>Yandex is the leading search engine in Russia. For the competition, they <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge/data target=_blank rel=noopener>supplied a dataset</a> that consists of log data of search activity from a single large city, which represents one month of search activity (excluding popular queries). In total, the dataset contains about 21M unique queries, 700M unique urls, 6M unique users, and 35M search sessions. This is a relatively-big dataset for a Kaggle competition (the training file is about 16GB uncompressed), but it&rsquo;s really rather small in comparison to Yandex&rsquo;s overall search volume and tiny compared to what Google handles.</p><p>The data was anonymised, so a sample looks like this (see <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge/details/logs-format target=_blank rel=noopener>full description of the data format</a> – the example and its description are taken from there):</p><blockquote><div class=highlight><pre tabindex=0 style=color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-text data-lang=text><span style=display:flex><span>744899 M 23 123123123
 </span></span><span style=display:flex><span>744899 0 Q 0 192902 4857,3847,2939 632428,2384 309585,28374 319567,38724 6547,28744 20264,2332 3094446,34535 90,21 841,231 8344,2342 119571,45767
 </span></span><span style=display:flex><span>744899 1403 C 0 632428
 </span></span></code></pre></div><p>These records describe the session (<code>SessionID</code> = 744899) of the user with <code>USERID</code> 123123123, performed on the 23rd day of the dataset. The user submitted the query with <code>QUERYID</code> 192902, which contains terms with <code>TermIDs</code> 4857,3847,2939. The URL with <code>URLID</code> 632428 placed on the domain <code>DomainID</code> 2384 is the top result on the corresponding SERP. 1403 units of time after beginning of the session the user clicked on the result with <code>URLID</code> 632428 (ranked first in the list).</p></blockquote><p>While this may seem daunting at first, the data is actually quite simple. For each search session, we know the user, the queries they&rsquo;ve made, which URLs and domains were returned in the SERP (search engine result page), which results they&rsquo;ve clicked, and at what point in time the queries and clicks happened.</p><h3 id=goal-and-evaluation>Goal and evaluation<a hidden class=anchor aria-hidden=true href=#goal-and-evaluation>#</a></h3><p>The goal of the competition is to rerank the results in each SERP such that the highest-ranking documents are those that the user would find most relevant. As the name of the competition suggests, personalising the results is key, but non-personalised approaches were also welcome (and actually worked quite well).</p><p>One question that arises is how to tell from this data which results the user finds relevant. In this competition, the results were labelled as either irrelevant (0), relevant (1), or highly relevant (2). Relevance is a function of clicks and dwell time, where dwell time is the time spent on the result (determined by the time that passed until the next query or click). Irrelevant results are ones that weren&rsquo;t clicked, or those for which the dwell time is less than 50 (the time unit is left unspecified). Relevant results are those that were clicked and have dwell time of 50 to 399. Highly relevant results have dwell time of at least 400, or were clicked as the last action in the session (i.e., it is assumed the user finished the session satisfied with the results rather than left because they couldn&rsquo;t find what they were looking for).</p><p>This approach to determining relevance has some obvious flaws, but <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge/details/evaluation target=_blank rel=noopener>it apparently correlates well with actual user satisfaction with search results</a>.</p><p>Given the above definition of relevance, one can quantify how well a reranking method improves the relevance of the results. For this competition, the organisers chose the <a href=https://en.wikipedia.org/wiki/Discounted_cumulative_gain target=_blank rel=noopener>normalised discounted cumulative gain (NDCG) measure</a>, which is a fancy name for a measure that, in the words of Wikipedia, encodes the assumptions that:</p><ul><li>Highly relevant documents are more useful when appearing earlier in a search engine result list (have higher ranks)</li><li>Highly relevant documents are more useful than marginally relevant documents, which are in turn more useful than irrelevant documents.</li></ul><h3 id=seo-insights-and-other-thoughts>SEO insights and other thoughts<a hidden class=anchor aria-hidden=true href=#seo-insights-and-other-thoughts>#</a></h3><p>A key insight that is relevant to SEO and privacy, is that even without considering browser-based tracking and tools like Google Analytics (which may or may not be used by Google to rerank search results), search engines can infer a lot about user behaviour on other sites, just based on user interaction with the SERP. So if your users bounce quickly because your website is slow to load or ranks highly for irrelevant queries, the search engine can know that, and will probably penalise you accordingly.</p><p>This works both ways, though, and is evident even on <a href=http://donttrack.us/ target=_blank rel=noopener>search engines that don&rsquo;t track personal information</a>. Just try searching for &ldquo;f&rdquo; or &ldquo;fa&rdquo; or &ldquo;fac&rdquo; using DuckDuckGo, Google, Bing, Yahoo, or even Yandex. Facebook will be one of the top results (most often the first one), probably just because people tend to search for or visit Facebook after searching for one of those terms by mistake. So if your website ranks poorly for a term for which it should rank well, and your users behave accordingly (because, for example, they&rsquo;re searching for your website specifically), you may magically end up with better ranking without any changes to inbound links or to your site.</p><p>Another thing that is demonstrated by this competition&rsquo;s dataset is just how much data search engines consider when determining ranking. The dataset is just a sample of logs for one city for one month. I don&rsquo;t like throwing the words &ldquo;big data&rdquo; around, but the full volume of data is pretty big. Too big for anyone to grasp and fully understand how exactly search engines work, and this includes the people who build them. What&rsquo;s worth keeping in mind is that for all major search engines, the user is the product that they sell to advertisers, so keeping the users happy is key. Any changes made to the underlying algorithms are usually done with the end-user in mind, because not making such changes may kill the search engine (remember AltaVista?). Further, personalisation means that <a href=http://dontbubble.us/ target=_blank rel=noopener>different users see different results for the same query</a>. So my feeling is that it&rsquo;s somewhat futile to do any SEO beyond making the website understandable by search engines, acquiring legitimate links, and just building a website that people would want to visit.</p><h3 id=next-steps>Next steps<a hidden class=anchor aria-hidden=true href=#next-steps>#</a></h3><p>With those thoughts out of the way, it&rsquo;s time to describe the way we addressed the challenge. This is covered in the next post, <a href=https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/ title="Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)">Learning to rank for personalised search</a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/kaggle/>Kaggle</a></li><li><a href=https://yanirseroussi.com/tags/kaggle-competition/>Kaggle competition</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li><li><a href=https://yanirseroussi.com/tags/search-engine-optimisation/>search engine optimisation</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1) on x" href="https://x.com/intent/tweet/?text=Is%20thinking%20like%20a%20search%20engine%20possible%3f%20%28Yandex%20search%20personalisation%20%e2%80%93%20Kaggle%20competition%20summary%20%e2%80%93%20part%201%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f29%2fis-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1%2f&amp;hashtags=datascience%2cKaggle%2cKagglecompetition%2cmachinelearning%2cpredictivemodelling%2csearchengineoptimisation"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f29%2fis-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1%2f&amp;title=Is%20thinking%20like%20a%20search%20engine%20possible%3f%20%28Yandex%20search%20personalisation%20%e2%80%93%20Kaggle%20competition%20summary%20%e2%80%93%20part%201%29&amp;summary=Is%20thinking%20like%20a%20search%20engine%20possible%3f%20%28Yandex%20search%20personalisation%20%e2%80%93%20Kaggle%20competition%20summary%20%e2%80%93%20part%201%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f29%2fis-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f29%2fis-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1%2f&title=Is%20thinking%20like%20a%20search%20engine%20possible%3f%20%28Yandex%20search%20personalisation%20%e2%80%93%20Kaggle%20competition%20summary%20%e2%80%93%20part%201%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f29%2fis-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1) on whatsapp" href="https://api.whatsapp.com/send?text=Is%20thinking%20like%20a%20search%20engine%20possible%3f%20%28Yandex%20search%20personalisation%20%e2%80%93%20Kaggle%20competition%20summary%20%e2%80%93%20part%201%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f29%2fis-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1) on telegram" href="https://telegram.me/share/url?text=Is%20thinking%20like%20a%20search%20engine%20possible%3f%20%28Yandex%20search%20personalisation%20%e2%80%93%20Kaggle%20competition%20summary%20%e2%80%93%20part%201%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f29%2fis-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1) on ycombinator" href="https://news.ycombinator.com/submitlink?t=Is%20thinking%20like%20a%20search%20engine%20possible%3f%20%28Yandex%20search%20personalisation%20%e2%80%93%20Kaggle%20competition%20summary%20%e2%80%93%20part%201%29&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f01%2f29%2fis-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
diff --git a/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.html b/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.html
index 14133cea7..8b704e606 100644
--- a/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.html
+++ b/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,gradient boosting,Kaggle,Kaggle competition,machine learning,predictive modelling,search engine optimisation"><meta name=description content="My team&rsquo;s solution to the Yandex Search Personalisation competition (finished 9th out of 194 teams)."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)"><meta property="og:description" content="My team&rsquo;s solution to the Yandex Search Personalisation competition (finished 9th out of 194 teams)."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/"><meta property="og:image" content="https://yanirseroussi.com/rating.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-02-11T06:34:17+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/rating.png"><meta name=twitter:title content="Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)"><meta name=twitter:description content="My team&rsquo;s solution to the Yandex Search Personalisation competition (finished 9th out of 194 teams)."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)","item":"https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)","name":"Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)","description":"My team\u0026rsquo;s solution to the Yandex Search Personalisation competition (finished 9th out of 194 teams).","keywords":["data science","gradient boosting","Kaggle","Kaggle competition","machine learning","predictive modelling","search engine optimisation"],"articleBody":"This is the second and last post summarising my team’s solution for the Yandex search personalisation Kaggle competition. See the first post for a summary of the dataset, evaluation approach, and some thoughts about search engine optimisation and privacy. This post discusses the algorithms and features we used.\nTo quickly recap the first post, Yandex released a 16GB dataset of query \u0026 click logs. The goal of the competition was to use this data to rerank query results such that the more relevant results appear before less relevant results. Relevance is determined by time spent on each clicked result (non-clicked results are deemed irrelevant), and overall performance is scored using the normalised discounted cumulative gain (NDCG) measure. No data about the content of sites or queries was given – each query in the dataset is a list of token IDs and each result is a (url ID, domain ID) pair.\nFirst steps: memory-based heuristics My initial approach wasn’t very exciting: it involved iterating through the data, summarising it in one way or another, and assigning new relevance scores to each (user, session, query) combination. In this early stage I also implemented an offline validation framework, which is an important part of every Kaggle competition: in this case I simply set aside the last three days of data for local testing, because the test dataset that was used for the leaderboard consisted of three days of log data.\nSomewhat surprisingly, my heuristics worked quite well and put me in a top-10 position on the leaderboard. It seems like the barrier of entry for this competition was higher than for other Kaggle competitions due to the size of the data and the fact that it wasn’t given as preprocessed feature vectors. This was evident from questions on the forum, where people noted that they were having trouble downloading and looking at the data.\nThe heuristic models that worked well included:\nReranking based on mean relevance (this just swapped positions 9 \u0026 10, probably because users are more likely to click the last result) Reranking based on mean relevance for (query, url) and (query, domain) pairs (non-personalised improvements) Downranking urls observed previously in a session Each one of the heuristic models was set to output relevance scores. The models were then ensembled by simply summing the relevance scores.\nThen, I started playing with a collaborative-filtering-inspired matrix factorisation model for predicting relevance, which didn’t work too well. At around that time, I got too busy with other stuff and decided to quit while I’m ahead.\nGetting more serious with some team work and LambdaMART A few weeks after quitting, I somehow volunteered to organise Kaggle teams for newbies at the Sydney Data Science Meetup group. At that point I was joined by my teammates, which served as a good motivation to do more stuff.\nThe first thing we tried was another heuristic model I read about in one of the papers suggested by the organisers: just reranking based on the fact that people often repeat queries as a navigational aid (e.g., search for Facebook and click Facebook). Combined in a simple linear model with the other heuristics, this put us at #4. Too easy 🙂\nWith all the new motivation, it was time to read more papers and start doing things properly. We ended up using Ranklib’s LambdaMART implementation as one of our main models, and also used LambdaMART to combine the various models (the old heuristics still helped the overall score, as did the matrix factorisation model).\nUsing LambdaMART made it possible to directly optimise the NDCG measure, turning the key problem into feature engineering, i.e., finding good features to feed into the model. Explaining how LambdaMART works is beyond the scope of this post (see this paper for an in-depth discussion), but the basic idea (which is also shared by other learning to rank algorithms) is that rather than trying to solve the hard problem of predicting relevance (i.e., a regression problem), the algorithm tries to predict the ranking that yields the best results according to a user-chosen measure.\nWe tried many features for the LambdaMART model, but after feature selection (using a method learned from Phil Brierley’s talk) the best features turned out to be:\npercentage_recurrent_term_ids: percentage of term IDs from the test query that appeared previously in the session — indicates if this query refines previous queries query_mean_ndcg: historical NDCG for this query — indicates how satisfied people are with the results of this query. Interestingly, we also tried query click entropy, but it performed worse. Probably because we’re optimising the NDCG rather than click-through rate. query_num_unique_serps: how many different result pages were shown for this query query_mean_result_dwell_time: how much time on average people spend per result for this query user_mean_ndcg: like query_mean_ndcg, but for users — a low NDCG indicates that this user is likely to be dissatisfied with the results. As for query_mean_ndcg, adding this feature yielded better results than using the user’s click entropy. user_num_click_actions_with_relevance_0: over the history of this user, how many of their clicks had relevance 0 (i.e., short dwell time). Interestingly, user_num_click_actions_with_relevance_1 and user_num_click_actions_with_relevance_2 were found to be less useful. user_num_query_actions: number of queries performed by the user rank: the original rank, as assigned by Yandex previous_query_url_relevance_in_session: modelling repeated results within a session, e.g., if a (query, url) pair was already found irrelevant in this session, the user may not want to see it again previous_url_relevance_in_session: the same as previous_query_url_relevance_in_session, but for a url regardless of the query user_query_url_relevance_sum: over the entire history of the user, not just the session user_normalised_rank_relevance: how relevant does the user usually find this rank? The idea is that some people are more likely to go through all the results than others query_url_click_probability: estimated simply as num_query_url_clicks / num_query_url_occurrences (across all the users) average_time_on_page: how much time people spend on this url on average Our best submission ended up placing us at the 9th place (out of 194 teams), which is respectable. Things got a bit more interesting towards the end of the competition – if we had used the original heuristic model that put at #4 early on, we would have finished 18th.\nConclusion I really enjoyed this competition. The data was well-organised and well-defined, which is not something you get in every competition (or in “real life”). Its size did present some challenges, but we stuck to using flat files and some preprocessing and other tricks to speed things up (e.g., I got to use Cython for the first time). It was good to learn how learning to rank algorithms work and get some insights on search personalisation. As is often the case with Kaggle competitions, this was time well spent.\n","wordCount":"1114","inLanguage":"en","image":"https://yanirseroussi.com/rating.png","datePublished":"2015-02-11T06:34:17Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)</h1><div class=post-meta><span title='2015-02-11 06:34:17 +0000 UTC'>February 11, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-02-11-learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/rating.png alt></figure><div class=post-content><p>This is the second and last post summarising my team&rsquo;s solution for the <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge target=_blank rel=noopener>Yandex search personalisation Kaggle competition</a>. <a href=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/ title="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)">See the first post</a> for a summary of the dataset, evaluation approach, and some thoughts about search engine optimisation and privacy. This post discusses the algorithms and features we used.</p><p>To quickly recap the <a href=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/ title="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)">first post</a>, Yandex released a 16GB dataset of query & click logs. The goal of the competition was to use this data to rerank query results such that the more relevant results appear before less relevant results. Relevance is determined by time spent on each clicked result (non-clicked results are deemed irrelevant), and overall performance is scored using the <a href=https://en.wikipedia.org/wiki/Discounted_cumulative_gain target=_blank rel=noopener>normalised discounted cumulative gain (NDCG) measure</a>. No data about the content of sites or queries was given – each query in the dataset is a list of token IDs and each result is a (url ID, domain ID) pair.</p><h3 id=first-steps-memory-based-heuristics>First steps: memory-based heuristics<a hidden class=anchor aria-hidden=true href=#first-steps-memory-based-heuristics>#</a></h3><p>My initial approach wasn&rsquo;t very exciting: it involved iterating through the data, summarising it in one way or another, and assigning new relevance scores to each (user, session, query) combination. In this early stage I also implemented an offline validation framework, <a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/ title="How to (almost) win Kaggle competitions">which is an important part of every Kaggle competition</a>: in this case I simply set aside the last three days of data for local testing, because the test dataset that was used for the leaderboard consisted of three days of log data.</p><p>Somewhat surprisingly, my heuristics worked quite well and put me in a top-10 position on the leaderboard. It seems like the barrier of entry for this competition was higher than for other Kaggle competitions due to the size of the data and the fact that it wasn&rsquo;t given as preprocessed feature vectors. This was evident from questions on the forum, where people noted that they were having trouble downloading and looking at the data.</p><p>The heuristic models that worked well included:</p><ul><li>Reranking based on mean relevance (this just swapped positions 9 & 10, probably because users are more likely to click the last result)</li><li>Reranking based on mean relevance for (query, url) and (query, domain) pairs (non-personalised improvements)</li><li>Downranking urls observed previously in a session</li></ul><p>Each one of the heuristic models was set to output relevance scores. The models were then ensembled by simply summing the relevance scores.</p><p>Then, I started playing with a <a href=https://en.wikipedia.org/wiki/Collaborative_filtering target=_blank rel=noopener>collaborative-filtering</a>-inspired matrix factorisation model for predicting relevance, which didn&rsquo;t work too well. At around that time, I got too busy with other stuff and decided to quit while I&rsquo;m ahead.</p><h3 id=getting-more-serious-with-some-team-work-and-lambdamart>Getting more serious with some team work and LambdaMART<a hidden class=anchor aria-hidden=true href=#getting-more-serious-with-some-team-work-and-lambdamart>#</a></h3><p>A few weeks after quitting, I somehow volunteered to organise Kaggle teams for newbies at the <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Sydney Data Science Meetup group</a>. At that point I was joined by my teammates, which served as a good motivation to do more stuff.</p><p>The first thing we tried was another heuristic model I read about in one of the <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge/details/related-papers target=_blank rel=noopener>papers suggested by the organisers</a>: just reranking based on the fact that people often repeat queries as a navigational aid (e.g., search for Facebook and click Facebook). Combined in a simple linear model with the other heuristics, this put us at #4. Too easy 🙂</p><p>With all the new motivation, it was time to read more papers and start doing things properly. We ended up using <a href=http://sourceforge.net/p/lemur/wiki/RankLib/ target=_blank rel=noopener>Ranklib&rsquo;s LambdaMART implementation</a> as one of our main models, and also used LambdaMART to combine the various models (the old heuristics still helped the overall score, as did the matrix factorisation model).</p><p>Using LambdaMART made it possible to directly optimise the NDCG measure, turning the key problem into feature engineering, i.e., finding good features to feed into the model. Explaining how LambdaMART works is beyond the scope of this post (<a href=http://research.microsoft.com/pubs/132652/MSR-TR-2010-82.pdf target=_blank rel=noopener>see this paper for an in-depth discussion</a>), but the basic idea (which is also shared by other <a href=https://en.wikipedia.org/wiki/Learning_to_rank target=_blank rel=noopener>learning to rank</a> algorithms) is that rather than trying to solve the hard problem of predicting relevance (i.e., a regression problem), the algorithm tries to predict the ranking that yields the best results according to a user-chosen measure.</p><p>We tried many features for the LambdaMART model, but after feature selection (using a method learned from <a href=http://anotherdataminingblog.blogspot.com.au/2013/10/techniques-to-improve-accuracy-of-your_17.html target=_blank rel=noopener>Phil Brierley&rsquo;s talk</a>) the best features turned out to be:</p><ul><li>percentage_recurrent_term_ids: percentage of term IDs from the test query that appeared previously in the session — indicates if this query refines previous queries</li><li>query_mean_ndcg: historical NDCG for this query — indicates how satisfied people are with the results of this query. Interestingly, we also tried query click entropy, but it performed worse. Probably because we&rsquo;re optimising the NDCG rather than click-through rate.</li><li>query_num_unique_serps: how many different result pages were shown for this query</li><li>query_mean_result_dwell_time: how much time on average people spend per result for this query</li><li>user_mean_ndcg: like query_mean_ndcg, but for users — a low NDCG indicates that this user is likely to be dissatisfied with the results. As for query_mean_ndcg, adding this feature yielded better results than using the user&rsquo;s click entropy.</li><li>user_num_click_actions_with_relevance_0: over the history of this user, how many of their clicks had relevance 0 (i.e., short dwell time). Interestingly, user_num_click_actions_with_relevance_1 and user_num_click_actions_with_relevance_2 were found to be less useful.</li><li>user_num_query_actions: number of queries performed by the user</li><li>rank: the original rank, as assigned by Yandex</li><li>previous_query_url_relevance_in_session: modelling repeated results within a session, e.g., if a (query, url) pair was already found irrelevant in this session, the user may not want to see it again</li><li>previous_url_relevance_in_session: the same as previous_query_url_relevance_in_session, but for a url regardless of the query</li><li>user_query_url_relevance_sum: over the entire history of the user, not just the session</li><li>user_normalised_rank_relevance: how relevant does the user usually find this rank? The idea is that some people are more likely to go through all the results than others</li><li>query_url_click_probability: estimated simply as num_query_url_clicks / num_query_url_occurrences (across all the users)</li><li>average_time_on_page: how much time people spend on this url on average</li></ul><p>Our best submission ended up placing us at the 9th place (out of 194 teams), which is respectable. Things got a bit more interesting towards the end of the competition – if we had used the original heuristic model that put at #4 early on, we would have finished 18th.</p><h3 id=conclusion>Conclusion<a hidden class=anchor aria-hidden=true href=#conclusion>#</a></h3><p>I really enjoyed this competition. The data was well-organised and well-defined, which is not something you get in every competition (or in &ldquo;real life&rdquo;). Its size did present some challenges, but we stuck to using flat files and some preprocessing and other tricks to speed things up (e.g., I got to use <a href=http://cython.org/ target=_blank rel=noopener>Cython</a> for the first time). It was good to learn how learning to rank algorithms work and get some insights on search personalisation. As is often the case with Kaggle competitions, this was time well spent.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/gradient-boosting/>gradient boosting</a></li><li><a href=https://yanirseroussi.com/tags/kaggle/>Kaggle</a></li><li><a href=https://yanirseroussi.com/tags/kaggle-competition/>Kaggle competition</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li><li><a href=https://yanirseroussi.com/tags/search-engine-optimisation/>search engine optimisation</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on x" href="https://x.com/intent/tweet/?text=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f&amp;hashtags=datascience%2cgradientboosting%2cKaggle%2cKagglecompetition%2cmachinelearning%2cpredictivemodelling%2csearchengineoptimisation"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f&amp;title=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&amp;summary=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f&title=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on whatsapp" href="https://api.whatsapp.com/send?text=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on telegram" href="https://telegram.me/share/url?text=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on ycombinator" href="https://news.ycombinator.com/submitlink?t=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="data science,gradient boosting,Kaggle,Kaggle competition,machine learning,predictive modelling,search engine optimisation"><meta name=description content="My team&rsquo;s solution to the Yandex Search Personalisation competition (finished 9th out of 194 teams)."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)"><meta property="og:description" content="My team&rsquo;s solution to the Yandex Search Personalisation competition (finished 9th out of 194 teams)."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/"><meta property="og:image" content="https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/rating.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-02-11T06:34:17+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/rating.png"><meta name=twitter:title content="Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)"><meta name=twitter:description content="My team&rsquo;s solution to the Yandex Search Personalisation competition (finished 9th out of 194 teams)."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)","item":"https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)","name":"Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)","description":"My team\u0026rsquo;s solution to the Yandex Search Personalisation competition (finished 9th out of 194 teams).","keywords":["data science","gradient boosting","Kaggle","Kaggle competition","machine learning","predictive modelling","search engine optimisation"],"articleBody":"This is the second and last post summarising my team’s solution for the Yandex search personalisation Kaggle competition. See the first post for a summary of the dataset, evaluation approach, and some thoughts about search engine optimisation and privacy. This post discusses the algorithms and features we used.\nTo quickly recap the first post, Yandex released a 16GB dataset of query \u0026 click logs. The goal of the competition was to use this data to rerank query results such that the more relevant results appear before less relevant results. Relevance is determined by time spent on each clicked result (non-clicked results are deemed irrelevant), and overall performance is scored using the normalised discounted cumulative gain (NDCG) measure. No data about the content of sites or queries was given – each query in the dataset is a list of token IDs and each result is a (url ID, domain ID) pair.\nFirst steps: memory-based heuristics My initial approach wasn’t very exciting: it involved iterating through the data, summarising it in one way or another, and assigning new relevance scores to each (user, session, query) combination. In this early stage I also implemented an offline validation framework, which is an important part of every Kaggle competition: in this case I simply set aside the last three days of data for local testing, because the test dataset that was used for the leaderboard consisted of three days of log data.\nSomewhat surprisingly, my heuristics worked quite well and put me in a top-10 position on the leaderboard. It seems like the barrier of entry for this competition was higher than for other Kaggle competitions due to the size of the data and the fact that it wasn’t given as preprocessed feature vectors. This was evident from questions on the forum, where people noted that they were having trouble downloading and looking at the data.\nThe heuristic models that worked well included:\nReranking based on mean relevance (this just swapped positions 9 \u0026 10, probably because users are more likely to click the last result) Reranking based on mean relevance for (query, url) and (query, domain) pairs (non-personalised improvements) Downranking urls observed previously in a session Each one of the heuristic models was set to output relevance scores. The models were then ensembled by simply summing the relevance scores.\nThen, I started playing with a collaborative-filtering-inspired matrix factorisation model for predicting relevance, which didn’t work too well. At around that time, I got too busy with other stuff and decided to quit while I’m ahead.\nGetting more serious with some team work and LambdaMART A few weeks after quitting, I somehow volunteered to organise Kaggle teams for newbies at the Sydney Data Science Meetup group. At that point I was joined by my teammates, which served as a good motivation to do more stuff.\nThe first thing we tried was another heuristic model I read about in one of the papers suggested by the organisers: just reranking based on the fact that people often repeat queries as a navigational aid (e.g., search for Facebook and click Facebook). Combined in a simple linear model with the other heuristics, this put us at #4. Too easy 🙂\nWith all the new motivation, it was time to read more papers and start doing things properly. We ended up using Ranklib’s LambdaMART implementation as one of our main models, and also used LambdaMART to combine the various models (the old heuristics still helped the overall score, as did the matrix factorisation model).\nUsing LambdaMART made it possible to directly optimise the NDCG measure, turning the key problem into feature engineering, i.e., finding good features to feed into the model. Explaining how LambdaMART works is beyond the scope of this post (see this paper for an in-depth discussion), but the basic idea (which is also shared by other learning to rank algorithms) is that rather than trying to solve the hard problem of predicting relevance (i.e., a regression problem), the algorithm tries to predict the ranking that yields the best results according to a user-chosen measure.\nWe tried many features for the LambdaMART model, but after feature selection (using a method learned from Phil Brierley’s talk) the best features turned out to be:\npercentage_recurrent_term_ids: percentage of term IDs from the test query that appeared previously in the session — indicates if this query refines previous queries query_mean_ndcg: historical NDCG for this query — indicates how satisfied people are with the results of this query. Interestingly, we also tried query click entropy, but it performed worse. Probably because we’re optimising the NDCG rather than click-through rate. query_num_unique_serps: how many different result pages were shown for this query query_mean_result_dwell_time: how much time on average people spend per result for this query user_mean_ndcg: like query_mean_ndcg, but for users — a low NDCG indicates that this user is likely to be dissatisfied with the results. As for query_mean_ndcg, adding this feature yielded better results than using the user’s click entropy. user_num_click_actions_with_relevance_0: over the history of this user, how many of their clicks had relevance 0 (i.e., short dwell time). Interestingly, user_num_click_actions_with_relevance_1 and user_num_click_actions_with_relevance_2 were found to be less useful. user_num_query_actions: number of queries performed by the user rank: the original rank, as assigned by Yandex previous_query_url_relevance_in_session: modelling repeated results within a session, e.g., if a (query, url) pair was already found irrelevant in this session, the user may not want to see it again previous_url_relevance_in_session: the same as previous_query_url_relevance_in_session, but for a url regardless of the query user_query_url_relevance_sum: over the entire history of the user, not just the session user_normalised_rank_relevance: how relevant does the user usually find this rank? The idea is that some people are more likely to go through all the results than others query_url_click_probability: estimated simply as num_query_url_clicks / num_query_url_occurrences (across all the users) average_time_on_page: how much time people spend on this url on average Our best submission ended up placing us at the 9th place (out of 194 teams), which is respectable. Things got a bit more interesting towards the end of the competition – if we had used the original heuristic model that put at #4 early on, we would have finished 18th.\nConclusion I really enjoyed this competition. The data was well-organised and well-defined, which is not something you get in every competition (or in “real life”). Its size did present some challenges, but we stuck to using flat files and some preprocessing and other tricks to speed things up (e.g., I got to use Cython for the first time). It was good to learn how learning to rank algorithms work and get some insights on search personalisation. As is often the case with Kaggle competitions, this was time well spent.\n","wordCount":"1114","inLanguage":"en","image":"https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/rating.png","datePublished":"2015-02-11T06:34:17Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)</h1><div class=post-meta><span title='2015-02-11 06:34:17 +0000 UTC'>February 11, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-02-11-learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/rating.png alt></figure><div class=post-content><p>This is the second and last post summarising my team&rsquo;s solution for the <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge target=_blank rel=noopener>Yandex search personalisation Kaggle competition</a>. <a href=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/ title="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)">See the first post</a> for a summary of the dataset, evaluation approach, and some thoughts about search engine optimisation and privacy. This post discusses the algorithms and features we used.</p><p>To quickly recap the <a href=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/ title="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)">first post</a>, Yandex released a 16GB dataset of query & click logs. The goal of the competition was to use this data to rerank query results such that the more relevant results appear before less relevant results. Relevance is determined by time spent on each clicked result (non-clicked results are deemed irrelevant), and overall performance is scored using the <a href=https://en.wikipedia.org/wiki/Discounted_cumulative_gain target=_blank rel=noopener>normalised discounted cumulative gain (NDCG) measure</a>. No data about the content of sites or queries was given – each query in the dataset is a list of token IDs and each result is a (url ID, domain ID) pair.</p><h3 id=first-steps-memory-based-heuristics>First steps: memory-based heuristics<a hidden class=anchor aria-hidden=true href=#first-steps-memory-based-heuristics>#</a></h3><p>My initial approach wasn&rsquo;t very exciting: it involved iterating through the data, summarising it in one way or another, and assigning new relevance scores to each (user, session, query) combination. In this early stage I also implemented an offline validation framework, <a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/ title="How to (almost) win Kaggle competitions">which is an important part of every Kaggle competition</a>: in this case I simply set aside the last three days of data for local testing, because the test dataset that was used for the leaderboard consisted of three days of log data.</p><p>Somewhat surprisingly, my heuristics worked quite well and put me in a top-10 position on the leaderboard. It seems like the barrier of entry for this competition was higher than for other Kaggle competitions due to the size of the data and the fact that it wasn&rsquo;t given as preprocessed feature vectors. This was evident from questions on the forum, where people noted that they were having trouble downloading and looking at the data.</p><p>The heuristic models that worked well included:</p><ul><li>Reranking based on mean relevance (this just swapped positions 9 & 10, probably because users are more likely to click the last result)</li><li>Reranking based on mean relevance for (query, url) and (query, domain) pairs (non-personalised improvements)</li><li>Downranking urls observed previously in a session</li></ul><p>Each one of the heuristic models was set to output relevance scores. The models were then ensembled by simply summing the relevance scores.</p><p>Then, I started playing with a <a href=https://en.wikipedia.org/wiki/Collaborative_filtering target=_blank rel=noopener>collaborative-filtering</a>-inspired matrix factorisation model for predicting relevance, which didn&rsquo;t work too well. At around that time, I got too busy with other stuff and decided to quit while I&rsquo;m ahead.</p><h3 id=getting-more-serious-with-some-team-work-and-lambdamart>Getting more serious with some team work and LambdaMART<a hidden class=anchor aria-hidden=true href=#getting-more-serious-with-some-team-work-and-lambdamart>#</a></h3><p>A few weeks after quitting, I somehow volunteered to organise Kaggle teams for newbies at the <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Sydney Data Science Meetup group</a>. At that point I was joined by my teammates, which served as a good motivation to do more stuff.</p><p>The first thing we tried was another heuristic model I read about in one of the <a href=https://www.kaggle.com/c/yandex-personalized-web-search-challenge/details/related-papers target=_blank rel=noopener>papers suggested by the organisers</a>: just reranking based on the fact that people often repeat queries as a navigational aid (e.g., search for Facebook and click Facebook). Combined in a simple linear model with the other heuristics, this put us at #4. Too easy 🙂</p><p>With all the new motivation, it was time to read more papers and start doing things properly. We ended up using <a href=http://sourceforge.net/p/lemur/wiki/RankLib/ target=_blank rel=noopener>Ranklib&rsquo;s LambdaMART implementation</a> as one of our main models, and also used LambdaMART to combine the various models (the old heuristics still helped the overall score, as did the matrix factorisation model).</p><p>Using LambdaMART made it possible to directly optimise the NDCG measure, turning the key problem into feature engineering, i.e., finding good features to feed into the model. Explaining how LambdaMART works is beyond the scope of this post (<a href=http://research.microsoft.com/pubs/132652/MSR-TR-2010-82.pdf target=_blank rel=noopener>see this paper for an in-depth discussion</a>), but the basic idea (which is also shared by other <a href=https://en.wikipedia.org/wiki/Learning_to_rank target=_blank rel=noopener>learning to rank</a> algorithms) is that rather than trying to solve the hard problem of predicting relevance (i.e., a regression problem), the algorithm tries to predict the ranking that yields the best results according to a user-chosen measure.</p><p>We tried many features for the LambdaMART model, but after feature selection (using a method learned from <a href=http://anotherdataminingblog.blogspot.com.au/2013/10/techniques-to-improve-accuracy-of-your_17.html target=_blank rel=noopener>Phil Brierley&rsquo;s talk</a>) the best features turned out to be:</p><ul><li>percentage_recurrent_term_ids: percentage of term IDs from the test query that appeared previously in the session — indicates if this query refines previous queries</li><li>query_mean_ndcg: historical NDCG for this query — indicates how satisfied people are with the results of this query. Interestingly, we also tried query click entropy, but it performed worse. Probably because we&rsquo;re optimising the NDCG rather than click-through rate.</li><li>query_num_unique_serps: how many different result pages were shown for this query</li><li>query_mean_result_dwell_time: how much time on average people spend per result for this query</li><li>user_mean_ndcg: like query_mean_ndcg, but for users — a low NDCG indicates that this user is likely to be dissatisfied with the results. As for query_mean_ndcg, adding this feature yielded better results than using the user&rsquo;s click entropy.</li><li>user_num_click_actions_with_relevance_0: over the history of this user, how many of their clicks had relevance 0 (i.e., short dwell time). Interestingly, user_num_click_actions_with_relevance_1 and user_num_click_actions_with_relevance_2 were found to be less useful.</li><li>user_num_query_actions: number of queries performed by the user</li><li>rank: the original rank, as assigned by Yandex</li><li>previous_query_url_relevance_in_session: modelling repeated results within a session, e.g., if a (query, url) pair was already found irrelevant in this session, the user may not want to see it again</li><li>previous_url_relevance_in_session: the same as previous_query_url_relevance_in_session, but for a url regardless of the query</li><li>user_query_url_relevance_sum: over the entire history of the user, not just the session</li><li>user_normalised_rank_relevance: how relevant does the user usually find this rank? The idea is that some people are more likely to go through all the results than others</li><li>query_url_click_probability: estimated simply as num_query_url_clicks / num_query_url_occurrences (across all the users)</li><li>average_time_on_page: how much time people spend on this url on average</li></ul><p>Our best submission ended up placing us at the 9th place (out of 194 teams), which is respectable. Things got a bit more interesting towards the end of the competition – if we had used the original heuristic model that put at #4 early on, we would have finished 18th.</p><h3 id=conclusion>Conclusion<a hidden class=anchor aria-hidden=true href=#conclusion>#</a></h3><p>I really enjoyed this competition. The data was well-organised and well-defined, which is not something you get in every competition (or in &ldquo;real life&rdquo;). Its size did present some challenges, but we stuck to using flat files and some preprocessing and other tricks to speed things up (e.g., I got to use <a href=http://cython.org/ target=_blank rel=noopener>Cython</a> for the first time). It was good to learn how learning to rank algorithms work and get some insights on search personalisation. As is often the case with Kaggle competitions, this was time well spent.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/gradient-boosting/>gradient boosting</a></li><li><a href=https://yanirseroussi.com/tags/kaggle/>Kaggle</a></li><li><a href=https://yanirseroussi.com/tags/kaggle-competition/>Kaggle competition</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li><li><a href=https://yanirseroussi.com/tags/search-engine-optimisation/>search engine optimisation</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on x" href="https://x.com/intent/tweet/?text=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f&amp;hashtags=datascience%2cgradientboosting%2cKaggle%2cKagglecompetition%2cmachinelearning%2cpredictivemodelling%2csearchengineoptimisation"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f&amp;title=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&amp;summary=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f&title=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on whatsapp" href="https://api.whatsapp.com/send?text=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on telegram" href="https://telegram.me/share/url?text=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) on ycombinator" href="https://news.ycombinator.com/submitlink?t=Learning%20to%20rank%20for%20personalised%20search%20%28Yandex%20Search%20Personalisation%20%e2%80%93%20Kaggle%20Competition%20Summary%20%e2%80%93%20Part%202%29&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f02%2f11%2flearning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-2052><div class=comment-header><a href=#comment-2052><img class=comment-avatar src="https://www.gravatar.com/avatar/1fc0fb275ad25219b5c017921c5e71ec?s=50"><p class=comment-info><strong>nitin</strong><br><small>2017-12-17 13:02:37</small></p></a></div><div class="comment-body post-content"><p>I do not understand how your featureset helped the model to learn anything.
 For example,
 user_num_query_actions: number of queries performed by the user</p><p>How will it affect the order of search results for a new/test query.</p></div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
diff --git a/2015/03/22/the-long-road-to-a-lifestyle-business/index.html b/2015/03/22/the-long-road-to-a-lifestyle-business/index.html
index 72729c03d..9d03695ee 100644
--- a/2015/03/22/the-long-road-to-a-lifestyle-business/index.html
+++ b/2015/03/22/the-long-road-to-a-lifestyle-business/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>The long road to a lifestyle business | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="business,data science,personal"><meta name=description content="Progress since leaving my last full-time job and setting on an independent path that includes data science consulting and work on my own projects."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The long road to a lifestyle business"><meta property="og:description" content="Progress since leaving my last full-time job and setting on an independent path that includes data science consulting and work on my own projects."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/"><meta property="og:image" content="https://yanirseroussi.com/overland-track.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-03-22T09:43:47+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/overland-track.jpg"><meta name=twitter:title content="The long road to a lifestyle business"><meta name=twitter:description content="Progress since leaving my last full-time job and setting on an independent path that includes data science consulting and work on my own projects."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The long road to a lifestyle business","item":"https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The long road to a lifestyle business","name":"The long road to a lifestyle business","description":"Progress since leaving my last full-time job and setting on an independent path that includes data science consulting and work on my own projects.","keywords":["business","data science","personal"],"articleBody":"Almost a year ago, I left my last full-time job and decided to set on an independent path that includes data science consulting and work on my own projects. The ultimate goal is not to have to sell my time for money by generating enough passive income to live comfortably. My five main areas of focus are – in no particular order – personal branding \u0026 networking, data science contracting, Bandcamp Recommender, Price Dingo, and marine conservation. This post summarises what I’ve been doing in each of these five areas, including highlights and lowlights. So far, it’s way better than having a “real” job. I hope this post will help others who are on a similar journey (there seem to be more and more of us – I’d love to hear from you).\nPersonal branding \u0026 networking Finding clients requires considerably more work than finding a full-time job. As with job hunting, the ideal situation is where people come to you for help, rather than you chasing them. To this end, I’ve been networking a lot, giving talks, writing up posts and working on distributing them. It may be harder than getting a full-time job, but it’s also much more interesting.\nHighlights: going viral in China, getting a post featured in KDNuggets\nLowlights: not having enough time to write all the things and meet all the people\nData science contracting My goal with contracting/consulting is to have a steady income stream while working on my own projects. As my projects are small enough to be done only by me (with optional outsourcing to contractors), this means I have infinite runway to pursue them. While this is probably not the best way of building a Silicon Valley-style startup that is going to make the world a better place, many others have applied this approach to building a so-called lifestyle business, which is what I want to achieve.\nEarly on, I realised that doing full-on consulting would be too time consuming, as many clients expect full-time availability. In addition, constantly needing to find new clients means that not much time would be left for work on my own projects. What I really wanted was a stable part-time gig. The first one was with GetUp (who reached out to me following a workshop I gave at General Assembly), where I did some work on forecasting engagement and churn. In parallel, I went through the interview process at DuckDuckGo, which included delivering a piece of work to production. DuckDuckGo ended up wanting me to work full-time (like a few other companies), so last month I started a part-time (three days a week) contract at Commonwealth Bank. I joined a team of very strong data scientists – it looks like it’s going to be interesting.\nHighlights: seeing my DuckDuckGo work every time I search for a Python package, the work environment at GetUp\nLowlights: chasing leads that never eventuated\nBandcamp Recommender (BCRecommender) I’ve written a several posts about BCRecommender, my Bandcamp music recommendation project. While I’ve always treated it as a side-project, it’s been useful in learning how to get traction for a product. It now has thousands of monthly users, and is still growing. My goal for BCRecommender has changed from the original one of finding music for myself to growing it enough to be a noticeable source of traffic for Bandcamp, thereby helping artists and fans. Doing it in side-project mode can be a bit challenging at times (because I have so many other things to do and a long list of ideas to make the app better), but I’ve been making gradual progress and discovering a lot of great music in the process.\nHighlights: every time someone gives me positive feedback, every time I listen to music I found using BCRecommender\nLowlights: dealing with Parse issues and random errors\nPrice Dingo The inability to reliably compare prices for many types of products has been bothering me for a while. Unlike general web search, where the main providers rank results by relevance, most Australian price comparison engines still require merchants to pay to even have their products listed. This creates an obvious bias in the results. To address this bias, I created Price Dingo – a user-centric price comparison engine. It serves users with results they can trust by not requiring merchants to pay to have their products listed. Just like general web search engines, the main ranking factor is relevancy to the user. This relevancy is also achieved by implementing Price Dingo as a network of independent sites, each focused on a specific product category, with the first category being scuba diving gear.\nImplementing Price Dingo hasn’t been too hard – the main challenge has been finding the time to do it with all the other stuff I’ve been doing. There are still plenty of improvements to be made to the site, but now the main goal is to get enough traction to make ongoing time investment worthwhile. Judging by the experience of Booko’s founder, there is space in the market for niche price comparison sites and apps, so it is just a matter of execution.\nHighlights: being able to finally compare dive gear prices, the joys of integrating Algolia\nLowlights: extracting data from messy websites – I’ve seen some horrible things…\nMarine conservation The first thing I did after leaving my last job was go overseas for five weeks, which included a ten-day visit to Israel (rockets!) and three weeks of conservation diving with New Heaven Dive School in Thailand. Back in Sydney, I joined the Underwater Research Group of NSW, a dive club that’s involved in many marine conservation and research activities, including Reef Life Survey (RLS) and underwater cleanups. With URG, I’ve been diving more than before, and for a change, some of my dives actually do good. I’d love to do this kind of stuff full-time, but there’s a lot less money in getting people to do less stuff (i.e., conservation and sustainability) than in consuming more. The compromise for now is that a portion of Price Dingo’s scuba revenue goes to the Australian Marine Conservation Society, and the plan is to expand this to other charities as more categories are added. Update – May 2015: I decided that this compromise isn’t good enough for me, so I shut down Price Dingo to focus on projects that are more aligned with my values.\nHighlights: becoming a certified RLS diver, pretty much every dive\nLowlights: cutting my hand open by falling on rocks on the first day of diving in Thailand\nThe future So far, I’m pretty happy with this not-having-a-job-doing-my-own-thing business. According to The 1000 Day Rule, I still have a long way to go until I get the lifestyle I want. It may even take longer than 1000 days given my decision to not work full-time on a single profitable project, together with my tendency to take more time off than I would if I had a “real” job. But the beauty of this path is that there are no investors breathing down my neck or the feeling of mental rot that comes with a full-time job, so there’s really no rush and I can just enjoy the ride.\n","wordCount":"1202","inLanguage":"en","image":"https://yanirseroussi.com/overland-track.jpg","datePublished":"2015-03-22T09:43:47Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The long road to a lifestyle business</h1><div class=post-meta><span title='2015-03-22 09:43:47 +0000 UTC'>March 22, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-03-22-the-long-road-to-a-lifestyle-business/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track.jpg 3450w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track.jpg alt width=3450 height=1730></figure><div class=post-content><p>Almost a year ago, I left my last full-time job and decided to set on an independent path that includes data science consulting and work on my own projects. The ultimate goal is not to <em>have</em> to sell my time for money by generating enough passive income to live comfortably. My five main areas of focus are – in no particular order – personal branding & networking, data science contracting, <a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp Recommender</a>, Price Dingo, and marine conservation. This post summarises what I&rsquo;ve been doing in each of these five areas, including highlights and lowlights. So far, it&rsquo;s way better than having a &ldquo;real&rdquo; job. I hope this post will help others who are on a similar journey (there seem to be more and more of us – I&rsquo;d love to hear from you).</p><h3 id=personal-branding--networking>Personal branding & networking<a hidden class=anchor aria-hidden=true href=#personal-branding--networking>#</a></h3><p>Finding clients requires considerably more work than finding a full-time job. As with job hunting, the ideal situation is where people come to you for help, rather than you chasing them. To this end, I&rsquo;ve been networking a lot, giving talks, writing up posts and working on distributing them. It may be harder than getting a full-time job, but it&rsquo;s also much more interesting.</p><p><strong>Highlights:</strong> <a href=http://www.weibo.com/1497035431/BDl53rXDk target=_blank rel=noopener>going viral in China</a>, <a href=http://www.kdnuggets.com/2015/03/10-steps-success-kaggle-data-science-competitions.html target=_blank rel=noopener>getting a post featured in KDNuggets</a><br><strong>Lowlights:</strong> not having enough time to write all the things and meet all the people</p><h3 id=data-science-contracting>Data science contracting<a hidden class=anchor aria-hidden=true href=#data-science-contracting>#</a></h3><p>My goal with contracting/consulting is to have a steady income stream while working on my own projects. As my projects are small enough to be done only by me (with optional outsourcing to contractors), this means I have infinite runway to pursue them. While this is probably not the best way of building a Silicon Valley-style startup that is going to <a href="https://www.youtube.com/watch?v=J-GVd_HLlps" target=_blank rel=noopener>make the world a better place</a>, many others have applied this approach to building a so-called lifestyle business, which is what I want to achieve.</p><p>Early on, I realised that doing full-on consulting would be too time consuming, as many clients expect full-time availability. In addition, constantly needing to find new clients means that not much time would be left for work on my own projects. What I really wanted was a stable part-time gig. The first one was with <a href=https://www.getup.org.au/ target=_blank rel=noopener>GetUp</a> (who reached out to me following a workshop I gave at <a href=https://generalassemb.ly/education/demystifying-data-an-introduction-to-data-science target=_blank rel=noopener>General Assembly</a>), where I did some work on forecasting engagement and churn. In parallel, I went through the interview process at <a href=https://duckduckgo.com/ target=_blank rel=noopener>DuckDuckGo</a>, which included <a href=https://github.com/duckduckgo/zeroclickinfo-fathead/pull/95 target=_blank rel=noopener>delivering a piece of work to production</a>. DuckDuckGo ended up wanting me to work full-time (like a few other companies), so last month I started a part-time (three days a week) contract at <a href=https://www.commbank.com.au/ target=_blank rel=noopener>Commonwealth Bank</a>. I joined a team of very strong data scientists – it looks like it&rsquo;s going to be interesting.</p><p><strong>Highlights:</strong> seeing my DuckDuckGo work every time I search for a Python package, the work environment at GetUp<br><strong>Lowlights:</strong> chasing leads that never eventuated</p><h3 id=bandcamp-recommender-bcrecommender>Bandcamp Recommender (BCRecommender)<a hidden class=anchor aria-hidden=true href=#bandcamp-recommender-bcrecommender>#</a></h3><p>I&rsquo;ve written a several posts about <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender, my Bandcamp music recommendation project</a>. While I&rsquo;ve always treated it as a side-project, it&rsquo;s been useful in learning how to get traction for a product. It now has thousands of monthly users, and is still growing. My goal for BCRecommender has changed from the original one of finding music for myself to growing it enough to be a noticeable source of traffic for Bandcamp, thereby helping artists and fans. Doing it in side-project mode can be a bit challenging at times (because I have so many other things to do and a long list of ideas to make the app better), but I&rsquo;ve been making gradual progress and discovering a lot of great music in the process.</p><p><strong>Highlights:</strong> every time someone gives me positive feedback, every time I listen to music I found using BCRecommender<br><strong>Lowlights:</strong> dealing with <a href=http://parse.com target=_blank rel=noopener>Parse</a> issues and random errors</p><h3 id=price-dingo>Price Dingo<a hidden class=anchor aria-hidden=true href=#price-dingo>#</a></h3><p>The inability to reliably compare prices for many types of products has been bothering me for a while. Unlike general web search, where the main providers rank results by relevance, most Australian price comparison engines still require merchants to pay to even have their products listed. This creates an obvious bias in the results. To address this bias, I created Price Dingo – a user-centric price comparison engine. It serves users with results they can trust by not requiring merchants to pay to have their products listed. Just like general web search engines, the main ranking factor is relevancy to the user. This relevancy is also achieved by implementing Price Dingo as a network of independent sites, each focused on a specific product category, with the first category being scuba diving gear.</p><p>Implementing Price Dingo hasn&rsquo;t been too hard – the main challenge has been finding the time to do it with all the other stuff I&rsquo;ve been doing. There are still plenty of improvements to be made to the site, but now the main goal is to get enough traction to make ongoing time investment worthwhile. Judging by the experience of <a href=http://www.booko.com.au target=_blank rel=noopener>Booko&rsquo;s</a> founder, there is space in the market for niche price comparison sites and apps, so it is just a matter of execution.</p><p><strong>Highlights:</strong> being able to finally compare dive gear prices, the joys of integrating <a href=http://www.algolia.com target=_blank rel=noopener>Algolia</a><br><strong>Lowlights:</strong> extracting data from messy websites – I&rsquo;ve seen some horrible things&mldr;</p><h3 id=marine-conservation>Marine conservation<a hidden class=anchor aria-hidden=true href=#marine-conservation>#</a></h3><p>The first thing I did after leaving my last job was go overseas for five weeks, which included a ten-day visit to Israel (rockets!) and three weeks of conservation diving with <a href=http://www.newheavendiveschool.com/marine-conservation-thailand/ target=_blank rel=noopener>New Heaven Dive School in Thailand</a>. Back in Sydney, I joined the <a href=http://www.urgdiveclub.org.au/ target=_blank rel=noopener>Underwater Research Group of NSW</a>, a dive club that&rsquo;s involved in many marine conservation and research activities, including <a href=http://reeflifesurvey.com/ target=_blank rel=noopener>Reef Life Survey (RLS)</a> and <a href=http://www.urgdiveclub.org.au/urg-and-rfa-clean-up-project/ target=_blank rel=noopener>underwater cleanups</a>. With URG, I&rsquo;ve been diving more than before, and for a change, some of my dives actually do good. I&rsquo;d love to do this kind of stuff full-time, but there&rsquo;s a lot less money in getting people to do less stuff (i.e., conservation and sustainability) than in consuming more. The compromise for now is that a portion of Price Dingo&rsquo;s scuba revenue goes to the <a href=http://www.marineconservation.org.au/ target=_blank rel=noopener>Australian Marine Conservation Society</a>, and the plan is to expand this to other charities as more categories are added. <strong>Update – May 2015:</strong> I decided that this compromise isn&rsquo;t good enough for me, so I shut down Price Dingo to focus on projects that are more aligned with my values.</p><p><strong>Highlights:</strong> <a href=http://www.urgdiveclub.org.au/reef-life-survey-training-review/ target=_blank rel=noopener>becoming a certified RLS diver</a>, pretty much every dive<br><strong>Lowlights:</strong> cutting my hand open by falling on rocks on the first day of diving in Thailand</p><h3 id=the-future>The future<a hidden class=anchor aria-hidden=true href=#the-future>#</a></h3><p>So far, I&rsquo;m pretty happy with this not-having-a-job-doing-my-own-thing business. According to <a href=http://www.tropicalmba.com/living-the-dream/ target=_blank rel=noopener>The 1000 Day Rule</a>, I still have a long way to go until I get the lifestyle I want. It may even take longer than 1000 days given my decision to not work full-time on a single profitable project, together with my tendency to take more time off than I would if I had a &ldquo;real&rdquo; job. But the beauty of this path is that there are no investors breathing down my neck or the feeling of mental rot that comes with a full-time job, so there&rsquo;s really no rush and I can just enjoy the ride.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/business/>business</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/personal/>personal</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on x" href="https://x.com/intent/tweet/?text=The%20long%20road%20to%20a%20lifestyle%20business&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f&amp;hashtags=business%2cdatascience%2cpersonal"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f&amp;title=The%20long%20road%20to%20a%20lifestyle%20business&amp;summary=The%20long%20road%20to%20a%20lifestyle%20business&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f&title=The%20long%20road%20to%20a%20lifestyle%20business"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on whatsapp" href="https://api.whatsapp.com/send?text=The%20long%20road%20to%20a%20lifestyle%20business%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on telegram" href="https://telegram.me/share/url?text=The%20long%20road%20to%20a%20lifestyle%20business&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on ycombinator" href="https://news.ycombinator.com/submitlink?t=The%20long%20road%20to%20a%20lifestyle%20business&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="business,data science,personal"><meta name=description content="Progress since leaving my last full-time job and setting on an independent path that includes data science consulting and work on my own projects."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The long road to a lifestyle business"><meta property="og:description" content="Progress since leaving my last full-time job and setting on an independent path that includes data science consulting and work on my own projects."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/"><meta property="og:image" content="https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-03-22T09:43:47+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track.jpg"><meta name=twitter:title content="The long road to a lifestyle business"><meta name=twitter:description content="Progress since leaving my last full-time job and setting on an independent path that includes data science consulting and work on my own projects."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The long road to a lifestyle business","item":"https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The long road to a lifestyle business","name":"The long road to a lifestyle business","description":"Progress since leaving my last full-time job and setting on an independent path that includes data science consulting and work on my own projects.","keywords":["business","data science","personal"],"articleBody":"Almost a year ago, I left my last full-time job and decided to set on an independent path that includes data science consulting and work on my own projects. The ultimate goal is not to have to sell my time for money by generating enough passive income to live comfortably. My five main areas of focus are – in no particular order – personal branding \u0026 networking, data science contracting, Bandcamp Recommender, Price Dingo, and marine conservation. This post summarises what I’ve been doing in each of these five areas, including highlights and lowlights. So far, it’s way better than having a “real” job. I hope this post will help others who are on a similar journey (there seem to be more and more of us – I’d love to hear from you).\nPersonal branding \u0026 networking Finding clients requires considerably more work than finding a full-time job. As with job hunting, the ideal situation is where people come to you for help, rather than you chasing them. To this end, I’ve been networking a lot, giving talks, writing up posts and working on distributing them. It may be harder than getting a full-time job, but it’s also much more interesting.\nHighlights: going viral in China, getting a post featured in KDNuggets\nLowlights: not having enough time to write all the things and meet all the people\nData science contracting My goal with contracting/consulting is to have a steady income stream while working on my own projects. As my projects are small enough to be done only by me (with optional outsourcing to contractors), this means I have infinite runway to pursue them. While this is probably not the best way of building a Silicon Valley-style startup that is going to make the world a better place, many others have applied this approach to building a so-called lifestyle business, which is what I want to achieve.\nEarly on, I realised that doing full-on consulting would be too time consuming, as many clients expect full-time availability. In addition, constantly needing to find new clients means that not much time would be left for work on my own projects. What I really wanted was a stable part-time gig. The first one was with GetUp (who reached out to me following a workshop I gave at General Assembly), where I did some work on forecasting engagement and churn. In parallel, I went through the interview process at DuckDuckGo, which included delivering a piece of work to production. DuckDuckGo ended up wanting me to work full-time (like a few other companies), so last month I started a part-time (three days a week) contract at Commonwealth Bank. I joined a team of very strong data scientists – it looks like it’s going to be interesting.\nHighlights: seeing my DuckDuckGo work every time I search for a Python package, the work environment at GetUp\nLowlights: chasing leads that never eventuated\nBandcamp Recommender (BCRecommender) I’ve written a several posts about BCRecommender, my Bandcamp music recommendation project. While I’ve always treated it as a side-project, it’s been useful in learning how to get traction for a product. It now has thousands of monthly users, and is still growing. My goal for BCRecommender has changed from the original one of finding music for myself to growing it enough to be a noticeable source of traffic for Bandcamp, thereby helping artists and fans. Doing it in side-project mode can be a bit challenging at times (because I have so many other things to do and a long list of ideas to make the app better), but I’ve been making gradual progress and discovering a lot of great music in the process.\nHighlights: every time someone gives me positive feedback, every time I listen to music I found using BCRecommender\nLowlights: dealing with Parse issues and random errors\nPrice Dingo The inability to reliably compare prices for many types of products has been bothering me for a while. Unlike general web search, where the main providers rank results by relevance, most Australian price comparison engines still require merchants to pay to even have their products listed. This creates an obvious bias in the results. To address this bias, I created Price Dingo – a user-centric price comparison engine. It serves users with results they can trust by not requiring merchants to pay to have their products listed. Just like general web search engines, the main ranking factor is relevancy to the user. This relevancy is also achieved by implementing Price Dingo as a network of independent sites, each focused on a specific product category, with the first category being scuba diving gear.\nImplementing Price Dingo hasn’t been too hard – the main challenge has been finding the time to do it with all the other stuff I’ve been doing. There are still plenty of improvements to be made to the site, but now the main goal is to get enough traction to make ongoing time investment worthwhile. Judging by the experience of Booko’s founder, there is space in the market for niche price comparison sites and apps, so it is just a matter of execution.\nHighlights: being able to finally compare dive gear prices, the joys of integrating Algolia\nLowlights: extracting data from messy websites – I’ve seen some horrible things…\nMarine conservation The first thing I did after leaving my last job was go overseas for five weeks, which included a ten-day visit to Israel (rockets!) and three weeks of conservation diving with New Heaven Dive School in Thailand. Back in Sydney, I joined the Underwater Research Group of NSW, a dive club that’s involved in many marine conservation and research activities, including Reef Life Survey (RLS) and underwater cleanups. With URG, I’ve been diving more than before, and for a change, some of my dives actually do good. I’d love to do this kind of stuff full-time, but there’s a lot less money in getting people to do less stuff (i.e., conservation and sustainability) than in consuming more. The compromise for now is that a portion of Price Dingo’s scuba revenue goes to the Australian Marine Conservation Society, and the plan is to expand this to other charities as more categories are added. Update – May 2015: I decided that this compromise isn’t good enough for me, so I shut down Price Dingo to focus on projects that are more aligned with my values.\nHighlights: becoming a certified RLS diver, pretty much every dive\nLowlights: cutting my hand open by falling on rocks on the first day of diving in Thailand\nThe future So far, I’m pretty happy with this not-having-a-job-doing-my-own-thing business. According to The 1000 Day Rule, I still have a long way to go until I get the lifestyle I want. It may even take longer than 1000 days given my decision to not work full-time on a single profitable project, together with my tendency to take more time off than I would if I had a “real” job. But the beauty of this path is that there are no investors breathing down my neck or the feeling of mental rot that comes with a full-time job, so there’s really no rush and I can just enjoy the ride.\n","wordCount":"1202","inLanguage":"en","image":"https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track.jpg","datePublished":"2015-03-22T09:43:47Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The long road to a lifestyle business</h1><div class=post-meta><span title='2015-03-22 09:43:47 +0000 UTC'>March 22, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-03-22-the-long-road-to-a-lifestyle-business/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track_hu48d06ef732b295416c5a71b75238e67b_1361225_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track.jpg 3450w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/overland-track.jpg alt width=3450 height=1730></figure><div class=post-content><p>Almost a year ago, I left my last full-time job and decided to set on an independent path that includes data science consulting and work on my own projects. The ultimate goal is not to <em>have</em> to sell my time for money by generating enough passive income to live comfortably. My five main areas of focus are – in no particular order – personal branding & networking, data science contracting, <a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp Recommender</a>, Price Dingo, and marine conservation. This post summarises what I&rsquo;ve been doing in each of these five areas, including highlights and lowlights. So far, it&rsquo;s way better than having a &ldquo;real&rdquo; job. I hope this post will help others who are on a similar journey (there seem to be more and more of us – I&rsquo;d love to hear from you).</p><h3 id=personal-branding--networking>Personal branding & networking<a hidden class=anchor aria-hidden=true href=#personal-branding--networking>#</a></h3><p>Finding clients requires considerably more work than finding a full-time job. As with job hunting, the ideal situation is where people come to you for help, rather than you chasing them. To this end, I&rsquo;ve been networking a lot, giving talks, writing up posts and working on distributing them. It may be harder than getting a full-time job, but it&rsquo;s also much more interesting.</p><p><strong>Highlights:</strong> <a href=http://www.weibo.com/1497035431/BDl53rXDk target=_blank rel=noopener>going viral in China</a>, <a href=http://www.kdnuggets.com/2015/03/10-steps-success-kaggle-data-science-competitions.html target=_blank rel=noopener>getting a post featured in KDNuggets</a><br><strong>Lowlights:</strong> not having enough time to write all the things and meet all the people</p><h3 id=data-science-contracting>Data science contracting<a hidden class=anchor aria-hidden=true href=#data-science-contracting>#</a></h3><p>My goal with contracting/consulting is to have a steady income stream while working on my own projects. As my projects are small enough to be done only by me (with optional outsourcing to contractors), this means I have infinite runway to pursue them. While this is probably not the best way of building a Silicon Valley-style startup that is going to <a href="https://www.youtube.com/watch?v=J-GVd_HLlps" target=_blank rel=noopener>make the world a better place</a>, many others have applied this approach to building a so-called lifestyle business, which is what I want to achieve.</p><p>Early on, I realised that doing full-on consulting would be too time consuming, as many clients expect full-time availability. In addition, constantly needing to find new clients means that not much time would be left for work on my own projects. What I really wanted was a stable part-time gig. The first one was with <a href=https://www.getup.org.au/ target=_blank rel=noopener>GetUp</a> (who reached out to me following a workshop I gave at <a href=https://generalassemb.ly/education/demystifying-data-an-introduction-to-data-science target=_blank rel=noopener>General Assembly</a>), where I did some work on forecasting engagement and churn. In parallel, I went through the interview process at <a href=https://duckduckgo.com/ target=_blank rel=noopener>DuckDuckGo</a>, which included <a href=https://github.com/duckduckgo/zeroclickinfo-fathead/pull/95 target=_blank rel=noopener>delivering a piece of work to production</a>. DuckDuckGo ended up wanting me to work full-time (like a few other companies), so last month I started a part-time (three days a week) contract at <a href=https://www.commbank.com.au/ target=_blank rel=noopener>Commonwealth Bank</a>. I joined a team of very strong data scientists – it looks like it&rsquo;s going to be interesting.</p><p><strong>Highlights:</strong> seeing my DuckDuckGo work every time I search for a Python package, the work environment at GetUp<br><strong>Lowlights:</strong> chasing leads that never eventuated</p><h3 id=bandcamp-recommender-bcrecommender>Bandcamp Recommender (BCRecommender)<a hidden class=anchor aria-hidden=true href=#bandcamp-recommender-bcrecommender>#</a></h3><p>I&rsquo;ve written a several posts about <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender, my Bandcamp music recommendation project</a>. While I&rsquo;ve always treated it as a side-project, it&rsquo;s been useful in learning how to get traction for a product. It now has thousands of monthly users, and is still growing. My goal for BCRecommender has changed from the original one of finding music for myself to growing it enough to be a noticeable source of traffic for Bandcamp, thereby helping artists and fans. Doing it in side-project mode can be a bit challenging at times (because I have so many other things to do and a long list of ideas to make the app better), but I&rsquo;ve been making gradual progress and discovering a lot of great music in the process.</p><p><strong>Highlights:</strong> every time someone gives me positive feedback, every time I listen to music I found using BCRecommender<br><strong>Lowlights:</strong> dealing with <a href=http://parse.com target=_blank rel=noopener>Parse</a> issues and random errors</p><h3 id=price-dingo>Price Dingo<a hidden class=anchor aria-hidden=true href=#price-dingo>#</a></h3><p>The inability to reliably compare prices for many types of products has been bothering me for a while. Unlike general web search, where the main providers rank results by relevance, most Australian price comparison engines still require merchants to pay to even have their products listed. This creates an obvious bias in the results. To address this bias, I created Price Dingo – a user-centric price comparison engine. It serves users with results they can trust by not requiring merchants to pay to have their products listed. Just like general web search engines, the main ranking factor is relevancy to the user. This relevancy is also achieved by implementing Price Dingo as a network of independent sites, each focused on a specific product category, with the first category being scuba diving gear.</p><p>Implementing Price Dingo hasn&rsquo;t been too hard – the main challenge has been finding the time to do it with all the other stuff I&rsquo;ve been doing. There are still plenty of improvements to be made to the site, but now the main goal is to get enough traction to make ongoing time investment worthwhile. Judging by the experience of <a href=http://www.booko.com.au target=_blank rel=noopener>Booko&rsquo;s</a> founder, there is space in the market for niche price comparison sites and apps, so it is just a matter of execution.</p><p><strong>Highlights:</strong> being able to finally compare dive gear prices, the joys of integrating <a href=http://www.algolia.com target=_blank rel=noopener>Algolia</a><br><strong>Lowlights:</strong> extracting data from messy websites – I&rsquo;ve seen some horrible things&mldr;</p><h3 id=marine-conservation>Marine conservation<a hidden class=anchor aria-hidden=true href=#marine-conservation>#</a></h3><p>The first thing I did after leaving my last job was go overseas for five weeks, which included a ten-day visit to Israel (rockets!) and three weeks of conservation diving with <a href=http://www.newheavendiveschool.com/marine-conservation-thailand/ target=_blank rel=noopener>New Heaven Dive School in Thailand</a>. Back in Sydney, I joined the <a href=http://www.urgdiveclub.org.au/ target=_blank rel=noopener>Underwater Research Group of NSW</a>, a dive club that&rsquo;s involved in many marine conservation and research activities, including <a href=http://reeflifesurvey.com/ target=_blank rel=noopener>Reef Life Survey (RLS)</a> and <a href=http://www.urgdiveclub.org.au/urg-and-rfa-clean-up-project/ target=_blank rel=noopener>underwater cleanups</a>. With URG, I&rsquo;ve been diving more than before, and for a change, some of my dives actually do good. I&rsquo;d love to do this kind of stuff full-time, but there&rsquo;s a lot less money in getting people to do less stuff (i.e., conservation and sustainability) than in consuming more. The compromise for now is that a portion of Price Dingo&rsquo;s scuba revenue goes to the <a href=http://www.marineconservation.org.au/ target=_blank rel=noopener>Australian Marine Conservation Society</a>, and the plan is to expand this to other charities as more categories are added. <strong>Update – May 2015:</strong> I decided that this compromise isn&rsquo;t good enough for me, so I shut down Price Dingo to focus on projects that are more aligned with my values.</p><p><strong>Highlights:</strong> <a href=http://www.urgdiveclub.org.au/reef-life-survey-training-review/ target=_blank rel=noopener>becoming a certified RLS diver</a>, pretty much every dive<br><strong>Lowlights:</strong> cutting my hand open by falling on rocks on the first day of diving in Thailand</p><h3 id=the-future>The future<a hidden class=anchor aria-hidden=true href=#the-future>#</a></h3><p>So far, I&rsquo;m pretty happy with this not-having-a-job-doing-my-own-thing business. According to <a href=http://www.tropicalmba.com/living-the-dream/ target=_blank rel=noopener>The 1000 Day Rule</a>, I still have a long way to go until I get the lifestyle I want. It may even take longer than 1000 days given my decision to not work full-time on a single profitable project, together with my tendency to take more time off than I would if I had a &ldquo;real&rdquo; job. But the beauty of this path is that there are no investors breathing down my neck or the feeling of mental rot that comes with a full-time job, so there&rsquo;s really no rush and I can just enjoy the ride.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/business/>business</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/personal/>personal</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on x" href="https://x.com/intent/tweet/?text=The%20long%20road%20to%20a%20lifestyle%20business&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f&amp;hashtags=business%2cdatascience%2cpersonal"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f&amp;title=The%20long%20road%20to%20a%20lifestyle%20business&amp;summary=The%20long%20road%20to%20a%20lifestyle%20business&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f&title=The%20long%20road%20to%20a%20lifestyle%20business"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on whatsapp" href="https://api.whatsapp.com/send?text=The%20long%20road%20to%20a%20lifestyle%20business%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on telegram" href="https://telegram.me/share/url?text=The%20long%20road%20to%20a%20lifestyle%20business&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The long road to a lifestyle business on ycombinator" href="https://news.ycombinator.com/submitlink?t=The%20long%20road%20to%20a%20lifestyle%20business&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f03%2f22%2fthe-long-road-to-a-lifestyle-business%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-1049><div class=comment-header><a href=#comment-1049><img class=comment-avatar src="https://www.gravatar.com/avatar/76347f8703adb9394f74bf150edb9b19?s=50"><p class=comment-info><strong>Ralph Haygood</strong><br><small>2016-01-29 06:08:15</small></p></a></div><div class="comment-body post-content">&ldquo;What I really wanted was a stable part-time gig.&rdquo;: They&rsquo;re remarkably hard to find. It&rsquo;s an absurdity of our time that many people are overemployed - selling more of their time than they want for more money than they need - even while many other people are underemployed - unable to sell enough of their time for enough money to live comfortably.</div></div><div class=comment-level-1 id=comment-1052><div class=comment-header><a href=#comment-1052><img class=comment-avatar src="https://www.gravatar.com/avatar/dda019c47a6183120608a6aeac2db6c5?s=50"><p class=comment-info><strong>Yanir Seroussi</strong><br><small>2016-01-30 09:27:23</small></p></a></div><div class="comment-body post-content">That&rsquo;s very true. The interesting thing is that it&rsquo;s a problem that is not unique to this century. It was discussed by Thoreau in <a href=https://en.wikipedia.org/wiki/Walden target=_blank rel=nofollow>Walden</a> (1854), Bertrand Russell in <a href=http://www.zpub.com/notes/idle.html target=_blank rel=nofollow>In Praise of Idleness</a> (1932), and David Graeber in <a href=http://strikemag.org/bullshit-jobs/ target=_blank rel=nofollow>On the Phenomenon of Bullshit Jobs</a> (2013), to name a few. People seem to be worried about robots taking their jobs, but the scarier thought is that robots will never take our jobs, because we&rsquo;ll keep coming up with ways of staying employed rather than enjoy the affluence afforded by technological advancements.</div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2015/04/24/my-divestment-from-fossil-fuels/index.html b/2015/04/24/my-divestment-from-fossil-fuels/index.html
index fb6a605c4..84e4a2f5f 100644
--- a/2015/04/24/my-divestment-from-fossil-fuels/index.html
+++ b/2015/04/24/my-divestment-from-fossil-fuels/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>My divestment from fossil fuels | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="climate change,divestment,environment,fossil fuels"><meta name=description content="Recent choices I&rsquo;ve made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My divestment from fossil fuels"><meta property="og:description" content="Recent choices I&rsquo;ve made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/"><meta property="og:image" content="https://yanirseroussi.com/industry.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-04-24T00:19:36+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/industry.jpg"><meta name=twitter:title content="My divestment from fossil fuels"><meta name=twitter:description content="Recent choices I&rsquo;ve made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My divestment from fossil fuels","item":"https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My divestment from fossil fuels","name":"My divestment from fossil fuels","description":"Recent choices I\u0026rsquo;ve made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons.","keywords":["climate change","divestment","environment","fossil fuels"],"articleBody":" This post covers recent choices I've made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons. I recently read Naomi Klein’s This Changes Everything, which deeply influenced me. The book describes how the world has been dragging its feet when it comes to reducing carbon emissions, and how we are coming very close to a point where climate change is likely to spin out of control. While many of the facts presented in the book can be very depressing, one ray of light is that it is still not too late to act. There are still things we can do to avoid catastrophic climate change.\nOne such thing is divestment from fossil fuels. Fossil fuel companies have committed to extracting (and therefore burning) more than what scientists agree is the safe amount of carbon that can be pumped into the atmosphere. While governments have been rather ineffective in stopping this (the current Australian government is even embarrassingly rolling back emission-reduction measures), divesting your money from such companies can help take away the social licence of these companies to do as they please. Further, this may be a smart investment strategy because the world is moving towards renewable energy. Indeed, according to one index, investors who divested from fossil fuels have had higher returns than conventional investors over the last five years.\nIt’s worth noting that even if you disagree with the scientific consensus that releasing billions of tonnes of greenhouse gases into the atmosphere increases the likelihood of climate change, you should agree that it’d be better to stop breathing all the pollutants that result from burning fossil fuels. Further, the environmental damage that comes with extracting fossil fuels is something worth avoiding. Examples include the Deepwater Horizon oil spill, numerous cases of poisoned water due to fracking, and the potential damage to the Great Barrier Reef due to coal mine expansion. Even climate change deniers would admit that divestment from fossil fuels and a rapid move to clean renewables will prevent such disasters.\nThe rest of this post describes steps I’ve recently taken towards divesting from fossil fuels. These are mostly relevant to Australians, though other countries may have similar options.\nSuperannuation In Australia, we have compulsory superannuation (commonly known as super), meaning that most working Australians have some money invested somewhere. As this money is only available at retirement, investors can afford to optimise for long-term returns. Many super funds allow investors to choose what to invest in, and switching funds is relatively straightforward. My super fund is UniSuper. Last week, I switched my plan from Balanced, which includes investments in coal miners Rio Tinto and BHP Billiton, to 75% Sustainable Balanced, which doesn’t directly invest in fossil fuels, and 25% Global Environment Opportunities, which is focused on companies with a green agenda such as Tesla. This switch was very simple – I wish I had done it earlier. If you’re interested in making a similar switch, check out Superswitch’s guide to fossil-free super options.\nEnergy While our previous energy retailer (ClickEnergy) isn’t one of the big three retailers who are actively lobbying the government to reduce the renewable energy target for 2020, my partner and I decided to switch to Powershop, as it appears to be the greenest energy retailer in New South Wales. Powershop supports maintaining the renewable energy target in its current form and provides free carbon offsets for all non-renewable energy. In addition, Powershop allows customers to purchase 100% green power from renewables – an option that we choose to take. With the savings from moving to Powershop and the extra payment for green power, our bill is expected to be more or less the same as before. Everyone wins!\nNote: If you live in New South Wales or Victoria and generally support what GetUp is doing, you can sign up via the links on this page, and GetUp will be paid a referral fee by Powershop.\nBanking There’s been a lot of focus recently on financing provided by the major banks to fossil fuel companies. The problem is that – unlike with super and energy – there aren’t many viable alternatives to the big banks. Reading the statements by smaller banks and credit unions, it is clear that they don’t provide financing to polluters just because they’re too small or not focused on commercial lending. Further, some of the smaller banks invest their money with the bigger banks. If the smaller banks were to become big due to the divestment movement, they may end up financing polluters. Unfortunately, changing your bank doesn’t give you more control over how your chosen financial institute uses your money.\nFor now, I think it makes sense to push the banks to become fossil free by putting them on notice or participating in demonstrations. With enough pressure, one of the big banks may make a strong statement against lending to polluters, and then it’ll be time to act on the notices. One thing that the big banks care about is customer satisfaction and public image. Sending a strong message about the connection between financing polluters and satisfaction may be enough to make a difference. I’ll be tracking news in this area and will possibly make a switch in the future, depending on how things evolve.\nTransportation My top transportation choices are cycling and public transport, followed by driving when the former two are highly inconvenient (e.g., when going scuba diving). Every bike ride means less pollution and is a vote against fossil fuels. Further, bike riding is my main form of exercise, so I don’t need to set aside time to go to the gym. Finally, it’s almost free, and it’s also the fastest way of getting to the city from where I live.\nSince January, I’ve been allowing people to borrow my car through Car Next Door. This service, which is currently active in Sydney and Melbourne, allows people to hire their neighbours’ cars, thereby reducing the number of cars on the road. They also carbon offset all the rides taken through the service. While making my car available has made using it slightly less convenient (because I need to book it for myself), it’s also saved me money, so far covering the cost of insurance and roadside assistance. With my car sitting idle for 95% of the time before joining Car Next Door, it’s definitely another win-win situation. If you’d like to join Car Next Door as either a borrower or an owner, you can use this link to get $15 credit.\nOther areas and next steps Many of the choices we make every day have the power to reduce energy demand. These choices often make our life better, as seen with the bike riding example above. There’s a lot of material online about these green choices, which I may cover from my angle in another post. In general, I’m planning to be more active in the area of environmentalism. While this may come at the cost of reduced focus on my other activities, I would rather be more a part of the solution than a part of the problem. I’ll update as I go – please subscribe to get notified when updates occur.\n","wordCount":"1209","inLanguage":"en","image":"https://yanirseroussi.com/industry.jpg","datePublished":"2015-04-24T00:19:36Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My divestment from fossil fuels</h1><div class=post-meta><span title='2015-04-24 00:19:36 +0000 UTC'>April 24, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-04-24-my-divestment-from-fossil-fuels/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry_hu578451d39f2ee65bac6accbf307997d3_141026_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry_hu578451d39f2ee65bac6accbf307997d3_141026_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry_hu578451d39f2ee65bac6accbf307997d3_141026_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry_hu578451d39f2ee65bac6accbf307997d3_141026_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry.jpg 1280w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry.jpg alt width=1280 height=653></figure><div class=post-content><p class=intro-note>This post covers recent choices I've made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons.</p><p>I recently read <a href=http://thischangeseverything.org/ target=_blank rel=noopener>Naomi Klein&rsquo;s This Changes Everything</a>, which deeply influenced me. The book describes how the world has been dragging its feet when it comes to reducing carbon emissions, and how we are coming very close to a point where climate change is likely to spin out of control. While many of the facts presented in the book can be very depressing, one ray of light is that it is still not too late to act. There are still things we can do to avoid catastrophic climate change.</p><p>One such thing is <a href=http://gofossilfree.org/ target=_blank rel=noopener>divestment from fossil fuels</a>. Fossil fuel companies have committed to extracting (and therefore burning) <a href=https://theconversation.com/unburnable-carbon-why-we-need-to-leave-fossil-fuels-in-the-ground-40467 target=_blank rel=noopener>more than what scientists agree is the safe amount of carbon that can be pumped into the atmosphere</a>. While governments have been rather ineffective in stopping this (the current Australian government is even <a href=https://www.facebook.com/theprojecttv/videos/10152808607343441/ target=_blank rel=noopener>embarrassingly rolling back emission-reduction measures</a>), divesting your money from such companies can help take away the social licence of these companies to do as they please. Further, this may be a smart investment strategy because the world is moving towards renewable energy. Indeed, according to one index, <a href=http://www.theguardian.com/environment/2015/apr/10/fossil-fuel-free-funds-out-performed-conventional-ones-analysis-shows target=_blank rel=noopener>investors who divested from fossil fuels have had higher returns than conventional investors over the last five years</a>.</p><p>It&rsquo;s worth noting that even if you disagree with the scientific consensus that releasing <a href=https://en.wikipedia.org/wiki/Greenhouse_gas target=_blank rel=noopener>billions of tonnes of greenhouse gases</a> into the atmosphere increases the likelihood of climate change, you should agree that it&rsquo;d be better to stop breathing all the pollutants that result from burning fossil fuels. Further, the environmental damage that comes with extracting fossil fuels is something worth avoiding. Examples include <a href=https://en.wikipedia.org/wiki/Deepwater_Horizon_oil_spill target=_blank rel=noopener>the Deepwater Horizon oil spill</a>, <a href=https://en.wikipedia.org/wiki/Environmental_impact_of_hydraulic_fracturing target=_blank rel=noopener>numerous cases of poisoned water due to fracking</a>, and <a href=http://fightforthereef.org.au/ target=_blank rel=noopener>the potential damage to the Great Barrier Reef due to coal mine expansion</a>. Even climate change deniers would admit that divestment from fossil fuels and a rapid move to clean renewables will prevent such disasters.</p><p>The rest of this post describes steps I&rsquo;ve recently taken towards divesting from fossil fuels. These are mostly relevant to Australians, though other countries may have similar options.</p><h3 id=superannuation>Superannuation<a hidden class=anchor aria-hidden=true href=#superannuation>#</a></h3><p>In Australia, we have <a href=https://en.wikipedia.org/wiki/Superannuation_in_Australia target=_blank rel=noopener>compulsory superannuation</a> (commonly known as <em>super</em>), meaning that most working Australians have some money invested somewhere. As this money is only available at retirement, investors can afford to optimise for long-term returns. Many super funds allow investors to choose what to invest in, and switching funds is relatively straightforward. My super fund is <a href=http://www.unisuper.com.au/ target=_blank rel=noopener>UniSuper</a>. Last week, I switched my plan from <a href=http://www.unisuper.com.au/investments/investment-options-and-performance/super-performance-and-option-holdings/balanced target=_blank rel=noopener>Balanced</a>, which includes investments in coal miners Rio Tinto and BHP Billiton, to 75% <a href=http://www.unisuper.com.au/investments/investment-options-and-performance/super-performance-and-option-holdings/sustainable-balanced target=_blank rel=noopener>Sustainable Balanced</a>, which doesn&rsquo;t directly invest in fossil fuels, and 25% <a href=http://www.unisuper.com.au/investments/investment-options-and-performance/super-performance-and-option-holdings/global-environmental-opportunities target=_blank rel=noopener>Global Environment Opportunities</a>, which is focused on companies with a green agenda such as Tesla. This switch was very simple – I wish I had done it earlier. If you&rsquo;re interested in making a similar switch, check out <a href=http://superswitch.org.au/ target=_blank rel=noopener>Superswitch&rsquo;s guide to fossil-free super options</a>.</p><h3 id=energy>Energy<a hidden class=anchor aria-hidden=true href=#energy>#</a></h3><p>While our previous energy retailer (ClickEnergy) isn&rsquo;t one of the big three retailers <a href=https://www.getup.org.au/campaigns/renewable-energy/send-the-dirty-three-a-message/hit-the-dirty-three-where-it-hurts target=_blank rel=noopener>who are actively lobbying the government to reduce the renewable energy target for 2020</a>, my partner and I decided to switch to <a href=http://www.powershop.com.au/ target=_blank rel=noopener>Powershop</a>, as it appears to be the greenest energy retailer in New South Wales. Powershop <a href=http://www.powershop.com.au/renewables/ target=_blank rel=noopener>supports maintaining the renewable energy target in its current form</a> and provides free carbon offsets for all non-renewable energy. In addition, Powershop allows customers to purchase 100% green power from renewables – an option that we choose to take. With the savings from moving to Powershop and the extra payment for green power, our bill is expected to be more or less the same as before. Everyone wins!</p><p>Note: If you live in New South Wales or Victoria and generally support what GetUp is doing, you can sign up via <a href=https://www.getup.org.au/campaigns/renewable-energy/send-the-dirty-three-a-message/hit-the-dirty-three-where-it-hurts target=_blank rel=noopener>the links on this page</a>, and GetUp will be paid a referral fee by Powershop.</p><h3 id=banking>Banking<a hidden class=anchor aria-hidden=true href=#banking>#</a></h3><p>There&rsquo;s been a lot of focus recently on <a href=http://gofossilfree.org.au/fossil-free-banks/ target=_blank rel=noopener>financing provided by the major banks to fossil fuel companies</a>. The problem is that – unlike with super and energy – there aren&rsquo;t many viable alternatives to the big banks. Reading the <a href=http://www.marketforces.org.au/banks/compare target=_blank rel=noopener>statements by smaller banks and credit unions</a>, it is clear that they don&rsquo;t provide financing to polluters just because they&rsquo;re too small or not focused on commercial lending. Further, some of the smaller banks invest their money with the bigger banks. If the smaller banks were to become big due to the divestment movement, they may end up financing polluters. Unfortunately, changing your bank doesn&rsquo;t give you more control over how your chosen financial institute uses your money.</p><p>For now, I think it makes sense to push the banks to become fossil free by <a href=http://action.marketforces.org.au/page/s/banks-on-notice target=_blank rel=noopener>putting them on notice</a> or <a href=http://act.350.org/event/CBA_Week_of_Action/ target=_blank rel=noopener>participating in demonstrations</a>. With enough pressure, one of the big banks may make a strong statement against lending to polluters, and then it&rsquo;ll be time to act on the notices. One thing that the big banks care about is <a href=http://www.roymorgan.com/findings/6028-consumer-sat-with-banks-close-to-record-high-201501262213 target=_blank rel=noopener>customer satisfaction</a> and public image. Sending a strong message about the connection between financing polluters and satisfaction may be enough to make a difference. I&rsquo;ll be tracking news in this area and will possibly make a switch in the future, depending on how things evolve.</p><h3 id=transportation>Transportation<a hidden class=anchor aria-hidden=true href=#transportation>#</a></h3><p>My top transportation choices are cycling and public transport, followed by driving when the former two are highly inconvenient (e.g., when going scuba diving). Every bike ride means less pollution and is a vote against fossil fuels. Further, bike riding is my main form of exercise, so I don&rsquo;t need to set aside time to go to the gym. Finally, it&rsquo;s almost free, and it&rsquo;s also the fastest way of getting to the city from where I live.</p><p>Since January, I&rsquo;ve been allowing people to borrow my car through Car Next Door. This service, which is currently active in Sydney and Melbourne, allows people to hire their neighbours&rsquo; cars, thereby reducing the number of cars on the road. They also <a href=http://www.carnextdoor.com.au/carbon-offset/ target=_blank rel=noopener>carbon offset all the rides taken through the service</a>. While making my car available has made using it slightly less convenient (because I need to book it for myself), it&rsquo;s also saved me money, so far covering the cost of insurance and roadside assistance. With my car sitting idle for 95% of the time before joining Car Next Door, it&rsquo;s definitely another win-win situation. If you&rsquo;d like to join Car Next Door as either a borrower or an owner, you can <a href="http://carnextdoor.ontraport.net/t?orid=26287&opid=2" target=_blank rel=noopener>use this link to get $15 credit</a>.</p><h3 id=other-areas-and-next-steps>Other areas and next steps<a hidden class=anchor aria-hidden=true href=#other-areas-and-next-steps>#</a></h3><p>Many of the choices we make every day have the power to reduce energy demand. These choices often make our life better, as seen with the bike riding example above. There&rsquo;s a lot of material online about these green choices, which I may cover from my angle in another post. In general, I&rsquo;m planning to be more active in the area of environmentalism. While this may come at the cost of reduced focus on <a href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/ title="The long road to a lifestyle business">my other activities</a>, I would rather be more a part of the solution than a part of the problem. I&rsquo;ll update as I go – please subscribe to get notified when updates occur.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/climate-change/>climate change</a></li><li><a href=https://yanirseroussi.com/tags/divestment/>divestment</a></li><li><a href=https://yanirseroussi.com/tags/environment/>environment</a></li><li><a href=https://yanirseroussi.com/tags/fossil-fuels/>fossil fuels</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on x" href="https://x.com/intent/tweet/?text=My%20divestment%20from%20fossil%20fuels&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f&amp;hashtags=climatechange%2cdivestment%2cenvironment%2cfossilfuels"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f&amp;title=My%20divestment%20from%20fossil%20fuels&amp;summary=My%20divestment%20from%20fossil%20fuels&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f&title=My%20divestment%20from%20fossil%20fuels"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on whatsapp" href="https://api.whatsapp.com/send?text=My%20divestment%20from%20fossil%20fuels%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on telegram" href="https://telegram.me/share/url?text=My%20divestment%20from%20fossil%20fuels&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on ycombinator" href="https://news.ycombinator.com/submitlink?t=My%20divestment%20from%20fossil%20fuels&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="climate change,divestment,environment,fossil fuels"><meta name=description content="Recent choices I&rsquo;ve made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My divestment from fossil fuels"><meta property="og:description" content="Recent choices I&rsquo;ve made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/"><meta property="og:image" content="https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-04-24T00:19:36+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry.jpg"><meta name=twitter:title content="My divestment from fossil fuels"><meta name=twitter:description content="Recent choices I&rsquo;ve made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My divestment from fossil fuels","item":"https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My divestment from fossil fuels","name":"My divestment from fossil fuels","description":"Recent choices I\u0026rsquo;ve made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons.","keywords":["climate change","divestment","environment","fossil fuels"],"articleBody":" This post covers recent choices I've made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons. I recently read Naomi Klein’s This Changes Everything, which deeply influenced me. The book describes how the world has been dragging its feet when it comes to reducing carbon emissions, and how we are coming very close to a point where climate change is likely to spin out of control. While many of the facts presented in the book can be very depressing, one ray of light is that it is still not too late to act. There are still things we can do to avoid catastrophic climate change.\nOne such thing is divestment from fossil fuels. Fossil fuel companies have committed to extracting (and therefore burning) more than what scientists agree is the safe amount of carbon that can be pumped into the atmosphere. While governments have been rather ineffective in stopping this (the current Australian government is even embarrassingly rolling back emission-reduction measures), divesting your money from such companies can help take away the social licence of these companies to do as they please. Further, this may be a smart investment strategy because the world is moving towards renewable energy. Indeed, according to one index, investors who divested from fossil fuels have had higher returns than conventional investors over the last five years.\nIt’s worth noting that even if you disagree with the scientific consensus that releasing billions of tonnes of greenhouse gases into the atmosphere increases the likelihood of climate change, you should agree that it’d be better to stop breathing all the pollutants that result from burning fossil fuels. Further, the environmental damage that comes with extracting fossil fuels is something worth avoiding. Examples include the Deepwater Horizon oil spill, numerous cases of poisoned water due to fracking, and the potential damage to the Great Barrier Reef due to coal mine expansion. Even climate change deniers would admit that divestment from fossil fuels and a rapid move to clean renewables will prevent such disasters.\nThe rest of this post describes steps I’ve recently taken towards divesting from fossil fuels. These are mostly relevant to Australians, though other countries may have similar options.\nSuperannuation In Australia, we have compulsory superannuation (commonly known as super), meaning that most working Australians have some money invested somewhere. As this money is only available at retirement, investors can afford to optimise for long-term returns. Many super funds allow investors to choose what to invest in, and switching funds is relatively straightforward. My super fund is UniSuper. Last week, I switched my plan from Balanced, which includes investments in coal miners Rio Tinto and BHP Billiton, to 75% Sustainable Balanced, which doesn’t directly invest in fossil fuels, and 25% Global Environment Opportunities, which is focused on companies with a green agenda such as Tesla. This switch was very simple – I wish I had done it earlier. If you’re interested in making a similar switch, check out Superswitch’s guide to fossil-free super options.\nEnergy While our previous energy retailer (ClickEnergy) isn’t one of the big three retailers who are actively lobbying the government to reduce the renewable energy target for 2020, my partner and I decided to switch to Powershop, as it appears to be the greenest energy retailer in New South Wales. Powershop supports maintaining the renewable energy target in its current form and provides free carbon offsets for all non-renewable energy. In addition, Powershop allows customers to purchase 100% green power from renewables – an option that we choose to take. With the savings from moving to Powershop and the extra payment for green power, our bill is expected to be more or less the same as before. Everyone wins!\nNote: If you live in New South Wales or Victoria and generally support what GetUp is doing, you can sign up via the links on this page, and GetUp will be paid a referral fee by Powershop.\nBanking There’s been a lot of focus recently on financing provided by the major banks to fossil fuel companies. The problem is that – unlike with super and energy – there aren’t many viable alternatives to the big banks. Reading the statements by smaller banks and credit unions, it is clear that they don’t provide financing to polluters just because they’re too small or not focused on commercial lending. Further, some of the smaller banks invest their money with the bigger banks. If the smaller banks were to become big due to the divestment movement, they may end up financing polluters. Unfortunately, changing your bank doesn’t give you more control over how your chosen financial institute uses your money.\nFor now, I think it makes sense to push the banks to become fossil free by putting them on notice or participating in demonstrations. With enough pressure, one of the big banks may make a strong statement against lending to polluters, and then it’ll be time to act on the notices. One thing that the big banks care about is customer satisfaction and public image. Sending a strong message about the connection between financing polluters and satisfaction may be enough to make a difference. I’ll be tracking news in this area and will possibly make a switch in the future, depending on how things evolve.\nTransportation My top transportation choices are cycling and public transport, followed by driving when the former two are highly inconvenient (e.g., when going scuba diving). Every bike ride means less pollution and is a vote against fossil fuels. Further, bike riding is my main form of exercise, so I don’t need to set aside time to go to the gym. Finally, it’s almost free, and it’s also the fastest way of getting to the city from where I live.\nSince January, I’ve been allowing people to borrow my car through Car Next Door. This service, which is currently active in Sydney and Melbourne, allows people to hire their neighbours’ cars, thereby reducing the number of cars on the road. They also carbon offset all the rides taken through the service. While making my car available has made using it slightly less convenient (because I need to book it for myself), it’s also saved me money, so far covering the cost of insurance and roadside assistance. With my car sitting idle for 95% of the time before joining Car Next Door, it’s definitely another win-win situation. If you’d like to join Car Next Door as either a borrower or an owner, you can use this link to get $15 credit.\nOther areas and next steps Many of the choices we make every day have the power to reduce energy demand. These choices often make our life better, as seen with the bike riding example above. There’s a lot of material online about these green choices, which I may cover from my angle in another post. In general, I’m planning to be more active in the area of environmentalism. While this may come at the cost of reduced focus on my other activities, I would rather be more a part of the solution than a part of the problem. I’ll update as I go – please subscribe to get notified when updates occur.\n","wordCount":"1209","inLanguage":"en","image":"https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry.jpg","datePublished":"2015-04-24T00:19:36Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My divestment from fossil fuels</h1><div class=post-meta><span title='2015-04-24 00:19:36 +0000 UTC'>April 24, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-04-24-my-divestment-from-fossil-fuels/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry_hu578451d39f2ee65bac6accbf307997d3_141026_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry_hu578451d39f2ee65bac6accbf307997d3_141026_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry_hu578451d39f2ee65bac6accbf307997d3_141026_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry_hu578451d39f2ee65bac6accbf307997d3_141026_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry.jpg 1280w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/industry.jpg alt width=1280 height=653></figure><div class=post-content><p class=intro-note>This post covers recent choices I've made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons.</p><p>I recently read <a href=http://thischangeseverything.org/ target=_blank rel=noopener>Naomi Klein&rsquo;s This Changes Everything</a>, which deeply influenced me. The book describes how the world has been dragging its feet when it comes to reducing carbon emissions, and how we are coming very close to a point where climate change is likely to spin out of control. While many of the facts presented in the book can be very depressing, one ray of light is that it is still not too late to act. There are still things we can do to avoid catastrophic climate change.</p><p>One such thing is <a href=http://gofossilfree.org/ target=_blank rel=noopener>divestment from fossil fuels</a>. Fossil fuel companies have committed to extracting (and therefore burning) <a href=https://theconversation.com/unburnable-carbon-why-we-need-to-leave-fossil-fuels-in-the-ground-40467 target=_blank rel=noopener>more than what scientists agree is the safe amount of carbon that can be pumped into the atmosphere</a>. While governments have been rather ineffective in stopping this (the current Australian government is even <a href=https://www.facebook.com/theprojecttv/videos/10152808607343441/ target=_blank rel=noopener>embarrassingly rolling back emission-reduction measures</a>), divesting your money from such companies can help take away the social licence of these companies to do as they please. Further, this may be a smart investment strategy because the world is moving towards renewable energy. Indeed, according to one index, <a href=http://www.theguardian.com/environment/2015/apr/10/fossil-fuel-free-funds-out-performed-conventional-ones-analysis-shows target=_blank rel=noopener>investors who divested from fossil fuels have had higher returns than conventional investors over the last five years</a>.</p><p>It&rsquo;s worth noting that even if you disagree with the scientific consensus that releasing <a href=https://en.wikipedia.org/wiki/Greenhouse_gas target=_blank rel=noopener>billions of tonnes of greenhouse gases</a> into the atmosphere increases the likelihood of climate change, you should agree that it&rsquo;d be better to stop breathing all the pollutants that result from burning fossil fuels. Further, the environmental damage that comes with extracting fossil fuels is something worth avoiding. Examples include <a href=https://en.wikipedia.org/wiki/Deepwater_Horizon_oil_spill target=_blank rel=noopener>the Deepwater Horizon oil spill</a>, <a href=https://en.wikipedia.org/wiki/Environmental_impact_of_hydraulic_fracturing target=_blank rel=noopener>numerous cases of poisoned water due to fracking</a>, and <a href=http://fightforthereef.org.au/ target=_blank rel=noopener>the potential damage to the Great Barrier Reef due to coal mine expansion</a>. Even climate change deniers would admit that divestment from fossil fuels and a rapid move to clean renewables will prevent such disasters.</p><p>The rest of this post describes steps I&rsquo;ve recently taken towards divesting from fossil fuels. These are mostly relevant to Australians, though other countries may have similar options.</p><h3 id=superannuation>Superannuation<a hidden class=anchor aria-hidden=true href=#superannuation>#</a></h3><p>In Australia, we have <a href=https://en.wikipedia.org/wiki/Superannuation_in_Australia target=_blank rel=noopener>compulsory superannuation</a> (commonly known as <em>super</em>), meaning that most working Australians have some money invested somewhere. As this money is only available at retirement, investors can afford to optimise for long-term returns. Many super funds allow investors to choose what to invest in, and switching funds is relatively straightforward. My super fund is <a href=http://www.unisuper.com.au/ target=_blank rel=noopener>UniSuper</a>. Last week, I switched my plan from <a href=http://www.unisuper.com.au/investments/investment-options-and-performance/super-performance-and-option-holdings/balanced target=_blank rel=noopener>Balanced</a>, which includes investments in coal miners Rio Tinto and BHP Billiton, to 75% <a href=http://www.unisuper.com.au/investments/investment-options-and-performance/super-performance-and-option-holdings/sustainable-balanced target=_blank rel=noopener>Sustainable Balanced</a>, which doesn&rsquo;t directly invest in fossil fuels, and 25% <a href=http://www.unisuper.com.au/investments/investment-options-and-performance/super-performance-and-option-holdings/global-environmental-opportunities target=_blank rel=noopener>Global Environment Opportunities</a>, which is focused on companies with a green agenda such as Tesla. This switch was very simple – I wish I had done it earlier. If you&rsquo;re interested in making a similar switch, check out <a href=http://superswitch.org.au/ target=_blank rel=noopener>Superswitch&rsquo;s guide to fossil-free super options</a>.</p><h3 id=energy>Energy<a hidden class=anchor aria-hidden=true href=#energy>#</a></h3><p>While our previous energy retailer (ClickEnergy) isn&rsquo;t one of the big three retailers <a href=https://www.getup.org.au/campaigns/renewable-energy/send-the-dirty-three-a-message/hit-the-dirty-three-where-it-hurts target=_blank rel=noopener>who are actively lobbying the government to reduce the renewable energy target for 2020</a>, my partner and I decided to switch to <a href=http://www.powershop.com.au/ target=_blank rel=noopener>Powershop</a>, as it appears to be the greenest energy retailer in New South Wales. Powershop <a href=http://www.powershop.com.au/renewables/ target=_blank rel=noopener>supports maintaining the renewable energy target in its current form</a> and provides free carbon offsets for all non-renewable energy. In addition, Powershop allows customers to purchase 100% green power from renewables – an option that we choose to take. With the savings from moving to Powershop and the extra payment for green power, our bill is expected to be more or less the same as before. Everyone wins!</p><p>Note: If you live in New South Wales or Victoria and generally support what GetUp is doing, you can sign up via <a href=https://www.getup.org.au/campaigns/renewable-energy/send-the-dirty-three-a-message/hit-the-dirty-three-where-it-hurts target=_blank rel=noopener>the links on this page</a>, and GetUp will be paid a referral fee by Powershop.</p><h3 id=banking>Banking<a hidden class=anchor aria-hidden=true href=#banking>#</a></h3><p>There&rsquo;s been a lot of focus recently on <a href=http://gofossilfree.org.au/fossil-free-banks/ target=_blank rel=noopener>financing provided by the major banks to fossil fuel companies</a>. The problem is that – unlike with super and energy – there aren&rsquo;t many viable alternatives to the big banks. Reading the <a href=http://www.marketforces.org.au/banks/compare target=_blank rel=noopener>statements by smaller banks and credit unions</a>, it is clear that they don&rsquo;t provide financing to polluters just because they&rsquo;re too small or not focused on commercial lending. Further, some of the smaller banks invest their money with the bigger banks. If the smaller banks were to become big due to the divestment movement, they may end up financing polluters. Unfortunately, changing your bank doesn&rsquo;t give you more control over how your chosen financial institute uses your money.</p><p>For now, I think it makes sense to push the banks to become fossil free by <a href=http://action.marketforces.org.au/page/s/banks-on-notice target=_blank rel=noopener>putting them on notice</a> or <a href=http://act.350.org/event/CBA_Week_of_Action/ target=_blank rel=noopener>participating in demonstrations</a>. With enough pressure, one of the big banks may make a strong statement against lending to polluters, and then it&rsquo;ll be time to act on the notices. One thing that the big banks care about is <a href=http://www.roymorgan.com/findings/6028-consumer-sat-with-banks-close-to-record-high-201501262213 target=_blank rel=noopener>customer satisfaction</a> and public image. Sending a strong message about the connection between financing polluters and satisfaction may be enough to make a difference. I&rsquo;ll be tracking news in this area and will possibly make a switch in the future, depending on how things evolve.</p><h3 id=transportation>Transportation<a hidden class=anchor aria-hidden=true href=#transportation>#</a></h3><p>My top transportation choices are cycling and public transport, followed by driving when the former two are highly inconvenient (e.g., when going scuba diving). Every bike ride means less pollution and is a vote against fossil fuels. Further, bike riding is my main form of exercise, so I don&rsquo;t need to set aside time to go to the gym. Finally, it&rsquo;s almost free, and it&rsquo;s also the fastest way of getting to the city from where I live.</p><p>Since January, I&rsquo;ve been allowing people to borrow my car through Car Next Door. This service, which is currently active in Sydney and Melbourne, allows people to hire their neighbours&rsquo; cars, thereby reducing the number of cars on the road. They also <a href=http://www.carnextdoor.com.au/carbon-offset/ target=_blank rel=noopener>carbon offset all the rides taken through the service</a>. While making my car available has made using it slightly less convenient (because I need to book it for myself), it&rsquo;s also saved me money, so far covering the cost of insurance and roadside assistance. With my car sitting idle for 95% of the time before joining Car Next Door, it&rsquo;s definitely another win-win situation. If you&rsquo;d like to join Car Next Door as either a borrower or an owner, you can <a href="http://carnextdoor.ontraport.net/t?orid=26287&opid=2" target=_blank rel=noopener>use this link to get $15 credit</a>.</p><h3 id=other-areas-and-next-steps>Other areas and next steps<a hidden class=anchor aria-hidden=true href=#other-areas-and-next-steps>#</a></h3><p>Many of the choices we make every day have the power to reduce energy demand. These choices often make our life better, as seen with the bike riding example above. There&rsquo;s a lot of material online about these green choices, which I may cover from my angle in another post. In general, I&rsquo;m planning to be more active in the area of environmentalism. While this may come at the cost of reduced focus on <a href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/ title="The long road to a lifestyle business">my other activities</a>, I would rather be more a part of the solution than a part of the problem. I&rsquo;ll update as I go – please subscribe to get notified when updates occur.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/climate-change/>climate change</a></li><li><a href=https://yanirseroussi.com/tags/divestment/>divestment</a></li><li><a href=https://yanirseroussi.com/tags/environment/>environment</a></li><li><a href=https://yanirseroussi.com/tags/fossil-fuels/>fossil fuels</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on x" href="https://x.com/intent/tweet/?text=My%20divestment%20from%20fossil%20fuels&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f&amp;hashtags=climatechange%2cdivestment%2cenvironment%2cfossilfuels"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f&amp;title=My%20divestment%20from%20fossil%20fuels&amp;summary=My%20divestment%20from%20fossil%20fuels&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f&title=My%20divestment%20from%20fossil%20fuels"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on whatsapp" href="https://api.whatsapp.com/send?text=My%20divestment%20from%20fossil%20fuels%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on telegram" href="https://telegram.me/share/url?text=My%20divestment%20from%20fossil%20fuels&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My divestment from fossil fuels on ycombinator" href="https://news.ycombinator.com/submitlink?t=My%20divestment%20from%20fossil%20fuels&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f04%2f24%2fmy-divestment-from-fossil-fuels%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-3139><div class=comment-header><a href=#comment-3139><img class=comment-avatar src="https://www.gravatar.com/avatar/ddeb90456a90eeded4b5ca639d404d1b?s=50"><p class=comment-info><strong>Alex</strong><br><small>2018-12-25 13:44:02</small></p></a></div><div class="comment-body post-content">Thanks for sharing your standpoint on this.</div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/index.html b/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/index.html
index 64219a272..f3ae76b5f 100644
--- a/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/index.html
+++ b/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>First steps in data science: author-aware sentiment analysis | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,machine learning,predictive modelling,sentiment analysis,software engineering"><meta name=description content="I became a data scientist by doing a PhD, but the same steps can be followed without a formal education program."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="First steps in data science: author-aware sentiment analysis"><meta property="og:description" content="I became a data scientist by doing a PhD, but the same steps can be followed without a formal education program."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/"><meta property="og:image" content="https://yanirseroussi.com/kitten-first-steps.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-05-02T08:31:10+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/kitten-first-steps.jpg"><meta name=twitter:title content="First steps in data science: author-aware sentiment analysis"><meta name=twitter:description content="I became a data scientist by doing a PhD, but the same steps can be followed without a formal education program."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"First steps in data science: author-aware sentiment analysis","item":"https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"First steps in data science: author-aware sentiment analysis","name":"First steps in data science: author-aware sentiment analysis","description":"I became a data scientist by doing a PhD, but the same steps can be followed without a formal education program.","keywords":["data science","machine learning","predictive modelling","sentiment analysis","software engineering"],"articleBody":"People often ask me what’s the best way of becoming a data scientist. The way I got there was by first becoming a software engineer and then doing a PhD in what was essentially data science (before it became such a popular term). This post describes my first steps in the field with the goal of helping others who are interested in making the transition from pure software engineering to data science.\nWhile my first steps were in a PhD program, I don’t think that going through the formal PhD process is necessary if you wish to become a data scientist. Self-motivated individuals can get very far by making use of the abundance of learning resources available online. In fact, one can make progress much faster than in a PhD, because PhD programs have many overheads.\nThis post is organised as a list of steps. Despite the sequential numbering, many steps can be done in parallel. These steps roughly recount the work I’ve done to publish my first paper, which was co-authored by Ingrid Zukerman and Fabian Bohnert. Most of the technical details are intentionally omitted. Readers who are interested in learning more are invited to read the original paper or chapter 6 in my thesis, which includes more thorough experiments and explanations.\nStep one: Find a problem to work on Even if you know nothing about the machine learning and statistics side of data science, it’s important to find a problem to work on. Ideally it’d be something you find personally interesting, as this helps with motivation. You could use a predefined problem such as a Kaggle competition or one of the UCI datasets. Alternatively, you could collect the data yourself to make things a bit more challenging.\nIn my case, I was interested in natural language processing and user modelling. My supervisor was given a grant to work on sentiment analysis of opinion polls, which was my first direction of research. This quickly changed to focus on the connection between authors and the way they express their sentiments, with the application of harnessing this connection to improve the accuracy of sentiment analysis algorithms. For the purpose of this research, I collected a dataset of texts by the most prolific IMDb users. The problem was to infer the ratings these users assigned to their own reviews, with the hypothesis that methods that take author identity into account would outperform methods that ignore authorship information.\nStep two: Close your knowledge gaps Whatever problem you choose, you will have some knowledge gaps that require filling. Wikipedia, textbooks, and online courses will be your best guide for foundational areas like machine learning and statistics. Reading academic papers is often required to get a better understanding of recent work on the specific problem you’re trying to solve.\nDoing a PhD afforded me the luxury of spending about a month just reading papers. Most of the ~200 papers I read were on sentiment analysis, which gave me a good overview of what’s been done in the field. However, the best thing I’ve done was to stop reading and move on to working on the problem. This is also the best advice I can give: there’s no better way to learn than getting your hands dirty working on a problem.\nStep three: Get your hands dirty With a well-defined problem and the knowledge gaps more-or-less closed, it is time to come up with a plan and implement it. Due to my background in software engineering and some exposure to early collaborative filtering approaches to recommender systems, my plan was very much a part of what Leo Breiman called the algorithmic modelling culture. That is, I was more focused on developing algorithms that work than on modelling the process that generated the data. This approach is arguably more in line with the mindset that software engineers tend to have than with the approach of mathematicians and statisticians.\nThe plan was quite simple:\nReproduce results that showed that rating inference models trained on enough texts by the target author (i.e., the author who wrote the text whose rating we want to predict) outperform models trained on texts by multiple authors Use an approach inspired by collaborative filtering to combine multiple single-author models to infer ratings for texts by the target author, where those models are weighted by similarity to the target author Experiment with multiple similarity measurements under various constraints on the number of texts available by the training and target authors Iterate on these ideas until the results are publishable The rationale behind this plan was that while different people express their sentiments differently, similar people would express their sentiments similarly (e.g., use of understatements varies by culture). The key motivation was Pang and Lee’s finding that a model trained on a single author is best if we have enough texts by this author.\nThe way I implemented the plan was vastly different from how I’d do it today. This was 2009, and using Java with the Weka package for the core modelling seemed like a huge improvement over the C/C++ I was used to. I relied heavily on the university grid to run experiments and wrote a bunch of code to handle experimental logic, including some Perl scripts for post-processing. It ended up being pretty messy, but it worked and I got publishable results. If I were to do the same work today, I’d use Python for everything. IPython Notebook is a great way of keeping track of experimental work, and Python packages like pandas, scikit-learn, gensim, TextBlob, etc. are mature and easy to use for data science applications.\nStep four: Publish your results Having a deadline for publishing results can be stressful, but it has two positive outcomes. First, making your work public allows you to obtain valuable feedback. Second, hard deadlines are great in making you work towards a tangible goal. You can always keep iterating to get infinitesimal improvements, but publication deadlines force you to decide that you’ve done enough.\nIn my case, the deadline for the UMAP 2010 conference and the promise of a free trip to Hawaii served as excellent motivators. But even if you don’t have the time or energy to get an academic paper published, you should set yourself a deadline to publish something on a blog or a forum, or even as a report to a mentor who can assess your work. Receiving continuous feedback is a key factor in improvement, so release early and release often.\nStep five: Improve results or move on Congratulations! You have published the results of your study. What now? You can either keep working on the same problem – try more approaches, add more data, change the constraints, etc. Or you can move on to work on other problems that interest you.\nIn my case, I had to go back to iterate on the results of the first paper because of things I learned later. I ended up rerunning all the experiments to make things fit together into a more-or-less coherent story for the thesis (writing a thesis is one of the main overheads that comes with doing a PhD). If I had a choice, I wouldn’t have done that. I would instead have pursued more sensible enhancements to the work presented in the paper, such as using the author as a feature, employing more robust ensemble methods, and testing different base methods than support vector machines. Nonetheless, I still think that the core idea – that the identity of authors should be taken into account in sentiment analysis – is still relevant and viable today. But I’ve taken my own advice and moved on.\n","wordCount":"1274","inLanguage":"en","image":"https://yanirseroussi.com/kitten-first-steps.jpg","datePublished":"2015-05-02T08:31:10Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">First steps in data science: author-aware sentiment analysis</h1><div class=post-meta><span title='2015-05-02 08:31:10 +0000 UTC'>May 2, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-05-02-first-steps-in-data-science-author-aware-sentiment-analysis/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps_hu71ce6d56294695860e76fe8bc29b8d4b_64845_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps_hu71ce6d56294695860e76fe8bc29b8d4b_64845_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps.jpg 635w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps.jpg alt width=635 height=220></figure><div class=post-content><p>People often ask me what&rsquo;s the best way of becoming a <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/ title="What is data science?" target=_blank rel=noopener>data scientist</a>. The way I got there was by first becoming a software engineer and then doing a PhD in what was essentially data science (before it became such a popular term). This post describes my first steps in the field with the goal of helping others who are interested in making the transition from pure software engineering to data science.</p><p>While my first steps were in a <a href=https://yanirseroussi.com/phd-work/ title="PhD Work" target=_blank rel=noopener>PhD program</a>, I don&rsquo;t think that going through the formal PhD process is necessary if you wish to become a data scientist. Self-motivated individuals can get very far by making use of the abundance of learning resources available online. In fact, one can make progress much faster than in a PhD, because PhD programs have many overheads.</p><p>This post is organised as a list of steps. Despite the sequential numbering, many steps can be done in parallel. These steps roughly recount the work I&rsquo;ve done to publish my first paper, which was co-authored by <a href=http://users.monash.edu/~ingrid/ target=_blank rel=noopener>Ingrid Zukerman</a> and <a href=https://sites.google.com/a/bohnert.eu/fabian-bohnert/ target=_blank rel=noopener>Fabian Bohnert</a>. Most of the technical details are intentionally omitted. Readers who are interested in learning more are invited to read the <a href=https://dl.dropboxusercontent.com/u/25632965/SeroussiZukermanBohnert2010b.pdf title="Collaborative Inference of Sentiments from Texts" target=_blank rel=noopener>original paper</a> or chapter 6 in <a href=http://arrow.monash.edu.au/vital/access/services/Download/monash:89860/THESIS01 title="Text Mining and Rating Prediction with Topical User Models" target=_blank rel=noopener>my thesis</a>, which includes more thorough experiments and explanations.</p><h3 id=step-one-find-a-problem-to-work-on>Step one: Find a problem to work on<a hidden class=anchor aria-hidden=true href=#step-one-find-a-problem-to-work-on>#</a></h3><p>Even if you know nothing about the machine learning and statistics side of data science, it&rsquo;s important to find a problem to work on. Ideally it&rsquo;d be something you find personally interesting, as this helps with motivation. You could use a predefined problem such as a <a href=http://www.kaggle.com/competitions target=_blank rel=noopener>Kaggle competition</a> or one of the <a href=http://archive.ics.uci.edu/ml/datasets.html target=_blank rel=noopener>UCI datasets</a>. Alternatively, you could collect the data yourself to make things a bit more challenging.</p><p>In my case, I was interested in <a href=http://www.csse.monash.edu.au/research/umnl/ target=_blank rel=noopener>natural language processing and user modelling</a>. My supervisor was given a grant to work on <a href=https://en.wikipedia.org/wiki/Sentiment_analysis target=_blank rel=noopener>sentiment analysis</a> of opinion polls, which was my first direction of research. This quickly changed to focus on the connection between authors and the way they express their sentiments, with the application of harnessing this connection to improve the accuracy of sentiment analysis algorithms. For the purpose of this research, I collected a dataset of texts by the most prolific <a href=http://www.imdb.com/ target=_blank rel=noopener>IMDb</a> users. The problem was to infer the ratings these users assigned to their own reviews, with the hypothesis that methods that take author identity into account would outperform methods that ignore authorship information.</p><h3 id=step-two-close-your-knowledge-gaps>Step two: Close your knowledge gaps<a hidden class=anchor aria-hidden=true href=#step-two-close-your-knowledge-gaps>#</a></h3><p>Whatever problem you choose, you will have some knowledge gaps that require filling. Wikipedia, textbooks, and online courses will be your best guide for foundational areas like machine learning and statistics. Reading academic papers is often required to get a better understanding of recent work on the specific problem you&rsquo;re trying to solve.</p><p>Doing a PhD afforded me the luxury of spending about a month just reading papers. Most of the ~200 papers I read were on sentiment analysis, which gave me a good overview of what&rsquo;s been done in the field. However, the best thing I&rsquo;ve done was to stop reading and move on to working on the problem. This is also the best advice I can give: there&rsquo;s no better way to learn than getting your hands dirty working on a problem.</p><h3 id=step-three-get-your-hands-dirty>Step three: Get your hands dirty<a hidden class=anchor aria-hidden=true href=#step-three-get-your-hands-dirty>#</a></h3><p>With a well-defined problem and the knowledge gaps more-or-less closed, it is time to come up with a plan and implement it. Due to my background in software engineering and some exposure to <a href=https://en.wikipedia.org/wiki/Collaborative_filtering#Memory-based target=_blank rel=noopener>early collaborative filtering approaches to recommender systems</a>, my plan was very much a part of what Leo Breiman called the <a href=http://projecteuclid.org/euclid.ss/1009213726 title="Statistical Modeling: The Two Cultures" target=_blank rel=noopener>algorithmic modelling culture</a>. That is, I was more focused on developing algorithms that work than on modelling the process that generated the data. This approach is arguably more in line with the mindset that software engineers tend to have than with the approach of mathematicians and statisticians.</p><p>The plan was quite simple:</p><ul><li>Reproduce results that showed that rating inference models trained on enough texts by the <em>target author</em> (i.e., the author who wrote the text whose rating we want to predict) outperform models trained on texts by multiple authors</li><li>Use an approach inspired by collaborative filtering to combine multiple single-author models to infer ratings for texts by the target author, where those models are weighted by similarity to the target author</li><li>Experiment with multiple similarity measurements under various constraints on the number of texts available by the training and target authors</li><li>Iterate on these ideas until the results are publishable</li></ul><p>The rationale behind this plan was that while different people express their sentiments differently, similar people would express their sentiments similarly (e.g., use of understatements varies by culture). The key motivation was <a href=http://arxiv.org/pdf/cs/0506075.pdf target=_blank rel=noopener>Pang and Lee&rsquo;s finding</a> that a model trained on a single author is best if we have enough texts by this author.</p><p>The way I implemented the plan was vastly different from how I&rsquo;d do it today. This was 2009, and using Java with the <a href=http://www.cs.waikato.ac.nz/ml/weka/ target=_blank rel=noopener>Weka package</a> for the core modelling seemed like a huge improvement over the C/C++ I was used to. I relied heavily on the university grid to run experiments and wrote a bunch of code to handle experimental logic, including some Perl scripts for post-processing. It ended up being pretty messy, but it worked and I got publishable results. If I were to do the same work today, I&rsquo;d use Python for everything. <a href=http://ipython.org/notebook.html target=_blank rel=noopener>IPython Notebook</a> is a great way of keeping track of experimental work, and Python packages like pandas, scikit-learn, gensim, TextBlob, etc. are mature and easy to use for data science applications.</p><h3 id=step-four-publish-your-results>Step four: Publish your results<a hidden class=anchor aria-hidden=true href=#step-four-publish-your-results>#</a></h3><p>Having a deadline for publishing results can be stressful, but it has two positive outcomes. First, making your work public allows you to obtain valuable feedback. Second, hard deadlines are great in making you work towards a tangible goal. You can always keep iterating to get infinitesimal improvements, but publication deadlines force you to decide that you&rsquo;ve done enough.</p><p>In my case, the deadline for the <a href=http://www.um.org/ target=_blank rel=noopener>UMAP 2010 conference</a> and the promise of a free trip to Hawaii served as excellent motivators. But even if you don&rsquo;t have the time or energy to get an academic paper published, you should set yourself a deadline to publish something on a blog or a forum, or even as a report to a mentor who can assess your work. Receiving continuous feedback is a key factor in improvement, so <a href=https://en.wikipedia.org/wiki/Release_early%2C_release_often target=_blank rel=noopener>release early and release often</a>.</p><h3 id=step-five-improve-results-or-move-on>Step five: Improve results or move on<a hidden class=anchor aria-hidden=true href=#step-five-improve-results-or-move-on>#</a></h3><p>Congratulations! You have published the results of your study. What now? You can either keep working on the same problem – try more approaches, add more data, change the constraints, etc. Or you can move on to work on other problems that interest you.</p><p>In my case, I had to go back to iterate on the results of the first paper because of things I learned later. I ended up rerunning all the experiments to make things fit together into a more-or-less coherent story for the thesis (writing a thesis is one of the main overheads that comes with doing a PhD). If I had a choice, I wouldn&rsquo;t have done that. I would instead have pursued more sensible enhancements to the work presented in the paper, such as using the author as a feature, employing more robust ensemble methods, and testing different base methods than support vector machines. Nonetheless, I still think that the core idea – that the identity of authors should be taken into account in sentiment analysis – is still relevant and viable today. But I&rsquo;ve taken my own advice and moved on.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li><li><a href=https://yanirseroussi.com/tags/sentiment-analysis/>sentiment analysis</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on x" href="https://x.com/intent/tweet/?text=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f&amp;hashtags=datascience%2cmachinelearning%2cpredictivemodelling%2csentimentanalysis%2csoftwareengineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f&amp;title=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&amp;summary=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f&title=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on whatsapp" href="https://api.whatsapp.com/send?text=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on telegram" href="https://telegram.me/share/url?text=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on ycombinator" href="https://news.ycombinator.com/submitlink?t=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="data science,machine learning,predictive modelling,sentiment analysis,software engineering"><meta name=description content="I became a data scientist by doing a PhD, but the same steps can be followed without a formal education program."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="First steps in data science: author-aware sentiment analysis"><meta property="og:description" content="I became a data scientist by doing a PhD, but the same steps can be followed without a formal education program."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/"><meta property="og:image" content="https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-05-02T08:31:10+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps.jpg"><meta name=twitter:title content="First steps in data science: author-aware sentiment analysis"><meta name=twitter:description content="I became a data scientist by doing a PhD, but the same steps can be followed without a formal education program."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"First steps in data science: author-aware sentiment analysis","item":"https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"First steps in data science: author-aware sentiment analysis","name":"First steps in data science: author-aware sentiment analysis","description":"I became a data scientist by doing a PhD, but the same steps can be followed without a formal education program.","keywords":["data science","machine learning","predictive modelling","sentiment analysis","software engineering"],"articleBody":"People often ask me what’s the best way of becoming a data scientist. The way I got there was by first becoming a software engineer and then doing a PhD in what was essentially data science (before it became such a popular term). This post describes my first steps in the field with the goal of helping others who are interested in making the transition from pure software engineering to data science.\nWhile my first steps were in a PhD program, I don’t think that going through the formal PhD process is necessary if you wish to become a data scientist. Self-motivated individuals can get very far by making use of the abundance of learning resources available online. In fact, one can make progress much faster than in a PhD, because PhD programs have many overheads.\nThis post is organised as a list of steps. Despite the sequential numbering, many steps can be done in parallel. These steps roughly recount the work I’ve done to publish my first paper, which was co-authored by Ingrid Zukerman and Fabian Bohnert. Most of the technical details are intentionally omitted. Readers who are interested in learning more are invited to read the original paper or chapter 6 in my thesis, which includes more thorough experiments and explanations.\nStep one: Find a problem to work on Even if you know nothing about the machine learning and statistics side of data science, it’s important to find a problem to work on. Ideally it’d be something you find personally interesting, as this helps with motivation. You could use a predefined problem such as a Kaggle competition or one of the UCI datasets. Alternatively, you could collect the data yourself to make things a bit more challenging.\nIn my case, I was interested in natural language processing and user modelling. My supervisor was given a grant to work on sentiment analysis of opinion polls, which was my first direction of research. This quickly changed to focus on the connection between authors and the way they express their sentiments, with the application of harnessing this connection to improve the accuracy of sentiment analysis algorithms. For the purpose of this research, I collected a dataset of texts by the most prolific IMDb users. The problem was to infer the ratings these users assigned to their own reviews, with the hypothesis that methods that take author identity into account would outperform methods that ignore authorship information.\nStep two: Close your knowledge gaps Whatever problem you choose, you will have some knowledge gaps that require filling. Wikipedia, textbooks, and online courses will be your best guide for foundational areas like machine learning and statistics. Reading academic papers is often required to get a better understanding of recent work on the specific problem you’re trying to solve.\nDoing a PhD afforded me the luxury of spending about a month just reading papers. Most of the ~200 papers I read were on sentiment analysis, which gave me a good overview of what’s been done in the field. However, the best thing I’ve done was to stop reading and move on to working on the problem. This is also the best advice I can give: there’s no better way to learn than getting your hands dirty working on a problem.\nStep three: Get your hands dirty With a well-defined problem and the knowledge gaps more-or-less closed, it is time to come up with a plan and implement it. Due to my background in software engineering and some exposure to early collaborative filtering approaches to recommender systems, my plan was very much a part of what Leo Breiman called the algorithmic modelling culture. That is, I was more focused on developing algorithms that work than on modelling the process that generated the data. This approach is arguably more in line with the mindset that software engineers tend to have than with the approach of mathematicians and statisticians.\nThe plan was quite simple:\nReproduce results that showed that rating inference models trained on enough texts by the target author (i.e., the author who wrote the text whose rating we want to predict) outperform models trained on texts by multiple authors Use an approach inspired by collaborative filtering to combine multiple single-author models to infer ratings for texts by the target author, where those models are weighted by similarity to the target author Experiment with multiple similarity measurements under various constraints on the number of texts available by the training and target authors Iterate on these ideas until the results are publishable The rationale behind this plan was that while different people express their sentiments differently, similar people would express their sentiments similarly (e.g., use of understatements varies by culture). The key motivation was Pang and Lee’s finding that a model trained on a single author is best if we have enough texts by this author.\nThe way I implemented the plan was vastly different from how I’d do it today. This was 2009, and using Java with the Weka package for the core modelling seemed like a huge improvement over the C/C++ I was used to. I relied heavily on the university grid to run experiments and wrote a bunch of code to handle experimental logic, including some Perl scripts for post-processing. It ended up being pretty messy, but it worked and I got publishable results. If I were to do the same work today, I’d use Python for everything. IPython Notebook is a great way of keeping track of experimental work, and Python packages like pandas, scikit-learn, gensim, TextBlob, etc. are mature and easy to use for data science applications.\nStep four: Publish your results Having a deadline for publishing results can be stressful, but it has two positive outcomes. First, making your work public allows you to obtain valuable feedback. Second, hard deadlines are great in making you work towards a tangible goal. You can always keep iterating to get infinitesimal improvements, but publication deadlines force you to decide that you’ve done enough.\nIn my case, the deadline for the UMAP 2010 conference and the promise of a free trip to Hawaii served as excellent motivators. But even if you don’t have the time or energy to get an academic paper published, you should set yourself a deadline to publish something on a blog or a forum, or even as a report to a mentor who can assess your work. Receiving continuous feedback is a key factor in improvement, so release early and release often.\nStep five: Improve results or move on Congratulations! You have published the results of your study. What now? You can either keep working on the same problem – try more approaches, add more data, change the constraints, etc. Or you can move on to work on other problems that interest you.\nIn my case, I had to go back to iterate on the results of the first paper because of things I learned later. I ended up rerunning all the experiments to make things fit together into a more-or-less coherent story for the thesis (writing a thesis is one of the main overheads that comes with doing a PhD). If I had a choice, I wouldn’t have done that. I would instead have pursued more sensible enhancements to the work presented in the paper, such as using the author as a feature, employing more robust ensemble methods, and testing different base methods than support vector machines. Nonetheless, I still think that the core idea – that the identity of authors should be taken into account in sentiment analysis – is still relevant and viable today. But I’ve taken my own advice and moved on.\n","wordCount":"1274","inLanguage":"en","image":"https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps.jpg","datePublished":"2015-05-02T08:31:10Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">First steps in data science: author-aware sentiment analysis</h1><div class=post-meta><span title='2015-05-02 08:31:10 +0000 UTC'>May 2, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-05-02-first-steps-in-data-science-author-aware-sentiment-analysis/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps_hu71ce6d56294695860e76fe8bc29b8d4b_64845_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps_hu71ce6d56294695860e76fe8bc29b8d4b_64845_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps.jpg 635w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/kitten-first-steps.jpg alt width=635 height=220></figure><div class=post-content><p>People often ask me what&rsquo;s the best way of becoming a <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/ title="What is data science?" target=_blank rel=noopener>data scientist</a>. The way I got there was by first becoming a software engineer and then doing a PhD in what was essentially data science (before it became such a popular term). This post describes my first steps in the field with the goal of helping others who are interested in making the transition from pure software engineering to data science.</p><p>While my first steps were in a <a href=https://yanirseroussi.com/phd-work/ title="PhD Work" target=_blank rel=noopener>PhD program</a>, I don&rsquo;t think that going through the formal PhD process is necessary if you wish to become a data scientist. Self-motivated individuals can get very far by making use of the abundance of learning resources available online. In fact, one can make progress much faster than in a PhD, because PhD programs have many overheads.</p><p>This post is organised as a list of steps. Despite the sequential numbering, many steps can be done in parallel. These steps roughly recount the work I&rsquo;ve done to publish my first paper, which was co-authored by <a href=http://users.monash.edu/~ingrid/ target=_blank rel=noopener>Ingrid Zukerman</a> and <a href=https://sites.google.com/a/bohnert.eu/fabian-bohnert/ target=_blank rel=noopener>Fabian Bohnert</a>. Most of the technical details are intentionally omitted. Readers who are interested in learning more are invited to read the <a href=https://dl.dropboxusercontent.com/u/25632965/SeroussiZukermanBohnert2010b.pdf title="Collaborative Inference of Sentiments from Texts" target=_blank rel=noopener>original paper</a> or chapter 6 in <a href=http://arrow.monash.edu.au/vital/access/services/Download/monash:89860/THESIS01 title="Text Mining and Rating Prediction with Topical User Models" target=_blank rel=noopener>my thesis</a>, which includes more thorough experiments and explanations.</p><h3 id=step-one-find-a-problem-to-work-on>Step one: Find a problem to work on<a hidden class=anchor aria-hidden=true href=#step-one-find-a-problem-to-work-on>#</a></h3><p>Even if you know nothing about the machine learning and statistics side of data science, it&rsquo;s important to find a problem to work on. Ideally it&rsquo;d be something you find personally interesting, as this helps with motivation. You could use a predefined problem such as a <a href=http://www.kaggle.com/competitions target=_blank rel=noopener>Kaggle competition</a> or one of the <a href=http://archive.ics.uci.edu/ml/datasets.html target=_blank rel=noopener>UCI datasets</a>. Alternatively, you could collect the data yourself to make things a bit more challenging.</p><p>In my case, I was interested in <a href=http://www.csse.monash.edu.au/research/umnl/ target=_blank rel=noopener>natural language processing and user modelling</a>. My supervisor was given a grant to work on <a href=https://en.wikipedia.org/wiki/Sentiment_analysis target=_blank rel=noopener>sentiment analysis</a> of opinion polls, which was my first direction of research. This quickly changed to focus on the connection between authors and the way they express their sentiments, with the application of harnessing this connection to improve the accuracy of sentiment analysis algorithms. For the purpose of this research, I collected a dataset of texts by the most prolific <a href=http://www.imdb.com/ target=_blank rel=noopener>IMDb</a> users. The problem was to infer the ratings these users assigned to their own reviews, with the hypothesis that methods that take author identity into account would outperform methods that ignore authorship information.</p><h3 id=step-two-close-your-knowledge-gaps>Step two: Close your knowledge gaps<a hidden class=anchor aria-hidden=true href=#step-two-close-your-knowledge-gaps>#</a></h3><p>Whatever problem you choose, you will have some knowledge gaps that require filling. Wikipedia, textbooks, and online courses will be your best guide for foundational areas like machine learning and statistics. Reading academic papers is often required to get a better understanding of recent work on the specific problem you&rsquo;re trying to solve.</p><p>Doing a PhD afforded me the luxury of spending about a month just reading papers. Most of the ~200 papers I read were on sentiment analysis, which gave me a good overview of what&rsquo;s been done in the field. However, the best thing I&rsquo;ve done was to stop reading and move on to working on the problem. This is also the best advice I can give: there&rsquo;s no better way to learn than getting your hands dirty working on a problem.</p><h3 id=step-three-get-your-hands-dirty>Step three: Get your hands dirty<a hidden class=anchor aria-hidden=true href=#step-three-get-your-hands-dirty>#</a></h3><p>With a well-defined problem and the knowledge gaps more-or-less closed, it is time to come up with a plan and implement it. Due to my background in software engineering and some exposure to <a href=https://en.wikipedia.org/wiki/Collaborative_filtering#Memory-based target=_blank rel=noopener>early collaborative filtering approaches to recommender systems</a>, my plan was very much a part of what Leo Breiman called the <a href=http://projecteuclid.org/euclid.ss/1009213726 title="Statistical Modeling: The Two Cultures" target=_blank rel=noopener>algorithmic modelling culture</a>. That is, I was more focused on developing algorithms that work than on modelling the process that generated the data. This approach is arguably more in line with the mindset that software engineers tend to have than with the approach of mathematicians and statisticians.</p><p>The plan was quite simple:</p><ul><li>Reproduce results that showed that rating inference models trained on enough texts by the <em>target author</em> (i.e., the author who wrote the text whose rating we want to predict) outperform models trained on texts by multiple authors</li><li>Use an approach inspired by collaborative filtering to combine multiple single-author models to infer ratings for texts by the target author, where those models are weighted by similarity to the target author</li><li>Experiment with multiple similarity measurements under various constraints on the number of texts available by the training and target authors</li><li>Iterate on these ideas until the results are publishable</li></ul><p>The rationale behind this plan was that while different people express their sentiments differently, similar people would express their sentiments similarly (e.g., use of understatements varies by culture). The key motivation was <a href=http://arxiv.org/pdf/cs/0506075.pdf target=_blank rel=noopener>Pang and Lee&rsquo;s finding</a> that a model trained on a single author is best if we have enough texts by this author.</p><p>The way I implemented the plan was vastly different from how I&rsquo;d do it today. This was 2009, and using Java with the <a href=http://www.cs.waikato.ac.nz/ml/weka/ target=_blank rel=noopener>Weka package</a> for the core modelling seemed like a huge improvement over the C/C++ I was used to. I relied heavily on the university grid to run experiments and wrote a bunch of code to handle experimental logic, including some Perl scripts for post-processing. It ended up being pretty messy, but it worked and I got publishable results. If I were to do the same work today, I&rsquo;d use Python for everything. <a href=http://ipython.org/notebook.html target=_blank rel=noopener>IPython Notebook</a> is a great way of keeping track of experimental work, and Python packages like pandas, scikit-learn, gensim, TextBlob, etc. are mature and easy to use for data science applications.</p><h3 id=step-four-publish-your-results>Step four: Publish your results<a hidden class=anchor aria-hidden=true href=#step-four-publish-your-results>#</a></h3><p>Having a deadline for publishing results can be stressful, but it has two positive outcomes. First, making your work public allows you to obtain valuable feedback. Second, hard deadlines are great in making you work towards a tangible goal. You can always keep iterating to get infinitesimal improvements, but publication deadlines force you to decide that you&rsquo;ve done enough.</p><p>In my case, the deadline for the <a href=http://www.um.org/ target=_blank rel=noopener>UMAP 2010 conference</a> and the promise of a free trip to Hawaii served as excellent motivators. But even if you don&rsquo;t have the time or energy to get an academic paper published, you should set yourself a deadline to publish something on a blog or a forum, or even as a report to a mentor who can assess your work. Receiving continuous feedback is a key factor in improvement, so <a href=https://en.wikipedia.org/wiki/Release_early%2C_release_often target=_blank rel=noopener>release early and release often</a>.</p><h3 id=step-five-improve-results-or-move-on>Step five: Improve results or move on<a hidden class=anchor aria-hidden=true href=#step-five-improve-results-or-move-on>#</a></h3><p>Congratulations! You have published the results of your study. What now? You can either keep working on the same problem – try more approaches, add more data, change the constraints, etc. Or you can move on to work on other problems that interest you.</p><p>In my case, I had to go back to iterate on the results of the first paper because of things I learned later. I ended up rerunning all the experiments to make things fit together into a more-or-less coherent story for the thesis (writing a thesis is one of the main overheads that comes with doing a PhD). If I had a choice, I wouldn&rsquo;t have done that. I would instead have pursued more sensible enhancements to the work presented in the paper, such as using the author as a feature, employing more robust ensemble methods, and testing different base methods than support vector machines. Nonetheless, I still think that the core idea – that the identity of authors should be taken into account in sentiment analysis – is still relevant and viable today. But I&rsquo;ve taken my own advice and moved on.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li><li><a href=https://yanirseroussi.com/tags/sentiment-analysis/>sentiment analysis</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on x" href="https://x.com/intent/tweet/?text=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f&amp;hashtags=datascience%2cmachinelearning%2cpredictivemodelling%2csentimentanalysis%2csoftwareengineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f&amp;title=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&amp;summary=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f&title=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on whatsapp" href="https://api.whatsapp.com/send?text=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on telegram" href="https://telegram.me/share/url?text=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share First steps in data science: author-aware sentiment analysis on ycombinator" href="https://news.ycombinator.com/submitlink?t=First%20steps%20in%20data%20science%3a%20author-aware%20sentiment%20analysis&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f05%2f02%2ffirst-steps-in-data-science-author-aware-sentiment-analysis%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-370><div class=comment-header><a href=#comment-370><img class=comment-avatar src="https://www.gravatar.com/avatar/600eae285b7bf6c93fca8c0bf155589c?s=50"><p class=comment-info><strong>Robert Klein (@PaperbackLegacy)</strong><br><small>2015-05-18 18:34:00</small></p></a></div><div class="comment-body post-content">Thanks for the stimulation. I’m still fascinated by the lure to extract sentiment from text, but it seems like so often the sentiment that the author intended never fully came to expression in the text. Maybe an interdisciplinary approach will be required to teach machines to parse the intentions implicit in text, and, like other media phenomena, a loop will have to form: perhaps the awareness that explicit intentions and sentiment are of benefit to authors in a world that (one day) automates the sorting of all its documents will cause writing styles to adapt. The effect of the best we can on what we’re doing now is one of those things you begin to see a pattern in. Here&rsquo;s an API that correlates patterns of unstructured info: dev.keywordmeme.com Would love your feedback. Let me know if it’s useful to you or if you have any comments. Well done on the carbon post, btw. Glad I found your blog.</div></div><div class=comment-level-1 id=comment-385><div class=comment-header><a href=#comment-385><img class=comment-avatar src="https://www.gravatar.com/avatar/dda019c47a6183120608a6aeac2db6c5?s=50"><p class=comment-info><strong>Yanir Seroussi</strong><br><small>2015-05-22 08:49:02</small></p></a></div><div class="comment-body post-content"><p>Thank you for the comment! I agree that analysing sentiment is very tricky due to the fact that people often don&rsquo;t express themselves so well. If I remember correctly, inter-annotator agreement on some sentiment analysis tasks is only 70-80%, so it&rsquo;s likely that we will ever have perfect performance by machines.</p><p>dev.keywordmeme.com redirects to a github page &ndash; where is the API?</p></div></div><div class=comment-level-2 id=comment-395><div class=comment-header><a href=#comment-395><img class=comment-avatar src="https://www.gravatar.com/avatar/600eae285b7bf6c93fca8c0bf155589c?s=50"><p class=comment-info><strong>Robert Klein (@PaperbackLegacy)</strong><br><small>2015-05-26 19:43:10</small></p></a></div><div class="comment-body post-content">You bet. I&rsquo;m fascinated to see what seems to be a real live push toward an interdisciplinary approach. That 70-80% performance might be pushed over the hump by humans with special training until such a time as the process can be formalized. It looks like auditing ML-driven processes could be a new category of employment through this next technological plateau. The human-machine relationship in a friendly old configuration!
 Sorry about the link. This should work: <a href=http://www.keywordmeme.com/ target=_blank rel=noopener>http://www.keywordmeme.com/</a>. It makes you register, just a heads up. Hit the engineers up on github if you have any questions or if things aren&rsquo;t working. Which is possible. Take care! :)</div></div><div class=comment-level-0 id=comment-635><div class=comment-header><a href=#comment-635><img class=comment-avatar src="https://www.gravatar.com/avatar/8830e28e12d6e04e41e17809bf7eb644?s=50"><p class=comment-info><strong>A</strong><br><small>2015-09-04 07:31:28</small></p></a></div><div class="comment-body post-content"><p>Hi Yanir</p><p>Thank you very much for this post. Helpul for somebody like me seeking to be a data scientist.</p><p>I&rsquo;m a software engineer, currently master data architect.</p><p>I&rsquo;m taking MOOCs in order to fill the gaps, so let&rsquo;s say I&rsquo;m on a good track :)</p><p>However, once problem found and hands got dirty, how to find a mentor ? afterwards, get published ?</p><p>I think this would be hard via academic</p></div></div><div class=comment-level-1 id=comment-636><div class=comment-header><a href=#comment-636><img class=comment-avatar src="https://www.gravatar.com/avatar/dda019c47a6183120608a6aeac2db6c5?s=50"><p class=comment-info><strong>Yanir Seroussi</strong><br><small>2015-09-04 08:46:51</small></p></a></div><div class="comment-body post-content"><p>Finding a mentor depends on where you are. Good places to start would be your current workplace (if you work with data scientists), or local meetups (if there are any in your area). Another option would be to contribute to open source projects in the field as a way of getting to know people and getting feedback. Finally, there are courses like <a href=https://www.thinkful.com/courses/learn-data-science-online/ target=_blank rel=nofollow>the one by Thinkful</a>, where you can pay to be mentored.</p><p>Regarding getting published, I agree that it&rsquo;d be hard to get published in many academic venues without help from people who know how it&rsquo;s done. However, you can always start your own blog and link to it from places like Reddit and DataTau. Even if you don&rsquo;t get any feedback, publishing often forces you to think more deeply about the subject of your article.</p></div></div><div class=comment-level-2 id=comment-639><div class=comment-header><a href=#comment-639><img class=comment-avatar src="https://www.gravatar.com/avatar/8830e28e12d6e04e41e17809bf7eb644?s=50"><p class=comment-info><strong>Amine</strong><br><small>2015-09-04 11:18:10</small></p></a></div><div class="comment-body post-content"><p>At the workplace, it will be a bit hard.</p><p>I Live in Paris, meetups would be a good option.</p><p>You&rsquo;re right, publishing forces to think more deeply, feedbacks from readers are also good way to learn.</p></div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
diff --git a/2015/06/06/hopping-on-the-deep-learning-bandwagon/index.html b/2015/06/06/hopping-on-the-deep-learning-bandwagon/index.html
index 166be7611..dc68338bf 100644
--- a/2015/06/06/hopping-on-the-deep-learning-bandwagon/index.html
+++ b/2015/06/06/hopping-on-the-deep-learning-bandwagon/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Hopping on the deep learning bandwagon | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Bandcamp,data science,deep learning,machine learning,predictive modelling"><meta name=description content="To become proficient at solving data science problems, you need to get your hands dirty. Here, I used album cover classification to learn about deep learning."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Hopping on the deep learning bandwagon"><meta property="og:description" content="To become proficient at solving data science problems, you need to get your hands dirty. Here, I used album cover classification to learn about deep learning."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/"><meta property="og:image" content="https://yanirseroussi.com/bandcamp-album-covers-by-genre-shuffled.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-06-06T05:00:22+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bandcamp-album-covers-by-genre-shuffled.png"><meta name=twitter:title content="Hopping on the deep learning bandwagon"><meta name=twitter:description content="To become proficient at solving data science problems, you need to get your hands dirty. Here, I used album cover classification to learn about deep learning."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Hopping on the deep learning bandwagon","item":"https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Hopping on the deep learning bandwagon","name":"Hopping on the deep learning bandwagon","description":"To become proficient at solving data science problems, you need to get your hands dirty. Here, I used album cover classification to learn about deep learning.","keywords":["Bandcamp","data science","deep learning","machine learning","predictive modelling"],"articleBody":"I’ve been meaning to get into deep learning for the last few years. Now, the stars having finally aligned and I have the time and motivation to work on a small project that will hopefully improve my understanding of the field. This is the first in a series of posts that will document my progress on this project.\nAs mentioned in a previous post on getting started as a data scientist, I believe that the best way of becoming proficient at solving data science problems is by getting your hands dirty. Despite being familiar with high-level terminology and having some understanding of how it all works, I don’t have any practical experience applying deep learning. The purpose of this project is to fix this experience gap by working on a real problem.\nThe problem: Inferring genre from album covers Deep learning has been very successful at image classification. Therefore, it makes sense to work on an image classification problem for this project. Rather than using an existing dataset, I decided to make things a bit more interesting by building my own dataset. Over the last year, I’ve been running BCRecommender – a recommendation system for Bandcamp music. I’ve noticed that album covers vary by genre, though it’s hard to quantify exactly how they vary. So the question I’ll be trying to answer with this project is how accurately can genre be inferred from Bandcamp album covers?\nAs the goal of this project is to learn about deep learning rather than make a novel contribution, I didn’t do a comprehensive search to see whether this problem has been addressed before. However, I did find a recent post by Alexandre Passant that describes his use of Clarifai’s API to tag the content of Spotify album covers (identifying elements such as men, night, dark, etc.), and then using these tags to infer the album’s genre. Another related project is Karayev et al.’s Recognizing image style paper, in which the authors classified datasets of images from Flickr and Wikipedia by style and art genre, respectively. In all these cases, the results are pretty good, supporting my intuition that the genre inference task is feasible.\nData collection \u0026 splits As I’ve already been crawling Bandcamp data for BCRecommender, creating the dataset was relatively straightforward. Currently, I have data on about 1.8 million tracks and albums. Bandcamp artists assign multiple tags to each release. To create the dataset, I selected 10 of the top tags: ambient, dubstep, folk, hiphop_rap, jazz, metal, pop, punk, rock, and soul. Then, I randomly selected 10,000 album covers that have exactly one of those tags, with 1,000 albums for each tag/genre. Each cover image size is 350×350. The following image shows a sample of the dataset.\nIt is apparent that some genres can be inferred more easily than others, especially when browsing through the full dataset. For example, metal albums tend to be pretty distinct. I doubt that predictive accuracy would be very high, but I think that it can definitely be much better than the random baseline of 10%.\nFor training, validation and testing I decided to use a static stratified 80%/10%/10% split of the dataset. It quickly became apparently that the full dataset is too big for development purposes, making it hard to quickly test code on my local machine. To address this, I created a local development dataset, using an 80%/10%/10% split of 1,000 images from the full training subset.\nThe code for downloading the dataset and creating the splits is available from the project repository on GitHub. This repository will include all the code for the project as it evolves. I will try to keep it well-documented enough to be useful for others, though it assumes some familiarity with Python. If you experience any issues running the code or find any bugs, please let me know.\nGetting started One of the things that has stopped me from playing with deep learning in the past is the feeling that there is a bit of a steep learning curve around the tools and methods. A lot of the deep learning libraries out there don’t seem as mature as general machine learning libraries, such as scikit-learn. There are also many more parameters to play with when building deep neural networks than when using linear models or algorithms such as random forests. Further, to enable any kind of meaningful experimentation, using a GPU is essential.\nFortunately, the tools and documentation have matured a lot in recent years. Motivated by Daniel Nouri’s excellent tutorial on detecting facial keypoints with convolutional neural nets, I decided to use the Lasagne package as my starting point. My plan was simple: Convert the MNIST example code to work on my dataset locally, setup an AWS machine with a GPU for full-scale experiments, and then play with various network architectures and techniques to improve accuracy and gain a deeper understanding of deep learning.\nInitial environment setup While Lasagne’s MNIST example code is pretty clear – especially once you get your head around the way Theano works – it doesn’t really lend itself to easy experimentation. I addressed this by refactoring the code in several iterations, until I got to the current state, where there’s a simple command-line interface that allows me to experiment with different datasets and architectures. This will probably change and become more complex as I start doing more sophisticated things.\nTo enable rapid experimentation, I had to set up an AWS machine with a GPU (g2.2xlarge instance). I wrote some simple deployment code using Fabric, which allows me to setup a machine from scratch, install all the requirements, package the project, and copy it to the remote machine.\nGetting the code running on the CPU was trivial, but I hit several issues when running on the GPU. First, the vanilla Ubuntu 14.04 server I used didn’t come with CUDA installed. After trying and failing to get it working by following some tutorials, I ended up going down the easier path of using the AMI supplied by Caffe. This AMI also has the advantage of coming with Caffe installed (surprisingly), which I may end up using at some point.\nThe second issue I encountered was that using the GPU to run Lasagne’s enhanced example code on my full dataset was impossible due to memory constraints. The problem was that the example assumes that the entire dataset can fit in the GPU’s memory (as discussed here and here). This took a while to resolve, even though the solution is conceptually simple – just copy the dataset to the GPU in chunks rather than attempt to copy it all in one go. Resolving this issue was a good way of getting a better understanding of what the code does, since I ended up rewriting most of the original example code.\nNext steps So far, I left the network architecture from the original example mostly untouched, as I was busy collecting the dataset, getting the environment set up, and resolving various issues. One thing I did notice was that the example’s architecture diverges on my dataset, so instead I tested my code using a basic multi-layer perceptron architecture with a single hidden layer. This performs about as well as a random classifier on my dataset, but at least it converges. I also tested the modified code on the MNIST dataset and the results are decent, so now it is time to move forward and actually do some modelling, starting with convolutional neural nets.\nThe high level plan is to iteratively read tutorials/papers/books, implement ideas, play with parameters, and visualise parts of the network until I’m satisfied with the results. The main goal remains to learn as much as possible and get a good intuition of how things work. I’ll write more about my experiences in subsequent posts. Stay tuned!\nUpdate: The second post in the series is now available.\n","wordCount":"1311","inLanguage":"en","image":"https://yanirseroussi.com/bandcamp-album-covers-by-genre-shuffled.png","datePublished":"2015-06-06T05:00:22Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Hopping on the deep learning bandwagon</h1><div class=post-meta><span title='2015-06-06 05:00:22 +0000 UTC'>June 6, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-06-06-hopping-on-the-deep-learning-bandwagon/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled_hu7dd57cb220c55d3023581cbc705ac82b_182096_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled_hu7dd57cb220c55d3023581cbc705ac82b_182096_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled_hu7dd57cb220c55d3023581cbc705ac82b_182096_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled.png 748w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled.png alt width=748 height=128></figure><div class=post-content><p>I&rsquo;ve been meaning to get into <a href=https://en.wikipedia.org/wiki/Deep_learning target=_blank rel=noopener>deep learning</a> for the last few years. Now, the stars having finally aligned and I have the time and motivation to work on a small project that will hopefully improve my understanding of the field. This is the first in a series of posts that will document my progress on this project.</p><p>As mentioned in a <a href=https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/>previous post on getting started as a data scientist</a>, I believe that the best way of becoming proficient at solving data science problems is by getting your hands dirty. Despite being familiar with high-level terminology and having some understanding of how it all works, I don&rsquo;t have any practical experience applying deep learning. The purpose of this project is to fix this experience gap by working on a real problem.</p><h3 id=the-problem-inferring-genre-from-album-covers>The problem: Inferring genre from album covers<a hidden class=anchor aria-hidden=true href=#the-problem-inferring-genre-from-album-covers>#</a></h3><p>Deep learning has been very successful at image classification. Therefore, it makes sense to work on an image classification problem for this project. Rather than using an existing dataset, I decided to make things a bit more interesting by building my own dataset. Over the last year, I&rsquo;ve been running <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender – a recommendation system for Bandcamp music</a>. I&rsquo;ve noticed that album covers vary by genre, though it&rsquo;s hard to quantify exactly <em>how</em> they vary. So the question I&rsquo;ll be trying to answer with this project is <em>how accurately can genre be inferred from Bandcamp album covers?</em></p><p>As the goal of this project is to learn about deep learning rather than make a novel contribution, I didn&rsquo;t do a comprehensive search to see whether this problem has been addressed before. However, I did find <a href=http://apassant.net/2015/05/14/album-covers-music-deep-learning/ target=_blank rel=noopener>a recent post by Alexandre Passant</a> that describes his use of Clarifai&rsquo;s API to tag the content of Spotify album covers (identifying elements such as men, night, dark, etc.), and then using these tags to infer the album&rsquo;s genre. Another related project is <a href=http://sergeykarayev.com/files/1311.3715v3.pdf target=_blank rel=noopener>Karayev et al.&rsquo;s <em>Recognizing image style</em> paper</a>, in which the authors classified datasets of images from Flickr and Wikipedia by style and art genre, respectively. In all these cases, the results are pretty good, supporting my intuition that the genre inference task is feasible.</p><h3 id=data-collection--splits>Data collection & splits<a hidden class=anchor aria-hidden=true href=#data-collection--splits>#</a></h3><p>As I&rsquo;ve already been crawling Bandcamp data for BCRecommender, creating the dataset was relatively straightforward. Currently, I have data on about 1.8 million tracks and albums. Bandcamp artists assign multiple tags to each release. To create the dataset, I selected 10 of the top tags: <em>ambient, dubstep, folk, hiphop_rap, jazz, metal, pop, punk, rock,</em> and <em>soul</em>. Then, I randomly selected 10,000 album covers that have exactly one of those tags, with 1,000 albums for each tag/genre. Each cover image size is 350×350. The following image shows a sample of the dataset.</p><figure><a href=bandcamp-album-covers-by-genre.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="Bandcamp,data science,deep learning,machine learning,predictive modelling"><meta name=description content="To become proficient at solving data science problems, you need to get your hands dirty. Here, I used album cover classification to learn about deep learning."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Hopping on the deep learning bandwagon"><meta property="og:description" content="To become proficient at solving data science problems, you need to get your hands dirty. Here, I used album cover classification to learn about deep learning."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/"><meta property="og:image" content="https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-06-06T05:00:22+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled.png"><meta name=twitter:title content="Hopping on the deep learning bandwagon"><meta name=twitter:description content="To become proficient at solving data science problems, you need to get your hands dirty. Here, I used album cover classification to learn about deep learning."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Hopping on the deep learning bandwagon","item":"https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Hopping on the deep learning bandwagon","name":"Hopping on the deep learning bandwagon","description":"To become proficient at solving data science problems, you need to get your hands dirty. Here, I used album cover classification to learn about deep learning.","keywords":["Bandcamp","data science","deep learning","machine learning","predictive modelling"],"articleBody":"I’ve been meaning to get into deep learning for the last few years. Now, the stars having finally aligned and I have the time and motivation to work on a small project that will hopefully improve my understanding of the field. This is the first in a series of posts that will document my progress on this project.\nAs mentioned in a previous post on getting started as a data scientist, I believe that the best way of becoming proficient at solving data science problems is by getting your hands dirty. Despite being familiar with high-level terminology and having some understanding of how it all works, I don’t have any practical experience applying deep learning. The purpose of this project is to fix this experience gap by working on a real problem.\nThe problem: Inferring genre from album covers Deep learning has been very successful at image classification. Therefore, it makes sense to work on an image classification problem for this project. Rather than using an existing dataset, I decided to make things a bit more interesting by building my own dataset. Over the last year, I’ve been running BCRecommender – a recommendation system for Bandcamp music. I’ve noticed that album covers vary by genre, though it’s hard to quantify exactly how they vary. So the question I’ll be trying to answer with this project is how accurately can genre be inferred from Bandcamp album covers?\nAs the goal of this project is to learn about deep learning rather than make a novel contribution, I didn’t do a comprehensive search to see whether this problem has been addressed before. However, I did find a recent post by Alexandre Passant that describes his use of Clarifai’s API to tag the content of Spotify album covers (identifying elements such as men, night, dark, etc.), and then using these tags to infer the album’s genre. Another related project is Karayev et al.’s Recognizing image style paper, in which the authors classified datasets of images from Flickr and Wikipedia by style and art genre, respectively. In all these cases, the results are pretty good, supporting my intuition that the genre inference task is feasible.\nData collection \u0026 splits As I’ve already been crawling Bandcamp data for BCRecommender, creating the dataset was relatively straightforward. Currently, I have data on about 1.8 million tracks and albums. Bandcamp artists assign multiple tags to each release. To create the dataset, I selected 10 of the top tags: ambient, dubstep, folk, hiphop_rap, jazz, metal, pop, punk, rock, and soul. Then, I randomly selected 10,000 album covers that have exactly one of those tags, with 1,000 albums for each tag/genre. Each cover image size is 350×350. The following image shows a sample of the dataset.\nIt is apparent that some genres can be inferred more easily than others, especially when browsing through the full dataset. For example, metal albums tend to be pretty distinct. I doubt that predictive accuracy would be very high, but I think that it can definitely be much better than the random baseline of 10%.\nFor training, validation and testing I decided to use a static stratified 80%/10%/10% split of the dataset. It quickly became apparently that the full dataset is too big for development purposes, making it hard to quickly test code on my local machine. To address this, I created a local development dataset, using an 80%/10%/10% split of 1,000 images from the full training subset.\nThe code for downloading the dataset and creating the splits is available from the project repository on GitHub. This repository will include all the code for the project as it evolves. I will try to keep it well-documented enough to be useful for others, though it assumes some familiarity with Python. If you experience any issues running the code or find any bugs, please let me know.\nGetting started One of the things that has stopped me from playing with deep learning in the past is the feeling that there is a bit of a steep learning curve around the tools and methods. A lot of the deep learning libraries out there don’t seem as mature as general machine learning libraries, such as scikit-learn. There are also many more parameters to play with when building deep neural networks than when using linear models or algorithms such as random forests. Further, to enable any kind of meaningful experimentation, using a GPU is essential.\nFortunately, the tools and documentation have matured a lot in recent years. Motivated by Daniel Nouri’s excellent tutorial on detecting facial keypoints with convolutional neural nets, I decided to use the Lasagne package as my starting point. My plan was simple: Convert the MNIST example code to work on my dataset locally, setup an AWS machine with a GPU for full-scale experiments, and then play with various network architectures and techniques to improve accuracy and gain a deeper understanding of deep learning.\nInitial environment setup While Lasagne’s MNIST example code is pretty clear – especially once you get your head around the way Theano works – it doesn’t really lend itself to easy experimentation. I addressed this by refactoring the code in several iterations, until I got to the current state, where there’s a simple command-line interface that allows me to experiment with different datasets and architectures. This will probably change and become more complex as I start doing more sophisticated things.\nTo enable rapid experimentation, I had to set up an AWS machine with a GPU (g2.2xlarge instance). I wrote some simple deployment code using Fabric, which allows me to setup a machine from scratch, install all the requirements, package the project, and copy it to the remote machine.\nGetting the code running on the CPU was trivial, but I hit several issues when running on the GPU. First, the vanilla Ubuntu 14.04 server I used didn’t come with CUDA installed. After trying and failing to get it working by following some tutorials, I ended up going down the easier path of using the AMI supplied by Caffe. This AMI also has the advantage of coming with Caffe installed (surprisingly), which I may end up using at some point.\nThe second issue I encountered was that using the GPU to run Lasagne’s enhanced example code on my full dataset was impossible due to memory constraints. The problem was that the example assumes that the entire dataset can fit in the GPU’s memory (as discussed here and here). This took a while to resolve, even though the solution is conceptually simple – just copy the dataset to the GPU in chunks rather than attempt to copy it all in one go. Resolving this issue was a good way of getting a better understanding of what the code does, since I ended up rewriting most of the original example code.\nNext steps So far, I left the network architecture from the original example mostly untouched, as I was busy collecting the dataset, getting the environment set up, and resolving various issues. One thing I did notice was that the example’s architecture diverges on my dataset, so instead I tested my code using a basic multi-layer perceptron architecture with a single hidden layer. This performs about as well as a random classifier on my dataset, but at least it converges. I also tested the modified code on the MNIST dataset and the results are decent, so now it is time to move forward and actually do some modelling, starting with convolutional neural nets.\nThe high level plan is to iteratively read tutorials/papers/books, implement ideas, play with parameters, and visualise parts of the network until I’m satisfied with the results. The main goal remains to learn as much as possible and get a good intuition of how things work. I’ll write more about my experiences in subsequent posts. Stay tuned!\nUpdate: The second post in the series is now available.\n","wordCount":"1311","inLanguage":"en","image":"https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled.png","datePublished":"2015-06-06T05:00:22Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Hopping on the deep learning bandwagon</h1><div class=post-meta><span title='2015-06-06 05:00:22 +0000 UTC'>June 6, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-06-06-hopping-on-the-deep-learning-bandwagon/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled_hu7dd57cb220c55d3023581cbc705ac82b_182096_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled_hu7dd57cb220c55d3023581cbc705ac82b_182096_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled_hu7dd57cb220c55d3023581cbc705ac82b_182096_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled.png 748w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre-shuffled.png alt width=748 height=128></figure><div class=post-content><p>I&rsquo;ve been meaning to get into <a href=https://en.wikipedia.org/wiki/Deep_learning target=_blank rel=noopener>deep learning</a> for the last few years. Now, the stars having finally aligned and I have the time and motivation to work on a small project that will hopefully improve my understanding of the field. This is the first in a series of posts that will document my progress on this project.</p><p>As mentioned in a <a href=https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/>previous post on getting started as a data scientist</a>, I believe that the best way of becoming proficient at solving data science problems is by getting your hands dirty. Despite being familiar with high-level terminology and having some understanding of how it all works, I don&rsquo;t have any practical experience applying deep learning. The purpose of this project is to fix this experience gap by working on a real problem.</p><h3 id=the-problem-inferring-genre-from-album-covers>The problem: Inferring genre from album covers<a hidden class=anchor aria-hidden=true href=#the-problem-inferring-genre-from-album-covers>#</a></h3><p>Deep learning has been very successful at image classification. Therefore, it makes sense to work on an image classification problem for this project. Rather than using an existing dataset, I decided to make things a bit more interesting by building my own dataset. Over the last year, I&rsquo;ve been running <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender – a recommendation system for Bandcamp music</a>. I&rsquo;ve noticed that album covers vary by genre, though it&rsquo;s hard to quantify exactly <em>how</em> they vary. So the question I&rsquo;ll be trying to answer with this project is <em>how accurately can genre be inferred from Bandcamp album covers?</em></p><p>As the goal of this project is to learn about deep learning rather than make a novel contribution, I didn&rsquo;t do a comprehensive search to see whether this problem has been addressed before. However, I did find <a href=http://apassant.net/2015/05/14/album-covers-music-deep-learning/ target=_blank rel=noopener>a recent post by Alexandre Passant</a> that describes his use of Clarifai&rsquo;s API to tag the content of Spotify album covers (identifying elements such as men, night, dark, etc.), and then using these tags to infer the album&rsquo;s genre. Another related project is <a href=http://sergeykarayev.com/files/1311.3715v3.pdf target=_blank rel=noopener>Karayev et al.&rsquo;s <em>Recognizing image style</em> paper</a>, in which the authors classified datasets of images from Flickr and Wikipedia by style and art genre, respectively. In all these cases, the results are pretty good, supporting my intuition that the genre inference task is feasible.</p><h3 id=data-collection--splits>Data collection & splits<a hidden class=anchor aria-hidden=true href=#data-collection--splits>#</a></h3><p>As I&rsquo;ve already been crawling Bandcamp data for BCRecommender, creating the dataset was relatively straightforward. Currently, I have data on about 1.8 million tracks and albums. Bandcamp artists assign multiple tags to each release. To create the dataset, I selected 10 of the top tags: <em>ambient, dubstep, folk, hiphop_rap, jazz, metal, pop, punk, rock,</em> and <em>soul</em>. Then, I randomly selected 10,000 album covers that have exactly one of those tags, with 1,000 albums for each tag/genre. Each cover image size is 350×350. The following image shows a sample of the dataset.</p><figure><a href=bandcamp-album-covers-by-genre.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre_hu98267f967d1b66bf7a519f3fa620b70e_1042875_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre_hu98267f967d1b66bf7a519f3fa620b70e_1042875_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/bandcamp-album-covers-by-genre_hu98267f967d1b66bf7a519f3fa620b70e_1042875_720x0_resize_box_3.png 720w,
diff --git a/2015/07/06/learning-about-deep-learning-through-album-cover-classification/index.html b/2015/07/06/learning-about-deep-learning-through-album-cover-classification/index.html
index 236922f3f..b037a0740 100644
--- a/2015/07/06/learning-about-deep-learning-through-album-cover-classification/index.html
+++ b/2015/07/06/learning-about-deep-learning-through-album-cover-classification/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Learning about deep learning through album cover classification | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,deep learning,machine learning,predictive modelling"><meta name=description content="Progress on my album cover classification project, highlighting lessons that would be useful to others who are getting started with deep learning."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Learning about deep learning through album cover classification"><meta property="og:description" content="Progress on my album cover classification project, highlighting lessons that would be useful to others who are getting started with deep learning."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/"><meta property="og:image" content="https://yanirseroussi.com/bandcamp-album-covers-by-genre.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-07-06T22:21:42+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bandcamp-album-covers-by-genre.png"><meta name=twitter:title content="Learning about deep learning through album cover classification"><meta name=twitter:description content="Progress on my album cover classification project, highlighting lessons that would be useful to others who are getting started with deep learning."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Learning about deep learning through album cover classification","item":"https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Learning about deep learning through album cover classification","name":"Learning about deep learning through album cover classification","description":"Progress on my album cover classification project, highlighting lessons that would be useful to others who are getting started with deep learning.","keywords":["data science","deep learning","machine learning","predictive modelling"],"articleBody":"In the past month, I’ve spent some time on my album cover classification project. The goal of this project is for me to learn about deep learning by working on an actual problem. This post covers my progress so far, highlighting lessons that would be useful to others who are getting started with deep learning.\nInitial steps summary The following points were discussed in detail in the previous post on this project.\nThe problem I chose to work on is classifying Bandcamp album covers by genre, using a balanced dataset of 10,000 images from 10 different genres. The experimental code is based on Lasagne, and is available on GitHub. Having set up the environment for running experiments on a GPU, the plan was to get Lasagne’s examples working on my dataset, and then iteratively read tutorials/papers/books, implement ideas, play with parameters, and visualise parts of the network until I’m satisfied with the results. Preliminary experiments and learning resources I hit several issues when adapting Lasagne’s example code to my dataset. The key issue is that the example code is based on the MNIST digits dataset. That dataset’s images are 28×28 grayscale, and my dataset’s images are 350×350 RGB. This difference led to the training loss quickly diverging when running the example code without any changes. It turns out that simply lowering the learning rate resolves this issue, though the initial results I got were still not much better than random. In general, it appears that everything works on the MNIST digits dataset, so choosing to work on my own dataset made things more challenging (which is a good thing).\nThe main learning resource I used is the excellent notes for the Stanford course Convolutional Neural Networks for Visual Recognition. The notes are very clear, contain up-to-date information from recent publications, and include many practical tips for successful training of convolutional networks (convnets). In addition, I read some other tutorials and a few papers. These are summarised in a separate page.\nThe first step after getting the MNIST examples working on my dataset was to extend the code to enable more flexible architectures. My main focus was on vanilla convnets, i.e., networks with several convolutional layers, where each convolutional layer is optionally followed by a max-pooling layer, and the convolutional layers are followed by multiple dense/fully-connected layers and dropout layers. To allow for easy experimentation, the specification of the network can be done from the command line. For example, to train an AlexNet architecture:\n$ python manage.py run_experiment \\ --dataset-path /path/to/dataset \\ --model-architecture ConvNet \\ --model-params num_conv_layers=5:num_dense_layers=2:lc0_num_filters=48:lc0_filter_size=11:lc0_stride=4:lc0_mp=True:lm0_pool_size=3:lm0_stride=2:lc1_num_filters=128:lc1_filter_size=5:lc1_mp=True:lm1_pool_size=3:lm1_stride=2:lc2_num_filters=192:lc2_filter_size=3:lc3_num_filters=192:lc3_filter_size=3:lc4_num_filters=128:lc4_filter_size=3:lc4_mp=True:lm4_pool_size=3:lm4_stride=2:ld0_num_units=2048:ld1_num_units=2048 This can obviously be a bit of a mouthful, so common architectures are also defined in the code with parameters that can be overridden. For instance, to train an AlexNet with 64 filters in the first layer instead of 48:\n$ python manage.py run_experiment \\ --dataset-path /path/to/dataset \\ --model-architecture AlexNet \\ --model-params lc0_num_filters=64 There are many more command line flags (possibly too many), which make it easy to both tinker with various settings, and also run more rigorous experiments. My initial tinkering with convnets didn’t yield impressive results in terms of predictive accuracy on my dataset. It turned out that this was partly due to the lack of preprocessing – the less exciting but crucial part of any predictive modelling work.\nThe importance of preprocessing My initial focus was on getting things to work on the dataset without worrying too much about preprocessing. I haven’t done any image classification work in the past, so I had to learn about the right type of preprocessing to use. I kept it pretty simple and applied the following transformations:\nDownsampling: all images were scaled down to 256×256. I played briefly with other sizes, but decided on this size to make it easy to use models pretrained on ImageNet. Cropping \u0026 mirroring: during training time, each image was cropped to random 224×224 slices. Deterministic slices were used in test time. In addition, each crop was mirrored horizontally. In most cases I used ten overall crops. Again, these numbers were chosen for comparability with ImageNet-trained models. Mean subtraction: the training mean of each pixel was subtracted from each instance. Shuffling: probably the most important preprocessing step. Initially I had the instances sorted by their class, as an artifact of the way the dataset was constructed. Due to the relatively small number of instances the network sees in each batch, this meant that in each epoch, the network first fitted on all the instances from class 1, then all the instances from class 2, etc. This led to very poor performance, which was fixed by shuffling the data once at the start of the training procedure (shuffling every epoch could potentially make things even better). Baselines After building the experimental environment and a fair bit of tinkering, I decided it was time for some more serious experiments. The results of my initial games were rather disappointing – slightly better than a random baseline, which yields an accuracy score of 10%. Therefore, I ran some baselines to get an idea of what’s possible on this dataset.\nThe first baseline I tried was a random forest with 1,000 trees, which yielded 15.25% accuracy. This baseline was trained directly on the pixel values without any preprocessing other than downsampling. It’s worth noting that the downsampling size didn’t make much of a difference to this baseline (I tried a few values in the range 50×50-350×350). This baseline was also not particularly sensitive to whether RGB or grayscale values were used to represent the images.\nThe next experiments were with baselines that utilised pretrained Caffe models. Training a random forest with 1,000 trees on features extracted from the highest fully-connected layer (fc7) in the CaffeNet and VGGNet-19 models yielded accuracies of 16.72% and 16.40% respectively. This was pretty disappointing, as I expected these features to perform much better. The reason may be that album covers are very different from ImageNet images, and the representations in fc7 are too specific to ImageNet. Indeed, when fine-tuning the CaffeNet model (following the procedure outlined here), I got the best accuracy on the dataset: 22.60%. Using Caffe to train the same network from scratch didn’t even get close to this accuracy. However, I didn’t try to tune Caffe’s learning parameters. Instead, I went back to running experiments with my code.\nIt’s worth noting that the classes identified by the CaffeNet model often have little to do with the actual content of the image. Better baseline results may be obtained by using models that were pretrained on a richer dataset than ImageNet. The following table presents three example covers together with the top-five classes identified by the CaffeNet model for each image. The tags assigned by Clarifai’s API are also presented for comparison. From this example, it looks like Clarifai’s model is more successful at identifying the correct elements than the CaffeNet model, indicating that a baseline that uses the Clarifai tags may yield competitive performance.\nAlbum CaffeNet Clarifai October by Wille P\nhiphop_rap digital clock, spotlight, jack-o’-lantern, volcano, traffic light tree, landscape, sunset, desert, sun, sunrise, nature, evening, sky, travel Demo by Blackrat\nmetal spider web, barn spider, chain, bubble, fountain skull, bone, nobody, death, vector, help, horror, medicine, black and white, tattoo The Kool-Aid Album by Mr. Merge\nsoul dishrag, paper towel, honeycomb, envelope, chain mail symbol, nobody, sign, illustration, color, flag, text, stripes, business, character Training from scratch My initial experiments were with various convnet architectures, where I manually varied the filter sizes and number of layers to have a reasonable number of parameters and ensure that the model is trainable on a GPU with 4GB of memory. As mentioned, this approach yielded unimpressive results. Following the relative success of the fine-tuned CaffeNet baseline, I decided to run more rigorous experiments on variants of AlexNet (which is very similar to CaffeNet).\nGiven the large number of hyperparameters that need to be set when training deep convnets, I realised that setting values manually or via grid search is unlikely to yield the best results. To address this, I used hyperopt to search for the best configuration of values. The hyperparameters that were included in the search were the learning method (Nesterov momentum versus Adam with their respective parameters), the learning rate, whether crops are mirrored or not, the number of crops to use (1 or 5), dropout probabilities, the number of hidden units in the fully-connected layers, and the number of filters in each convolutional layer.\nEach configuration suggested by hyperopt was trained for 10 epochs, and the promising setups were trained until results stopped improving. The results of the search were rather disappointing, with the best accuracy being 17.19%. However, I learned a lot by finding hyperparameters in this manner – in the past I’ve only used a combination of manual settings with grid search.\nThere are many possible reasons for why the results are so poor. It could be that there’s just too little data to train a good classifier, which is supported by the inability to beat the fine-tuned results. This is in line with the results obtained by Zeiler and Fergus (2013), who found that convnets pretrained on ImageNet performed much better on the Caltech-101 and Caltech-256 datasets than the same networks trained from scratch. However, it could also be that I just didn’t run enough experiments – I definitely feel like I haven’t explored everything as well as I’d like. In addition, I’m still building my intuition for what works and why. I should work more on visualising the way the network learns to uncover more hidden gotchas in addition to those I’ve already found. Finally, it could be that it’s just too hard to distinguish between covers from the genres I chose for the study.\nIdeas for future work There are many avenues for improving on the work I’ve done so far. The code could definitely be made more robust and better tested, optimised and parallelised. It would be worth investing more in hyperparameter and architecture search, including incorporation of ideas from non-vanilla convnets (e.g., GoogLeNet). This search should be guided by visualisation and a deeper understanding of the trained networks, which may also come from analysing class-level accuracy (certain genres seem to be easier to distinguish than others). In addition, more sophisticated preprocessing may yield improved results.\nIf the goal were to get the best possible performance on my dataset, I’d invest in establishing the human performance baseline on the dataset by running some tests with Mechanical Turk. My guess is that humans would perform better than the algorithms tested so far due to access to external knowledge. Therefore, incorporating external knowledge in the form of manual features or additional data sources may yield the most substantial performance boosts. For example, text on an album cover may contain important clues about its genre, and models pretrained on style datasets may be more suitable than ImageNet models. In addition, it may be beneficial to use a model to detect multiple elements in images where the universe is not restricted to ImageNet classes. This approach was taken by Alexandre Passant, who used Clarifai’s API to tag and classify doom metal and K-pop album covers. Finally, using several different models in an ensemble is likely to help squeeze a bit more accuracy out of the dataset.\nAnother direction that may be worth exploring is using image data for recommendation work. The reason I chose to work on this problem was my exposure to album covers through my work on Bandcamp Recommender – a music recommendation system. It is well-known that visual elements influence the way users interact with recommender systems. This is especially true in Bandcamp Recommender’s case, as users see the album covers before they choose to play them. This leads me to conjecture that considering features that describe the album covers when generating recommendations would increase user interaction with the system. However, it’s hard to tell whether it’d increase the overall relevance of the results. You can’t judge an album by its cover. Or can you…?\nConclusion While I’ve learned a lot from working on this project, there’s still much more to discover. It was especially great to learn some generally-applicable lessons about hyperparameter optimisation and improvements to vanilla gradient descent. Despite the many potential ways of improving performance on my dataset, my next steps in the field would probably include working on problems for which obtaining a good solution is feasible and useful. For example, I have some ideas for applications to marine creature identification.\nFeedback and suggestions are always welcome. Please feel free to contact me privately or via the comments section.\nAcknowledgement: Thanks to Brian Basham and Diogo Moitinho de Almeida for useful tips and discussions.\n","wordCount":"2117","inLanguage":"en","image":"https://yanirseroussi.com/bandcamp-album-covers-by-genre.png","datePublished":"2015-07-06T22:21:42Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Learning about deep learning through album cover classification</h1><div class=post-meta><span title='2015-07-06 22:21:42 +0000 UTC'>July 6, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-07-06-learning-about-deep-learning-through-album-cover-classification/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre_hube220240ac6ea6d528d49262fd2fcb98_1398155_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre_hube220240ac6ea6d528d49262fd2fcb98_1398155_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre_hube220240ac6ea6d528d49262fd2fcb98_1398155_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre_hube220240ac6ea6d528d49262fd2fcb98_1398155_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre.png 1259w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre.png alt width=1259 height=649></figure><div class=post-content><p>In the past month, I&rsquo;ve spent some time on <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/>my album cover classification project</a>. The goal of this project is for me to learn about deep learning by working on an actual problem. This post covers my progress so far, highlighting lessons that would be useful to others who are getting started with deep learning.</p><h3 id=initial-steps-summary>Initial steps summary<a hidden class=anchor aria-hidden=true href=#initial-steps-summary>#</a></h3><p>The following points were discussed in detail in the <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/>previous post on this project</a>.</p><ul><li>The problem I chose to work on is classifying Bandcamp album covers by genre, using a balanced dataset of 10,000 images from 10 different genres.</li><li>The experimental code is based on <a href=http://lasagne.readthedocs.org/en/latest/ target=_blank rel=noopener>Lasagne</a>, and is <a href=https://github.com/yanirs/bandcamp-deep-learning/ target=_blank rel=noopener>available on GitHub</a>.</li><li>Having set up the environment for running experiments on a GPU, the plan was to get Lasagne&rsquo;s examples working on my dataset, and then iteratively read tutorials/papers/books, implement ideas, play with parameters, and visualise parts of the network until I&rsquo;m satisfied with the results.</li></ul><h3 id=preliminary-experiments-and-learning-resources>Preliminary experiments and learning resources<a hidden class=anchor aria-hidden=true href=#preliminary-experiments-and-learning-resources>#</a></h3><p>I hit several issues when adapting Lasagne&rsquo;s example code to my dataset. The key issue is that the example code is based on the MNIST digits dataset. That dataset&rsquo;s images are 28×28 grayscale, and my dataset&rsquo;s images are 350×350 RGB. This difference led to the training loss quickly diverging when running the example code without any changes. It turns out that simply lowering the learning rate resolves this issue, though the initial results I got were still not much better than random. In general, it appears that everything works on the MNIST digits dataset, so choosing to work on my own dataset made things more challenging (which is a good thing).</p><p>The main learning resource I used is the excellent notes for the Stanford course <a href=http://cs231n.github.io/ target=_blank rel=noopener>Convolutional Neural Networks for Visual Recognition</a>. The notes are very clear, contain up-to-date information from recent publications, and include many practical tips for successful training of convolutional networks (convnets). In addition, I read some other tutorials and a few papers. These are summarised in <a href=https://yanirseroussi.com/deep-learning-resources/>a separate page</a>.</p><p>The first step after getting the MNIST examples working on my dataset was to extend the code to enable more flexible architectures. My main focus was on vanilla convnets, i.e., networks with several convolutional layers, where each convolutional layer is optionally followed by a max-pooling layer, and the convolutional layers are followed by multiple dense/fully-connected layers and dropout layers. To allow for easy experimentation, the specification of the network can be done from the command line. For example, to train an <a href=http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf target=_blank rel=noopener>AlexNet</a> architecture:</p><div class=highlight><pre tabindex=0 style=color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-bash data-lang=bash><span style=display:flex><span>$ python manage.py run_experiment <span style=color:#ae81ff>\
+<meta name=keywords content="data science,deep learning,machine learning,predictive modelling"><meta name=description content="Progress on my album cover classification project, highlighting lessons that would be useful to others who are getting started with deep learning."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Learning about deep learning through album cover classification"><meta property="og:description" content="Progress on my album cover classification project, highlighting lessons that would be useful to others who are getting started with deep learning."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/"><meta property="og:image" content="https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-07-06T22:21:42+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre.png"><meta name=twitter:title content="Learning about deep learning through album cover classification"><meta name=twitter:description content="Progress on my album cover classification project, highlighting lessons that would be useful to others who are getting started with deep learning."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Learning about deep learning through album cover classification","item":"https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Learning about deep learning through album cover classification","name":"Learning about deep learning through album cover classification","description":"Progress on my album cover classification project, highlighting lessons that would be useful to others who are getting started with deep learning.","keywords":["data science","deep learning","machine learning","predictive modelling"],"articleBody":"In the past month, I’ve spent some time on my album cover classification project. The goal of this project is for me to learn about deep learning by working on an actual problem. This post covers my progress so far, highlighting lessons that would be useful to others who are getting started with deep learning.\nInitial steps summary The following points were discussed in detail in the previous post on this project.\nThe problem I chose to work on is classifying Bandcamp album covers by genre, using a balanced dataset of 10,000 images from 10 different genres. The experimental code is based on Lasagne, and is available on GitHub. Having set up the environment for running experiments on a GPU, the plan was to get Lasagne’s examples working on my dataset, and then iteratively read tutorials/papers/books, implement ideas, play with parameters, and visualise parts of the network until I’m satisfied with the results. Preliminary experiments and learning resources I hit several issues when adapting Lasagne’s example code to my dataset. The key issue is that the example code is based on the MNIST digits dataset. That dataset’s images are 28×28 grayscale, and my dataset’s images are 350×350 RGB. This difference led to the training loss quickly diverging when running the example code without any changes. It turns out that simply lowering the learning rate resolves this issue, though the initial results I got were still not much better than random. In general, it appears that everything works on the MNIST digits dataset, so choosing to work on my own dataset made things more challenging (which is a good thing).\nThe main learning resource I used is the excellent notes for the Stanford course Convolutional Neural Networks for Visual Recognition. The notes are very clear, contain up-to-date information from recent publications, and include many practical tips for successful training of convolutional networks (convnets). In addition, I read some other tutorials and a few papers. These are summarised in a separate page.\nThe first step after getting the MNIST examples working on my dataset was to extend the code to enable more flexible architectures. My main focus was on vanilla convnets, i.e., networks with several convolutional layers, where each convolutional layer is optionally followed by a max-pooling layer, and the convolutional layers are followed by multiple dense/fully-connected layers and dropout layers. To allow for easy experimentation, the specification of the network can be done from the command line. For example, to train an AlexNet architecture:\n$ python manage.py run_experiment \\ --dataset-path /path/to/dataset \\ --model-architecture ConvNet \\ --model-params num_conv_layers=5:num_dense_layers=2:lc0_num_filters=48:lc0_filter_size=11:lc0_stride=4:lc0_mp=True:lm0_pool_size=3:lm0_stride=2:lc1_num_filters=128:lc1_filter_size=5:lc1_mp=True:lm1_pool_size=3:lm1_stride=2:lc2_num_filters=192:lc2_filter_size=3:lc3_num_filters=192:lc3_filter_size=3:lc4_num_filters=128:lc4_filter_size=3:lc4_mp=True:lm4_pool_size=3:lm4_stride=2:ld0_num_units=2048:ld1_num_units=2048 This can obviously be a bit of a mouthful, so common architectures are also defined in the code with parameters that can be overridden. For instance, to train an AlexNet with 64 filters in the first layer instead of 48:\n$ python manage.py run_experiment \\ --dataset-path /path/to/dataset \\ --model-architecture AlexNet \\ --model-params lc0_num_filters=64 There are many more command line flags (possibly too many), which make it easy to both tinker with various settings, and also run more rigorous experiments. My initial tinkering with convnets didn’t yield impressive results in terms of predictive accuracy on my dataset. It turned out that this was partly due to the lack of preprocessing – the less exciting but crucial part of any predictive modelling work.\nThe importance of preprocessing My initial focus was on getting things to work on the dataset without worrying too much about preprocessing. I haven’t done any image classification work in the past, so I had to learn about the right type of preprocessing to use. I kept it pretty simple and applied the following transformations:\nDownsampling: all images were scaled down to 256×256. I played briefly with other sizes, but decided on this size to make it easy to use models pretrained on ImageNet. Cropping \u0026 mirroring: during training time, each image was cropped to random 224×224 slices. Deterministic slices were used in test time. In addition, each crop was mirrored horizontally. In most cases I used ten overall crops. Again, these numbers were chosen for comparability with ImageNet-trained models. Mean subtraction: the training mean of each pixel was subtracted from each instance. Shuffling: probably the most important preprocessing step. Initially I had the instances sorted by their class, as an artifact of the way the dataset was constructed. Due to the relatively small number of instances the network sees in each batch, this meant that in each epoch, the network first fitted on all the instances from class 1, then all the instances from class 2, etc. This led to very poor performance, which was fixed by shuffling the data once at the start of the training procedure (shuffling every epoch could potentially make things even better). Baselines After building the experimental environment and a fair bit of tinkering, I decided it was time for some more serious experiments. The results of my initial games were rather disappointing – slightly better than a random baseline, which yields an accuracy score of 10%. Therefore, I ran some baselines to get an idea of what’s possible on this dataset.\nThe first baseline I tried was a random forest with 1,000 trees, which yielded 15.25% accuracy. This baseline was trained directly on the pixel values without any preprocessing other than downsampling. It’s worth noting that the downsampling size didn’t make much of a difference to this baseline (I tried a few values in the range 50×50-350×350). This baseline was also not particularly sensitive to whether RGB or grayscale values were used to represent the images.\nThe next experiments were with baselines that utilised pretrained Caffe models. Training a random forest with 1,000 trees on features extracted from the highest fully-connected layer (fc7) in the CaffeNet and VGGNet-19 models yielded accuracies of 16.72% and 16.40% respectively. This was pretty disappointing, as I expected these features to perform much better. The reason may be that album covers are very different from ImageNet images, and the representations in fc7 are too specific to ImageNet. Indeed, when fine-tuning the CaffeNet model (following the procedure outlined here), I got the best accuracy on the dataset: 22.60%. Using Caffe to train the same network from scratch didn’t even get close to this accuracy. However, I didn’t try to tune Caffe’s learning parameters. Instead, I went back to running experiments with my code.\nIt’s worth noting that the classes identified by the CaffeNet model often have little to do with the actual content of the image. Better baseline results may be obtained by using models that were pretrained on a richer dataset than ImageNet. The following table presents three example covers together with the top-five classes identified by the CaffeNet model for each image. The tags assigned by Clarifai’s API are also presented for comparison. From this example, it looks like Clarifai’s model is more successful at identifying the correct elements than the CaffeNet model, indicating that a baseline that uses the Clarifai tags may yield competitive performance.\nAlbum CaffeNet Clarifai October by Wille P\nhiphop_rap digital clock, spotlight, jack-o’-lantern, volcano, traffic light tree, landscape, sunset, desert, sun, sunrise, nature, evening, sky, travel Demo by Blackrat\nmetal spider web, barn spider, chain, bubble, fountain skull, bone, nobody, death, vector, help, horror, medicine, black and white, tattoo The Kool-Aid Album by Mr. Merge\nsoul dishrag, paper towel, honeycomb, envelope, chain mail symbol, nobody, sign, illustration, color, flag, text, stripes, business, character Training from scratch My initial experiments were with various convnet architectures, where I manually varied the filter sizes and number of layers to have a reasonable number of parameters and ensure that the model is trainable on a GPU with 4GB of memory. As mentioned, this approach yielded unimpressive results. Following the relative success of the fine-tuned CaffeNet baseline, I decided to run more rigorous experiments on variants of AlexNet (which is very similar to CaffeNet).\nGiven the large number of hyperparameters that need to be set when training deep convnets, I realised that setting values manually or via grid search is unlikely to yield the best results. To address this, I used hyperopt to search for the best configuration of values. The hyperparameters that were included in the search were the learning method (Nesterov momentum versus Adam with their respective parameters), the learning rate, whether crops are mirrored or not, the number of crops to use (1 or 5), dropout probabilities, the number of hidden units in the fully-connected layers, and the number of filters in each convolutional layer.\nEach configuration suggested by hyperopt was trained for 10 epochs, and the promising setups were trained until results stopped improving. The results of the search were rather disappointing, with the best accuracy being 17.19%. However, I learned a lot by finding hyperparameters in this manner – in the past I’ve only used a combination of manual settings with grid search.\nThere are many possible reasons for why the results are so poor. It could be that there’s just too little data to train a good classifier, which is supported by the inability to beat the fine-tuned results. This is in line with the results obtained by Zeiler and Fergus (2013), who found that convnets pretrained on ImageNet performed much better on the Caltech-101 and Caltech-256 datasets than the same networks trained from scratch. However, it could also be that I just didn’t run enough experiments – I definitely feel like I haven’t explored everything as well as I’d like. In addition, I’m still building my intuition for what works and why. I should work more on visualising the way the network learns to uncover more hidden gotchas in addition to those I’ve already found. Finally, it could be that it’s just too hard to distinguish between covers from the genres I chose for the study.\nIdeas for future work There are many avenues for improving on the work I’ve done so far. The code could definitely be made more robust and better tested, optimised and parallelised. It would be worth investing more in hyperparameter and architecture search, including incorporation of ideas from non-vanilla convnets (e.g., GoogLeNet). This search should be guided by visualisation and a deeper understanding of the trained networks, which may also come from analysing class-level accuracy (certain genres seem to be easier to distinguish than others). In addition, more sophisticated preprocessing may yield improved results.\nIf the goal were to get the best possible performance on my dataset, I’d invest in establishing the human performance baseline on the dataset by running some tests with Mechanical Turk. My guess is that humans would perform better than the algorithms tested so far due to access to external knowledge. Therefore, incorporating external knowledge in the form of manual features or additional data sources may yield the most substantial performance boosts. For example, text on an album cover may contain important clues about its genre, and models pretrained on style datasets may be more suitable than ImageNet models. In addition, it may be beneficial to use a model to detect multiple elements in images where the universe is not restricted to ImageNet classes. This approach was taken by Alexandre Passant, who used Clarifai’s API to tag and classify doom metal and K-pop album covers. Finally, using several different models in an ensemble is likely to help squeeze a bit more accuracy out of the dataset.\nAnother direction that may be worth exploring is using image data for recommendation work. The reason I chose to work on this problem was my exposure to album covers through my work on Bandcamp Recommender – a music recommendation system. It is well-known that visual elements influence the way users interact with recommender systems. This is especially true in Bandcamp Recommender’s case, as users see the album covers before they choose to play them. This leads me to conjecture that considering features that describe the album covers when generating recommendations would increase user interaction with the system. However, it’s hard to tell whether it’d increase the overall relevance of the results. You can’t judge an album by its cover. Or can you…?\nConclusion While I’ve learned a lot from working on this project, there’s still much more to discover. It was especially great to learn some generally-applicable lessons about hyperparameter optimisation and improvements to vanilla gradient descent. Despite the many potential ways of improving performance on my dataset, my next steps in the field would probably include working on problems for which obtaining a good solution is feasible and useful. For example, I have some ideas for applications to marine creature identification.\nFeedback and suggestions are always welcome. Please feel free to contact me privately or via the comments section.\nAcknowledgement: Thanks to Brian Basham and Diogo Moitinho de Almeida for useful tips and discussions.\n","wordCount":"2117","inLanguage":"en","image":"https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre.png","datePublished":"2015-07-06T22:21:42Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Learning about deep learning through album cover classification</h1><div class=post-meta><span title='2015-07-06 22:21:42 +0000 UTC'>July 6, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-07-06-learning-about-deep-learning-through-album-cover-classification/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre_hube220240ac6ea6d528d49262fd2fcb98_1398155_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre_hube220240ac6ea6d528d49262fd2fcb98_1398155_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre_hube220240ac6ea6d528d49262fd2fcb98_1398155_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre_hube220240ac6ea6d528d49262fd2fcb98_1398155_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre.png 1259w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/bandcamp-album-covers-by-genre.png alt width=1259 height=649></figure><div class=post-content><p>In the past month, I&rsquo;ve spent some time on <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/>my album cover classification project</a>. The goal of this project is for me to learn about deep learning by working on an actual problem. This post covers my progress so far, highlighting lessons that would be useful to others who are getting started with deep learning.</p><h3 id=initial-steps-summary>Initial steps summary<a hidden class=anchor aria-hidden=true href=#initial-steps-summary>#</a></h3><p>The following points were discussed in detail in the <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/>previous post on this project</a>.</p><ul><li>The problem I chose to work on is classifying Bandcamp album covers by genre, using a balanced dataset of 10,000 images from 10 different genres.</li><li>The experimental code is based on <a href=http://lasagne.readthedocs.org/en/latest/ target=_blank rel=noopener>Lasagne</a>, and is <a href=https://github.com/yanirs/bandcamp-deep-learning/ target=_blank rel=noopener>available on GitHub</a>.</li><li>Having set up the environment for running experiments on a GPU, the plan was to get Lasagne&rsquo;s examples working on my dataset, and then iteratively read tutorials/papers/books, implement ideas, play with parameters, and visualise parts of the network until I&rsquo;m satisfied with the results.</li></ul><h3 id=preliminary-experiments-and-learning-resources>Preliminary experiments and learning resources<a hidden class=anchor aria-hidden=true href=#preliminary-experiments-and-learning-resources>#</a></h3><p>I hit several issues when adapting Lasagne&rsquo;s example code to my dataset. The key issue is that the example code is based on the MNIST digits dataset. That dataset&rsquo;s images are 28×28 grayscale, and my dataset&rsquo;s images are 350×350 RGB. This difference led to the training loss quickly diverging when running the example code without any changes. It turns out that simply lowering the learning rate resolves this issue, though the initial results I got were still not much better than random. In general, it appears that everything works on the MNIST digits dataset, so choosing to work on my own dataset made things more challenging (which is a good thing).</p><p>The main learning resource I used is the excellent notes for the Stanford course <a href=http://cs231n.github.io/ target=_blank rel=noopener>Convolutional Neural Networks for Visual Recognition</a>. The notes are very clear, contain up-to-date information from recent publications, and include many practical tips for successful training of convolutional networks (convnets). In addition, I read some other tutorials and a few papers. These are summarised in <a href=https://yanirseroussi.com/deep-learning-resources/>a separate page</a>.</p><p>The first step after getting the MNIST examples working on my dataset was to extend the code to enable more flexible architectures. My main focus was on vanilla convnets, i.e., networks with several convolutional layers, where each convolutional layer is optionally followed by a max-pooling layer, and the convolutional layers are followed by multiple dense/fully-connected layers and dropout layers. To allow for easy experimentation, the specification of the network can be done from the command line. For example, to train an <a href=http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf target=_blank rel=noopener>AlexNet</a> architecture:</p><div class=highlight><pre tabindex=0 style=color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-bash data-lang=bash><span style=display:flex><span>$ python manage.py run_experiment <span style=color:#ae81ff>\
 </span></span></span><span style=display:flex><span><span style=color:#ae81ff></span>    --dataset-path /path/to/dataset <span style=color:#ae81ff>\
 </span></span></span><span style=display:flex><span><span style=color:#ae81ff></span>    --model-architecture ConvNet <span style=color:#ae81ff>\
 </span></span></span><span style=display:flex><span><span style=color:#ae81ff></span>    --model-params num_conv_layers<span style=color:#f92672>=</span>5:num_dense_layers<span style=color:#f92672>=</span>2:lc0_num_filters<span style=color:#f92672>=</span>48:lc0_filter_size<span style=color:#f92672>=</span>11:lc0_stride<span style=color:#f92672>=</span>4:lc0_mp<span style=color:#f92672>=</span>True:lm0_pool_size<span style=color:#f92672>=</span>3:lm0_stride<span style=color:#f92672>=</span>2:lc1_num_filters<span style=color:#f92672>=</span>128:lc1_filter_size<span style=color:#f92672>=</span>5:lc1_mp<span style=color:#f92672>=</span>True:lm1_pool_size<span style=color:#f92672>=</span>3:lm1_stride<span style=color:#f92672>=</span>2:lc2_num_filters<span style=color:#f92672>=</span>192:lc2_filter_size<span style=color:#f92672>=</span>3:lc3_num_filters<span style=color:#f92672>=</span>192:lc3_filter_size<span style=color:#f92672>=</span>3:lc4_num_filters<span style=color:#f92672>=</span>128:lc4_filter_size<span style=color:#f92672>=</span>3:lc4_mp<span style=color:#f92672>=</span>True:lm4_pool_size<span style=color:#f92672>=</span>3:lm4_stride<span style=color:#f92672>=</span>2:ld0_num_units<span style=color:#f92672>=</span>2048:ld1_num_units<span style=color:#f92672>=</span><span style=color:#ae81ff>2048</span>
diff --git a/2015/07/31/goodbye-parse-com/index.html b/2015/07/31/goodbye-parse-com/index.html
index 4157d93a1..d499f9fe5 100644
--- a/2015/07/31/goodbye-parse-com/index.html
+++ b/2015/07/31/goodbye-parse-com/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Goodbye, Parse.com | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="BCRecommender,DevOps,parse.com,software engineering"><meta name=description content="Migrating my web apps away from Parse.com due to reliability issues. Self-hosting is a better solution."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Goodbye, Parse.com"><meta property="og:description" content="Migrating my web apps away from Parse.com due to reliability issues. Self-hosting is a better solution."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/07/31/goodbye-parse-com/"><meta property="og:image" content="https://yanirseroussi.com/farewell.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-07-31T03:29:50+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/farewell.jpg"><meta name=twitter:title content="Goodbye, Parse.com"><meta name=twitter:description content="Migrating my web apps away from Parse.com due to reliability issues. Self-hosting is a better solution."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Goodbye, Parse.com","item":"https://yanirseroussi.com/2015/07/31/goodbye-parse-com/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Goodbye, Parse.com","name":"Goodbye, Parse.com","description":"Migrating my web apps away from Parse.com due to reliability issues. Self-hosting is a better solution.","keywords":["BCRecommender","DevOps","parse.com","software engineering"],"articleBody":"Over the past year, I’ve been using Parse‘s free backend-as-a-service and web hosting to serve BCRecommender (music recommendation service) and Price Dingo (now-closed shopping comparison engine). The main lesson: You get what you pay for. Despite some improvements, Parse remains very unreliable, and any time saved by using their APIs and SDKs tends to be offset by having to work around the restrictions of their sandboxed environment. This post details some of the issues I faced and the transition away from the service.\nWhat’s so bad about Parse? In one word: reliability. The service is simply unreliable, with many latency spikes and random errors. I reported this issue six months ago, and it’s still being investigated. Reliability has been a known issue for years (see Stack Overflow and Hacker News discussions). Parse’s acquisition by Facebook over two years ago gave some hope that these issues would be resolved quickly, but this is just not the case.\nIt is worth noting that the way I used Parse was probably somewhat uncommon. For both Price Dingo and BCRecommender, data was scraped and processed outside Parse, and then imported in bulk into Parse. As bulk imports are not supported by the API, automating the process required reliance on the web interface, which made things somewhat fragile. Further, a few months ago Parse inexplicably dropped support for uploading zipped files, making imports much slower. Finally, when importing large collections, I found that it takes ages for the data to get indexed. The final straw was with the last BCRecommender update, where even after days of waiting the data was still not fully indexed.\nPrice Dingo’s transition Price Dingo was a shopping comparison engine with a web interface. The idea was to focus on user needs in specialised product categories, as opposed to the traditional model that requires merchants to pay to be listed. I decided to shut down the service a few months ago to focus on other things, but before the shutdown, I almost completed the transition away from Parse. The first step was replacing the persistence layer with Algolia – search engine as a service. Algolia is super-fast, its advanced search capabilities are way better than Parse’s search options, and as a paid service their customer support was excellent. If I hadn’t shut Price Dingo down, the second step would have been replacing Parse hosting with a more reliable service, as I have recently done for BCRecommender.\nBCRecommender’s transition The Parse-hosted part of BCRecommender was a fairly simple express.js backend that rendered Jade templates. The fastest transition would probably have been to set up a standalone express.js backend and replace the Parse API calls with calls to the database. But as I much prefer coding in Python (the recommendation-generating backend is in Python), I decided to completely rewrite the web backend using Flask.\nFor hosting, I decided to go with DigitalOcean (signing up with this link gives you US$10 credit), because it has a good reputation, and it compares favourably with other infrastructure-as-a-service providers. For US$10/month you get a server with 1GB of memory, 30GB of SSD storage, and 2TB of data transfers, which should be more than enough for BCRecommender’s modest traffic (200 daily users + ~2 bot requests per second).\nSetting up the BCRecommender webapp stack is a bit more involved than getting started with Parse, but fortunately I was already familiar with all parts of the stack. It ended up being almost identical to the stack used in Charlie Huang’s blog post Deploy a MongoDB powered Flask app in 5 minutes: an Ubuntu server running MongoDB as the persistence layer, Nginx as the webserver, Gunicorn as the WSGI proxy, Supervisor for daemon management, and Fabric for managing deployments.\nBefore deploying to DigitalOcean, I used Vagrant to set up a local development environment, which is almost identical to the production environment. Deployment scripts are one thing that you don’t have to worry about when using Parse, as they provide their own build tools. However, it’s not too hard to implement your own scripts, so within a few hours I had the environment and the deployment scripts up and ready for translating the webapp code from express.js to Flask.\nThe translation process was pretty straightforward and actually enjoyable. The Python code ended up being much cleaner and shorter than the JavaScript code (line count reduced to 284 from 378). This was partly thanks to the newly-found freedom of being able to install any package I wanted, and partly due to the reduction in callbacks, which made the code less nested and easier to understand.\nI was hoping to use PyJade to obviate the need for translating the page templates to Jinja. However, I ran into a bunch of issues and subtle bugs that made me decide to use PyJade for one-off translation to Jinja, followed by a manual process of ensuring that each template was converted correctly. Some of the issues were:\nUsing PyJade’s Flask extension compiles the templates to Jinja on the fly, so debugging issues is hard because the line numbers in the generated Jinja templates don’t match the line numbers in the original Jade files. Jade allows the use of arbitrary JavaScript code, which PyJade doesn’t translate to Python (makes sense – it’d be too hard and messy). This caused many of my templates to simply not work because, e.g., I used the ternary operator or called a built-in JavaScript function. Worse than that, some cases failed silently, e.g., calling arr.length where arr is an array works fine in pure Jade, but is undefined in Python because arrays don’t have a length attribute. Hyphenated block names are fine in Jade, but don’t compile in Jinja. The conversion to Jinja pretty much offset the cleanliness gained in the Python code, with a growth in template line count from 403 to 464 lines, and much clutter with unnecessary closing tags. Jade, I will miss you, but I guess I can’t have it all.\nThe good news is that latency immediately dropped as I deployed the new environment. The graph below almost says it all. What’s missing is the much more massive spikes (5-60 seconds) and timeouts that happen pretty frequently with Parse hosting.\nNote that this graph is for a simple GET request of the homepage without fetching any of the embedded static assets or running client-side rendering. Handling the request simply populates a Jade template without touching the database. It really shouldn’t take too long unless the server is under very heavy load. And even then, Parse is supposed to handle such loads gracefully – not needing to worry about this kind of stuff is the key reason for using a backend-as-a-service!\nFinal thoughts I really like the idea behind Parse, as setting up and running a web backend is not a trivial task. They do provide some good tooling, and I was happy to work around the minor issues and restrictions that come with working in a sandboxed environment. However, the lack of reliability is a huge disadvantage, even at the attractive price point of $0. Further, there’s no indication that paying for the service would increase reliability, as the free tier includes up to 30 requests / second and it can barely handle a single request. Maybe I’ll get back to Parse one day, but for now I’m much happier with the increased power and responsibility of managing my own servers.\nUpdate (30 January, 2016): Facebook has announced it will be shutting Parse down, which is a shame. It could have been a great service if they had just focused more on reliability. You just couldn’t run serious apps on Parse, which probably meant that not many apps were upgraded to the paid tiers. It’s very disappointing that Facebook didn’t help Parse realise its potential, but this isn’t the first time a big company takes over a small product and shuts it down. It’s just the way of the world.\n","wordCount":"1323","inLanguage":"en","image":"https://yanirseroussi.com/farewell.jpg","datePublished":"2015-07-31T03:29:50Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/07/31/goodbye-parse-com/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Goodbye, Parse.com</h1><div class=post-meta><span title='2015-07-31 03:29:50 +0000 UTC'>July 31, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-07-31-goodbye-parse-com/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/farewell.jpg alt></figure><div class=post-content><p>Over the past year, I&rsquo;ve been using <a href=https://parse.com target=_blank rel=noopener>Parse</a>‘s free backend-as-a-service and web hosting to serve <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender (music recommendation service)</a> and Price Dingo (now-closed shopping comparison engine). The main lesson: You get what you pay for. Despite some improvements, Parse remains very unreliable, and any time saved by using their APIs and SDKs tends to be offset by having to work around the restrictions of their sandboxed environment. This post details some of the issues I faced and the transition away from the service.</p><h3 id=whats-so-bad-about-parse>What&rsquo;s so bad about Parse?<a hidden class=anchor aria-hidden=true href=#whats-so-bad-about-parse>#</a></h3><p>In one word: <strong>reliability</strong>. The service is simply unreliable, with many latency spikes and random errors. I <a href=https://developers.facebook.com/bugs/1550140598598847/ target=_blank rel=noopener>reported this issue six months ago</a>, and it&rsquo;s still being investigated. Reliability has been a known issue for years (see <a href=http://stackoverflow.com/questions/11283729/how-scalable-is-parse/24253932#24253932 target=_blank rel=noopener>Stack Overflow</a> and <a href="https://news.ycombinator.com/item?id=8347310" target=_blank rel=noopener>Hacker News</a> discussions). Parse&rsquo;s acquisition by Facebook over two years ago gave some hope that these issues would be resolved quickly, but this is just not the case.</p><p>It is worth noting that the way I used Parse was probably somewhat uncommon. For both Price Dingo and BCRecommender, data was scraped and processed outside Parse, and then imported in bulk into Parse. As bulk imports are not supported by the API, <a href=https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/>automating the process required reliance on the web interface</a>, which made things somewhat fragile. Further, a few months ago Parse inexplicably dropped support for uploading zipped files, making imports much slower. Finally, when importing large collections, I found that it takes ages for the data to get indexed. The final straw was with the last BCRecommender update, where even after days of waiting the data was still not fully indexed.</p><h3 id=price-dingos-transition>Price Dingo&rsquo;s transition<a hidden class=anchor aria-hidden=true href=#price-dingos-transition>#</a></h3><p>Price Dingo was a shopping comparison engine with a web interface. The idea was to focus on user needs in specialised product categories, as opposed to the traditional model that requires merchants to pay to be listed. I decided to shut down the service a few months ago to focus on other things, but before the shutdown, I almost completed the transition away from Parse. The first step was replacing the persistence layer with <a href=https://www.algolia.com/ target=_blank rel=noopener>Algolia – search engine as a service</a>. Algolia is super-fast, its advanced search capabilities are way better than Parse&rsquo;s search options, and as a paid service their customer support was excellent. If I hadn&rsquo;t shut Price Dingo down, the second step would have been replacing Parse hosting with a more reliable service, as I have recently done for BCRecommender.</p><h3 id=bcrecommenders-transition>BCRecommender&rsquo;s transition<a hidden class=anchor aria-hidden=true href=#bcrecommenders-transition>#</a></h3><p>The Parse-hosted part of BCRecommender was a fairly simple <a href=http://expressjs.com/ target=_blank rel=noopener>express.js</a> backend that rendered <a href=http://jade-lang.com/ target=_blank rel=noopener>Jade</a> templates. The fastest transition would probably have been to set up a standalone express.js backend and replace the Parse API calls with calls to the database. But as I much prefer coding in Python (<a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/>the recommendation-generating backend is in Python</a>), I decided to completely rewrite the web backend using <a href=http://flask.pocoo.org/ target=_blank rel=noopener>Flask</a>.</p><p>For hosting, I decided to go with <a href="https://www.digitalocean.com/?refcode=cd96cae9d5e1" target=_blank rel=noopener>DigitalOcean</a> (signing up with this link gives you US$10 credit), because it has a good reputation, and it <a href=https://www.scriptrock.com/articles/cloud-service-provider-roundup-the-best-of-the-best target=_blank rel=noopener>compares favourably with other infrastructure-as-a-service providers</a>. For US$10/month you get a server with 1GB of memory, 30GB of SSD storage, and 2TB of data transfers, which should be more than enough for BCRecommender&rsquo;s modest traffic (200 daily users + ~2 bot requests per second).</p><p>Setting up the BCRecommender webapp stack is a bit more involved than getting started with Parse, but fortunately I was already familiar with all parts of the stack. It ended up being almost identical to the stack used in Charlie Huang&rsquo;s blog post <a href=http://www.sasanalysis.com/2015/02/deploy-mongodb-powered-flask-app-in-5.html target=_blank rel=noopener>Deploy a MongoDB powered Flask app in 5 minutes</a>: an Ubuntu server running MongoDB as the persistence layer, Nginx as the webserver, Gunicorn as the WSGI proxy, Supervisor for daemon management, and Fabric for managing deployments.</p><p>Before deploying to DigitalOcean, I used <a href=https://www.vagrantup.com/ target=_blank rel=noopener>Vagrant</a> to set up a local development environment, which is almost identical to the production environment. Deployment scripts are one thing that you don&rsquo;t have to worry about when using Parse, as they provide their own build tools. However, it&rsquo;s not too hard to implement your own scripts, so within a few hours I had the environment and the deployment scripts up and ready for translating the webapp code from express.js to Flask.</p><p>The translation process was pretty straightforward and actually enjoyable. The Python code ended up being much cleaner and shorter than the JavaScript code (line count reduced to 284 from 378). This was partly thanks to the newly-found freedom of being able to install any package I wanted, and partly due to the reduction in callbacks, which made the code less nested and easier to understand.</p><p>I was hoping to use <a href=https://github.com/SyrusAkbary/pyjade target=_blank rel=noopener>PyJade</a> to obviate the need for translating the page templates to <a href=http://jinja.pocoo.org/ target=_blank rel=noopener>Jinja</a>. However, I ran into a bunch of issues and subtle bugs that made me decide to use PyJade for one-off translation to Jinja, followed by a manual process of ensuring that each template was converted correctly. Some of the issues were:</p><ul><li>Using PyJade&rsquo;s Flask extension compiles the templates to Jinja on the fly, so debugging issues is hard because the line numbers in the generated Jinja templates don&rsquo;t match the line numbers in the original Jade files.</li><li>Jade allows the use of arbitrary JavaScript code, which PyJade doesn&rsquo;t translate to Python (makes sense – it&rsquo;d be too hard and messy). This caused many of my templates to simply not work because, e.g., I used the ternary operator or called a built-in JavaScript function. Worse than that, some cases failed silently, e.g., calling <code>arr.length</code> where <code>arr</code> is an array works fine in pure Jade, but is undefined in Python because arrays don&rsquo;t have a length attribute.</li><li>Hyphenated block names are fine in Jade, but don&rsquo;t compile in Jinja.</li></ul><p>The conversion to Jinja pretty much offset the cleanliness gained in the Python code, with a growth in template line count from 403 to 464 lines, and much clutter with unnecessary closing tags. Jade, I will miss you, but I guess I can&rsquo;t have it all.</p><p>The good news is that latency immediately dropped as I deployed the new environment. The graph below almost says it all. What&rsquo;s missing is the much more massive spikes (5-60 seconds) and timeouts that happen pretty frequently with Parse hosting.</p><figure><a href=bcrecommender-latency-digital-ocean.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="BCRecommender,DevOps,parse.com,software engineering"><meta name=description content="Migrating my web apps away from Parse.com due to reliability issues. Self-hosting is a better solution."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Goodbye, Parse.com"><meta property="og:description" content="Migrating my web apps away from Parse.com due to reliability issues. Self-hosting is a better solution."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/07/31/goodbye-parse-com/"><meta property="og:image" content="https://yanirseroussi.com/2015/07/31/goodbye-parse-com/farewell.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-07-31T03:29:50+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/07/31/goodbye-parse-com/farewell.jpg"><meta name=twitter:title content="Goodbye, Parse.com"><meta name=twitter:description content="Migrating my web apps away from Parse.com due to reliability issues. Self-hosting is a better solution."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Goodbye, Parse.com","item":"https://yanirseroussi.com/2015/07/31/goodbye-parse-com/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Goodbye, Parse.com","name":"Goodbye, Parse.com","description":"Migrating my web apps away from Parse.com due to reliability issues. Self-hosting is a better solution.","keywords":["BCRecommender","DevOps","parse.com","software engineering"],"articleBody":"Over the past year, I’ve been using Parse‘s free backend-as-a-service and web hosting to serve BCRecommender (music recommendation service) and Price Dingo (now-closed shopping comparison engine). The main lesson: You get what you pay for. Despite some improvements, Parse remains very unreliable, and any time saved by using their APIs and SDKs tends to be offset by having to work around the restrictions of their sandboxed environment. This post details some of the issues I faced and the transition away from the service.\nWhat’s so bad about Parse? In one word: reliability. The service is simply unreliable, with many latency spikes and random errors. I reported this issue six months ago, and it’s still being investigated. Reliability has been a known issue for years (see Stack Overflow and Hacker News discussions). Parse’s acquisition by Facebook over two years ago gave some hope that these issues would be resolved quickly, but this is just not the case.\nIt is worth noting that the way I used Parse was probably somewhat uncommon. For both Price Dingo and BCRecommender, data was scraped and processed outside Parse, and then imported in bulk into Parse. As bulk imports are not supported by the API, automating the process required reliance on the web interface, which made things somewhat fragile. Further, a few months ago Parse inexplicably dropped support for uploading zipped files, making imports much slower. Finally, when importing large collections, I found that it takes ages for the data to get indexed. The final straw was with the last BCRecommender update, where even after days of waiting the data was still not fully indexed.\nPrice Dingo’s transition Price Dingo was a shopping comparison engine with a web interface. The idea was to focus on user needs in specialised product categories, as opposed to the traditional model that requires merchants to pay to be listed. I decided to shut down the service a few months ago to focus on other things, but before the shutdown, I almost completed the transition away from Parse. The first step was replacing the persistence layer with Algolia – search engine as a service. Algolia is super-fast, its advanced search capabilities are way better than Parse’s search options, and as a paid service their customer support was excellent. If I hadn’t shut Price Dingo down, the second step would have been replacing Parse hosting with a more reliable service, as I have recently done for BCRecommender.\nBCRecommender’s transition The Parse-hosted part of BCRecommender was a fairly simple express.js backend that rendered Jade templates. The fastest transition would probably have been to set up a standalone express.js backend and replace the Parse API calls with calls to the database. But as I much prefer coding in Python (the recommendation-generating backend is in Python), I decided to completely rewrite the web backend using Flask.\nFor hosting, I decided to go with DigitalOcean (signing up with this link gives you US$10 credit), because it has a good reputation, and it compares favourably with other infrastructure-as-a-service providers. For US$10/month you get a server with 1GB of memory, 30GB of SSD storage, and 2TB of data transfers, which should be more than enough for BCRecommender’s modest traffic (200 daily users + ~2 bot requests per second).\nSetting up the BCRecommender webapp stack is a bit more involved than getting started with Parse, but fortunately I was already familiar with all parts of the stack. It ended up being almost identical to the stack used in Charlie Huang’s blog post Deploy a MongoDB powered Flask app in 5 minutes: an Ubuntu server running MongoDB as the persistence layer, Nginx as the webserver, Gunicorn as the WSGI proxy, Supervisor for daemon management, and Fabric for managing deployments.\nBefore deploying to DigitalOcean, I used Vagrant to set up a local development environment, which is almost identical to the production environment. Deployment scripts are one thing that you don’t have to worry about when using Parse, as they provide their own build tools. However, it’s not too hard to implement your own scripts, so within a few hours I had the environment and the deployment scripts up and ready for translating the webapp code from express.js to Flask.\nThe translation process was pretty straightforward and actually enjoyable. The Python code ended up being much cleaner and shorter than the JavaScript code (line count reduced to 284 from 378). This was partly thanks to the newly-found freedom of being able to install any package I wanted, and partly due to the reduction in callbacks, which made the code less nested and easier to understand.\nI was hoping to use PyJade to obviate the need for translating the page templates to Jinja. However, I ran into a bunch of issues and subtle bugs that made me decide to use PyJade for one-off translation to Jinja, followed by a manual process of ensuring that each template was converted correctly. Some of the issues were:\nUsing PyJade’s Flask extension compiles the templates to Jinja on the fly, so debugging issues is hard because the line numbers in the generated Jinja templates don’t match the line numbers in the original Jade files. Jade allows the use of arbitrary JavaScript code, which PyJade doesn’t translate to Python (makes sense – it’d be too hard and messy). This caused many of my templates to simply not work because, e.g., I used the ternary operator or called a built-in JavaScript function. Worse than that, some cases failed silently, e.g., calling arr.length where arr is an array works fine in pure Jade, but is undefined in Python because arrays don’t have a length attribute. Hyphenated block names are fine in Jade, but don’t compile in Jinja. The conversion to Jinja pretty much offset the cleanliness gained in the Python code, with a growth in template line count from 403 to 464 lines, and much clutter with unnecessary closing tags. Jade, I will miss you, but I guess I can’t have it all.\nThe good news is that latency immediately dropped as I deployed the new environment. The graph below almost says it all. What’s missing is the much more massive spikes (5-60 seconds) and timeouts that happen pretty frequently with Parse hosting.\nNote that this graph is for a simple GET request of the homepage without fetching any of the embedded static assets or running client-side rendering. Handling the request simply populates a Jade template without touching the database. It really shouldn’t take too long unless the server is under very heavy load. And even then, Parse is supposed to handle such loads gracefully – not needing to worry about this kind of stuff is the key reason for using a backend-as-a-service!\nFinal thoughts I really like the idea behind Parse, as setting up and running a web backend is not a trivial task. They do provide some good tooling, and I was happy to work around the minor issues and restrictions that come with working in a sandboxed environment. However, the lack of reliability is a huge disadvantage, even at the attractive price point of $0. Further, there’s no indication that paying for the service would increase reliability, as the free tier includes up to 30 requests / second and it can barely handle a single request. Maybe I’ll get back to Parse one day, but for now I’m much happier with the increased power and responsibility of managing my own servers.\nUpdate (30 January, 2016): Facebook has announced it will be shutting Parse down, which is a shame. It could have been a great service if they had just focused more on reliability. You just couldn’t run serious apps on Parse, which probably meant that not many apps were upgraded to the paid tiers. It’s very disappointing that Facebook didn’t help Parse realise its potential, but this isn’t the first time a big company takes over a small product and shuts it down. It’s just the way of the world.\n","wordCount":"1323","inLanguage":"en","image":"https://yanirseroussi.com/2015/07/31/goodbye-parse-com/farewell.jpg","datePublished":"2015-07-31T03:29:50Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/07/31/goodbye-parse-com/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Goodbye, Parse.com</h1><div class=post-meta><span title='2015-07-31 03:29:50 +0000 UTC'>July 31, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-07-31-goodbye-parse-com/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/farewell.jpg alt></figure><div class=post-content><p>Over the past year, I&rsquo;ve been using <a href=https://parse.com target=_blank rel=noopener>Parse</a>‘s free backend-as-a-service and web hosting to serve <a href=http://www.bcrecommender.com target=_blank rel=noopener>BCRecommender (music recommendation service)</a> and Price Dingo (now-closed shopping comparison engine). The main lesson: You get what you pay for. Despite some improvements, Parse remains very unreliable, and any time saved by using their APIs and SDKs tends to be offset by having to work around the restrictions of their sandboxed environment. This post details some of the issues I faced and the transition away from the service.</p><h3 id=whats-so-bad-about-parse>What&rsquo;s so bad about Parse?<a hidden class=anchor aria-hidden=true href=#whats-so-bad-about-parse>#</a></h3><p>In one word: <strong>reliability</strong>. The service is simply unreliable, with many latency spikes and random errors. I <a href=https://developers.facebook.com/bugs/1550140598598847/ target=_blank rel=noopener>reported this issue six months ago</a>, and it&rsquo;s still being investigated. Reliability has been a known issue for years (see <a href=http://stackoverflow.com/questions/11283729/how-scalable-is-parse/24253932#24253932 target=_blank rel=noopener>Stack Overflow</a> and <a href="https://news.ycombinator.com/item?id=8347310" target=_blank rel=noopener>Hacker News</a> discussions). Parse&rsquo;s acquisition by Facebook over two years ago gave some hope that these issues would be resolved quickly, but this is just not the case.</p><p>It is worth noting that the way I used Parse was probably somewhat uncommon. For both Price Dingo and BCRecommender, data was scraped and processed outside Parse, and then imported in bulk into Parse. As bulk imports are not supported by the API, <a href=https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/>automating the process required reliance on the web interface</a>, which made things somewhat fragile. Further, a few months ago Parse inexplicably dropped support for uploading zipped files, making imports much slower. Finally, when importing large collections, I found that it takes ages for the data to get indexed. The final straw was with the last BCRecommender update, where even after days of waiting the data was still not fully indexed.</p><h3 id=price-dingos-transition>Price Dingo&rsquo;s transition<a hidden class=anchor aria-hidden=true href=#price-dingos-transition>#</a></h3><p>Price Dingo was a shopping comparison engine with a web interface. The idea was to focus on user needs in specialised product categories, as opposed to the traditional model that requires merchants to pay to be listed. I decided to shut down the service a few months ago to focus on other things, but before the shutdown, I almost completed the transition away from Parse. The first step was replacing the persistence layer with <a href=https://www.algolia.com/ target=_blank rel=noopener>Algolia – search engine as a service</a>. Algolia is super-fast, its advanced search capabilities are way better than Parse&rsquo;s search options, and as a paid service their customer support was excellent. If I hadn&rsquo;t shut Price Dingo down, the second step would have been replacing Parse hosting with a more reliable service, as I have recently done for BCRecommender.</p><h3 id=bcrecommenders-transition>BCRecommender&rsquo;s transition<a hidden class=anchor aria-hidden=true href=#bcrecommenders-transition>#</a></h3><p>The Parse-hosted part of BCRecommender was a fairly simple <a href=http://expressjs.com/ target=_blank rel=noopener>express.js</a> backend that rendered <a href=http://jade-lang.com/ target=_blank rel=noopener>Jade</a> templates. The fastest transition would probably have been to set up a standalone express.js backend and replace the Parse API calls with calls to the database. But as I much prefer coding in Python (<a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/>the recommendation-generating backend is in Python</a>), I decided to completely rewrite the web backend using <a href=http://flask.pocoo.org/ target=_blank rel=noopener>Flask</a>.</p><p>For hosting, I decided to go with <a href="https://www.digitalocean.com/?refcode=cd96cae9d5e1" target=_blank rel=noopener>DigitalOcean</a> (signing up with this link gives you US$10 credit), because it has a good reputation, and it <a href=https://www.scriptrock.com/articles/cloud-service-provider-roundup-the-best-of-the-best target=_blank rel=noopener>compares favourably with other infrastructure-as-a-service providers</a>. For US$10/month you get a server with 1GB of memory, 30GB of SSD storage, and 2TB of data transfers, which should be more than enough for BCRecommender&rsquo;s modest traffic (200 daily users + ~2 bot requests per second).</p><p>Setting up the BCRecommender webapp stack is a bit more involved than getting started with Parse, but fortunately I was already familiar with all parts of the stack. It ended up being almost identical to the stack used in Charlie Huang&rsquo;s blog post <a href=http://www.sasanalysis.com/2015/02/deploy-mongodb-powered-flask-app-in-5.html target=_blank rel=noopener>Deploy a MongoDB powered Flask app in 5 minutes</a>: an Ubuntu server running MongoDB as the persistence layer, Nginx as the webserver, Gunicorn as the WSGI proxy, Supervisor for daemon management, and Fabric for managing deployments.</p><p>Before deploying to DigitalOcean, I used <a href=https://www.vagrantup.com/ target=_blank rel=noopener>Vagrant</a> to set up a local development environment, which is almost identical to the production environment. Deployment scripts are one thing that you don&rsquo;t have to worry about when using Parse, as they provide their own build tools. However, it&rsquo;s not too hard to implement your own scripts, so within a few hours I had the environment and the deployment scripts up and ready for translating the webapp code from express.js to Flask.</p><p>The translation process was pretty straightforward and actually enjoyable. The Python code ended up being much cleaner and shorter than the JavaScript code (line count reduced to 284 from 378). This was partly thanks to the newly-found freedom of being able to install any package I wanted, and partly due to the reduction in callbacks, which made the code less nested and easier to understand.</p><p>I was hoping to use <a href=https://github.com/SyrusAkbary/pyjade target=_blank rel=noopener>PyJade</a> to obviate the need for translating the page templates to <a href=http://jinja.pocoo.org/ target=_blank rel=noopener>Jinja</a>. However, I ran into a bunch of issues and subtle bugs that made me decide to use PyJade for one-off translation to Jinja, followed by a manual process of ensuring that each template was converted correctly. Some of the issues were:</p><ul><li>Using PyJade&rsquo;s Flask extension compiles the templates to Jinja on the fly, so debugging issues is hard because the line numbers in the generated Jinja templates don&rsquo;t match the line numbers in the original Jade files.</li><li>Jade allows the use of arbitrary JavaScript code, which PyJade doesn&rsquo;t translate to Python (makes sense – it&rsquo;d be too hard and messy). This caused many of my templates to simply not work because, e.g., I used the ternary operator or called a built-in JavaScript function. Worse than that, some cases failed silently, e.g., calling <code>arr.length</code> where <code>arr</code> is an array works fine in pure Jade, but is undefined in Python because arrays don&rsquo;t have a length attribute.</li><li>Hyphenated block names are fine in Jade, but don&rsquo;t compile in Jinja.</li></ul><p>The conversion to Jinja pretty much offset the cleanliness gained in the Python code, with a growth in template line count from 403 to 464 lines, and much clutter with unnecessary closing tags. Jade, I will miss you, but I guess I can&rsquo;t have it all.</p><p>The good news is that latency immediately dropped as I deployed the new environment. The graph below almost says it all. What&rsquo;s missing is the much more massive spikes (5-60 seconds) and timeouts that happen pretty frequently with Parse hosting.</p><figure><a href=bcrecommender-latency-digital-ocean.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2015/07/31/goodbye-parse-com/bcrecommender-latency-digital-ocean_huab82bdddef099c15f4714a7d4c558d3e_32379_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2015/07/31/goodbye-parse-com/bcrecommender-latency-digital-ocean_huab82bdddef099c15f4714a7d4c558d3e_32379_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2015/07/31/goodbye-parse-com/bcrecommender-latency-digital-ocean_huab82bdddef099c15f4714a7d4c558d3e_32379_720x0_resize_box_3.png 720w,
diff --git a/2015/08/24/you-dont-need-a-data-scientist-yet/index.html b/2015/08/24/you-dont-need-a-data-scientist-yet/index.html
index 21b206a09..b8819d556 100644
--- a/2015/08/24/you-dont-need-a-data-scientist-yet/index.html
+++ b/2015/08/24/you-dont-need-a-data-scientist-yet/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>You don’t need a data scientist (yet) | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="business,data business,data science"><meta name=description content="Hiring data scientists prematurely is wasteful and frustrating. Here are some questions to ask before you hire your first data scientist."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="You don’t need a data scientist (yet)"><meta property="og:description" content="Hiring data scientists prematurely is wasteful and frustrating. Here are some questions to ask before you hire your first data scientist."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/"><meta property="og:image" content="https://yanirseroussi.com/hammer.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-08-24T08:25:30+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/hammer.jpg"><meta name=twitter:title content="You don’t need a data scientist (yet)"><meta name=twitter:description content="Hiring data scientists prematurely is wasteful and frustrating. Here are some questions to ask before you hire your first data scientist."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"You don’t need a data scientist (yet)","item":"https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"You don’t need a data scientist (yet)","name":"You don’t need a data scientist (yet)","description":"Hiring data scientists prematurely is wasteful and frustrating. Here are some questions to ask before you hire your first data scientist.","keywords":["business","data business","data science"],"articleBody":"The hype around big data has caused many organisations to hire data scientists without giving much thought to what these data scientists are going to do and whether they’re actually needed. This is a source of frustration for all parties involved. This post discusses some questions you should ask yourself before deciding to hire your first data scientist.\nQ1: Do you know what data scientists do? Somewhat surprisingly, there are quite a few companies that hire data scientists without having a clear idea of what data scientists actually do. People seem to have a fear of missing out on the big data hype, and think of hiring data scientists as the solution. A common misconception is that a data scientist’s role includes telling you what to do with your data. While this may sometimes happen in practice, the ideal scenario is where the business has problems that can be solved using data science (more on this under Q3 below). If you don’t know what your data scientist is going to do, you probably don’t need one.\nSo what do data scientists do? When you think about it, adding the word “data” to “science” is a bit redundant, as all science is based on data. Following from this, anyone who does any kind of data analysis is a data scientist. While it may be true, this broad definition is not very helpful. As discussed in a previous post, it’s more useful to define data scientists as individuals who combine expertise in statistics and machine learning with strong software engineering skills.\nQ2: Do you have enough data available? It’s not uncommon to see products that suffer from over-engineering and premature investment in advanced analytics capabilities. In the early stages, it’s important to focus on creating a minimum viable product and getting it to market quickly. Data science starts to shine once the product is generating enough data, as most of the power of advanced analytics is in optimising and automating existing processes.\nNot having a data scientist in the early stages doesn’t mean the data is being ignored – it just means that it doesn’t require the attention of a full-time data scientist. If your product is at an early stage and you are still concerned, you’re better off hiring a data science consultant for a few days to help lay out the long-term vision for data-driven capabilities. This would be cheaper and less time-consuming than hiring a full-timer. The exception to this rule is when the product itself is built around advanced analytics (e.g., AlchemyAPI or Enlitic). Building such products without data scientists is far from ideal, or just impossible.\nEven if your product is mature and generating a lot of data, it doesn’t mean it’s ready for data science. Advanced analytics capabilities are at the top of data’s hierarchy of needs: If your product is buggy, or if your data is scattered everywhere and your platform lacks centralised reporting, you need to first invest in fixing your data plumbing. This is the job of data engineers. Getting data scientists involved when the data is hardly available due to infrastructure issues is likely to lead to frustration. In addition, setting up centralised reporting and dashboarding is likely to give you ideas for problems that data scientists can solve.\nQ3: Do you have a specific problem to solve? If the problem you’re trying to solve is “everyone is doing smart things with data, we should be doing stuff with data too”, you don’t have a specific problem that can be solved by bringing a data scientist on board. Defining the problem often ends up occupying a lot of the data scientist’s time, so you are likely to obtain better results if have more than just a vague idea around “doing something with data, because Hadoop”. Ideally you want to optimise an existing process that is currently being solved with heuristics, make an existing model better, implement a new data-driven feature, or something along these lines. Common examples include reducing churn, increasing conversions, and replacing manual processes with automated data-driven systems. Again, getting advice from experienced data scientists before committing to hiring one may be your best first step.\nQ4: Can you get away with heuristics, intuition, and/or manual processes? Some data scientists would passionately claim that you must deploy only models that are theoretically justified and well-tested. However, in many cases you can get away with using simple heuristics, intuition, and/or manual processes. These can be orders of magnitude cheaper than building sophisticated predictive models and the infrastructure to support them. For many businesses, there are more pressing needs than doing everything in a theoretically sound way. Despite what many technical people like to think, customers don’t tend to care how things are implemented, as long as their needs are fulfilled.\nFor example, I spent some time with a client whose product includes a semi-manual part where structured data is extracted from documents. Their process included sending some of the documents to a trained team in the Philippines for manual analysis. The client was interested in replacing that manual work with a machine learning algorithm. As is often the case with machine learning, it was unknown whether the resultant model would be accurate enough to completely replace the manual workers. This generally depends on data quality and the feasibility of solving the problem. Assessing the feasibility would have taken some time and money, so the client decided to park the idea and focus on other areas of their business.\nEvery business has resource constraints. Situations where the best investment you can make is hiring a full-time data scientist are rarer than what the hype may make you think. It’s often the case that functions that would be the responsibility of a data scientist are adequately performed by existing employees, such as software engineers, business/data analysts, and marketers.\nQ5: Are you committed to being data-driven? I have seen more than one case where data scientists are hired only to be blocked or ignored. This is more prevalent in the corporate world, where managers are often incentivised to prioritise doing things that look good over things that make financial sense. But even if recruitment is done with the best intentions, progress may be blocked by employees who feel threatened because they would be replaced by automated data-driven algorithms. Successful data science projects require support from senior leadership, as discussed by Greta Roberts, Radim Řehůřek, Alec Smith, and many others. Without such support and a strong commitment to making data-driven decisions, everyone is just wasting their time.\nClosing thoughts While data science is currently over-hyped, many organisations still have much to gain from hiring data scientists. I hope that this post has helped you decide whether you need a data scientist right now. If you’re unsure, please don’t hesitate to contact me. And to any data scientists reading this: Be very wary of potential employers who do not have good answers to the above questions. At this point in time you can afford to be picky, at least until the hype is over.\n","wordCount":"1178","inLanguage":"en","image":"https://yanirseroussi.com/hammer.jpg","datePublished":"2015-08-24T08:25:30Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">You don’t need a data scientist (yet)</h1><div class=post-meta><span title='2015-08-24 08:25:30 +0000 UTC'>August 24, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-08-24-you-dont-need-a-data-scientist-yet/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer_hu2c8b5baf56bd11c08a3f40db9407264b_42562_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer_hu2c8b5baf56bd11c08a3f40db9407264b_42562_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer.jpg 560w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer.jpg alt width=560 height=300></figure><div class=post-content><p>The hype around big data has caused many organisations to hire data scientists without giving much thought to what these data scientists are going to do and whether they&rsquo;re actually needed. This is a source of frustration for all parties involved. This post discusses some questions you should ask yourself before deciding to hire your first data scientist.</p><h3 id=q1-do-you-know-what-data-scientists-do>Q1: Do you know what data scientists do?<a hidden class=anchor aria-hidden=true href=#q1-do-you-know-what-data-scientists-do>#</a></h3><p>Somewhat surprisingly, there are quite a few companies that hire data scientists without having a clear idea of what data scientists actually do. People seem to have a fear of missing out on the big data hype, and think of hiring data scientists as the solution. A common misconception is that a data scientist&rsquo;s role includes telling you what to do with your data. While this may sometimes happen in practice, the ideal scenario is where the business has problems that can be solved using data science (more on this under Q3 below). If you don&rsquo;t know what your data scientist is going to do, you probably don&rsquo;t need one.</p><p>So what do data scientists do? When you think about it, adding the word &ldquo;data&rdquo; to &ldquo;science&rdquo; is a bit redundant, as all science is based on data. Following from this, <a href=http://robjhyndman.com/hyndsight/am-i-a-data-scientist/ target=_blank rel=noopener>anyone who does any kind of data analysis is a data scientist</a>. While it may be true, this broad definition is not very helpful. <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>As discussed in a previous post</a>, it&rsquo;s more useful to define data scientists as individuals who combine expertise in statistics and machine learning with strong software engineering skills.</p><h3 id=q2-do-you-have-enough-data-available>Q2: Do you have enough data available?<a hidden class=anchor aria-hidden=true href=#q2-do-you-have-enough-data-available>#</a></h3><p>It&rsquo;s not uncommon to see products that suffer from over-engineering and premature investment in advanced analytics capabilities. In the early stages, it&rsquo;s important to focus on creating a minimum viable product and getting it to market quickly. Data science starts to shine once the product is generating enough data, as most of the power of advanced analytics is in optimising and automating existing processes.</p><p>Not having a data scientist in the early stages doesn&rsquo;t mean the data is being ignored – it just means that it doesn&rsquo;t require the attention of a full-time data scientist. If your product is at an early stage and you are still concerned, you&rsquo;re better off hiring a data science consultant for a few days to help lay out the long-term vision for data-driven capabilities. This would be cheaper and less time-consuming than hiring a full-timer. The exception to this rule is when the product itself is built around advanced analytics (e.g., <a href=http://www.alchemyapi.com/ target=_blank rel=noopener>AlchemyAPI</a> or <a href=http://www.enlitic.com/ target=_blank rel=noopener>Enlitic</a>). Building such products without data scientists is far from ideal, or just impossible.</p><p>Even if your product is mature and generating a lot of data, it doesn&rsquo;t mean it&rsquo;s ready for data science. Advanced analytics capabilities are at the top of <a href=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/>data&rsquo;s hierarchy of needs</a>: If your product is buggy, or if your data is scattered everywhere and your platform lacks centralised reporting, you need to first invest in fixing your data plumbing. This is the job of <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>data engineers</a>. Getting data scientists involved when the data is hardly available due to infrastructure issues is likely to lead to frustration. In addition, setting up centralised reporting and dashboarding is likely to give you ideas for problems that data scientists can solve.</p><h3 id=q3-do-you-have-a-specific-problem-to-solve>Q3: Do you have a specific problem to solve?<a hidden class=anchor aria-hidden=true href=#q3-do-you-have-a-specific-problem-to-solve>#</a></h3><p>If the problem you&rsquo;re trying to solve is &ldquo;everyone is doing smart things with data, we should be doing stuff with data too&rdquo;, you don&rsquo;t have a specific problem that can be solved by bringing a data scientist on board. Defining the problem often ends up occupying a lot of the data scientist&rsquo;s time, so you are likely to obtain better results if have more than just a vague idea around &ldquo;doing something with data, because Hadoop&rdquo;. Ideally you want to optimise an existing process that is currently being solved with heuristics, make an existing model better, implement a new data-driven feature, or something along these lines. Common examples include reducing churn, increasing conversions, and replacing manual processes with automated data-driven systems. Again, getting advice from experienced data scientists before committing to hiring one may be your best first step.</p><h3 id=q4-can-you-get-away-with-heuristics-intuition-andor-manual-processes>Q4: Can you get away with heuristics, intuition, and/or manual processes?<a hidden class=anchor aria-hidden=true href=#q4-can-you-get-away-with-heuristics-intuition-andor-manual-processes>#</a></h3><p>Some data scientists would passionately claim that you must deploy only models that are theoretically justified and well-tested. However, in many cases you can get away with using simple heuristics, intuition, and/or manual processes. These can be orders of magnitude cheaper than building sophisticated predictive models and the infrastructure to support them. For many businesses, there are more pressing needs than doing everything in a theoretically sound way. Despite what many technical people like to think, customers don&rsquo;t tend to care how things are implemented, as long as their needs are fulfilled.</p><p>For example, I spent some time with a client whose product includes a semi-manual part where structured data is extracted from documents. Their process included sending some of the documents to a trained team in the Philippines for manual analysis. The client was interested in replacing that manual work with a machine learning algorithm. As is often the case with machine learning, it was unknown whether the resultant model would be accurate enough to completely replace the manual workers. This generally depends on data quality and the feasibility of solving the problem. Assessing the feasibility would have taken some time and money, so the client decided to park the idea and focus on other areas of their business.</p><p>Every business has resource constraints. Situations where the best investment you can make is hiring a full-time data scientist are rarer than what the hype may make you think. It&rsquo;s often the case that functions that would be the responsibility of a data scientist are adequately performed by existing employees, such as software engineers, business/data analysts, and marketers.</p><h3 id=q5-are-you-committed-to-being-data-driven>Q5: Are you committed to being data-driven?<a hidden class=anchor aria-hidden=true href=#q5-are-you-committed-to-being-data-driven>#</a></h3><p>I have seen more than one case where data scientists are hired only to be blocked or ignored. This is more prevalent in the corporate world, where managers are often incentivised to prioritise doing things that look good over things that make financial sense. But even if recruitment is done with the best intentions, progress may be blocked by employees who feel threatened because they would be replaced by automated data-driven algorithms. Successful data science projects require support from senior leadership, as discussed by <a href=http://venturebeat.com/2015/07/22/stop-hiring-data-scientists-until-youre-ready-for-data-science/ target=_blank rel=noopener>Greta Roberts</a>, <a href=https://berlinbuzzwords.de/sites/berlinbuzzwords.de/files/media/documents/radim_rehurek-so_you_want_to_be_a_data_science_consultant.pdf target=_blank rel=noopener>Radim Řehůřek</a>, <a href=https://www.linkedin.com/pulse/big-data-science-analytics-australia-alec-smith target=_blank rel=noopener>Alec Smith</a>, and many others. Without such support and a strong commitment to making data-driven decisions, everyone is just wasting their time.</p><h3 id=closing-thoughts>Closing thoughts<a hidden class=anchor aria-hidden=true href=#closing-thoughts>#</a></h3><p>While data science is currently over-hyped, many organisations still have much to gain from hiring data scientists. I hope that this post has helped you decide whether you need a data scientist right now. If you&rsquo;re unsure, please don&rsquo;t hesitate to <a href=https://yanirseroussi.com/about/ target=_blank rel=noopener>contact me</a>. And to any data scientists reading this: Be very wary of potential employers who do not have good answers to the above questions. At this point in time you can afford to be picky, at least until the hype is over.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/business/>business</a></li><li><a href=https://yanirseroussi.com/tags/data-business/>data business</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on x" href="https://x.com/intent/tweet/?text=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f&amp;hashtags=business%2cdatabusiness%2cdatascience"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f&amp;title=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&amp;summary=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f&title=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on whatsapp" href="https://api.whatsapp.com/send?text=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on telegram" href="https://telegram.me/share/url?text=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on ycombinator" href="https://news.ycombinator.com/submitlink?t=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="business,data business,data science"><meta name=description content="Hiring data scientists prematurely is wasteful and frustrating. Here are some questions to ask before you hire your first data scientist."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="You don’t need a data scientist (yet)"><meta property="og:description" content="Hiring data scientists prematurely is wasteful and frustrating. Here are some questions to ask before you hire your first data scientist."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/"><meta property="og:image" content="https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-08-24T08:25:30+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer.jpg"><meta name=twitter:title content="You don’t need a data scientist (yet)"><meta name=twitter:description content="Hiring data scientists prematurely is wasteful and frustrating. Here are some questions to ask before you hire your first data scientist."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"You don’t need a data scientist (yet)","item":"https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"You don’t need a data scientist (yet)","name":"You don’t need a data scientist (yet)","description":"Hiring data scientists prematurely is wasteful and frustrating. Here are some questions to ask before you hire your first data scientist.","keywords":["business","data business","data science"],"articleBody":"The hype around big data has caused many organisations to hire data scientists without giving much thought to what these data scientists are going to do and whether they’re actually needed. This is a source of frustration for all parties involved. This post discusses some questions you should ask yourself before deciding to hire your first data scientist.\nQ1: Do you know what data scientists do? Somewhat surprisingly, there are quite a few companies that hire data scientists without having a clear idea of what data scientists actually do. People seem to have a fear of missing out on the big data hype, and think of hiring data scientists as the solution. A common misconception is that a data scientist’s role includes telling you what to do with your data. While this may sometimes happen in practice, the ideal scenario is where the business has problems that can be solved using data science (more on this under Q3 below). If you don’t know what your data scientist is going to do, you probably don’t need one.\nSo what do data scientists do? When you think about it, adding the word “data” to “science” is a bit redundant, as all science is based on data. Following from this, anyone who does any kind of data analysis is a data scientist. While it may be true, this broad definition is not very helpful. As discussed in a previous post, it’s more useful to define data scientists as individuals who combine expertise in statistics and machine learning with strong software engineering skills.\nQ2: Do you have enough data available? It’s not uncommon to see products that suffer from over-engineering and premature investment in advanced analytics capabilities. In the early stages, it’s important to focus on creating a minimum viable product and getting it to market quickly. Data science starts to shine once the product is generating enough data, as most of the power of advanced analytics is in optimising and automating existing processes.\nNot having a data scientist in the early stages doesn’t mean the data is being ignored – it just means that it doesn’t require the attention of a full-time data scientist. If your product is at an early stage and you are still concerned, you’re better off hiring a data science consultant for a few days to help lay out the long-term vision for data-driven capabilities. This would be cheaper and less time-consuming than hiring a full-timer. The exception to this rule is when the product itself is built around advanced analytics (e.g., AlchemyAPI or Enlitic). Building such products without data scientists is far from ideal, or just impossible.\nEven if your product is mature and generating a lot of data, it doesn’t mean it’s ready for data science. Advanced analytics capabilities are at the top of data’s hierarchy of needs: If your product is buggy, or if your data is scattered everywhere and your platform lacks centralised reporting, you need to first invest in fixing your data plumbing. This is the job of data engineers. Getting data scientists involved when the data is hardly available due to infrastructure issues is likely to lead to frustration. In addition, setting up centralised reporting and dashboarding is likely to give you ideas for problems that data scientists can solve.\nQ3: Do you have a specific problem to solve? If the problem you’re trying to solve is “everyone is doing smart things with data, we should be doing stuff with data too”, you don’t have a specific problem that can be solved by bringing a data scientist on board. Defining the problem often ends up occupying a lot of the data scientist’s time, so you are likely to obtain better results if have more than just a vague idea around “doing something with data, because Hadoop”. Ideally you want to optimise an existing process that is currently being solved with heuristics, make an existing model better, implement a new data-driven feature, or something along these lines. Common examples include reducing churn, increasing conversions, and replacing manual processes with automated data-driven systems. Again, getting advice from experienced data scientists before committing to hiring one may be your best first step.\nQ4: Can you get away with heuristics, intuition, and/or manual processes? Some data scientists would passionately claim that you must deploy only models that are theoretically justified and well-tested. However, in many cases you can get away with using simple heuristics, intuition, and/or manual processes. These can be orders of magnitude cheaper than building sophisticated predictive models and the infrastructure to support them. For many businesses, there are more pressing needs than doing everything in a theoretically sound way. Despite what many technical people like to think, customers don’t tend to care how things are implemented, as long as their needs are fulfilled.\nFor example, I spent some time with a client whose product includes a semi-manual part where structured data is extracted from documents. Their process included sending some of the documents to a trained team in the Philippines for manual analysis. The client was interested in replacing that manual work with a machine learning algorithm. As is often the case with machine learning, it was unknown whether the resultant model would be accurate enough to completely replace the manual workers. This generally depends on data quality and the feasibility of solving the problem. Assessing the feasibility would have taken some time and money, so the client decided to park the idea and focus on other areas of their business.\nEvery business has resource constraints. Situations where the best investment you can make is hiring a full-time data scientist are rarer than what the hype may make you think. It’s often the case that functions that would be the responsibility of a data scientist are adequately performed by existing employees, such as software engineers, business/data analysts, and marketers.\nQ5: Are you committed to being data-driven? I have seen more than one case where data scientists are hired only to be blocked or ignored. This is more prevalent in the corporate world, where managers are often incentivised to prioritise doing things that look good over things that make financial sense. But even if recruitment is done with the best intentions, progress may be blocked by employees who feel threatened because they would be replaced by automated data-driven algorithms. Successful data science projects require support from senior leadership, as discussed by Greta Roberts, Radim Řehůřek, Alec Smith, and many others. Without such support and a strong commitment to making data-driven decisions, everyone is just wasting their time.\nClosing thoughts While data science is currently over-hyped, many organisations still have much to gain from hiring data scientists. I hope that this post has helped you decide whether you need a data scientist right now. If you’re unsure, please don’t hesitate to contact me. And to any data scientists reading this: Be very wary of potential employers who do not have good answers to the above questions. At this point in time you can afford to be picky, at least until the hype is over.\n","wordCount":"1178","inLanguage":"en","image":"https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer.jpg","datePublished":"2015-08-24T08:25:30Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">You don’t need a data scientist (yet)</h1><div class=post-meta><span title='2015-08-24 08:25:30 +0000 UTC'>August 24, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-08-24-you-dont-need-a-data-scientist-yet/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer_hu2c8b5baf56bd11c08a3f40db9407264b_42562_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer_hu2c8b5baf56bd11c08a3f40db9407264b_42562_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer.jpg 560w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/hammer.jpg alt width=560 height=300></figure><div class=post-content><p>The hype around big data has caused many organisations to hire data scientists without giving much thought to what these data scientists are going to do and whether they&rsquo;re actually needed. This is a source of frustration for all parties involved. This post discusses some questions you should ask yourself before deciding to hire your first data scientist.</p><h3 id=q1-do-you-know-what-data-scientists-do>Q1: Do you know what data scientists do?<a hidden class=anchor aria-hidden=true href=#q1-do-you-know-what-data-scientists-do>#</a></h3><p>Somewhat surprisingly, there are quite a few companies that hire data scientists without having a clear idea of what data scientists actually do. People seem to have a fear of missing out on the big data hype, and think of hiring data scientists as the solution. A common misconception is that a data scientist&rsquo;s role includes telling you what to do with your data. While this may sometimes happen in practice, the ideal scenario is where the business has problems that can be solved using data science (more on this under Q3 below). If you don&rsquo;t know what your data scientist is going to do, you probably don&rsquo;t need one.</p><p>So what do data scientists do? When you think about it, adding the word &ldquo;data&rdquo; to &ldquo;science&rdquo; is a bit redundant, as all science is based on data. Following from this, <a href=http://robjhyndman.com/hyndsight/am-i-a-data-scientist/ target=_blank rel=noopener>anyone who does any kind of data analysis is a data scientist</a>. While it may be true, this broad definition is not very helpful. <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>As discussed in a previous post</a>, it&rsquo;s more useful to define data scientists as individuals who combine expertise in statistics and machine learning with strong software engineering skills.</p><h3 id=q2-do-you-have-enough-data-available>Q2: Do you have enough data available?<a hidden class=anchor aria-hidden=true href=#q2-do-you-have-enough-data-available>#</a></h3><p>It&rsquo;s not uncommon to see products that suffer from over-engineering and premature investment in advanced analytics capabilities. In the early stages, it&rsquo;s important to focus on creating a minimum viable product and getting it to market quickly. Data science starts to shine once the product is generating enough data, as most of the power of advanced analytics is in optimising and automating existing processes.</p><p>Not having a data scientist in the early stages doesn&rsquo;t mean the data is being ignored – it just means that it doesn&rsquo;t require the attention of a full-time data scientist. If your product is at an early stage and you are still concerned, you&rsquo;re better off hiring a data science consultant for a few days to help lay out the long-term vision for data-driven capabilities. This would be cheaper and less time-consuming than hiring a full-timer. The exception to this rule is when the product itself is built around advanced analytics (e.g., <a href=http://www.alchemyapi.com/ target=_blank rel=noopener>AlchemyAPI</a> or <a href=http://www.enlitic.com/ target=_blank rel=noopener>Enlitic</a>). Building such products without data scientists is far from ideal, or just impossible.</p><p>Even if your product is mature and generating a lot of data, it doesn&rsquo;t mean it&rsquo;s ready for data science. Advanced analytics capabilities are at the top of <a href=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/>data&rsquo;s hierarchy of needs</a>: If your product is buggy, or if your data is scattered everywhere and your platform lacks centralised reporting, you need to first invest in fixing your data plumbing. This is the job of <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>data engineers</a>. Getting data scientists involved when the data is hardly available due to infrastructure issues is likely to lead to frustration. In addition, setting up centralised reporting and dashboarding is likely to give you ideas for problems that data scientists can solve.</p><h3 id=q3-do-you-have-a-specific-problem-to-solve>Q3: Do you have a specific problem to solve?<a hidden class=anchor aria-hidden=true href=#q3-do-you-have-a-specific-problem-to-solve>#</a></h3><p>If the problem you&rsquo;re trying to solve is &ldquo;everyone is doing smart things with data, we should be doing stuff with data too&rdquo;, you don&rsquo;t have a specific problem that can be solved by bringing a data scientist on board. Defining the problem often ends up occupying a lot of the data scientist&rsquo;s time, so you are likely to obtain better results if have more than just a vague idea around &ldquo;doing something with data, because Hadoop&rdquo;. Ideally you want to optimise an existing process that is currently being solved with heuristics, make an existing model better, implement a new data-driven feature, or something along these lines. Common examples include reducing churn, increasing conversions, and replacing manual processes with automated data-driven systems. Again, getting advice from experienced data scientists before committing to hiring one may be your best first step.</p><h3 id=q4-can-you-get-away-with-heuristics-intuition-andor-manual-processes>Q4: Can you get away with heuristics, intuition, and/or manual processes?<a hidden class=anchor aria-hidden=true href=#q4-can-you-get-away-with-heuristics-intuition-andor-manual-processes>#</a></h3><p>Some data scientists would passionately claim that you must deploy only models that are theoretically justified and well-tested. However, in many cases you can get away with using simple heuristics, intuition, and/or manual processes. These can be orders of magnitude cheaper than building sophisticated predictive models and the infrastructure to support them. For many businesses, there are more pressing needs than doing everything in a theoretically sound way. Despite what many technical people like to think, customers don&rsquo;t tend to care how things are implemented, as long as their needs are fulfilled.</p><p>For example, I spent some time with a client whose product includes a semi-manual part where structured data is extracted from documents. Their process included sending some of the documents to a trained team in the Philippines for manual analysis. The client was interested in replacing that manual work with a machine learning algorithm. As is often the case with machine learning, it was unknown whether the resultant model would be accurate enough to completely replace the manual workers. This generally depends on data quality and the feasibility of solving the problem. Assessing the feasibility would have taken some time and money, so the client decided to park the idea and focus on other areas of their business.</p><p>Every business has resource constraints. Situations where the best investment you can make is hiring a full-time data scientist are rarer than what the hype may make you think. It&rsquo;s often the case that functions that would be the responsibility of a data scientist are adequately performed by existing employees, such as software engineers, business/data analysts, and marketers.</p><h3 id=q5-are-you-committed-to-being-data-driven>Q5: Are you committed to being data-driven?<a hidden class=anchor aria-hidden=true href=#q5-are-you-committed-to-being-data-driven>#</a></h3><p>I have seen more than one case where data scientists are hired only to be blocked or ignored. This is more prevalent in the corporate world, where managers are often incentivised to prioritise doing things that look good over things that make financial sense. But even if recruitment is done with the best intentions, progress may be blocked by employees who feel threatened because they would be replaced by automated data-driven algorithms. Successful data science projects require support from senior leadership, as discussed by <a href=http://venturebeat.com/2015/07/22/stop-hiring-data-scientists-until-youre-ready-for-data-science/ target=_blank rel=noopener>Greta Roberts</a>, <a href=https://berlinbuzzwords.de/sites/berlinbuzzwords.de/files/media/documents/radim_rehurek-so_you_want_to_be_a_data_science_consultant.pdf target=_blank rel=noopener>Radim Řehůřek</a>, <a href=https://www.linkedin.com/pulse/big-data-science-analytics-australia-alec-smith target=_blank rel=noopener>Alec Smith</a>, and many others. Without such support and a strong commitment to making data-driven decisions, everyone is just wasting their time.</p><h3 id=closing-thoughts>Closing thoughts<a hidden class=anchor aria-hidden=true href=#closing-thoughts>#</a></h3><p>While data science is currently over-hyped, many organisations still have much to gain from hiring data scientists. I hope that this post has helped you decide whether you need a data scientist right now. If you&rsquo;re unsure, please don&rsquo;t hesitate to <a href=https://yanirseroussi.com/about/ target=_blank rel=noopener>contact me</a>. And to any data scientists reading this: Be very wary of potential employers who do not have good answers to the above questions. At this point in time you can afford to be picky, at least until the hype is over.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/business/>business</a></li><li><a href=https://yanirseroussi.com/tags/data-business/>data business</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on x" href="https://x.com/intent/tweet/?text=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f&amp;hashtags=business%2cdatabusiness%2cdatascience"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f&amp;title=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&amp;summary=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f&title=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on whatsapp" href="https://api.whatsapp.com/send?text=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on telegram" href="https://telegram.me/share/url?text=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share You don’t need a data scientist (yet) on ycombinator" href="https://news.ycombinator.com/submitlink?t=You%20don%e2%80%99t%20need%20a%20data%20scientist%20%28yet%29&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f08%2f24%2fyou-dont-need-a-data-scientist-yet%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-621><div class=comment-header><a href=#comment-621><img class=comment-avatar src="https://www.gravatar.com/avatar/3e83196ec5d22b66453107ead83adc58?s=50"><p class=comment-info><strong>Eric Colson</strong><br><small>2015-08-28 16:10:40</small></p></a></div><div class="comment-body post-content"><p>I enjoyed the post - though I offer some contrary points to consider:</p><p>I have learned that if it is clear that you will need a data scientists (by someone who knows what they do), then you should get them as soon as possible. Don’t wait. Data Scientists work best when they have full context for the problem they are here to solve. Getting them in early allows them to help frame the problem. This framing is critical. If the framing is off, it takes a very long time (sometimes never) to get it back on track. A late-to-the-game data scientists can be too influenced by the the existing framing they are given. They tend to think within that box, when in reality, the box was never the right way to approach the problem. Even if they do see outside of it, it can be very difficult to convince the original framers that there is a better way to do things (people can get quite attached to their vision).</p><p>It also can be wise to NOT WAIT till there is data to analyze. Too often, data is an afterthought. Its important for the data scientist to get in early on the initiative so he or she can help define the needed instrumentation and data acquisition strategy. They can even guide the needs of the data warehouse and other repositories where the newly captured data will reside.</p><p>Further, it is often the case that it is the data scientist that identifies the specific problem to solve. At my company, I estimate that over half of the ideas for new data products, features, and services come from the data science team &ndash; not the business. This is intuitive as the data scientists are the folks that are most intimate with the data and are least constrained by what is possible to do with data. Give them business context and they will come up with problems/solutions that no one has thought of.</p><p>Finally, I find heuristics to be dangerous. At best they are suboptimal, and more often than not, they are just plain wrong (those with extensive A/B testing experience can attest to the fact that our intuition fails us again and again). Undoing a bad heuristics can be very painful - in the technical work, the coordinate work, and in the resetting of expectations. Its hard to get people to not walk on a paved path &mldr; even if that path is the long way or a dead-end.</p><p>I totally agree with &ldquo;Q5: Are you committed to being data-driven?&rdquo;. This comes down to business model and culture. Is your business model one where data science can be the source of strategic differentiation? Is your culture able to support empiricism? The answer to both of these has to be &lsquo;yes&rsquo; in order to commit to being data-driven.</p></div></div><div class=comment-level-1 id=comment-623><div class=comment-header><a href=#comment-623><img class=comment-avatar src="https://www.gravatar.com/avatar/dda019c47a6183120608a6aeac2db6c5?s=50"><p class=comment-info><strong>Yanir Seroussi</strong><br><small>2015-08-29 02:43:41</small></p></a></div><div class="comment-body post-content"><p>Thank you for your thoughtful comments, Eric!</p><p>I generally agree that it can be beneficial to involve data scientists early on and to avoid thoughtless heuristics, but that it all depends on having a supportive data-driven environment and on resource constraints. As mentioned under Q2, getting advice from a data scientist in the early stages of the product is worthwhile, so it may be smart to pay for a few days of consulting, but not necessarily a good idea to hire a full-timer. A lot of it depends on the general product vision.</p><p>Another note regarding heuristics and intuition: While some may be dangerous, you can view many modelling decisions as heuristics. For example, when building a predictive model, you have to make some intuition-driven choices around features (no model uses all the knowledge in the world), learning algorithms and their hyperparameters. You just can&rsquo;t test everything, so there&rsquo;s a need for compromises if you aim to ever deliver anything.</p></div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2015/10/02/the-wonderful-world-of-recommender-systems/index.html b/2015/10/02/the-wonderful-world-of-recommender-systems/index.html
index b0046e146..51e5b5e6b 100644
--- a/2015/10/02/the-wonderful-world-of-recommender-systems/index.html
+++ b/2015/10/02/the-wonderful-world-of-recommender-systems/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>The wonderful world of recommender systems | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,machine learning,predictive modelling,recommender systems,software engineering"><meta name=description content="Giving an overview of the field and common paradigms, and debunking five common myths about recommender systems."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The wonderful world of recommender systems"><meta property="og:description" content="Giving an overview of the field and common paradigms, and debunking five common myths about recommender systems."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/"><meta property="og:image" content="https://yanirseroussi.com/recommender-universe.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-10-02T05:25:57+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/recommender-universe.jpg"><meta name=twitter:title content="The wonderful world of recommender systems"><meta name=twitter:description content="Giving an overview of the field and common paradigms, and debunking five common myths about recommender systems."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The wonderful world of recommender systems","item":"https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The wonderful world of recommender systems","name":"The wonderful world of recommender systems","description":"Giving an overview of the field and common paradigms, and debunking five common myths about recommender systems.","keywords":["data science","machine learning","predictive modelling","recommender systems","software engineering"],"articleBody":"I recently gave a talk about recommender systems at the Data Science Sydney meetup (the slides are available here). This post roughly follows the outline of the talk, expanding on some of the key points in non-slide form (i.e., complete sentences and paragraphs!). The first few sections give a broad overview of the field and the common recommendation paradigms, while the final part is dedicated to debunking five common myths about recommender systems.\nMotivation: Why should we care about recommender systems? The key reason why many people seem to care about recommender systems is money. For companies such as Amazon, Netflix, and Spotify, recommender systems drive significant engagement and revenue. But this is the more cynical view of things. The reason these companies (and others) see increased revenue is because they deliver actual value to their customers – recommender systems provide a scalable way of personalising content for users in scenarios with many items.\nAnother reason why data scientists specifically should care about recommender systems is that it is a true data science problem. That is, at least according to my favourite definition of data science as the intersection between software engineering, machine learning, and statistics. As we will see, building successful recommender systems requires all of these skills (and more).\nDefining recommender systems When trying to the define anything, a reasonable first step is to ask Wikipedia. Unfortunately, as of the day of this post’s publication, Wikipedia defines recommender systems too narrowly, as “a subclass of information filtering system that seek to predict the ‘rating’ or ‘preference’ that a user would give to an item” (I should probably fix it, but this wrong definition helped my talk flow better – let me know if you fix it and I’ll update this paragraph).\nThe problem with Wikipedia’s definition is that there’s so much more to recommender systems than rating prediction. First, recommender is a misnomer – calling it a discovery assistant is better, as the so-called recommendations are far from binding. Second, system means that elements like presentation are important, which is part of what makes recommendation such an interesting data science problem.\nMy definition is simply:\nRecommender systems are systems that help users discover items they may like. Recommendation paradigms Depending on who you ask, there are between two and twenty different recommendation paradigms. The usual classification is by the type of data that is used to generate recommendations. The distinction between approaches is more academic than practical, as it is often a good idea to use hybrids/ensembles to address each method’s limitations. Nonetheless, it is worthwhile discussing the different paradigms. The way I see it, if you ignore trivial approaches that often work surprisingly well (e.g., popular items, and “watch it again”), there are four main paradigms: collaborative filtering, content-based, social/demographic, and contextual recommendation.\nCollaborative filtering is perhaps the most famous approach to recommendation, to the point that it is sometimes seen as synonymous with the field. The main idea is that you’re given a matrix of preferences by users for items, and these are used to predict missing preferences and recommend items with high predictions. One of the key advantages of this approach is that there has been a huge amount of research into collaborative filtering, making it pretty well-understood, with existing libraries that make implementation fairly straightforward. Another important advantage is that collaborative filtering is independent of item properties. All you need to get started is user and item IDs, and some notion of preference by users for items (ratings, views, etc.).\nThe major limitation of collaborative filtering is its reliance on preferences. In a cold-start scenario, where there are no preferences at all, it can’t generate any recommendations. However, cold starts can also occur when there are millions of available preferences, because pure collaborative recommendation doesn’t work for items or users with no ratings, and often performs pretty poorly when there are only a few ratings. Further, the underlying collaborative model may yield disappointing results when the preference matrix is sparse. In fact, this has been my experience in nearly every situation where I deployed collaborative filtering. It always requires tweaking, and never simply works out of the box.\nContent-based algorithms are given user preferences for items, and recommend similar items based on a domain-specific notion of item content. The main advantage of content-based recommendation over collaborative filtering is that it doesn’t require as much user feedback to get going. Even one known user preference can yield many good recommendations (which can lead to the collection of preferences to enable collaborative recommendation). In many scenarios, content-based recommendation is the most natural approach. For example, when recommending news articles or blog posts, it’s natural to compare the textual content of the items. This approach also extends naturally to cases where item metadata is available (e.g., movie stars, book authors, and music genres).\nOne problem with deploying content-based recommendations arises when item similarity is not so easily defined. However, even when it is natural to measure similarity, content-based recommendations may end up being too homogeneous to be useful. Such recommendations may also be too static over time, thereby failing to adjust to changes in individual user tastes and other shifts in the underlying data.\nSocial and demographic recommenders suggest items that are liked by friends, friends of friends, and demographically-similar people. Such recommenders don’t need any preferences by the user to whom recommendations are made, making them very powerful. In my experience, even trivially-implemented approaches can be depressingly accurate. For example, just summing the number of Facebook likes by a person’s close friends can often be enough to paint a pretty accurate picture of what that person likes.\nGiven this power of social and demographic recommenders, it isn’t surprising that social networks don’t easily give their data away. This means that for many practitioners, employing social/demographic recommendation algorithms is simply impossible. However, even when such data is available, it is not always easy to use without creeping users out. Further, privacy concerns need to be carefully addressed to ensure that users are comfortable with using the system.\nContextual recommendation algorithms recommend items that match the user’s current context. This allows them to be more flexible and adaptive to current user needs than methods that ignore context (essentially giving the same weight to all of the user’s history). Hence, contextual algorithms are more likely to elicit a response than approaches that are based only on historical data.\nThe key limitations of contextual recommenders are similar to those of social and demographic recommenders – contextual data may not always be available, and there’s a risk of creeping out the user. For example, ad retargeting can be seen as a form of contextual recommendation that follows users around the web and across devices, without having the explicit consent of the users to being tracked in this manner.\nFive common myths about recommender systems There are some common myths and misconceptions surrounding recommender systems. I’ve picked five to address in this post. If you disagree, agree, or have more to add, I would love to hear from you either privately or in the comment section.\nThe accuracy myth\nOffline optimisation of an accuracy measure is sufficient for creating a successful recommender\nReality\nUsers don't really care about accuracy This is perhaps the most prevalent myth of all, as evidenced by Wikipedia’s definition of recommender systems. It’s somewhat surprising that it still persists, as it’s been almost ten years since McNee et al.’s influential paper on the damage the focus on accuracy measures has done to the field.\nIt is therefore worth asking where this myth came from. My theory is that it is a feedback loop between academia and industry. In academia it is pretty easy to publish papers with infinitesimal improvements to arbitrary accuracy measures on offline datasets (I’m also guilty of doing just that), while it’s relatively hard to run experiments on live systems. However, one of the moves that significantly increased focus on offline predictive accuracy came from industry, in the form of the $1M Netflix prize, where the goal was to improve the accuracy of Netflix’s rating prediction algorithm by 10%.\nNotably, most of the algorithms that came out of the three-year competition were never integrated into Netflix. As discussed on the Netflix blog:\nYou might be wondering what happened with the final Grand Prize ensemble that won the $1M two years later… We evaluated some of the new methods offline but the additional accuracy gains that we measured did not seem to justify the engineering effort needed to bring them into a production environment.\nOur business objective is to maximize member satisfaction and month-to-month subscription retention… Now it is clear that the Netflix Prize objective, accurate prediction of a movie’s rating, is just one of the many components of an effective recommendation system that optimizes our members’ enjoyment.\nThe following chart says it all (taken from the second part of the blog post quoted above):\nAn important question that arises is: If users don’t really care about predictive accuracy, what do they care about? The answer is that predictive accuracy has some importance (as evidenced by the above chart), but it is not the only thing. In my opinion, the key consideration is UI/UX. You can have the most accurate recommendations in the world, but no one would know about it (or care) if they are not served in a timely manner through a friendly interface.\nOf course, even with a great user interface and accurate predictions, there are other issues that require attention when designing recommender systems. Examples include diversity (showing various types of items), serendipity/novelty (showing non-obvious recommendations that users don’t already know about), and coverage (being able to generate recommendations for all users and items). Many other considerations are covered in an excellent survey by Guy Shani and Asela Gunawardana.\nIt’s also worth noting that there is an inherent problem with common accuracy measures. Specifically, when using a measure like root mean square error, a rating prediction algorithm can be made to perform better by reducing errors on low ratings. This is rather pointless, because items with low ratings will not be shown to users in any case.\nFinally, a key issue that arises with offline evaluation is that there are biases in offline datasets that do not necessarily carry over to online scenarios. For instance, in many cases there is an implicit assumption that data is missing at random, when it really isn’t, e.g., the fact that users took the effort to watch and rate a movie already tells us a lot about a bias they have towards this movie (the team that won the Netflix prize used this bias to their advantage). Hiding this rating and trying to predict it is not the same as predicting a rating for a movie that is picked at random from the entire set of movies.\nThe black box myth\nYou can build successful recommender systems without worrying about what's being recommended and how recommendations are being served\nReality\nUI/UX is king, item type is critical A good recommender system has to consider how users interact with the recommendations. For example, the number of displayed recommendations should inform the optimisation procedure (e.g., are you aiming for precision@1 or precision@10?). How these recommendations are laid out (e.g., horizontally/vertically) tends to influence user interaction. In addition, being able to explain the reasons for the recommendations can yield easy wins. Finally, in many cases there are constraints on the amount of time that can be spent generating recommendations.\nIn addition to UI/UX, the design of good recommender systems has to account for what’s being recommended. For example, music tracks and short videos can be played many times, so it’s probably a good idea to recommend items that the user has already seen. On the other hand, items like washing machines and cars don’t get consumed as often. If a user has just bought a washing machine, they’re unlikely to want another one anytime soon (but they may want a dryer or a clothes line).\nHynt is a recommender-system-as-a-service for e-commerce whose development I led up until the middle of last year. The general idea is that merchants simply add a few lines of JavaScript to their shop pages and Hynt does the hard work of recommending relevant items from the store, while considering the user and page context. Going live with Hynt reaffirmed many well-known UI/UX lessons. Most notably:\nAbove the fold is better than below. Engagement with Hynt widgets that were visible without scrolling was higher than those that were lower on the page. More recommendations are better than a few. Hynt widgets are responsive, adapting to the size of the container they’re placed in. Engagement was more likely when more recommendations were displayed, because users were more likely to find something they liked without scrolling through the widget. Fast is better than slow. If recommendations load faster, more people see them, which increases engagement. In Hynt’s case speed was especially important because the widgets load asynchronously after the host page finishes loading. Another important UI/UX element is explanations. Displaying a plausible explanation next to a recommendation can do wonders, without making any changes to the underlying recommendation algorithms. The impact of explanations has been studied extensively by Nava Tintarev and Judith Masthoff. They have identified seven different aims of explanations, which are summarised in the following table (reproduced from their survey of explanations in recommender systems).\nAim Definition Transparency Explain how the system works Scrutability Allow users to tell the system it is wrong Trust Increase user confidence in the system Effectiveness Help users make good decisions Persuasiveness Convince users to try or buy Efficiency Help users make decisions faster Satisfaction Increase ease of usability or enjoyment Explanations are ubiquitous in real-world recommender systems. For example, Amazon uses explanations like “frequently bought together”, and “customers who bought this item also bought”, while Netflix presents different lists of recommendations where each list is driven by a different reason. However, as the following Netflix example shows, it is worth making sure that the explanations you provide don’t make you look stupid.\nThe solved problem myth\nThe space of recommender systems has been exhaustively explored\nReality\nDevelopment of new methods is often required When I finished my PhD, about three years ago, I joined a small startup called Giveable as the first employee (essentially part of the founding team that was formed after Adam Neumann, the original founder, graduated from AngelCube and raised some seed funding). Giveable’s original product was a webapp where users could connect with their Facebook account and find gifts for their friends.\nAt the time, there wasn’t much published research on gift recommendation, and there was more or less nothing about the specific problem of recommending gifts for Facebook friends using liked pages. Here are some of the ways this problem differs from classic recommendation scenarios.\nNeed to consider giver and receiver. Unlike traditional scenarios, the recommended items aren’t consumed by the user to whom they’re shown. In practice, this meant that we had to ensure the items are giftable, and take into account the relationship between the giver and the receiver. For example, the type of gift your mum may give you is different from gifts your partner may give you. Likes are historical, sparse, and often nonsensical. This is best illustrated by an example: What does liking a page such as Tony Abbott – Worst PM in Australian History tell us about gifts the user may like? Tony Abbott is no longer prime minister (thankfully), so it’s historical, and while this page is quite popular, there are many other pages out there that are difficult to interpret and are liked by only a handful of people (this video is a good summary of why Tony is disliked, for those who are unfamiliar with Australian politics). Likes are not for recommended items. As the above example shows, just because you like disliking Tony, it doesn’t exactly lead to useful gifts. Even with things that are more related to interests, such as authors and bands, the liked pages aren’t recommendable as gifts. Likes are not always available offline. This was an important engineering consideration: We didn’t have much time to generate recommendations from the point where a new user gave us permission to view their likes and the likes of their friends. Ideally, recommendation generation would take less than a second from the time we got all the data from Facebook. This puts a strong constraint on the types of algorithms we could use. The key to effectively addressing the Giveable recommendation problem was doing as much processing offline as possible. Specifically:\nSimilar pages were inferred using Latent Dirichlet Allocation (which can be seen as a collaborative filtering technique). This made it possible to use information from pages that are not directly linked to giftable products, e.g., for the above Tony Abbott example, people who dislike him are likely to be left-leaning, which implies many other interests. Facebook pages were matched to giftable products with heuristics + Mechanical Turk + machine learning. This took a few iterations of what was essentially partly-manual semi-supervised learning, where we obtained high-confidence matches through heuristics and manual tagging, and then used this to train a classifier that was used to classify uncertain matches. The results of classification on a hold-out set were then verified through manual tagging of subsamples. We enriched the page and product data with structured information from the Freebase knowledge graph (which has since been deprecated). This allowed us to easily match giftable products to liked pages, e.g., books to authors. The online part included taking a receiver’s liked pages, inferring likes for similar pages, and matching all these pages to a ranked and diversified list of giftable product recommendations. These recommendations came with explanations, which were quite important in this case because the giver of a gift has to know why they’re giving it.\nThe silver bullet myth\nOptimising a single measure or using a single algorithm is sufficient for generating a good recommendation list\nReality\nHybrids work best Netflix provides another example for how focusing on a single algorithm or measure of success is far from sufficient. In a recent blog post, they talk about how they use multiple algorithms to optimise the order of different recommendation lists and each list’s internal ranking, while considering device-specific UI constraints, relevance, engagement, diversity, business requirements, and more.\nAn example from my experience comes from Giveable (which ended up evolving into Hynt), where a single list was generated by mixing the outputs of the following recommendation approaches: contextual, direct likes, inferred likes, content-based, social, collaborative filtering of products, previously viewed items, and popular interests/products. The weight of each algorithm in the mix was static – it was either set manually or through A/B testing, and then left as a hardcoded constant.\nThis kind of static mix can get you very far, but there’s a better way that I haven’t gotten around to implementing before leaving to work on other things. This way is described in a series of posts on bandits for recommenders by Sergey Feldman of RichRelevance. The general idea is to train recommendation models offline using a small number of strategies/paradigms. Online, recommendations are served from strategies that maximise clickthrough and revenue, given a context of features that describe the user, merchant, and web page where the RichRelevance widget is embedded. Rather than setting static weights for the strategies, the bandit model continuously adjusts the weights, while balancing between exploring new strategy weights and exploiting strategies that have been known to work well in a specific context. This allows the overall recommendation engine to adjust to changes in reality and in the underlying data.\nThe omnipresence myth\nEvery personalised system is a recommender system\nReality\nThis one is kinda true, but not necessarily useful... The first conference I attended as a PhD student was the 18th International Conference on User Modeling, Adaptation and Personalization (UMAP), back in 2010. The field of recommender systems was getting increased attention, and Peter Brusilovsky, who has been working in the UMAP field for decades, argued that recommender systems are the new expert systems. This was partly because the hype was causing people to broaden the definition of the field to allow them to say that they’re working on recommender systems.\nI don’t think it’s incorrect that personalisation and recommender systems are different things. However, one problem that this may cause is making people think that common recommendation techniques would apply in scenarios where they’re unlikely to work. For example, web search can be seen as a recommender system for pages that gives a high weight to the user’s intent, as captured by the query. Hence, when personalising web search, it seems sensible to use collaborative filtering techniques. This was indeed my experience with the Yandex search personalisation competition: employing a matrix factorisation approach that was inspired by collaborative filtering turned out to be a waste of time compared to domain-specific methods.\nIn conclusion, recommenders are about as murky as data science. Just like data science, the boundaries of recommender systems are hard to define and they are sometimes over-hyped. This hype may lead to people investing in a recommender system they don’t really need, just like the common issue of premature investment in data science. However, the hype is based on real value, which can definitely be delivered by recommender systems when they are used correctly.\n","wordCount":"3577","inLanguage":"en","image":"https://yanirseroussi.com/recommender-universe.jpg","datePublished":"2015-10-02T05:25:57Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The wonderful world of recommender systems</h1><div class=post-meta><span title='2015-10-02 05:25:57 +0000 UTC'>October 2, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-10-02-the-wonderful-world-of-recommender-systems/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe.jpg 4961w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe.jpg alt width=4961 height=2468></figure><div class=post-content><p>I recently gave a talk about recommender systems at the <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Data Science Sydney meetup</a> (the slides are available <a href=http://yanirs.github.io/talks/the-wonderful-world-of-recommender-systems target=_blank rel=noopener>here</a>). This post roughly follows the outline of the talk, expanding on some of the key points in non-slide form (i.e., complete sentences and paragraphs!). The first few sections give a broad overview of the field and the common recommendation paradigms, while the final part is dedicated to debunking five common myths about recommender systems.</p><h3 id=motivation-why-should-we-care-about-recommender-systems>Motivation: Why should we care about recommender systems?<a hidden class=anchor aria-hidden=true href=#motivation-why-should-we-care-about-recommender-systems>#</a></h3><p>The key reason why many people seem to care about recommender systems is <em>money</em>. For companies such as Amazon, Netflix, and Spotify, recommender systems drive significant engagement and revenue. But this is the more cynical view of things. The reason these companies (and others) see increased revenue is because they deliver actual <em>value</em> to their customers – recommender systems provide a scalable way of personalising content for users in scenarios with many items.</p><p>Another reason why data scientists specifically should care about recommender systems is that it is a true data science problem. That is, at least according to <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>my favourite definition of data science</a> as the intersection between software engineering, machine learning, and statistics. As we will see, building successful recommender systems requires all of these skills (and more).</p><h3 id=defining-recommender-systems>Defining recommender systems<a hidden class=anchor aria-hidden=true href=#defining-recommender-systems>#</a></h3><p>When trying to the define anything, a reasonable first step is to ask Wikipedia. Unfortunately, as of the day of this post&rsquo;s publication, <a href=http://en.wikipedia.org/wiki/Recommender_system target=_blank rel=noopener>Wikipedia defines recommender systems too narrowly</a>, as &ldquo;a subclass of information filtering system that seek to predict the ‘rating&rsquo; or ‘preference&rsquo; that a user would give to an item&rdquo; (I should probably fix it, but this wrong definition helped my talk flow better – let me know if you fix it and I&rsquo;ll update this paragraph).</p><p>The problem with Wikipedia&rsquo;s definition is that there&rsquo;s so much more to recommender systems than rating prediction. First, <em>recommender</em> is a misnomer – calling it a discovery assistant is better, as the so-called recommendations are far from binding. Second, <em>system</em> means that elements like presentation are important, which is part of what makes recommendation such an interesting data science problem.</p><p>My definition is simply:</p><p class=highlight-box><i>Recommender systems are systems that help users discover items they may like.</i></p><h3 id=recommendation-paradigms>Recommendation paradigms<a hidden class=anchor aria-hidden=true href=#recommendation-paradigms>#</a></h3><p>Depending on who you ask, there are between two and twenty different recommendation paradigms. The usual classification is by the type of data that is used to generate recommendations. The distinction between approaches is more academic than practical, as it is often a good idea to use hybrids/ensembles to address each method&rsquo;s limitations. Nonetheless, it is worthwhile discussing the different paradigms. The way I see it, if you ignore trivial approaches that often work surprisingly well (e.g., popular items, and &ldquo;watch it again&rdquo;), there are four main paradigms: collaborative filtering, content-based, social/demographic, and contextual recommendation.</p><p><strong>Collaborative filtering</strong> is perhaps the most famous approach to recommendation, to the point that it is sometimes seen as synonymous with the field. The main idea is that you&rsquo;re given a matrix of preferences by users for items, and these are used to predict missing preferences and recommend items with high predictions. One of the key advantages of this approach is that there has been a huge amount of research into collaborative filtering, making it pretty well-understood, with existing libraries that make implementation fairly straightforward. Another important advantage is that collaborative filtering is independent of item properties. All you need to get started is user and item IDs, and some notion of preference by users for items (ratings, views, etc.).</p><p>The major limitation of collaborative filtering is its reliance on preferences. In a cold-start scenario, where there are no preferences at all, it can&rsquo;t generate any recommendations. However, cold starts can also occur when there are millions of available preferences, because pure collaborative recommendation doesn&rsquo;t work for items or users with no ratings, and <a href=https://dl.dropboxusercontent.com/u/25632965/SeroussiBohnertZukerman2011.pdf target=_blank rel=noopener>often performs pretty poorly when there are only a few ratings</a>. Further, the underlying collaborative model may yield disappointing results when the preference matrix is sparse. In fact, this has been my experience in <a href=https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/>nearly every situation where I deployed collaborative filtering</a>. It always requires tweaking, and never simply works out of the box.</p><p><strong>Content-based</strong> algorithms are given user preferences for items, and recommend similar items based on a domain-specific notion of item content. The main advantage of content-based recommendation over collaborative filtering is that it doesn&rsquo;t require as much user feedback to get going. Even one known user preference can yield many good recommendations (which can lead to the collection of preferences to enable collaborative recommendation). In many scenarios, content-based recommendation is the most natural approach. For example, when recommending news articles or blog posts, it&rsquo;s natural to compare the textual content of the items. This approach also extends naturally to cases where item metadata is available (e.g., movie stars, book authors, and music genres).</p><p>One problem with deploying content-based recommendations arises when item similarity is not so easily defined. However, even when it is natural to measure similarity, content-based recommendations may end up being too homogeneous to be useful. Such recommendations may also be too static over time, thereby failing to adjust to changes in individual user tastes and other shifts in the underlying data.</p><p><strong>Social and demographic</strong> recommenders suggest items that are liked by friends, friends of friends, and demographically-similar people. Such recommenders don&rsquo;t need any preferences by the user to whom recommendations are made, making them very powerful. In my experience, even trivially-implemented approaches can be depressingly accurate. For example, just summing the number of Facebook likes by a person&rsquo;s close friends can often be enough to paint a pretty accurate picture of what that person likes.</p><p>Given this power of social and demographic recommenders, it isn&rsquo;t surprising that social networks don&rsquo;t easily give their data away. This means that for many practitioners, employing social/demographic recommendation algorithms is simply impossible. However, even when such data is available, it is not always easy to use without creeping users out. Further, privacy concerns need to be carefully addressed to ensure that users are comfortable with using the system.</p><p><strong>Contextual</strong> recommendation algorithms recommend items that match the user&rsquo;s current context. This allows them to be more flexible and adaptive to current user needs than methods that ignore context (essentially giving the same weight to all of the user&rsquo;s history). Hence, contextual algorithms are more likely to elicit a response than approaches that are based only on historical data.</p><p>The key limitations of contextual recommenders are similar to those of social and demographic recommenders – contextual data may not always be available, and there&rsquo;s a risk of creeping out the user. For example, <a href=https://en.wikipedia.org/wiki/Behavioral_retargeting target=_blank rel=noopener>ad retargeting</a> can be seen as a form of contextual recommendation that follows users around the web and across devices, without having the explicit consent of the users to being tracked in this manner.</p><h3 id=five-common-myths-about-recommender-systems>Five common myths about recommender systems<a hidden class=anchor aria-hidden=true href=#five-common-myths-about-recommender-systems>#</a></h3><p>There are some common myths and misconceptions surrounding recommender systems. I&rsquo;ve picked five to address in this post. If you disagree, agree, or have more to add, I would love to hear from you either <a href=https://yanirseroussi.com/about/>privately</a> or in the comment section.</p><p class=highlight-box><b>The accuracy myth</b><br>Offline optimisation of an accuracy measure is sufficient for creating a successful recommender<br><b>Reality</b><br>Users don't really care about accuracy</p><p>This is perhaps the most prevalent myth of all, as evidenced by Wikipedia&rsquo;s definition of recommender systems. It&rsquo;s somewhat surprising that it still persists, as it&rsquo;s been almost ten years since <a href="http://dl.acm.org/citation.cfm?id=1125659" target=_blank rel=noopener>McNee et al.&rsquo;s influential paper on the damage the focus on accuracy measures has done to the field</a>.</p><p>It is therefore worth asking where this myth came from. My theory is that it is a feedback loop between academia and industry. In academia it is pretty easy to publish papers with infinitesimal improvements to arbitrary accuracy measures on offline datasets (<a href=https://dl.dropboxusercontent.com/u/25632965/SeroussiBohnertZukerman2011.pdf target=_blank rel=noopener>I&rsquo;m also guilty of doing just that</a>), while it&rsquo;s relatively hard to run experiments on live systems. However, one of the moves that significantly increased focus on offline predictive accuracy came from industry, in the form of the <a href=https://en.wikipedia.org/wiki/Netflix_Prize target=_blank rel=noopener>$1M Netflix prize</a>, where the goal was to improve the accuracy of Netflix&rsquo;s rating prediction algorithm by 10%.</p><p>Notably, most of the algorithms that came out of the three-year competition were never integrated into Netflix. As <a href=http://techblog.netflix.com/2012/04/netflix-recommendations-beyond-5-stars.html target=_blank rel=noopener>discussed on the Netflix blog</a>:</p><blockquote><p>You might be wondering what happened with the final Grand Prize ensemble that won the $1M two years later&mldr; We evaluated some of the new methods offline but the additional accuracy gains that we measured did not seem to justify the engineering effort needed to bring them into a production environment.</p><p>Our business objective is to maximize member satisfaction and month-to-month subscription retention&mldr; Now it is clear that the Netflix Prize objective, accurate prediction of a movie&rsquo;s rating, is just one of the many components of an effective recommendation system that optimizes our members&rsquo; enjoyment.</p></blockquote><p>The following chart says it all (taken from <a href=http://techblog.netflix.com/2012/06/netflix-recommendations-beyond-5-stars.html target=_blank rel=noopener>the second part of the blog post quoted above</a>):</p><figure><a href=netflix-rating-prediction-contribution.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="data science,machine learning,predictive modelling,recommender systems,software engineering"><meta name=description content="Giving an overview of the field and common paradigms, and debunking five common myths about recommender systems."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The wonderful world of recommender systems"><meta property="og:description" content="Giving an overview of the field and common paradigms, and debunking five common myths about recommender systems."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/"><meta property="og:image" content="https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-10-02T05:25:57+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe.jpg"><meta name=twitter:title content="The wonderful world of recommender systems"><meta name=twitter:description content="Giving an overview of the field and common paradigms, and debunking five common myths about recommender systems."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The wonderful world of recommender systems","item":"https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The wonderful world of recommender systems","name":"The wonderful world of recommender systems","description":"Giving an overview of the field and common paradigms, and debunking five common myths about recommender systems.","keywords":["data science","machine learning","predictive modelling","recommender systems","software engineering"],"articleBody":"I recently gave a talk about recommender systems at the Data Science Sydney meetup (the slides are available here). This post roughly follows the outline of the talk, expanding on some of the key points in non-slide form (i.e., complete sentences and paragraphs!). The first few sections give a broad overview of the field and the common recommendation paradigms, while the final part is dedicated to debunking five common myths about recommender systems.\nMotivation: Why should we care about recommender systems? The key reason why many people seem to care about recommender systems is money. For companies such as Amazon, Netflix, and Spotify, recommender systems drive significant engagement and revenue. But this is the more cynical view of things. The reason these companies (and others) see increased revenue is because they deliver actual value to their customers – recommender systems provide a scalable way of personalising content for users in scenarios with many items.\nAnother reason why data scientists specifically should care about recommender systems is that it is a true data science problem. That is, at least according to my favourite definition of data science as the intersection between software engineering, machine learning, and statistics. As we will see, building successful recommender systems requires all of these skills (and more).\nDefining recommender systems When trying to the define anything, a reasonable first step is to ask Wikipedia. Unfortunately, as of the day of this post’s publication, Wikipedia defines recommender systems too narrowly, as “a subclass of information filtering system that seek to predict the ‘rating’ or ‘preference’ that a user would give to an item” (I should probably fix it, but this wrong definition helped my talk flow better – let me know if you fix it and I’ll update this paragraph).\nThe problem with Wikipedia’s definition is that there’s so much more to recommender systems than rating prediction. First, recommender is a misnomer – calling it a discovery assistant is better, as the so-called recommendations are far from binding. Second, system means that elements like presentation are important, which is part of what makes recommendation such an interesting data science problem.\nMy definition is simply:\nRecommender systems are systems that help users discover items they may like. Recommendation paradigms Depending on who you ask, there are between two and twenty different recommendation paradigms. The usual classification is by the type of data that is used to generate recommendations. The distinction between approaches is more academic than practical, as it is often a good idea to use hybrids/ensembles to address each method’s limitations. Nonetheless, it is worthwhile discussing the different paradigms. The way I see it, if you ignore trivial approaches that often work surprisingly well (e.g., popular items, and “watch it again”), there are four main paradigms: collaborative filtering, content-based, social/demographic, and contextual recommendation.\nCollaborative filtering is perhaps the most famous approach to recommendation, to the point that it is sometimes seen as synonymous with the field. The main idea is that you’re given a matrix of preferences by users for items, and these are used to predict missing preferences and recommend items with high predictions. One of the key advantages of this approach is that there has been a huge amount of research into collaborative filtering, making it pretty well-understood, with existing libraries that make implementation fairly straightforward. Another important advantage is that collaborative filtering is independent of item properties. All you need to get started is user and item IDs, and some notion of preference by users for items (ratings, views, etc.).\nThe major limitation of collaborative filtering is its reliance on preferences. In a cold-start scenario, where there are no preferences at all, it can’t generate any recommendations. However, cold starts can also occur when there are millions of available preferences, because pure collaborative recommendation doesn’t work for items or users with no ratings, and often performs pretty poorly when there are only a few ratings. Further, the underlying collaborative model may yield disappointing results when the preference matrix is sparse. In fact, this has been my experience in nearly every situation where I deployed collaborative filtering. It always requires tweaking, and never simply works out of the box.\nContent-based algorithms are given user preferences for items, and recommend similar items based on a domain-specific notion of item content. The main advantage of content-based recommendation over collaborative filtering is that it doesn’t require as much user feedback to get going. Even one known user preference can yield many good recommendations (which can lead to the collection of preferences to enable collaborative recommendation). In many scenarios, content-based recommendation is the most natural approach. For example, when recommending news articles or blog posts, it’s natural to compare the textual content of the items. This approach also extends naturally to cases where item metadata is available (e.g., movie stars, book authors, and music genres).\nOne problem with deploying content-based recommendations arises when item similarity is not so easily defined. However, even when it is natural to measure similarity, content-based recommendations may end up being too homogeneous to be useful. Such recommendations may also be too static over time, thereby failing to adjust to changes in individual user tastes and other shifts in the underlying data.\nSocial and demographic recommenders suggest items that are liked by friends, friends of friends, and demographically-similar people. Such recommenders don’t need any preferences by the user to whom recommendations are made, making them very powerful. In my experience, even trivially-implemented approaches can be depressingly accurate. For example, just summing the number of Facebook likes by a person’s close friends can often be enough to paint a pretty accurate picture of what that person likes.\nGiven this power of social and demographic recommenders, it isn’t surprising that social networks don’t easily give their data away. This means that for many practitioners, employing social/demographic recommendation algorithms is simply impossible. However, even when such data is available, it is not always easy to use without creeping users out. Further, privacy concerns need to be carefully addressed to ensure that users are comfortable with using the system.\nContextual recommendation algorithms recommend items that match the user’s current context. This allows them to be more flexible and adaptive to current user needs than methods that ignore context (essentially giving the same weight to all of the user’s history). Hence, contextual algorithms are more likely to elicit a response than approaches that are based only on historical data.\nThe key limitations of contextual recommenders are similar to those of social and demographic recommenders – contextual data may not always be available, and there’s a risk of creeping out the user. For example, ad retargeting can be seen as a form of contextual recommendation that follows users around the web and across devices, without having the explicit consent of the users to being tracked in this manner.\nFive common myths about recommender systems There are some common myths and misconceptions surrounding recommender systems. I’ve picked five to address in this post. If you disagree, agree, or have more to add, I would love to hear from you either privately or in the comment section.\nThe accuracy myth\nOffline optimisation of an accuracy measure is sufficient for creating a successful recommender\nReality\nUsers don't really care about accuracy This is perhaps the most prevalent myth of all, as evidenced by Wikipedia’s definition of recommender systems. It’s somewhat surprising that it still persists, as it’s been almost ten years since McNee et al.’s influential paper on the damage the focus on accuracy measures has done to the field.\nIt is therefore worth asking where this myth came from. My theory is that it is a feedback loop between academia and industry. In academia it is pretty easy to publish papers with infinitesimal improvements to arbitrary accuracy measures on offline datasets (I’m also guilty of doing just that), while it’s relatively hard to run experiments on live systems. However, one of the moves that significantly increased focus on offline predictive accuracy came from industry, in the form of the $1M Netflix prize, where the goal was to improve the accuracy of Netflix’s rating prediction algorithm by 10%.\nNotably, most of the algorithms that came out of the three-year competition were never integrated into Netflix. As discussed on the Netflix blog:\nYou might be wondering what happened with the final Grand Prize ensemble that won the $1M two years later… We evaluated some of the new methods offline but the additional accuracy gains that we measured did not seem to justify the engineering effort needed to bring them into a production environment.\nOur business objective is to maximize member satisfaction and month-to-month subscription retention… Now it is clear that the Netflix Prize objective, accurate prediction of a movie’s rating, is just one of the many components of an effective recommendation system that optimizes our members’ enjoyment.\nThe following chart says it all (taken from the second part of the blog post quoted above):\nAn important question that arises is: If users don’t really care about predictive accuracy, what do they care about? The answer is that predictive accuracy has some importance (as evidenced by the above chart), but it is not the only thing. In my opinion, the key consideration is UI/UX. You can have the most accurate recommendations in the world, but no one would know about it (or care) if they are not served in a timely manner through a friendly interface.\nOf course, even with a great user interface and accurate predictions, there are other issues that require attention when designing recommender systems. Examples include diversity (showing various types of items), serendipity/novelty (showing non-obvious recommendations that users don’t already know about), and coverage (being able to generate recommendations for all users and items). Many other considerations are covered in an excellent survey by Guy Shani and Asela Gunawardana.\nIt’s also worth noting that there is an inherent problem with common accuracy measures. Specifically, when using a measure like root mean square error, a rating prediction algorithm can be made to perform better by reducing errors on low ratings. This is rather pointless, because items with low ratings will not be shown to users in any case.\nFinally, a key issue that arises with offline evaluation is that there are biases in offline datasets that do not necessarily carry over to online scenarios. For instance, in many cases there is an implicit assumption that data is missing at random, when it really isn’t, e.g., the fact that users took the effort to watch and rate a movie already tells us a lot about a bias they have towards this movie (the team that won the Netflix prize used this bias to their advantage). Hiding this rating and trying to predict it is not the same as predicting a rating for a movie that is picked at random from the entire set of movies.\nThe black box myth\nYou can build successful recommender systems without worrying about what's being recommended and how recommendations are being served\nReality\nUI/UX is king, item type is critical A good recommender system has to consider how users interact with the recommendations. For example, the number of displayed recommendations should inform the optimisation procedure (e.g., are you aiming for precision@1 or precision@10?). How these recommendations are laid out (e.g., horizontally/vertically) tends to influence user interaction. In addition, being able to explain the reasons for the recommendations can yield easy wins. Finally, in many cases there are constraints on the amount of time that can be spent generating recommendations.\nIn addition to UI/UX, the design of good recommender systems has to account for what’s being recommended. For example, music tracks and short videos can be played many times, so it’s probably a good idea to recommend items that the user has already seen. On the other hand, items like washing machines and cars don’t get consumed as often. If a user has just bought a washing machine, they’re unlikely to want another one anytime soon (but they may want a dryer or a clothes line).\nHynt is a recommender-system-as-a-service for e-commerce whose development I led up until the middle of last year. The general idea is that merchants simply add a few lines of JavaScript to their shop pages and Hynt does the hard work of recommending relevant items from the store, while considering the user and page context. Going live with Hynt reaffirmed many well-known UI/UX lessons. Most notably:\nAbove the fold is better than below. Engagement with Hynt widgets that were visible without scrolling was higher than those that were lower on the page. More recommendations are better than a few. Hynt widgets are responsive, adapting to the size of the container they’re placed in. Engagement was more likely when more recommendations were displayed, because users were more likely to find something they liked without scrolling through the widget. Fast is better than slow. If recommendations load faster, more people see them, which increases engagement. In Hynt’s case speed was especially important because the widgets load asynchronously after the host page finishes loading. Another important UI/UX element is explanations. Displaying a plausible explanation next to a recommendation can do wonders, without making any changes to the underlying recommendation algorithms. The impact of explanations has been studied extensively by Nava Tintarev and Judith Masthoff. They have identified seven different aims of explanations, which are summarised in the following table (reproduced from their survey of explanations in recommender systems).\nAim Definition Transparency Explain how the system works Scrutability Allow users to tell the system it is wrong Trust Increase user confidence in the system Effectiveness Help users make good decisions Persuasiveness Convince users to try or buy Efficiency Help users make decisions faster Satisfaction Increase ease of usability or enjoyment Explanations are ubiquitous in real-world recommender systems. For example, Amazon uses explanations like “frequently bought together”, and “customers who bought this item also bought”, while Netflix presents different lists of recommendations where each list is driven by a different reason. However, as the following Netflix example shows, it is worth making sure that the explanations you provide don’t make you look stupid.\nThe solved problem myth\nThe space of recommender systems has been exhaustively explored\nReality\nDevelopment of new methods is often required When I finished my PhD, about three years ago, I joined a small startup called Giveable as the first employee (essentially part of the founding team that was formed after Adam Neumann, the original founder, graduated from AngelCube and raised some seed funding). Giveable’s original product was a webapp where users could connect with their Facebook account and find gifts for their friends.\nAt the time, there wasn’t much published research on gift recommendation, and there was more or less nothing about the specific problem of recommending gifts for Facebook friends using liked pages. Here are some of the ways this problem differs from classic recommendation scenarios.\nNeed to consider giver and receiver. Unlike traditional scenarios, the recommended items aren’t consumed by the user to whom they’re shown. In practice, this meant that we had to ensure the items are giftable, and take into account the relationship between the giver and the receiver. For example, the type of gift your mum may give you is different from gifts your partner may give you. Likes are historical, sparse, and often nonsensical. This is best illustrated by an example: What does liking a page such as Tony Abbott – Worst PM in Australian History tell us about gifts the user may like? Tony Abbott is no longer prime minister (thankfully), so it’s historical, and while this page is quite popular, there are many other pages out there that are difficult to interpret and are liked by only a handful of people (this video is a good summary of why Tony is disliked, for those who are unfamiliar with Australian politics). Likes are not for recommended items. As the above example shows, just because you like disliking Tony, it doesn’t exactly lead to useful gifts. Even with things that are more related to interests, such as authors and bands, the liked pages aren’t recommendable as gifts. Likes are not always available offline. This was an important engineering consideration: We didn’t have much time to generate recommendations from the point where a new user gave us permission to view their likes and the likes of their friends. Ideally, recommendation generation would take less than a second from the time we got all the data from Facebook. This puts a strong constraint on the types of algorithms we could use. The key to effectively addressing the Giveable recommendation problem was doing as much processing offline as possible. Specifically:\nSimilar pages were inferred using Latent Dirichlet Allocation (which can be seen as a collaborative filtering technique). This made it possible to use information from pages that are not directly linked to giftable products, e.g., for the above Tony Abbott example, people who dislike him are likely to be left-leaning, which implies many other interests. Facebook pages were matched to giftable products with heuristics + Mechanical Turk + machine learning. This took a few iterations of what was essentially partly-manual semi-supervised learning, where we obtained high-confidence matches through heuristics and manual tagging, and then used this to train a classifier that was used to classify uncertain matches. The results of classification on a hold-out set were then verified through manual tagging of subsamples. We enriched the page and product data with structured information from the Freebase knowledge graph (which has since been deprecated). This allowed us to easily match giftable products to liked pages, e.g., books to authors. The online part included taking a receiver’s liked pages, inferring likes for similar pages, and matching all these pages to a ranked and diversified list of giftable product recommendations. These recommendations came with explanations, which were quite important in this case because the giver of a gift has to know why they’re giving it.\nThe silver bullet myth\nOptimising a single measure or using a single algorithm is sufficient for generating a good recommendation list\nReality\nHybrids work best Netflix provides another example for how focusing on a single algorithm or measure of success is far from sufficient. In a recent blog post, they talk about how they use multiple algorithms to optimise the order of different recommendation lists and each list’s internal ranking, while considering device-specific UI constraints, relevance, engagement, diversity, business requirements, and more.\nAn example from my experience comes from Giveable (which ended up evolving into Hynt), where a single list was generated by mixing the outputs of the following recommendation approaches: contextual, direct likes, inferred likes, content-based, social, collaborative filtering of products, previously viewed items, and popular interests/products. The weight of each algorithm in the mix was static – it was either set manually or through A/B testing, and then left as a hardcoded constant.\nThis kind of static mix can get you very far, but there’s a better way that I haven’t gotten around to implementing before leaving to work on other things. This way is described in a series of posts on bandits for recommenders by Sergey Feldman of RichRelevance. The general idea is to train recommendation models offline using a small number of strategies/paradigms. Online, recommendations are served from strategies that maximise clickthrough and revenue, given a context of features that describe the user, merchant, and web page where the RichRelevance widget is embedded. Rather than setting static weights for the strategies, the bandit model continuously adjusts the weights, while balancing between exploring new strategy weights and exploiting strategies that have been known to work well in a specific context. This allows the overall recommendation engine to adjust to changes in reality and in the underlying data.\nThe omnipresence myth\nEvery personalised system is a recommender system\nReality\nThis one is kinda true, but not necessarily useful... The first conference I attended as a PhD student was the 18th International Conference on User Modeling, Adaptation and Personalization (UMAP), back in 2010. The field of recommender systems was getting increased attention, and Peter Brusilovsky, who has been working in the UMAP field for decades, argued that recommender systems are the new expert systems. This was partly because the hype was causing people to broaden the definition of the field to allow them to say that they’re working on recommender systems.\nI don’t think it’s incorrect that personalisation and recommender systems are different things. However, one problem that this may cause is making people think that common recommendation techniques would apply in scenarios where they’re unlikely to work. For example, web search can be seen as a recommender system for pages that gives a high weight to the user’s intent, as captured by the query. Hence, when personalising web search, it seems sensible to use collaborative filtering techniques. This was indeed my experience with the Yandex search personalisation competition: employing a matrix factorisation approach that was inspired by collaborative filtering turned out to be a waste of time compared to domain-specific methods.\nIn conclusion, recommenders are about as murky as data science. Just like data science, the boundaries of recommender systems are hard to define and they are sometimes over-hyped. This hype may lead to people investing in a recommender system they don’t really need, just like the common issue of premature investment in data science. However, the hype is based on real value, which can definitely be delivered by recommender systems when they are used correctly.\n","wordCount":"3577","inLanguage":"en","image":"https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe.jpg","datePublished":"2015-10-02T05:25:57Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The wonderful world of recommender systems</h1><div class=post-meta><span title='2015-10-02 05:25:57 +0000 UTC'>October 2, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-10-02-the-wonderful-world-of-recommender-systems/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe_hu04af572edec61288f5f08ad15b2b373c_1725198_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe.jpg 4961w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/recommender-universe.jpg alt width=4961 height=2468></figure><div class=post-content><p>I recently gave a talk about recommender systems at the <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Data Science Sydney meetup</a> (the slides are available <a href=http://yanirs.github.io/talks/the-wonderful-world-of-recommender-systems target=_blank rel=noopener>here</a>). This post roughly follows the outline of the talk, expanding on some of the key points in non-slide form (i.e., complete sentences and paragraphs!). The first few sections give a broad overview of the field and the common recommendation paradigms, while the final part is dedicated to debunking five common myths about recommender systems.</p><h3 id=motivation-why-should-we-care-about-recommender-systems>Motivation: Why should we care about recommender systems?<a hidden class=anchor aria-hidden=true href=#motivation-why-should-we-care-about-recommender-systems>#</a></h3><p>The key reason why many people seem to care about recommender systems is <em>money</em>. For companies such as Amazon, Netflix, and Spotify, recommender systems drive significant engagement and revenue. But this is the more cynical view of things. The reason these companies (and others) see increased revenue is because they deliver actual <em>value</em> to their customers – recommender systems provide a scalable way of personalising content for users in scenarios with many items.</p><p>Another reason why data scientists specifically should care about recommender systems is that it is a true data science problem. That is, at least according to <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>my favourite definition of data science</a> as the intersection between software engineering, machine learning, and statistics. As we will see, building successful recommender systems requires all of these skills (and more).</p><h3 id=defining-recommender-systems>Defining recommender systems<a hidden class=anchor aria-hidden=true href=#defining-recommender-systems>#</a></h3><p>When trying to the define anything, a reasonable first step is to ask Wikipedia. Unfortunately, as of the day of this post&rsquo;s publication, <a href=http://en.wikipedia.org/wiki/Recommender_system target=_blank rel=noopener>Wikipedia defines recommender systems too narrowly</a>, as &ldquo;a subclass of information filtering system that seek to predict the ‘rating&rsquo; or ‘preference&rsquo; that a user would give to an item&rdquo; (I should probably fix it, but this wrong definition helped my talk flow better – let me know if you fix it and I&rsquo;ll update this paragraph).</p><p>The problem with Wikipedia&rsquo;s definition is that there&rsquo;s so much more to recommender systems than rating prediction. First, <em>recommender</em> is a misnomer – calling it a discovery assistant is better, as the so-called recommendations are far from binding. Second, <em>system</em> means that elements like presentation are important, which is part of what makes recommendation such an interesting data science problem.</p><p>My definition is simply:</p><p class=highlight-box><i>Recommender systems are systems that help users discover items they may like.</i></p><h3 id=recommendation-paradigms>Recommendation paradigms<a hidden class=anchor aria-hidden=true href=#recommendation-paradigms>#</a></h3><p>Depending on who you ask, there are between two and twenty different recommendation paradigms. The usual classification is by the type of data that is used to generate recommendations. The distinction between approaches is more academic than practical, as it is often a good idea to use hybrids/ensembles to address each method&rsquo;s limitations. Nonetheless, it is worthwhile discussing the different paradigms. The way I see it, if you ignore trivial approaches that often work surprisingly well (e.g., popular items, and &ldquo;watch it again&rdquo;), there are four main paradigms: collaborative filtering, content-based, social/demographic, and contextual recommendation.</p><p><strong>Collaborative filtering</strong> is perhaps the most famous approach to recommendation, to the point that it is sometimes seen as synonymous with the field. The main idea is that you&rsquo;re given a matrix of preferences by users for items, and these are used to predict missing preferences and recommend items with high predictions. One of the key advantages of this approach is that there has been a huge amount of research into collaborative filtering, making it pretty well-understood, with existing libraries that make implementation fairly straightforward. Another important advantage is that collaborative filtering is independent of item properties. All you need to get started is user and item IDs, and some notion of preference by users for items (ratings, views, etc.).</p><p>The major limitation of collaborative filtering is its reliance on preferences. In a cold-start scenario, where there are no preferences at all, it can&rsquo;t generate any recommendations. However, cold starts can also occur when there are millions of available preferences, because pure collaborative recommendation doesn&rsquo;t work for items or users with no ratings, and <a href=https://dl.dropboxusercontent.com/u/25632965/SeroussiBohnertZukerman2011.pdf target=_blank rel=noopener>often performs pretty poorly when there are only a few ratings</a>. Further, the underlying collaborative model may yield disappointing results when the preference matrix is sparse. In fact, this has been my experience in <a href=https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/>nearly every situation where I deployed collaborative filtering</a>. It always requires tweaking, and never simply works out of the box.</p><p><strong>Content-based</strong> algorithms are given user preferences for items, and recommend similar items based on a domain-specific notion of item content. The main advantage of content-based recommendation over collaborative filtering is that it doesn&rsquo;t require as much user feedback to get going. Even one known user preference can yield many good recommendations (which can lead to the collection of preferences to enable collaborative recommendation). In many scenarios, content-based recommendation is the most natural approach. For example, when recommending news articles or blog posts, it&rsquo;s natural to compare the textual content of the items. This approach also extends naturally to cases where item metadata is available (e.g., movie stars, book authors, and music genres).</p><p>One problem with deploying content-based recommendations arises when item similarity is not so easily defined. However, even when it is natural to measure similarity, content-based recommendations may end up being too homogeneous to be useful. Such recommendations may also be too static over time, thereby failing to adjust to changes in individual user tastes and other shifts in the underlying data.</p><p><strong>Social and demographic</strong> recommenders suggest items that are liked by friends, friends of friends, and demographically-similar people. Such recommenders don&rsquo;t need any preferences by the user to whom recommendations are made, making them very powerful. In my experience, even trivially-implemented approaches can be depressingly accurate. For example, just summing the number of Facebook likes by a person&rsquo;s close friends can often be enough to paint a pretty accurate picture of what that person likes.</p><p>Given this power of social and demographic recommenders, it isn&rsquo;t surprising that social networks don&rsquo;t easily give their data away. This means that for many practitioners, employing social/demographic recommendation algorithms is simply impossible. However, even when such data is available, it is not always easy to use without creeping users out. Further, privacy concerns need to be carefully addressed to ensure that users are comfortable with using the system.</p><p><strong>Contextual</strong> recommendation algorithms recommend items that match the user&rsquo;s current context. This allows them to be more flexible and adaptive to current user needs than methods that ignore context (essentially giving the same weight to all of the user&rsquo;s history). Hence, contextual algorithms are more likely to elicit a response than approaches that are based only on historical data.</p><p>The key limitations of contextual recommenders are similar to those of social and demographic recommenders – contextual data may not always be available, and there&rsquo;s a risk of creeping out the user. For example, <a href=https://en.wikipedia.org/wiki/Behavioral_retargeting target=_blank rel=noopener>ad retargeting</a> can be seen as a form of contextual recommendation that follows users around the web and across devices, without having the explicit consent of the users to being tracked in this manner.</p><h3 id=five-common-myths-about-recommender-systems>Five common myths about recommender systems<a hidden class=anchor aria-hidden=true href=#five-common-myths-about-recommender-systems>#</a></h3><p>There are some common myths and misconceptions surrounding recommender systems. I&rsquo;ve picked five to address in this post. If you disagree, agree, or have more to add, I would love to hear from you either <a href=https://yanirseroussi.com/about/>privately</a> or in the comment section.</p><p class=highlight-box><b>The accuracy myth</b><br>Offline optimisation of an accuracy measure is sufficient for creating a successful recommender<br><b>Reality</b><br>Users don't really care about accuracy</p><p>This is perhaps the most prevalent myth of all, as evidenced by Wikipedia&rsquo;s definition of recommender systems. It&rsquo;s somewhat surprising that it still persists, as it&rsquo;s been almost ten years since <a href="http://dl.acm.org/citation.cfm?id=1125659" target=_blank rel=noopener>McNee et al.&rsquo;s influential paper on the damage the focus on accuracy measures has done to the field</a>.</p><p>It is therefore worth asking where this myth came from. My theory is that it is a feedback loop between academia and industry. In academia it is pretty easy to publish papers with infinitesimal improvements to arbitrary accuracy measures on offline datasets (<a href=https://dl.dropboxusercontent.com/u/25632965/SeroussiBohnertZukerman2011.pdf target=_blank rel=noopener>I&rsquo;m also guilty of doing just that</a>), while it&rsquo;s relatively hard to run experiments on live systems. However, one of the moves that significantly increased focus on offline predictive accuracy came from industry, in the form of the <a href=https://en.wikipedia.org/wiki/Netflix_Prize target=_blank rel=noopener>$1M Netflix prize</a>, where the goal was to improve the accuracy of Netflix&rsquo;s rating prediction algorithm by 10%.</p><p>Notably, most of the algorithms that came out of the three-year competition were never integrated into Netflix. As <a href=http://techblog.netflix.com/2012/04/netflix-recommendations-beyond-5-stars.html target=_blank rel=noopener>discussed on the Netflix blog</a>:</p><blockquote><p>You might be wondering what happened with the final Grand Prize ensemble that won the $1M two years later&mldr; We evaluated some of the new methods offline but the additional accuracy gains that we measured did not seem to justify the engineering effort needed to bring them into a production environment.</p><p>Our business objective is to maximize member satisfaction and month-to-month subscription retention&mldr; Now it is clear that the Netflix Prize objective, accurate prediction of a movie&rsquo;s rating, is just one of the many components of an effective recommendation system that optimizes our members&rsquo; enjoyment.</p></blockquote><p>The following chart says it all (taken from <a href=http://techblog.netflix.com/2012/06/netflix-recommendations-beyond-5-stars.html target=_blank rel=noopener>the second part of the blog post quoted above</a>):</p><figure><a href=netflix-rating-prediction-contribution.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/netflix-rating-prediction-contribution_hu4d59641bac20d42b471a6b4658d17e86_9091_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/netflix-rating-prediction-contribution_hu4d59641bac20d42b471a6b4658d17e86_9091_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/netflix-rating-prediction-contribution_hu4d59641bac20d42b471a6b4658d17e86_9091_720x0_resize_box_3.png 720w,
diff --git a/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.html b/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.html
index ec367a89b..b8c66f527 100644
--- a/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.html
+++ b/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data business,data science,health,machine learning,nutrition,nutritionism,predictive modelling"><meta name=description content="Nutritionism is a special case of misinterpretation and miscommunication of scientific results – something many data scientists encounter in their work."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling"><meta property="og:description" content="Nutritionism is a special case of misinterpretation and miscommunication of scientific results – something many data scientists encounter in their work."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/"><meta property="og:image" content="https://yanirseroussi.com/health-star-dish.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-10-19T00:02:32+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/health-star-dish.jpg"><meta name=twitter:title content="Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling"><meta name=twitter:description content="Nutritionism is a special case of misinterpretation and miscommunication of scientific results – something many data scientists encounter in their work."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling","item":"https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling","name":"Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling","description":"Nutritionism is a special case of misinterpretation and miscommunication of scientific results – something many data scientists encounter in their work.","keywords":["data business","data science","health","machine learning","nutrition","nutritionism","predictive modelling"],"articleBody":"I recently finished reading the book In Defense of Food: An Eater’s Manifesto by Michael Pollan. The book criticises nutritionism – the idea that one should eat according to the sum of measured nutrients while ignoring the food that contains these nutrients. The key argument of the book is that since the knowledge derived using food science is still very limited, completely relying on the partial findings and tools provided by this science is likely to lead to health issues. Instead, the author says we should “Eat food. Not too much. Mostly plants.” One of the reasons I found the book interesting is that nutritionism is a special case of misinterpretation and miscommunication of scientific results. This is something many data scientists encounter in their everyday work – finding the balance between simple and complex models, the need to “sell” models and their results to non-technical stakeholders, and the requirement for well-performing models. This post explores these issues through the example of predicting human health based on diet.\nAs an aside, I generally agree with the book’s message, which is backed by fairly thorough research (though it is a bit dated, as the book was released in 2008). There are many commercial interests invested in persuading us to eat things that may be edible, but shouldn’t really be considered food. These food-like products tend to rely on health claims that dumb down the science. A common example can be found in various fat-free products, where healthy fat is replaced with unhealthy amounts of sugar to compensate for the loss of flavour. These products are then marketed as healthy due to their lack of fat. The book is full of such examples, and is definitely worth reading, especially if you live in the US or in a country that’s heavily influenced by American food culture.\nRunning example: Predicting a person’s health based on their diet Predicting health based on diet isn’t an easy problem. First, how do you quantify and measure health? You could use proxies like longevity and occurrence/duration of disease, but these are imperfect measures because you can have a long unhealthy life (thanks to modern medicine) and some diseases are more unbearable than others. Another issue is that there are many factors other than diet that contribute to health, such as genetics, age, lifestyle, access to healthcare, etc. Finally, even if you could reliably study the effect of diet in isolation from other factors, there’s the question of measuring the diet. Do you measure each nutrient separately or do you look at foods and consumption patterns? Do you group foods by time (e.g., looking at overall daily or monthly patterns)? If you just looked at the raw data of foods and nutrients consumed at certain points in time, every studied subject is likely to be an outlier (due to the curse of dimensionality). The raw data on foods consumed by individuals has to be grouped in some way to build a generalisable model, but groupings necessitate removal of some data.\nModelling real-world data is rarely straightforward. Many assumptions are embedded in the measurements and models. Good scientific papers are explicit about the shortcomings and limitations of the presented work. However, by the time scientific studies make it to the real world, shortcomings and limitations are removed to present palatable (and often wrong) conclusions to a general audience. This is illustrated nicely by the following comic:\nSelling your model with simple explanations People like simple explanations for complex phenomena. If you work as a data scientist, or if you are planning to become/hire one, you’ve probably seen storytelling listed as one of the key skills that data scientists should have. Unlike “real” scientists that work in academia and have to explain their results mostly to peers who can handle technical complexities, data scientists in industry have to deal with non-technical stakeholders who want to understand how the models work. However, these stakeholders rarely have the time or patience to understand how things truly work. What they want is a simple hand-wavy explanation to make them feel as if they understand the matter – they want a story, not a technical report (an aside: don’t feel too smug, there is a lot of knowledge out there and in matters that fall outside of our main interests we are all non-technical stakeholders who get fed simple stories).\nOne of the simplest stories that most people can understand is the story of correlation. Going back to the running example of predicting health based on diet, it is well-known that excessive consumption of certain fats under certain conditions is correlated with an increase in likelihood of certain diseases. This is simplified in some stories to “consuming more fat increases your chance of disease”, which leads to the conclusion that consuming no fat at all decreases the chance of disease to zero. While this may sound ridiculous, it’s the sad reality. According to a recent survey, while the image of fat has improved over the past few years, 42% of Americans still try to limit or avoid all fats.\nA slightly more involved story is that of linear models – looking at the effect of the most important factors, rather than presenting a single factor’s contribution. This storytelling technique is commonly used even with non-linear models, where the most important features are identified using various techniques. The problem is that people still tend to interpret this form of presentation as a simple linear relationship. Expanding on the previous example, this approach goes from a single-minded focus on fat to the need to consume less fat and sugar, but more calcium, protein and vitamin D. Unfortunately, even linear models with tens of variables are hard for people to use and follow. In the case of nutrition, few people really track the intake of all the nutrients covered by recommended daily intakes.\nFew interesting relationships are linear Complex phenomena tend to be explained by complex non-linear models. For example, it’s not enough to consume the “right” amount of calcium – you also need vitamin D to absorb it, but popping a few vitamin D pills isn’t going to work well if you don’t consume them with fat, though over-consumption of certain fats is likely to lead to health issues. This list of human-friendly rules can go on and on, but reality is much more complex. It is naive to think that it is possible to predict something as complex as human health with a simple linear model that is based on daily nutrient intake. That being said, some relationships do lend themselves to simple rules of thumb. For example, if you don’t have enough vitamin C, you’re very likely to get scurvy, and people who don’t consume enough vitamin B1 may contract beriberi. However, when it comes to cancers and other diseases that take years to develop, linear models are inadequate.\nAn accurate model to predict human health based on diet would be based on thousands to millions of variables, and would consider many non-linear relationships. It is fairly safe to assume that there is no magic bullet that simply explains how diet affects our health, and no superfood is going to save us from the complexity of our nutritional needs. It is likely that even if we had such a model, it would not be completely accurate. All models are wrong, but some models are useful. For example, the vitamin C versus scurvy model is very useful, but it is often wrong when it comes to predicting overall health. Predictions made by useful complex models can be very hard to reason about and explain, but it doesn’t mean we shouldn’t use them.\nThe ongoing quest for sellable complex models All of the above should be pretty obvious to any modern data scientist. The culture of preferring complex models with high predictive accuracy to simplistic models with questionable predictive power is now prevalent (see Leo Breiman’s 2001 paper for a discussion of these two cultures of statistical modelling). This is illustrated by the focus of many Kaggle competitions on producing accurate models and the recent successes of deep learning for computer vision. Especially with deep learning for vision, no one expects a handful of variables (pixels) to be predictive, so traditional explanations of variable importance are useless. This does lead to a general suspicion of such models, as they are too complex for us to reason about or fully explain. However, it is very hard to argue with the empirical success of accurate modelling techniques.\nNonetheless, many data scientists still work in environments that require simple explanations. This may lead some data scientists to settle for simple models that are easier to sell. In my opinion, it is better to make up a simple explanation for an accurate complex model than settle for a simple model that doesn’t really work. That being said, some situations do call for simple or inflexible models due to a lack of data or the need to enforce strong prior assumptions. In Albert Einstein’s words, “it can scarcely be denied that the supreme goal of all theory is to make the irreducible basic elements as simple and as few as possible without having to surrender the adequate representation of a single datum of experience”. Make things as simple as possible, but not simpler, and always consider the interests of people who try to sell you simplistic (or unnecessarily complex) explanations.\n","wordCount":"1569","inLanguage":"en","image":"https://yanirseroussi.com/health-star-dish.jpg","datePublished":"2015-10-19T00:02:32Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling</h1><div class=post-meta><span title='2015-10-19 00:02:32 +0000 UTC'>October 19, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-10-19-nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish_huc4da638d7ca6a4b8f50402f897bdb27b_209676_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish_huc4da638d7ca6a4b8f50402f897bdb27b_209676_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish_huc4da638d7ca6a4b8f50402f897bdb27b_209676_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish_huc4da638d7ca6a4b8f50402f897bdb27b_209676_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish.jpg 1275w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish.jpg alt width=1275 height=553></figure><div class=post-content><p>I recently finished reading the book <a href=http://michaelpollan.com/books/in-defense-of-food/ target=_blank rel=noopener>In Defense of Food: An Eater&rsquo;s Manifesto</a> by Michael Pollan. The book criticises <a href=https://en.wikipedia.org/wiki/Nutritionism target=_blank rel=noopener>nutritionism</a> – the idea that one should eat according to the sum of measured nutrients while ignoring the food that contains these nutrients. The key argument of the book is that since the knowledge derived using food science is still very limited, completely relying on the partial findings and tools provided by this science is likely to lead to health issues. Instead, the author says we should &ldquo;<em>Eat food. Not too much. Mostly plants.</em>&rdquo; One of the reasons I found the book interesting is that nutritionism is a special case of misinterpretation and miscommunication of scientific results. This is something many data scientists encounter in their everyday work – finding the balance between simple and complex models, the need to &ldquo;sell&rdquo; models and their results to non-technical stakeholders, and the requirement for well-performing models. This post explores these issues through the example of predicting human health based on diet.</p><p>As an aside, I generally agree with the book&rsquo;s message, which is backed by fairly thorough research (though it is a bit dated, as the book was released in 2008). There are many commercial interests invested in persuading us to eat things that may be edible, but shouldn&rsquo;t really be considered food. These food-like products tend to rely on health claims that dumb down the science. A common example can be found in various fat-free products, where healthy fat is replaced with unhealthy amounts of sugar to compensate for the loss of flavour. These products are then marketed as healthy due to their lack of fat. The book is full of such examples, and is definitely worth reading, especially if you live in the US or in a country that&rsquo;s heavily influenced by American food culture.</p><h3 id=running-example-predicting-a-persons-health-based-on-their-diet>Running example: Predicting a person&rsquo;s health based on their diet<a hidden class=anchor aria-hidden=true href=#running-example-predicting-a-persons-health-based-on-their-diet>#</a></h3><p>Predicting health based on diet isn&rsquo;t an easy problem. First, how do you quantify and measure health? You could use proxies like longevity and occurrence/duration of disease, but these are imperfect measures because you can have a long unhealthy life (thanks to modern medicine) and some diseases are more unbearable than others. Another issue is that there are many factors other than diet that contribute to health, such as genetics, age, lifestyle, access to healthcare, etc. Finally, even if you could reliably study the effect of diet in isolation from other factors, there&rsquo;s the question of measuring the diet. Do you measure each nutrient separately or do you look at foods and consumption patterns? Do you group foods by time (e.g., looking at overall daily or monthly patterns)? If you just looked at the raw data of foods and nutrients consumed at certain points in time, every studied subject is likely to be an outlier (due to the <a href=https://en.wikipedia.org/wiki/Curse_of_dimensionality target=_blank rel=noopener>curse of dimensionality</a>). The raw data on foods consumed by individuals has to be grouped in some way to build a generalisable model, but groupings necessitate removal of some data.</p><p>Modelling real-world data is rarely straightforward. Many assumptions are embedded in the measurements and models. Good scientific papers are explicit about the shortcomings and limitations of the presented work. However, by the time scientific studies make it to the real world, shortcomings and limitations are removed to present palatable (and often wrong) conclusions to a general audience. This is illustrated nicely by the following comic:</p><figure><a href="http://www.phdcomics.com/comics.php?n=1174" target=_blank rel=noopener><img sizes="(min-width: 768px) 600px,
+<meta name=keywords content="data business,data science,health,machine learning,nutrition,nutritionism,predictive modelling"><meta name=description content="Nutritionism is a special case of misinterpretation and miscommunication of scientific results – something many data scientists encounter in their work."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling"><meta property="og:description" content="Nutritionism is a special case of misinterpretation and miscommunication of scientific results – something many data scientists encounter in their work."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/"><meta property="og:image" content="https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-10-19T00:02:32+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish.jpg"><meta name=twitter:title content="Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling"><meta name=twitter:description content="Nutritionism is a special case of misinterpretation and miscommunication of scientific results – something many data scientists encounter in their work."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling","item":"https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling","name":"Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling","description":"Nutritionism is a special case of misinterpretation and miscommunication of scientific results – something many data scientists encounter in their work.","keywords":["data business","data science","health","machine learning","nutrition","nutritionism","predictive modelling"],"articleBody":"I recently finished reading the book In Defense of Food: An Eater’s Manifesto by Michael Pollan. The book criticises nutritionism – the idea that one should eat according to the sum of measured nutrients while ignoring the food that contains these nutrients. The key argument of the book is that since the knowledge derived using food science is still very limited, completely relying on the partial findings and tools provided by this science is likely to lead to health issues. Instead, the author says we should “Eat food. Not too much. Mostly plants.” One of the reasons I found the book interesting is that nutritionism is a special case of misinterpretation and miscommunication of scientific results. This is something many data scientists encounter in their everyday work – finding the balance between simple and complex models, the need to “sell” models and their results to non-technical stakeholders, and the requirement for well-performing models. This post explores these issues through the example of predicting human health based on diet.\nAs an aside, I generally agree with the book’s message, which is backed by fairly thorough research (though it is a bit dated, as the book was released in 2008). There are many commercial interests invested in persuading us to eat things that may be edible, but shouldn’t really be considered food. These food-like products tend to rely on health claims that dumb down the science. A common example can be found in various fat-free products, where healthy fat is replaced with unhealthy amounts of sugar to compensate for the loss of flavour. These products are then marketed as healthy due to their lack of fat. The book is full of such examples, and is definitely worth reading, especially if you live in the US or in a country that’s heavily influenced by American food culture.\nRunning example: Predicting a person’s health based on their diet Predicting health based on diet isn’t an easy problem. First, how do you quantify and measure health? You could use proxies like longevity and occurrence/duration of disease, but these are imperfect measures because you can have a long unhealthy life (thanks to modern medicine) and some diseases are more unbearable than others. Another issue is that there are many factors other than diet that contribute to health, such as genetics, age, lifestyle, access to healthcare, etc. Finally, even if you could reliably study the effect of diet in isolation from other factors, there’s the question of measuring the diet. Do you measure each nutrient separately or do you look at foods and consumption patterns? Do you group foods by time (e.g., looking at overall daily or monthly patterns)? If you just looked at the raw data of foods and nutrients consumed at certain points in time, every studied subject is likely to be an outlier (due to the curse of dimensionality). The raw data on foods consumed by individuals has to be grouped in some way to build a generalisable model, but groupings necessitate removal of some data.\nModelling real-world data is rarely straightforward. Many assumptions are embedded in the measurements and models. Good scientific papers are explicit about the shortcomings and limitations of the presented work. However, by the time scientific studies make it to the real world, shortcomings and limitations are removed to present palatable (and often wrong) conclusions to a general audience. This is illustrated nicely by the following comic:\nSelling your model with simple explanations People like simple explanations for complex phenomena. If you work as a data scientist, or if you are planning to become/hire one, you’ve probably seen storytelling listed as one of the key skills that data scientists should have. Unlike “real” scientists that work in academia and have to explain their results mostly to peers who can handle technical complexities, data scientists in industry have to deal with non-technical stakeholders who want to understand how the models work. However, these stakeholders rarely have the time or patience to understand how things truly work. What they want is a simple hand-wavy explanation to make them feel as if they understand the matter – they want a story, not a technical report (an aside: don’t feel too smug, there is a lot of knowledge out there and in matters that fall outside of our main interests we are all non-technical stakeholders who get fed simple stories).\nOne of the simplest stories that most people can understand is the story of correlation. Going back to the running example of predicting health based on diet, it is well-known that excessive consumption of certain fats under certain conditions is correlated with an increase in likelihood of certain diseases. This is simplified in some stories to “consuming more fat increases your chance of disease”, which leads to the conclusion that consuming no fat at all decreases the chance of disease to zero. While this may sound ridiculous, it’s the sad reality. According to a recent survey, while the image of fat has improved over the past few years, 42% of Americans still try to limit or avoid all fats.\nA slightly more involved story is that of linear models – looking at the effect of the most important factors, rather than presenting a single factor’s contribution. This storytelling technique is commonly used even with non-linear models, where the most important features are identified using various techniques. The problem is that people still tend to interpret this form of presentation as a simple linear relationship. Expanding on the previous example, this approach goes from a single-minded focus on fat to the need to consume less fat and sugar, but more calcium, protein and vitamin D. Unfortunately, even linear models with tens of variables are hard for people to use and follow. In the case of nutrition, few people really track the intake of all the nutrients covered by recommended daily intakes.\nFew interesting relationships are linear Complex phenomena tend to be explained by complex non-linear models. For example, it’s not enough to consume the “right” amount of calcium – you also need vitamin D to absorb it, but popping a few vitamin D pills isn’t going to work well if you don’t consume them with fat, though over-consumption of certain fats is likely to lead to health issues. This list of human-friendly rules can go on and on, but reality is much more complex. It is naive to think that it is possible to predict something as complex as human health with a simple linear model that is based on daily nutrient intake. That being said, some relationships do lend themselves to simple rules of thumb. For example, if you don’t have enough vitamin C, you’re very likely to get scurvy, and people who don’t consume enough vitamin B1 may contract beriberi. However, when it comes to cancers and other diseases that take years to develop, linear models are inadequate.\nAn accurate model to predict human health based on diet would be based on thousands to millions of variables, and would consider many non-linear relationships. It is fairly safe to assume that there is no magic bullet that simply explains how diet affects our health, and no superfood is going to save us from the complexity of our nutritional needs. It is likely that even if we had such a model, it would not be completely accurate. All models are wrong, but some models are useful. For example, the vitamin C versus scurvy model is very useful, but it is often wrong when it comes to predicting overall health. Predictions made by useful complex models can be very hard to reason about and explain, but it doesn’t mean we shouldn’t use them.\nThe ongoing quest for sellable complex models All of the above should be pretty obvious to any modern data scientist. The culture of preferring complex models with high predictive accuracy to simplistic models with questionable predictive power is now prevalent (see Leo Breiman’s 2001 paper for a discussion of these two cultures of statistical modelling). This is illustrated by the focus of many Kaggle competitions on producing accurate models and the recent successes of deep learning for computer vision. Especially with deep learning for vision, no one expects a handful of variables (pixels) to be predictive, so traditional explanations of variable importance are useless. This does lead to a general suspicion of such models, as they are too complex for us to reason about or fully explain. However, it is very hard to argue with the empirical success of accurate modelling techniques.\nNonetheless, many data scientists still work in environments that require simple explanations. This may lead some data scientists to settle for simple models that are easier to sell. In my opinion, it is better to make up a simple explanation for an accurate complex model than settle for a simple model that doesn’t really work. That being said, some situations do call for simple or inflexible models due to a lack of data or the need to enforce strong prior assumptions. In Albert Einstein’s words, “it can scarcely be denied that the supreme goal of all theory is to make the irreducible basic elements as simple and as few as possible without having to surrender the adequate representation of a single datum of experience”. Make things as simple as possible, but not simpler, and always consider the interests of people who try to sell you simplistic (or unnecessarily complex) explanations.\n","wordCount":"1569","inLanguage":"en","image":"https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish.jpg","datePublished":"2015-10-19T00:02:32Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling</h1><div class=post-meta><span title='2015-10-19 00:02:32 +0000 UTC'>October 19, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-10-19-nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish_huc4da638d7ca6a4b8f50402f897bdb27b_209676_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish_huc4da638d7ca6a4b8f50402f897bdb27b_209676_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish_huc4da638d7ca6a4b8f50402f897bdb27b_209676_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish_huc4da638d7ca6a4b8f50402f897bdb27b_209676_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish.jpg 1275w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/health-star-dish.jpg alt width=1275 height=553></figure><div class=post-content><p>I recently finished reading the book <a href=http://michaelpollan.com/books/in-defense-of-food/ target=_blank rel=noopener>In Defense of Food: An Eater&rsquo;s Manifesto</a> by Michael Pollan. The book criticises <a href=https://en.wikipedia.org/wiki/Nutritionism target=_blank rel=noopener>nutritionism</a> – the idea that one should eat according to the sum of measured nutrients while ignoring the food that contains these nutrients. The key argument of the book is that since the knowledge derived using food science is still very limited, completely relying on the partial findings and tools provided by this science is likely to lead to health issues. Instead, the author says we should &ldquo;<em>Eat food. Not too much. Mostly plants.</em>&rdquo; One of the reasons I found the book interesting is that nutritionism is a special case of misinterpretation and miscommunication of scientific results. This is something many data scientists encounter in their everyday work – finding the balance between simple and complex models, the need to &ldquo;sell&rdquo; models and their results to non-technical stakeholders, and the requirement for well-performing models. This post explores these issues through the example of predicting human health based on diet.</p><p>As an aside, I generally agree with the book&rsquo;s message, which is backed by fairly thorough research (though it is a bit dated, as the book was released in 2008). There are many commercial interests invested in persuading us to eat things that may be edible, but shouldn&rsquo;t really be considered food. These food-like products tend to rely on health claims that dumb down the science. A common example can be found in various fat-free products, where healthy fat is replaced with unhealthy amounts of sugar to compensate for the loss of flavour. These products are then marketed as healthy due to their lack of fat. The book is full of such examples, and is definitely worth reading, especially if you live in the US or in a country that&rsquo;s heavily influenced by American food culture.</p><h3 id=running-example-predicting-a-persons-health-based-on-their-diet>Running example: Predicting a person&rsquo;s health based on their diet<a hidden class=anchor aria-hidden=true href=#running-example-predicting-a-persons-health-based-on-their-diet>#</a></h3><p>Predicting health based on diet isn&rsquo;t an easy problem. First, how do you quantify and measure health? You could use proxies like longevity and occurrence/duration of disease, but these are imperfect measures because you can have a long unhealthy life (thanks to modern medicine) and some diseases are more unbearable than others. Another issue is that there are many factors other than diet that contribute to health, such as genetics, age, lifestyle, access to healthcare, etc. Finally, even if you could reliably study the effect of diet in isolation from other factors, there&rsquo;s the question of measuring the diet. Do you measure each nutrient separately or do you look at foods and consumption patterns? Do you group foods by time (e.g., looking at overall daily or monthly patterns)? If you just looked at the raw data of foods and nutrients consumed at certain points in time, every studied subject is likely to be an outlier (due to the <a href=https://en.wikipedia.org/wiki/Curse_of_dimensionality target=_blank rel=noopener>curse of dimensionality</a>). The raw data on foods consumed by individuals has to be grouped in some way to build a generalisable model, but groupings necessitate removal of some data.</p><p>Modelling real-world data is rarely straightforward. Many assumptions are embedded in the measurements and models. Good scientific papers are explicit about the shortcomings and limitations of the presented work. However, by the time scientific studies make it to the real world, shortcomings and limitations are removed to present palatable (and often wrong) conclusions to a general audience. This is illustrated nicely by the following comic:</p><figure><a href="http://www.phdcomics.com/comics.php?n=1174" target=_blank rel=noopener><img sizes="(min-width: 768px) 600px,
 100vw" srcset="https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/phd-comics-science-news-cycle_hub589cb87d006926e3ce7b389284e329c_105126_360x0_resize_box_1.gif 360w,
 https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/phd-comics-science-news-cycle_hub589cb87d006926e3ce7b389284e329c_105126_480x0_resize_box_1.gif 480w,
 https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/phd-comics-science-news-cycle.gif 600w," src=https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/phd-comics-science-news-cycle.gif alt="PHD Comics: Science News Cycle" loading=lazy></a></figure><h3 id=selling-your-model-with-simple-explanations>Selling your model with simple explanations<a hidden class=anchor aria-hidden=true href=#selling-your-model-with-simple-explanations>#</a></h3><p>People like simple explanations for complex phenomena. If you work as a data scientist, or if you are planning to become/hire one, you&rsquo;ve probably seen <strong>storytelling</strong> listed as one of the key skills that data scientists should have. Unlike &ldquo;real&rdquo; scientists that work in academia and have to explain their results mostly to peers who can handle technical complexities, data scientists in industry have to deal with non-technical stakeholders who want to understand how the models work. However, these stakeholders rarely have the time or patience to understand how things truly work. What they want is a simple hand-wavy explanation to make them <em>feel</em> as if they understand the matter – they want a <em>story</em>, not a technical report (an aside: don&rsquo;t feel too smug, there is a lot of knowledge out there and in matters that fall outside of our main interests we are all non-technical stakeholders who get fed simple stories).</p><p>One of the simplest stories that most people can understand is the story of <strong>correlation</strong>. Going back to the running example of predicting health based on diet, it is well-known that excessive consumption of certain fats under certain conditions is correlated with an increase in likelihood of certain diseases. This is simplified in some stories to &ldquo;consuming more fat increases your chance of disease&rdquo;, which leads to the conclusion that consuming no fat at all decreases the chance of disease to zero. While this may sound ridiculous, it&rsquo;s the sad reality. According to <a href=http://www.foodinsight.org/2015-food-health-survey-consumer-research target=_blank rel=noopener>a recent survey</a>, while the image of fat has improved over the past few years, 42% of Americans still try to limit or avoid all fats.</p><p>A slightly more involved story is that of <strong>linear models</strong> – looking at the effect of the most important factors, rather than presenting a single factor&rsquo;s contribution. This storytelling technique is commonly used even with non-linear models, where the most important features are identified using various techniques. The problem is that people still tend to interpret this form of presentation as a simple linear relationship. Expanding on the previous example, this approach goes from a single-minded focus on fat to the need to consume less fat and sugar, but more calcium, protein and vitamin D. Unfortunately, even linear models with tens of variables are hard for people to use and follow. In the case of nutrition, few people really track the intake of all the nutrients covered by recommended daily intakes.</p><h3 id=few-interesting-relationships-are-linear>Few interesting relationships are linear<a hidden class=anchor aria-hidden=true href=#few-interesting-relationships-are-linear>#</a></h3><p>Complex phenomena tend to be explained by complex non-linear models. For example, it&rsquo;s not enough to consume the &ldquo;right&rdquo; amount of calcium – <a href=https://en.wikipedia.org/wiki/Calcium#Nutrition target=_blank rel=noopener>you also need vitamin D to absorb it</a>, but <a href=http://www.medicaldaily.com/vitamin-d-benefits-are-enhanced-if-meal-contains-fat-absorbing-more-supplements-311248 target=_blank rel=noopener>popping a few vitamin D pills isn&rsquo;t going to work well if you don&rsquo;t consume them with fat</a>, though <a href=https://en.wikipedia.org/wiki/Trans_fat#Health_risks target=_blank rel=noopener>over-consumption of certain fats is likely to lead to health issues</a>. This list of human-friendly rules can go on and on, but reality is much more complex. It is naive to think that it is possible to predict something as complex as human health with a simple linear model that is based on daily nutrient intake. That being said, some relationships do lend themselves to simple rules of thumb. For example, if you don&rsquo;t have enough vitamin C, you&rsquo;re very likely to get <a href=https://en.wikipedia.org/wiki/Scurvy target=_blank rel=noopener>scurvy</a>, and people who don&rsquo;t consume enough vitamin B1 may contract <a href=https://en.wikipedia.org/wiki/Beriberi target=_blank rel=noopener>beriberi</a>. However, when it comes to cancers and other diseases that take years to develop, linear models are inadequate.</p><p>An accurate model to predict human health based on diet would be based on thousands to millions of variables, and would consider many non-linear relationships. It is fairly safe to assume that there is no magic bullet that simply explains how diet affects our health, and no <a href=https://en.wikipedia.org/wiki/Superfood target=_blank rel=noopener>superfood</a> is going to save us from the complexity of our nutritional needs. It is likely that even if we had such a model, it would not be completely accurate. All models are wrong, but some models are useful. For example, the vitamin C versus scurvy model is very useful, but it is often wrong when it comes to predicting overall health. Predictions made by useful complex models can be very hard to reason about and explain, but it doesn&rsquo;t mean we shouldn&rsquo;t use them.</p><h3 id=the-ongoing-quest-for-sellable-complex-models>The ongoing quest for sellable complex models<a hidden class=anchor aria-hidden=true href=#the-ongoing-quest-for-sellable-complex-models>#</a></h3><p>All of the above should be pretty obvious to any modern data scientist. The culture of preferring complex models with high predictive accuracy to simplistic models with questionable predictive power is now prevalent (see <a href=http://projecteuclid.org/download/pdf_1/euclid.ss/1009213726 target=_blank rel=noopener>Leo Breiman&rsquo;s 2001 paper for a discussion of these two cultures of statistical modelling</a>). This is illustrated by the focus of many <a href=https://www.kaggle.com/ target=_blank rel=noopener>Kaggle</a> competitions on producing accurate models and the recent successes of <a href=https://en.wikipedia.org/wiki/Deep_learning#Image_recognition target=_blank rel=noopener>deep learning for computer vision</a>. Especially with deep learning for vision, no one expects a handful of variables (pixels) to be predictive, so traditional explanations of variable importance are useless. This does lead to a general suspicion of such models, as they are too complex for us to reason about or fully explain. However, it is very hard to argue with the empirical success of accurate modelling techniques.</p><p>Nonetheless, many data scientists still work in environments that require simple explanations. This may lead some data scientists to settle for simple models that are easier to sell. In my opinion, it is better to make up a simple explanation for an accurate complex model than settle for a simple model that doesn&rsquo;t really work. That being said, some situations do call for simple or inflexible models due to a lack of data or the need to enforce strong prior assumptions. In Albert Einstein&rsquo;s words, &ldquo;it can scarcely be denied that the supreme goal of all theory is to make the irreducible basic elements as simple and as few as possible without having to surrender the adequate representation of a single datum of experience&rdquo;. Make things as simple as possible, but not simpler, and always consider the interests of people who try to sell you simplistic (or unnecessarily complex) explanations.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-business/>data business</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/health/>health</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/nutrition/>nutrition</a></li><li><a href=https://yanirseroussi.com/tags/nutritionism/>nutritionism</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling on x" href="https://x.com/intent/tweet/?text=Miscommunicating%20science%3a%20Simplistic%20models%2c%20nutritionism%2c%20and%20the%20art%20of%20storytelling&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f10%2f19%2fnutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena%2f&amp;hashtags=databusiness%2cdatascience%2chealth%2cmachinelearning%2cnutrition%2cnutritionism%2cpredictivemodelling"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f10%2f19%2fnutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena%2f&amp;title=Miscommunicating%20science%3a%20Simplistic%20models%2c%20nutritionism%2c%20and%20the%20art%20of%20storytelling&amp;summary=Miscommunicating%20science%3a%20Simplistic%20models%2c%20nutritionism%2c%20and%20the%20art%20of%20storytelling&amp;source=https%3a%2f%2fyanirseroussi.com%2f2015%2f10%2f19%2fnutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2015%2f10%2f19%2fnutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena%2f&title=Miscommunicating%20science%3a%20Simplistic%20models%2c%20nutritionism%2c%20and%20the%20art%20of%20storytelling"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2015%2f10%2f19%2fnutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling on whatsapp" href="https://api.whatsapp.com/send?text=Miscommunicating%20science%3a%20Simplistic%20models%2c%20nutritionism%2c%20and%20the%20art%20of%20storytelling%20-%20https%3a%2f%2fyanirseroussi.com%2f2015%2f10%2f19%2fnutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling on telegram" href="https://telegram.me/share/url?text=Miscommunicating%20science%3a%20Simplistic%20models%2c%20nutritionism%2c%20and%20the%20art%20of%20storytelling&amp;url=https%3a%2f%2fyanirseroussi.com%2f2015%2f10%2f19%2fnutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling on ycombinator" href="https://news.ycombinator.com/submitlink?t=Miscommunicating%20science%3a%20Simplistic%20models%2c%20nutritionism%2c%20and%20the%20art%20of%20storytelling&u=https%3a%2f%2fyanirseroussi.com%2f2015%2f10%2f19%2fnutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
diff --git a/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.html b/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.html
index f7f2f5758..bc3348f3c 100644
--- a/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.html
+++ b/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Migrating a simple web application from MongoDB to Elasticsearch | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="BCRecommender,DevOps,Elasticsearch,MongoDB,software engineering"><meta name=description content="Migrating BCRecommender from MongoDB to Elasticsearch made it possible to offer a richer search experience to users at a similar cost, among other benefits."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Migrating a simple web application from MongoDB to Elasticsearch"><meta property="og:description" content="Migrating BCRecommender from MongoDB to Elasticsearch made it possible to offer a richer search experience to users at a similar cost, among other benefits."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/"><meta property="og:image" content="https://yanirseroussi.com/mongodb-to-elasticsearch.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-11-04T03:53:18+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/mongodb-to-elasticsearch.png"><meta name=twitter:title content="Migrating a simple web application from MongoDB to Elasticsearch"><meta name=twitter:description content="Migrating BCRecommender from MongoDB to Elasticsearch made it possible to offer a richer search experience to users at a similar cost, among other benefits."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Migrating a simple web application from MongoDB to Elasticsearch","item":"https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Migrating a simple web application from MongoDB to Elasticsearch","name":"Migrating a simple web application from MongoDB to Elasticsearch","description":"Migrating BCRecommender from MongoDB to Elasticsearch made it possible to offer a richer search experience to users at a similar cost, among other benefits.","keywords":["BCRecommender","DevOps","Elasticsearch","MongoDB","software engineering"],"articleBody":"Bandcamp Recommender (BCRecommender) is a web application that serves music recommendations from Bandcamp. I recently switched BCRecommender’s data store from MongoDB to Elasticsearch. This has made it possible to offer a richer search experience to users at a similar cost. This post describes the migration process and discusses some of the advantages and disadvantages of using Elasticsearch instead of MongoDB.\nMotivation: Why swap MongoDB for Elasticsearch? I’ve written a few posts in the past on BCRecommender’s design and implementation. It is a fairly simple application with two main components: the backend worker that crawls data and generates recommendations in batch, and the webapp that serves the recommendations. Importantly, each of these components has its own data store, with the recommendations synced up from the worker to the webapp, and data like events and subscriptions synced down from the webapp to the worker. Recently, I migrated the webapp component from Parse to DigitalOcean, replacing Parse’s data store with MongoDB. Choosing MongoDB was meant to simplify the transition – Parse uses MongoDB behind the scenes, as does the backend worker. However, moving out of Parse’s sandboxed environment freed me to choose any data store, and Elasticsearch seemed like a good candidate that would make it possible to expose advanced search capabilities to end users.\nAdvanced search means different things to different people. In BCRecommender’s case what I had in mind was rather modest, at least for the initial stages. BCRecommender presents recommendations for two types of entities: fans and tralbums (tracks/albums). In both cases, the recommended items are tralbums. When the key is a fan, the recommendations are tralbums that they may like, and when the key is a tralbum, the recommendations are similar tralbums. Each tralbum has a title, an artist name, and a list of tags. Each fan has its Bandcamp username as a primary key, and a list of tags that is derived from the tralbums in the fan’s collection. Originally, “searching” required users to either enter the exact username of a Bandcamp fan, or the exact Bandcamp link of a tralbum – not the best user experience! Indeed, I was tracking the search terms and found that many people were unsuccessfully trying to use unstructured queries. My idea of advanced search was to move away from the original key-value approach to full-text search that considers tags, titles, artists, and other fields that may get added later.\nIt was clear that while it may be possible to provide advanced search with MongoDB, it wouldn’t be a smooth ride. While recent versions of MongoDB include support for full-text search, it isn’t as feature-rich as Elasticsearch. For example, MongoDB text indices do not store phrases or information about the proximity of words in the documents, making phrase queries run slowly unless the entire collection fits in memory. The names really say it all: MongoDB is a database with some search capabilities, and Elasticsearch is a search engine with some database capabilities. It seems pretty common to use MongoDB (or another database) as a data store and supply search through Elasticsearch, so I figured it isn’t a bad idea to apply this pattern to BCRecommender.\nIt is worth noting that if BCRecommender were a for-profit project, I would probably use Algolia rather than Elasticsearch. My experience with Algolia on a different project has been excellent – they make it easy for you to get started, have great customer service, and deliver good and fast results with minimal development and operational effort. The two main disadvantages of Algolia are its price and the fact that it’s a closed-source solution (see further discussion on Quora). At over two million records, the monthly cost of running Algolia for BCRecommender would be around US$649, which is more than what I’m willing to spend on this project. However, for a business this may be a reasonable cost because deploying and maintaining an Elasticsearch cluster may end up costing more. Nonetheless, many businesses use Elasticsearch successfully, which is why I have no doubt that it’s a great choice for my use case – it just requires more work than Algolia to get up and running.\nExecuting the migration plan The plan for migrating the webapp from MongoDB to Elasticsearch was pretty simple:\nRead the Elasticsearch manual to ensure it suits my needs Replace MongoDB with Elasticsearch without making any user-facing changes Expose full-text search to BCRecommender users Improve search performance based on user behaviour Implement more search features Reading the manual is not something I do for every piece of technology I use (there are just too many tools out there these days), but for Elasticsearch it seemed to be worth the effort. I’m not done reading yet, but covering the material in the Getting Started and Search in Depth sections gave me enough information to complete steps 2 \u0026 3. The main things I was worried about was Elasticsearch’s performance as a database and how memory-hungry it’d be. Reading the manual allowed me to avoid some memory-use pitfalls and gave me insights on the way MongoDB and Elasticsearch compare (see details below).\nSwitching from MongoDB to Elasticsearch as a simple database was pretty straightforward. Both are document-based, so there were no changes required to the data models, but I did use the opportunity to fix some issues. For example, I changed the sitemap generation process from dynamic to static to avoid having to scroll through the entire dataset to fetch deep sitemap pages. To support BCRecommender’s feature of browsing through random fans, I replaced MongoDB’s somewhat-hacky approach of returning random results with Elasticsearch’s cleaner method. As the webapp is implemented in Python, I originally used the elasticsearch-dsl package, but found it too hard to debug queries (e.g., figuring out how to rank results randomly was a bit of a nightmare). Instead, I ended up using the elasticsearch-py package, which is only a thin wrapper around the Elasticsearch API. This approach yields code that doesn’t look very Pythonic – rather than following the Zen of Python’s flat is better than nested aphorism, the API follows the more Java-esque belief of you can never have enough nesting (see image below for example). However, I prefer overly-nested structures that I can debug to flat code that doesn’t work. I may try using the DSL again in the future, once I’ve gained more experience with Elasticsearch.\nAs mentioned, one of my worries was that I would have to increase the amount of memory allocated to the machine where Elasticsearch runs. Since BCRecommender is a fairly low-budget project, I’m willing to sacrifice high availability to save a bit on operational costs. Therefore, the webapp and its data store run on the same DigitalOcean instance, which is enough to happily serve the current amount of traffic (around one request per second). By default, Elasticsearch indexes all the fields, and even includes an extra indexed _all field that is a concatenation of all string fields in a document. While indexing everything may be convenient, it wasn’t necessary for the first stage. Choosing the minimal index settings allowed me to keep using the same instance size as before (1GB RAM and 30GB SSD). In fact, due to the switch to static sitemaps and the removal of MongoDB’s random attribute hack, fewer indexes were required after the change.\nOnce I had all the code converted and working on my local Vagrant environment, it was time to deploy. The deployment was fairly straightforward and required no downtime, as I simply provisioned a new instance and switched over the floating IP once it was all tested and ready to go. I monitored response time and memory use closely and everything seemed to be working just fine – similarly to MongoDB. After a week of monitoring, it was time to take the next step and enable advanced search.\nEnabling full-text search is where things got interesting. This phase required adding a search result page (previously users were redirected to the queried page if it was found), and reindexing the data. For this phase, I tried to keep things as simple as possible, and just indexed the string fields (tags, artist, and title) using the standard analyser. I did some manual testing of search results based on common queries, and played a bit with improving precision and recall. Perhaps the most important tweak was allowing an item’s activity level to influence the ranking. For each tralbum, the activity level is the number of fans that have the tralbum in their collection, and for each fan, it is the size of the collection. For example, when searching for amanda, the top result is the fan with username amanda, followed by tralbums by the popular Amanda Palmer. Before I added the consideration of activity level, all tralbums and fans that contained the word amanda had the same ranking.\nI deployed full-text search earlier this week, and so far it’s looking pretty good. Elasticsearch seems to be coping well with having the same level of resources allocated as before, but it’s still too early to tell if this is sustainable over time. Most importantly, users are finally seeing results when they enter unstructured queries, which increases their engagement and retention. Woohoo!\nImproving search performance based on user behaviour is expected to be an ongoing effort. Despite having many ideas, I resisted the temptation of endless offline tinkering and opted to release a working search page quickly. With Google Analytics now set up to track site search, the plan is keep identifying gaps and tweak the search settings continuously. This will take a while, as the number of daily users is currently 200-300, and they don’t all use site search.\nImplementing more search features is another set of items on my to-do list that will be addressed over time. For example, it’d be great to have search auto-completion and a prettier result page. However, I have more ideas than time to implement them, and I’m not working on BCRecommender full-time. For now, I’m pretty happy with finally having the search function.\nElasticsearch versus MongoDB: Key findings Comparisons between tools should always be taken with a grain of salt. General comparisons may not address features that are important for your specific use case, or may overemphasise aspects that you don’t care about. In addition, actively developed tools are moving targets. Since I started the transition to Elasticsearch, version 2.0 has been released, and MongoDB 3.2 is expected very soon. The following list is derived from my experience and may not apply to you. You have been warned!\nWith the disclaimer out of the way, here are some of the advantages of Elasticsearch over MongoDB:\nBetter full-text search support (duh!). Enforceable schemas and type validation (note: some form of optional schema is expected in MongoDB 3.2). All fields are indexed by default, making it easy to explore unstructured data without worrying about adding indices. It appears that indexing is implemented in a more efficient way that doesn’t block the node. Slowness due to indexing operations seems to be a common issue with MongoDB, even with background index creation. It’s possible to query multiple indices and types (same as MongoDB databases \u0026 collections, respectively) in the same query. This is a huge advantage in my case as it makes it possible to efficiently search both fans and tralbums in a single query. Index aliases make it easy to change the indices without changing the application. Multi-get by IDs returns results in the order they were requested. This is not the case with MongoDB, where using $in doesn’t have any guarantees on the returned documents’ order. It’s easy to work around this issue, but it can be the source of subtle bugs. In my case, recommendations were unintentionally sorted in random order until I added an additional step to sort them correctly. Built-in support for random scoring (note: random sampling will finally be available in MondoDB 3.2 – the ticket for this has been open for 5 years). Built-in support for multiple types of analysis on the same field. Some disadvantages of Elasticsearch in comparison to MongoDB are:\nAll fields are indexed by default, making it easy to run into memory issues. Adjusting these default settings is strongly recommended if you know how you’re going to query the data. Documents are immutable, so every update requires deleting the original document and re-inserting it (in practice, it seems like this isn’t much of an issue). Sorting results by a field requires reading all the field’s values and sorting them in memory. The sorted results are cached, but this may cause issues if memory is too limited. In conclusion, my experience with Elasticsearch has been mostly positive so far and I’m glad I’ve made the switch. I’m looking forward to taking further advantage of advanced search features to improve user experience on BCRecommender. New posts on the topic may be published in the future, so please subscribe to be notified when this happens. As always, I’m happy to receive feedback through the comments or privately.\n","wordCount":"2165","inLanguage":"en","image":"https://yanirseroussi.com/mongodb-to-elasticsearch.png","datePublished":"2015-11-04T03:53:18Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Migrating a simple web application from MongoDB to Elasticsearch</h1><div class=post-meta><span title='2015-11-04 03:53:18 +0000 UTC'>November 4, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-11-04-migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/mongodb-to-elasticsearch.png alt></figure><div class=post-content><p><a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp Recommender (BCRecommender)</a> is a web application that serves music recommendations from <a href=http://bandcamp.com target=_blank rel=noopener>Bandcamp</a>. I recently switched BCRecommender&rsquo;s data store from <a href=https://www.mongodb.com/ target=_blank rel=noopener>MongoDB</a> to <a href=https://www.elastic.co/products/elasticsearch target=_blank rel=noopener>Elasticsearch</a>. This has made it possible to offer a richer search experience to users at a similar cost. This post describes the migration process and discusses some of the advantages and disadvantages of using Elasticsearch instead of MongoDB.</p><h2 id=motivation-why-swap-mongodb-for-elasticsearch>Motivation: Why swap MongoDB for Elasticsearch?<a hidden class=anchor aria-hidden=true href=#motivation-why-swap-mongodb-for-elasticsearch>#</a></h2><p>I&rsquo;ve written a few posts in the past on BCRecommender&rsquo;s design and implementation. It is a fairly <a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/>simple application with two main components</a>: the backend worker that crawls data and generates recommendations in batch, and the webapp that serves the recommendations. Importantly, each of these components has its own data store, with the recommendations synced up from the worker to the webapp, and data like events and subscriptions synced down from the webapp to the worker. Recently, I <a href=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/>migrated the webapp component from Parse to DigitalOcean</a>, replacing Parse&rsquo;s data store with MongoDB. Choosing MongoDB was meant to simplify the transition – Parse uses MongoDB behind the scenes, as does the backend worker. However, moving out of Parse&rsquo;s sandboxed environment freed me to choose any data store, and Elasticsearch seemed like a good candidate that would make it possible to expose advanced search capabilities to end users.</p><p>Advanced search means different things to different people. In BCRecommender&rsquo;s case what I had in mind was rather modest, at least for the initial stages. BCRecommender presents recommendations for two types of entities: fans and tralbums (tracks/albums). In both cases, the recommended items are tralbums. <a href=https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/>When the key is a fan, the recommendations are tralbums that they may like, and when the key is a tralbum, the recommendations are similar tralbums</a>. Each tralbum has a title, an artist name, and a list of tags. Each fan has its Bandcamp username as a primary key, and a list of tags that is derived from the tralbums in the fan&rsquo;s collection. Originally, &ldquo;searching&rdquo; required users to either enter the exact username of a Bandcamp fan, or the exact Bandcamp link of a tralbum – not the best user experience! Indeed, I was tracking the search terms and found that many people were unsuccessfully trying to use unstructured queries. My idea of advanced search was to move away from the original key-value approach to full-text search that considers tags, titles, artists, and other fields that may get added later.</p><p>It was clear that while it may be possible to provide advanced search with MongoDB, it <a href=http://beletsky.net/2014/05/got-tired-of-mongodb-full-text.html target=_blank rel=noopener>wouldn&rsquo;t be a smooth ride</a>. While recent versions of MongoDB include support for full-text search, it isn&rsquo;t as feature-rich as Elasticsearch. For example, <a href=https://docs.mongodb.org/manual/core/index-text/#storage-requirements-and-performance-costs target=_blank rel=noopener>MongoDB text indices do not store phrases or information about the proximity of words in the documents</a>, making phrase queries run slowly unless the entire collection fits in memory. The names really say it all: MongoDB is a database with some search capabilities, and Elasticsearch is a search engine with some database capabilities. It <a href=https://www.compose.io/articles/mongoosastic-the-power-of-mongodb-and-elasticsearch-together/ target=_blank rel=noopener>seems pretty common</a> to use MongoDB (or another database) as a data store and supply search through Elasticsearch, so I figured it isn&rsquo;t a bad idea to apply this pattern to BCRecommender.</p><p>It is worth noting that if BCRecommender were a for-profit project, I would probably use <a href=https://www.algolia.com target=_blank rel=noopener>Algolia</a> rather than Elasticsearch. My experience with Algolia on a different project has been excellent – they make it easy for you to get started, have great customer service, and deliver good and fast results with minimal development and operational effort. The two main disadvantages of Algolia are its price and the fact that it&rsquo;s a closed-source solution (<a href=https://www.quora.com/How-does-Elasticsearch-relate-and-or-compare-to-Algolias-Search-as-a-Service target=_blank rel=noopener>see further discussion on Quora</a>). At over two million records, the monthly cost of running Algolia for BCRecommender would be around US$649, which is more than what I&rsquo;m willing to spend on this project. However, for a business this may be a reasonable cost because deploying and maintaining an Elasticsearch cluster may end up costing more. Nonetheless, many businesses use Elasticsearch successfully, which is why I have no doubt that it&rsquo;s a great choice for my use case – it just requires more work than Algolia to get up and running.</p><h2 id=executing-the-migration-plan>Executing the migration plan<a hidden class=anchor aria-hidden=true href=#executing-the-migration-plan>#</a></h2><p>The plan for migrating the webapp from MongoDB to Elasticsearch was pretty simple:</p><ol><li>Read the <a href=https://www.elastic.co/guide/en/elasticsearch/guide/current/index.html target=_blank rel=noopener>Elasticsearch manual</a> to ensure it suits my needs</li><li>Replace MongoDB with Elasticsearch without making any user-facing changes</li><li>Expose full-text search to BCRecommender users</li><li>Improve search performance based on user behaviour</li><li>Implement more search features</li></ol><p><strong>Reading the manual</strong> is not something I do for every piece of technology I use (there are just too many tools out there these days), but for Elasticsearch it seemed to be worth the effort. I&rsquo;m not done reading yet, but covering the material in the <em>Getting Started</em> and <em>Search in Depth</em> sections gave me enough information to complete steps 2 & 3. The main things I was worried about was Elasticsearch&rsquo;s performance as a database and how memory-hungry it&rsquo;d be. Reading the manual allowed me to avoid some memory-use pitfalls and gave me insights on the way MongoDB and Elasticsearch compare (<a href=#es-vs-mongo>see details below</a>).</p><p><strong>Switching from MongoDB to Elasticsearch as a simple database</strong> was pretty straightforward. Both are document-based, so there were no changes required to the data models, but I did use the opportunity to fix some issues. For example, I changed the sitemap generation process from dynamic to static to avoid having to scroll through the entire dataset to fetch deep sitemap pages. To support BCRecommender&rsquo;s feature of browsing through random fans, I replaced <a href=http://bdadam.com/blog/finding-a-random-document-in-mongodb.html target=_blank rel=noopener>MongoDB&rsquo;s somewhat-hacky approach of returning random results</a> with <a href=https://www.elastic.co/guide/en/elasticsearch/guide/current/random-scoring.html target=_blank rel=noopener>Elasticsearch&rsquo;s cleaner method</a>. As the webapp is implemented in Python, I originally used the <a href=http://elasticsearch-dsl.readthedocs.org/en/latest/ target=_blank rel=noopener>elasticsearch-dsl package</a>, but found it too hard to debug queries (e.g., figuring out how to rank results randomly was a bit of a nightmare). Instead, I ended up using the <a href=http://elasticsearch-py.readthedocs.org/en/latest/ target=_blank rel=noopener>elasticsearch-py package</a>, which is only a thin wrapper around the Elasticsearch API. This approach yields code that doesn&rsquo;t look very Pythonic – rather than following the <a href=https://www.python.org/dev/peps/pep-0020/ target=_blank rel=noopener>Zen of Python&rsquo;s</a> <em>flat is better than nested</em> aphorism, the API follows the more Java-esque belief of <em>you can never have enough nesting</em> (see image below for example). However, I prefer overly-nested structures that I can debug to flat code that doesn&rsquo;t work. I may try using the DSL again in the future, once I&rsquo;ve gained more experience with Elasticsearch.</p><figure><a href=elasticsearch-is-nesty.png target=_blank rel=noopener><img sizes="(min-width: 768px) 528px,
+<meta name=keywords content="BCRecommender,DevOps,Elasticsearch,MongoDB,software engineering"><meta name=description content="Migrating BCRecommender from MongoDB to Elasticsearch made it possible to offer a richer search experience to users at a similar cost, among other benefits."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Migrating a simple web application from MongoDB to Elasticsearch"><meta property="og:description" content="Migrating BCRecommender from MongoDB to Elasticsearch made it possible to offer a richer search experience to users at a similar cost, among other benefits."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/"><meta property="og:image" content="https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/mongodb-to-elasticsearch.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-11-04T03:53:18+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/mongodb-to-elasticsearch.png"><meta name=twitter:title content="Migrating a simple web application from MongoDB to Elasticsearch"><meta name=twitter:description content="Migrating BCRecommender from MongoDB to Elasticsearch made it possible to offer a richer search experience to users at a similar cost, among other benefits."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Migrating a simple web application from MongoDB to Elasticsearch","item":"https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Migrating a simple web application from MongoDB to Elasticsearch","name":"Migrating a simple web application from MongoDB to Elasticsearch","description":"Migrating BCRecommender from MongoDB to Elasticsearch made it possible to offer a richer search experience to users at a similar cost, among other benefits.","keywords":["BCRecommender","DevOps","Elasticsearch","MongoDB","software engineering"],"articleBody":"Bandcamp Recommender (BCRecommender) is a web application that serves music recommendations from Bandcamp. I recently switched BCRecommender’s data store from MongoDB to Elasticsearch. This has made it possible to offer a richer search experience to users at a similar cost. This post describes the migration process and discusses some of the advantages and disadvantages of using Elasticsearch instead of MongoDB.\nMotivation: Why swap MongoDB for Elasticsearch? I’ve written a few posts in the past on BCRecommender’s design and implementation. It is a fairly simple application with two main components: the backend worker that crawls data and generates recommendations in batch, and the webapp that serves the recommendations. Importantly, each of these components has its own data store, with the recommendations synced up from the worker to the webapp, and data like events and subscriptions synced down from the webapp to the worker. Recently, I migrated the webapp component from Parse to DigitalOcean, replacing Parse’s data store with MongoDB. Choosing MongoDB was meant to simplify the transition – Parse uses MongoDB behind the scenes, as does the backend worker. However, moving out of Parse’s sandboxed environment freed me to choose any data store, and Elasticsearch seemed like a good candidate that would make it possible to expose advanced search capabilities to end users.\nAdvanced search means different things to different people. In BCRecommender’s case what I had in mind was rather modest, at least for the initial stages. BCRecommender presents recommendations for two types of entities: fans and tralbums (tracks/albums). In both cases, the recommended items are tralbums. When the key is a fan, the recommendations are tralbums that they may like, and when the key is a tralbum, the recommendations are similar tralbums. Each tralbum has a title, an artist name, and a list of tags. Each fan has its Bandcamp username as a primary key, and a list of tags that is derived from the tralbums in the fan’s collection. Originally, “searching” required users to either enter the exact username of a Bandcamp fan, or the exact Bandcamp link of a tralbum – not the best user experience! Indeed, I was tracking the search terms and found that many people were unsuccessfully trying to use unstructured queries. My idea of advanced search was to move away from the original key-value approach to full-text search that considers tags, titles, artists, and other fields that may get added later.\nIt was clear that while it may be possible to provide advanced search with MongoDB, it wouldn’t be a smooth ride. While recent versions of MongoDB include support for full-text search, it isn’t as feature-rich as Elasticsearch. For example, MongoDB text indices do not store phrases or information about the proximity of words in the documents, making phrase queries run slowly unless the entire collection fits in memory. The names really say it all: MongoDB is a database with some search capabilities, and Elasticsearch is a search engine with some database capabilities. It seems pretty common to use MongoDB (or another database) as a data store and supply search through Elasticsearch, so I figured it isn’t a bad idea to apply this pattern to BCRecommender.\nIt is worth noting that if BCRecommender were a for-profit project, I would probably use Algolia rather than Elasticsearch. My experience with Algolia on a different project has been excellent – they make it easy for you to get started, have great customer service, and deliver good and fast results with minimal development and operational effort. The two main disadvantages of Algolia are its price and the fact that it’s a closed-source solution (see further discussion on Quora). At over two million records, the monthly cost of running Algolia for BCRecommender would be around US$649, which is more than what I’m willing to spend on this project. However, for a business this may be a reasonable cost because deploying and maintaining an Elasticsearch cluster may end up costing more. Nonetheless, many businesses use Elasticsearch successfully, which is why I have no doubt that it’s a great choice for my use case – it just requires more work than Algolia to get up and running.\nExecuting the migration plan The plan for migrating the webapp from MongoDB to Elasticsearch was pretty simple:\nRead the Elasticsearch manual to ensure it suits my needs Replace MongoDB with Elasticsearch without making any user-facing changes Expose full-text search to BCRecommender users Improve search performance based on user behaviour Implement more search features Reading the manual is not something I do for every piece of technology I use (there are just too many tools out there these days), but for Elasticsearch it seemed to be worth the effort. I’m not done reading yet, but covering the material in the Getting Started and Search in Depth sections gave me enough information to complete steps 2 \u0026 3. The main things I was worried about was Elasticsearch’s performance as a database and how memory-hungry it’d be. Reading the manual allowed me to avoid some memory-use pitfalls and gave me insights on the way MongoDB and Elasticsearch compare (see details below).\nSwitching from MongoDB to Elasticsearch as a simple database was pretty straightforward. Both are document-based, so there were no changes required to the data models, but I did use the opportunity to fix some issues. For example, I changed the sitemap generation process from dynamic to static to avoid having to scroll through the entire dataset to fetch deep sitemap pages. To support BCRecommender’s feature of browsing through random fans, I replaced MongoDB’s somewhat-hacky approach of returning random results with Elasticsearch’s cleaner method. As the webapp is implemented in Python, I originally used the elasticsearch-dsl package, but found it too hard to debug queries (e.g., figuring out how to rank results randomly was a bit of a nightmare). Instead, I ended up using the elasticsearch-py package, which is only a thin wrapper around the Elasticsearch API. This approach yields code that doesn’t look very Pythonic – rather than following the Zen of Python’s flat is better than nested aphorism, the API follows the more Java-esque belief of you can never have enough nesting (see image below for example). However, I prefer overly-nested structures that I can debug to flat code that doesn’t work. I may try using the DSL again in the future, once I’ve gained more experience with Elasticsearch.\nAs mentioned, one of my worries was that I would have to increase the amount of memory allocated to the machine where Elasticsearch runs. Since BCRecommender is a fairly low-budget project, I’m willing to sacrifice high availability to save a bit on operational costs. Therefore, the webapp and its data store run on the same DigitalOcean instance, which is enough to happily serve the current amount of traffic (around one request per second). By default, Elasticsearch indexes all the fields, and even includes an extra indexed _all field that is a concatenation of all string fields in a document. While indexing everything may be convenient, it wasn’t necessary for the first stage. Choosing the minimal index settings allowed me to keep using the same instance size as before (1GB RAM and 30GB SSD). In fact, due to the switch to static sitemaps and the removal of MongoDB’s random attribute hack, fewer indexes were required after the change.\nOnce I had all the code converted and working on my local Vagrant environment, it was time to deploy. The deployment was fairly straightforward and required no downtime, as I simply provisioned a new instance and switched over the floating IP once it was all tested and ready to go. I monitored response time and memory use closely and everything seemed to be working just fine – similarly to MongoDB. After a week of monitoring, it was time to take the next step and enable advanced search.\nEnabling full-text search is where things got interesting. This phase required adding a search result page (previously users were redirected to the queried page if it was found), and reindexing the data. For this phase, I tried to keep things as simple as possible, and just indexed the string fields (tags, artist, and title) using the standard analyser. I did some manual testing of search results based on common queries, and played a bit with improving precision and recall. Perhaps the most important tweak was allowing an item’s activity level to influence the ranking. For each tralbum, the activity level is the number of fans that have the tralbum in their collection, and for each fan, it is the size of the collection. For example, when searching for amanda, the top result is the fan with username amanda, followed by tralbums by the popular Amanda Palmer. Before I added the consideration of activity level, all tralbums and fans that contained the word amanda had the same ranking.\nI deployed full-text search earlier this week, and so far it’s looking pretty good. Elasticsearch seems to be coping well with having the same level of resources allocated as before, but it’s still too early to tell if this is sustainable over time. Most importantly, users are finally seeing results when they enter unstructured queries, which increases their engagement and retention. Woohoo!\nImproving search performance based on user behaviour is expected to be an ongoing effort. Despite having many ideas, I resisted the temptation of endless offline tinkering and opted to release a working search page quickly. With Google Analytics now set up to track site search, the plan is keep identifying gaps and tweak the search settings continuously. This will take a while, as the number of daily users is currently 200-300, and they don’t all use site search.\nImplementing more search features is another set of items on my to-do list that will be addressed over time. For example, it’d be great to have search auto-completion and a prettier result page. However, I have more ideas than time to implement them, and I’m not working on BCRecommender full-time. For now, I’m pretty happy with finally having the search function.\nElasticsearch versus MongoDB: Key findings Comparisons between tools should always be taken with a grain of salt. General comparisons may not address features that are important for your specific use case, or may overemphasise aspects that you don’t care about. In addition, actively developed tools are moving targets. Since I started the transition to Elasticsearch, version 2.0 has been released, and MongoDB 3.2 is expected very soon. The following list is derived from my experience and may not apply to you. You have been warned!\nWith the disclaimer out of the way, here are some of the advantages of Elasticsearch over MongoDB:\nBetter full-text search support (duh!). Enforceable schemas and type validation (note: some form of optional schema is expected in MongoDB 3.2). All fields are indexed by default, making it easy to explore unstructured data without worrying about adding indices. It appears that indexing is implemented in a more efficient way that doesn’t block the node. Slowness due to indexing operations seems to be a common issue with MongoDB, even with background index creation. It’s possible to query multiple indices and types (same as MongoDB databases \u0026 collections, respectively) in the same query. This is a huge advantage in my case as it makes it possible to efficiently search both fans and tralbums in a single query. Index aliases make it easy to change the indices without changing the application. Multi-get by IDs returns results in the order they were requested. This is not the case with MongoDB, where using $in doesn’t have any guarantees on the returned documents’ order. It’s easy to work around this issue, but it can be the source of subtle bugs. In my case, recommendations were unintentionally sorted in random order until I added an additional step to sort them correctly. Built-in support for random scoring (note: random sampling will finally be available in MondoDB 3.2 – the ticket for this has been open for 5 years). Built-in support for multiple types of analysis on the same field. Some disadvantages of Elasticsearch in comparison to MongoDB are:\nAll fields are indexed by default, making it easy to run into memory issues. Adjusting these default settings is strongly recommended if you know how you’re going to query the data. Documents are immutable, so every update requires deleting the original document and re-inserting it (in practice, it seems like this isn’t much of an issue). Sorting results by a field requires reading all the field’s values and sorting them in memory. The sorted results are cached, but this may cause issues if memory is too limited. In conclusion, my experience with Elasticsearch has been mostly positive so far and I’m glad I’ve made the switch. I’m looking forward to taking further advantage of advanced search features to improve user experience on BCRecommender. New posts on the topic may be published in the future, so please subscribe to be notified when this happens. As always, I’m happy to receive feedback through the comments or privately.\n","wordCount":"2165","inLanguage":"en","image":"https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/mongodb-to-elasticsearch.png","datePublished":"2015-11-04T03:53:18Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Migrating a simple web application from MongoDB to Elasticsearch</h1><div class=post-meta><span title='2015-11-04 03:53:18 +0000 UTC'>November 4, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-11-04-migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/mongodb-to-elasticsearch.png alt></figure><div class=post-content><p><a href=http://www.bcrecommender.com target=_blank rel=noopener>Bandcamp Recommender (BCRecommender)</a> is a web application that serves music recommendations from <a href=http://bandcamp.com target=_blank rel=noopener>Bandcamp</a>. I recently switched BCRecommender&rsquo;s data store from <a href=https://www.mongodb.com/ target=_blank rel=noopener>MongoDB</a> to <a href=https://www.elastic.co/products/elasticsearch target=_blank rel=noopener>Elasticsearch</a>. This has made it possible to offer a richer search experience to users at a similar cost. This post describes the migration process and discusses some of the advantages and disadvantages of using Elasticsearch instead of MongoDB.</p><h2 id=motivation-why-swap-mongodb-for-elasticsearch>Motivation: Why swap MongoDB for Elasticsearch?<a hidden class=anchor aria-hidden=true href=#motivation-why-swap-mongodb-for-elasticsearch>#</a></h2><p>I&rsquo;ve written a few posts in the past on BCRecommender&rsquo;s design and implementation. It is a fairly <a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/>simple application with two main components</a>: the backend worker that crawls data and generates recommendations in batch, and the webapp that serves the recommendations. Importantly, each of these components has its own data store, with the recommendations synced up from the worker to the webapp, and data like events and subscriptions synced down from the webapp to the worker. Recently, I <a href=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/>migrated the webapp component from Parse to DigitalOcean</a>, replacing Parse&rsquo;s data store with MongoDB. Choosing MongoDB was meant to simplify the transition – Parse uses MongoDB behind the scenes, as does the backend worker. However, moving out of Parse&rsquo;s sandboxed environment freed me to choose any data store, and Elasticsearch seemed like a good candidate that would make it possible to expose advanced search capabilities to end users.</p><p>Advanced search means different things to different people. In BCRecommender&rsquo;s case what I had in mind was rather modest, at least for the initial stages. BCRecommender presents recommendations for two types of entities: fans and tralbums (tracks/albums). In both cases, the recommended items are tralbums. <a href=https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/>When the key is a fan, the recommendations are tralbums that they may like, and when the key is a tralbum, the recommendations are similar tralbums</a>. Each tralbum has a title, an artist name, and a list of tags. Each fan has its Bandcamp username as a primary key, and a list of tags that is derived from the tralbums in the fan&rsquo;s collection. Originally, &ldquo;searching&rdquo; required users to either enter the exact username of a Bandcamp fan, or the exact Bandcamp link of a tralbum – not the best user experience! Indeed, I was tracking the search terms and found that many people were unsuccessfully trying to use unstructured queries. My idea of advanced search was to move away from the original key-value approach to full-text search that considers tags, titles, artists, and other fields that may get added later.</p><p>It was clear that while it may be possible to provide advanced search with MongoDB, it <a href=http://beletsky.net/2014/05/got-tired-of-mongodb-full-text.html target=_blank rel=noopener>wouldn&rsquo;t be a smooth ride</a>. While recent versions of MongoDB include support for full-text search, it isn&rsquo;t as feature-rich as Elasticsearch. For example, <a href=https://docs.mongodb.org/manual/core/index-text/#storage-requirements-and-performance-costs target=_blank rel=noopener>MongoDB text indices do not store phrases or information about the proximity of words in the documents</a>, making phrase queries run slowly unless the entire collection fits in memory. The names really say it all: MongoDB is a database with some search capabilities, and Elasticsearch is a search engine with some database capabilities. It <a href=https://www.compose.io/articles/mongoosastic-the-power-of-mongodb-and-elasticsearch-together/ target=_blank rel=noopener>seems pretty common</a> to use MongoDB (or another database) as a data store and supply search through Elasticsearch, so I figured it isn&rsquo;t a bad idea to apply this pattern to BCRecommender.</p><p>It is worth noting that if BCRecommender were a for-profit project, I would probably use <a href=https://www.algolia.com target=_blank rel=noopener>Algolia</a> rather than Elasticsearch. My experience with Algolia on a different project has been excellent – they make it easy for you to get started, have great customer service, and deliver good and fast results with minimal development and operational effort. The two main disadvantages of Algolia are its price and the fact that it&rsquo;s a closed-source solution (<a href=https://www.quora.com/How-does-Elasticsearch-relate-and-or-compare-to-Algolias-Search-as-a-Service target=_blank rel=noopener>see further discussion on Quora</a>). At over two million records, the monthly cost of running Algolia for BCRecommender would be around US$649, which is more than what I&rsquo;m willing to spend on this project. However, for a business this may be a reasonable cost because deploying and maintaining an Elasticsearch cluster may end up costing more. Nonetheless, many businesses use Elasticsearch successfully, which is why I have no doubt that it&rsquo;s a great choice for my use case – it just requires more work than Algolia to get up and running.</p><h2 id=executing-the-migration-plan>Executing the migration plan<a hidden class=anchor aria-hidden=true href=#executing-the-migration-plan>#</a></h2><p>The plan for migrating the webapp from MongoDB to Elasticsearch was pretty simple:</p><ol><li>Read the <a href=https://www.elastic.co/guide/en/elasticsearch/guide/current/index.html target=_blank rel=noopener>Elasticsearch manual</a> to ensure it suits my needs</li><li>Replace MongoDB with Elasticsearch without making any user-facing changes</li><li>Expose full-text search to BCRecommender users</li><li>Improve search performance based on user behaviour</li><li>Implement more search features</li></ol><p><strong>Reading the manual</strong> is not something I do for every piece of technology I use (there are just too many tools out there these days), but for Elasticsearch it seemed to be worth the effort. I&rsquo;m not done reading yet, but covering the material in the <em>Getting Started</em> and <em>Search in Depth</em> sections gave me enough information to complete steps 2 & 3. The main things I was worried about was Elasticsearch&rsquo;s performance as a database and how memory-hungry it&rsquo;d be. Reading the manual allowed me to avoid some memory-use pitfalls and gave me insights on the way MongoDB and Elasticsearch compare (<a href=#es-vs-mongo>see details below</a>).</p><p><strong>Switching from MongoDB to Elasticsearch as a simple database</strong> was pretty straightforward. Both are document-based, so there were no changes required to the data models, but I did use the opportunity to fix some issues. For example, I changed the sitemap generation process from dynamic to static to avoid having to scroll through the entire dataset to fetch deep sitemap pages. To support BCRecommender&rsquo;s feature of browsing through random fans, I replaced <a href=http://bdadam.com/blog/finding-a-random-document-in-mongodb.html target=_blank rel=noopener>MongoDB&rsquo;s somewhat-hacky approach of returning random results</a> with <a href=https://www.elastic.co/guide/en/elasticsearch/guide/current/random-scoring.html target=_blank rel=noopener>Elasticsearch&rsquo;s cleaner method</a>. As the webapp is implemented in Python, I originally used the <a href=http://elasticsearch-dsl.readthedocs.org/en/latest/ target=_blank rel=noopener>elasticsearch-dsl package</a>, but found it too hard to debug queries (e.g., figuring out how to rank results randomly was a bit of a nightmare). Instead, I ended up using the <a href=http://elasticsearch-py.readthedocs.org/en/latest/ target=_blank rel=noopener>elasticsearch-py package</a>, which is only a thin wrapper around the Elasticsearch API. This approach yields code that doesn&rsquo;t look very Pythonic – rather than following the <a href=https://www.python.org/dev/peps/pep-0020/ target=_blank rel=noopener>Zen of Python&rsquo;s</a> <em>flat is better than nested</em> aphorism, the API follows the more Java-esque belief of <em>you can never have enough nesting</em> (see image below for example). However, I prefer overly-nested structures that I can debug to flat code that doesn&rsquo;t work. I may try using the DSL again in the future, once I&rsquo;ve gained more experience with Elasticsearch.</p><figure><a href=elasticsearch-is-nesty.png target=_blank rel=noopener><img sizes="(min-width: 768px) 528px,
 100vw" srcset="https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/elasticsearch-is-nesty_huc88b90071870452221fd99ee4be90e05_21660_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/elasticsearch-is-nesty_huc88b90071870452221fd99ee4be90e05_21660_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/elasticsearch-is-nesty.png 528w," src=https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/elasticsearch-is-nesty.png alt="elasticsearch is nesty" loading=lazy></a></figure><p>As mentioned, one of my worries was that I would have to increase the amount of memory allocated to the machine where Elasticsearch runs. Since BCRecommender is a fairly low-budget project, I&rsquo;m willing to sacrifice high availability to save a bit on operational costs. Therefore, the webapp and its data store run on the same DigitalOcean instance, which is enough to happily serve the current amount of traffic (around one request per second). By default, Elasticsearch indexes all the fields, and even includes an extra indexed _all field that is a concatenation of all string fields in a document. While indexing everything may be convenient, it wasn&rsquo;t necessary for the first stage. Choosing the minimal index settings allowed me to keep using the same instance size as before (1GB RAM and 30GB SSD). In fact, due to the switch to static sitemaps and the removal of MongoDB&rsquo;s random attribute hack, fewer indexes were required after the change.</p><p>Once I had all the code converted and working on my local Vagrant environment, it was time to deploy. The deployment was fairly straightforward and required no downtime, as I simply provisioned a new instance and switched over the floating IP once it was all tested and ready to go. I monitored response time and memory use closely and everything seemed to be working just fine – similarly to MongoDB. After a week of monitoring, it was time to take the next step and enable advanced search.</p><p><strong>Enabling full-text search</strong> is where things got interesting. This phase required adding a search result page (previously users were redirected to the queried page if it was found), and reindexing the data. For this phase, I tried to keep things as simple as possible, and just indexed the string fields (tags, artist, and title) using the standard analyser. I did some manual testing of search results based on common queries, and played a bit with improving <a href=https://en.wikipedia.org/wiki/Precision_and_recall target=_blank rel=noopener>precision and recall</a>. Perhaps the most important tweak was allowing an item&rsquo;s activity level to influence the ranking. For each tralbum, the activity level is the number of fans that have the tralbum in their collection, and for each fan, it is the size of the collection. For example, <a href="http://www.bcrecommender.com/search?q=amanda" target=_blank rel=noopener>when searching for <em>amanda</em></a>, the top result is the fan with username <em>amanda</em>, followed by tralbums by the popular Amanda Palmer. Before I added the consideration of activity level, all tralbums and fans that contained the word <em>amanda</em> had the same ranking.</p><figure><a href=bcrecommender-search-amanda.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
diff --git a/2015/11/23/the-hardest-parts-of-data-science/index.html b/2015/11/23/the-hardest-parts-of-data-science/index.html
index bae8c4397..220f65f4e 100644
--- a/2015/11/23/the-hardest-parts-of-data-science/index.html
+++ b/2015/11/23/the-hardest-parts-of-data-science/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>The hardest parts of data science | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="climate change,data science,Kaggle,predictive modelling,science communication"><meta name=description content="Defining feasible problems and coming up with reasonable ways of measuring solutions is harder than building accurate models or obtaining clean data."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The hardest parts of data science"><meta property="og:description" content="Defining feasible problems and coming up with reasonable ways of measuring solutions is harder than building accurate models or obtaining clean data."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/"><meta property="og:image" content="https://yanirseroussi.com/foggy-random-forest.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-11-23T04:14:21+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/foggy-random-forest.jpg"><meta name=twitter:title content="The hardest parts of data science"><meta name=twitter:description content="Defining feasible problems and coming up with reasonable ways of measuring solutions is harder than building accurate models or obtaining clean data."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The hardest parts of data science","item":"https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The hardest parts of data science","name":"The hardest parts of data science","description":"Defining feasible problems and coming up with reasonable ways of measuring solutions is harder than building accurate models or obtaining clean data.","keywords":["climate change","data science","Kaggle","predictive modelling","science communication"],"articleBody":"Contrary to common belief, the hardest part of data science isn’t building an accurate model or obtaining good, clean data. It is much harder to define feasible problems and come up with reasonable ways of measuring solutions. This post discusses some examples of these issues and how they can be addressed.\nThe not-so-hard parts Before discussing the hardest parts of data science, it’s worth quickly addressing the two main contenders: model fitting and data collection/cleaning.\nModel fitting is seen by some as particularly hard, or as real data science. This belief is fuelled in part by the success of Kaggle, that calls itself the home of data science. Most Kaggle competitions are focused on model fitting: Participants are given a well-defined problem, a dataset, and a measure to optimise, and they compete to produce the most accurate model. Coupling Kaggle’s excellent marketing with their competition setup leads many people to believe that data science is all about fitting models. In reality, building reasonably-accurate models is not that hard, because many model-building phases can easily be automated. Indeed, there are many companies that offer model fitting as a service (e.g., Microsoft, Amazon, Google and others). Even Ben Hamner, CTO of Kaggle, has said that he is “surprised at the number of ‘black box machine learning in the cloud’ services emerging: model fitting is easy. Problem definition and data collection are not.”\nData collection/cleaning is the essential part that everyone loves to hate. DJ Patil (US Chief Data Scientist) is quoted as saying that “the hardest part of data science is getting good, clean data. Cleaning data is often 80% of the work.” While I agree that collecting data and cleaning it can be a lot of work, I don’t think of this part as particularly hard. It’s definitely important and may require careful planning, but in many cases it just isn’t very challenging. In addition, it is often the case that the data is already given, or is collected using previously-developed methods.\nProblem definition is hard There are many reasons why problem definition can be hard. It is sometimes due to stakeholders who don’t know what they want, and expect data scientists to solve all their data problems (either real or imagined). This type of situation is summarised by the following Dilbert strip. It is best handled by cleverly managing stakeholder expectations, while stirring them towards better-defined problems.\nWell-defined problems are great, for the obvious reason that they can actually be addressed. Examples of such problems include:\nBuild a model to predict the sales of a marketing campaign Create a system that runs campaigns that automatically adapt to customer feedback Identify key objects in images Improve click-through rates on search engine results, ads, or any other element Detect whale calls from underwater recordings to prevent collisions Often, it can be hard to get to the stage where the problem is agreed on, because this requires dealing with people who only have a fuzzy idea of what can be done with data science. Dilbertian situations aside, these people often have real problems that they care about, so exploring the core issues with them is time well-spent.\nSolution measurement is often harder than problem definition Many problems that actually matter have solutions that are really hard to measure. For example, improving the well-being of the population (e.g., a company’s customers or a country’s citizens) is an overarching problem that arises in many situations. However, this problem gives rise to the hard question of how well-being can be measured and aggregated. The following paragraphs discuss issues that occur in solution measurement, often making it the hardest part of data science.\nIdeally, we would always be able to run randomised controlled trials to measure treatment effects. However, the reality is that experimental data is often censored, there many constraints on running experiments (ethics, practicality, budget, etc.), and confounding factors may make it impossible to identify the true causal impact of interventions. These issues seriously influence many aspects of our lives. I’ve written a post on how these issues manifest themselves in research on the connection between nutrition and our health. Here, I’ll discuss another major example: the health effects of smoking and anthropogenic climate change.\nWhile smoking and anthropogenic climate change may seem unrelated, they actually have a lot in common. In both cases it is hard (or impossible) to perform experiments to determine causality, and in both cases this fact has been used to mislead the public by parties with commercial and ideological interests. In the case of smoking, due to ethical reasons, one can’t perform an experiment where a random control group is forced not to smoke, while a treatment group is forced to smoke. Further, since it can take many years for smoking-caused diseases to develop, it’d take a long time to obtain the results of such an experiment. Tobacco companies have exploited this fact for years, claiming that there may be some genetic factor that causes both smoking and a higher susceptibility to smoking-related diseases. Fortunately, we live in a world where these claims have been widely discredited, and it is now clear to most people that smoking is harmful. However, similar doubt-casting techniques are used by polluters and their supporters in the debate on anthropogenic climate change. While no serious climate scientist doubts the fact that human activities are causing climate change, this can’t be proved through experimentation on another Earth. In both cases, the answers should be clear when looking at the evidence and the mechanisms at play without an ideological bias. It doesn’t take a scientist to figure out that pumping your lungs full of smoke on a regular basis is likely to be harmful, as is pumping the atmosphere full of greenhouse gases that have been sequestered for millions of years. However, as said by Upton Sinclair, “it is difficult to get a man to understand something, when his salary depends upon his not understanding it.”\nAssuming that we have addressed the issues raised so far, there is the matter of choosing a measure or metric of success. How do we know that our solution works well? A common approach is to choose a single metric to focus on, such as increasing conversion rates. However, all metrics have their flaws, and there are quite a few problems with metric selection and its maintenance over time.\nFirst, focusing on a single metric can be harmful, because no metric is perfect. A classic example of this issue is the focus on growing the economy, as measured by gross domestic product (GDP). The article What is up with the GDP? by Frank Shostak summarises some of the problems with GDP:\nThe GDP framework cannot tell us whether final goods and services that were produced during a particular period of time are a reflection of real wealth expansion, or a reflection of capital consumption.\nFor instance, if a government embarks on the building of a pyramid, which adds absolutely nothing to the well-being of individuals, the GDP framework will regard this as economic growth. In reality, however, the building of the pyramid will divert real funding from wealth-generating activities, thereby stifling the production of wealth.\n[…]\nThe whole idea of GDP gives the impression that there is such a thing as the national output. In the real world, however, wealth is produced by someone and belongs to somebody. In other words, goods and services are not produced in totality and supervised by one supreme leader. This in turn means that the entire concept of GDP is devoid of any basis in reality. It is an empty concept.\nShostak’s criticism comes from a right-winged viewpoint – his argument is that the GDP is used as an excuse for unnecessary government intervention with the market. However, the focus on GDP growth is also heavily-criticised by the left due to the fact that it doesn’t consider environmental effects and inequalities in the distribution of wealth. It is a bit odd that GDP growth is still considered a worthwhile goal by many people, given that it can easily be skewed by a few powerful individuals who choose to build unnecessary pyramids (though perhaps this is the real reason why the GDP persists – wealthy individuals have an interest in keeping it this way).\nEven if we decide to use multiple metrics to evaluate our solution, our troubles aren’t over yet. Using multiple metrics often means that there are trade-offs between the different metrics. For example, with the precision and recall measures that are commonly used to evaluate the performance of search engines, it is rare to be able to increase both precision and recall at the same time. Precision is the percentage of relevant items out of those that have been returned, while recall is the percentage of relevant items that have been returned out of the overall number of relevant items. Hence, it is easy to artificially increase recall to 100% by always returning all the items in the database, but this would mean settling for near-zero precision. Similarly, one can increase precision by always returning a single item that the algorithm is very confident about, but this means that recall would suffer. Ultimately, the best balance between precision and recall depends on the application.\nAnother issue with choosing metrics is the impossibility of reliably evaluating our choices. This is summarised well by Scott Berkun in his book The Year Without Pants:\nAll metrics create temptations. Even with great intentions and smart minds, data runs you faster and faster into a stupid self-destructive circle. Data can’t decide things for you. It can help you see things more clearly if captured carefully, but that’s not the same as deciding. Just as there is an advice paradox, there is a data paradox: no matter how much data you have, you still depend on your intuition for deciding how to interpret and then apply the data.\nPut another way, there is no good KPI for measuring KPIs. There are no good metrics for evaluating metrics (or for evaluating metrics for evaluating metrics for evaluating metrics, and on it goes).\nOK, so we’ve picked some flawed measures that we can’t really evaluate, and we’ve accepted the imperfections of the evaluation process. Are we done yet? No. There’s still the small matter of Goodhart’s Law, which states that “when a measure becomes a target, it ceases to be a good measure.” This is often the case because people will tend to manipulate results and game the system (not necessarily maliciously) in order to hit measured goals. However, even without manipulation and gaming, we often deal with moving targets. Just because the measure we’ve chosen is suitable today, it doesn’t mean it will still be relevant in a few months or years because reality changes. For example, in the 1990s, the number of page views was a good measure of interaction with websites, but nowadays it is a pretty weak measure because many websites are single-page applications. Reality changes and so should our problems, solutions, measures, and goals.\nEmbracing ambiguity and uncertainty Personally, I find the complexities of measurement and problem definition quite interesting. However, many people aren’t that interested in this stuff – they just want working solutions and simple stories. As demonstrated by the examples throughout this article, over-simplification of complicated matters is a pervasive issue that goes beyond what’s commonly considered “data science”. This is why storytelling is seen as a key skill that data scientists should possess. I believe it’s also important to maintain one’s integrity and not just make up stories that people would buy, but it’d be naive to assume that this never happens. Either way, good data scientists embrace uncertainty and ambiguity, but can still tell a simple story if needed.\nNote: The ideas in this post were first presented at The Sydney Data Science Breakfast Meetup Group. The slides for that talk are available here.\n","wordCount":"1983","inLanguage":"en","image":"https://yanirseroussi.com/foggy-random-forest.jpg","datePublished":"2015-11-23T04:14:21Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The hardest parts of data science</h1><div class=post-meta><span title='2015-11-23 04:14:21 +0000 UTC'>November 23, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-11-23-the-hardest-parts-of-data-science/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest.jpg 1960w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest.jpg alt width=1960 height=597></figure><div class=post-content><p>Contrary to common belief, the hardest part of data science isn&rsquo;t building an accurate model or obtaining good, clean data. It is much harder to define feasible problems and come up with reasonable ways of measuring solutions. This post discusses some examples of these issues and how they can be addressed.</p><h2 id=the-not-so-hard-parts>The not-so-hard parts<a hidden class=anchor aria-hidden=true href=#the-not-so-hard-parts>#</a></h2><p>Before discussing the hardest parts of data science, it&rsquo;s worth quickly addressing the two main contenders: model fitting and data collection/cleaning.</p><p><strong>Model fitting</strong> is seen by some as particularly hard, or as <em>real</em> data science. This belief is fuelled in part by the success of <a href=https://www.kaggle.com/ target=_blank rel=noopener>Kaggle</a>, that calls itself <em>the home of data science</em>. Most Kaggle competitions are focused on model fitting: Participants are given a well-defined problem, a dataset, and a measure to optimise, and they compete to produce the most accurate model. Coupling Kaggle&rsquo;s excellent marketing with their competition setup leads many people to believe that data science is all about fitting models. In reality, building reasonably-accurate models is not that hard, because many model-building phases can easily be automated. Indeed, there are many companies that offer model fitting as a service (e.g., Microsoft, Amazon, Google and <a href=http://www.shivonzilis.com/machineintelligence target=_blank rel=noopener>others</a>). Even Ben Hamner, CTO of Kaggle, has said that he is &ldquo;surprised at the number of &lsquo;black box machine learning in the cloud&rsquo; services emerging: model fitting is easy. Problem definition and data collection are not.&rdquo;</p><figure><a href=https://twitter.com/benhamner/status/595850574999990274 target=_blank rel=noopener><img sizes="(min-width: 768px) 569px,
+<meta name=keywords content="climate change,data science,Kaggle,predictive modelling,science communication"><meta name=description content="Defining feasible problems and coming up with reasonable ways of measuring solutions is harder than building accurate models or obtaining clean data."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The hardest parts of data science"><meta property="og:description" content="Defining feasible problems and coming up with reasonable ways of measuring solutions is harder than building accurate models or obtaining clean data."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/"><meta property="og:image" content="https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-11-23T04:14:21+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest.jpg"><meta name=twitter:title content="The hardest parts of data science"><meta name=twitter:description content="Defining feasible problems and coming up with reasonable ways of measuring solutions is harder than building accurate models or obtaining clean data."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The hardest parts of data science","item":"https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The hardest parts of data science","name":"The hardest parts of data science","description":"Defining feasible problems and coming up with reasonable ways of measuring solutions is harder than building accurate models or obtaining clean data.","keywords":["climate change","data science","Kaggle","predictive modelling","science communication"],"articleBody":"Contrary to common belief, the hardest part of data science isn’t building an accurate model or obtaining good, clean data. It is much harder to define feasible problems and come up with reasonable ways of measuring solutions. This post discusses some examples of these issues and how they can be addressed.\nThe not-so-hard parts Before discussing the hardest parts of data science, it’s worth quickly addressing the two main contenders: model fitting and data collection/cleaning.\nModel fitting is seen by some as particularly hard, or as real data science. This belief is fuelled in part by the success of Kaggle, that calls itself the home of data science. Most Kaggle competitions are focused on model fitting: Participants are given a well-defined problem, a dataset, and a measure to optimise, and they compete to produce the most accurate model. Coupling Kaggle’s excellent marketing with their competition setup leads many people to believe that data science is all about fitting models. In reality, building reasonably-accurate models is not that hard, because many model-building phases can easily be automated. Indeed, there are many companies that offer model fitting as a service (e.g., Microsoft, Amazon, Google and others). Even Ben Hamner, CTO of Kaggle, has said that he is “surprised at the number of ‘black box machine learning in the cloud’ services emerging: model fitting is easy. Problem definition and data collection are not.”\nData collection/cleaning is the essential part that everyone loves to hate. DJ Patil (US Chief Data Scientist) is quoted as saying that “the hardest part of data science is getting good, clean data. Cleaning data is often 80% of the work.” While I agree that collecting data and cleaning it can be a lot of work, I don’t think of this part as particularly hard. It’s definitely important and may require careful planning, but in many cases it just isn’t very challenging. In addition, it is often the case that the data is already given, or is collected using previously-developed methods.\nProblem definition is hard There are many reasons why problem definition can be hard. It is sometimes due to stakeholders who don’t know what they want, and expect data scientists to solve all their data problems (either real or imagined). This type of situation is summarised by the following Dilbert strip. It is best handled by cleverly managing stakeholder expectations, while stirring them towards better-defined problems.\nWell-defined problems are great, for the obvious reason that they can actually be addressed. Examples of such problems include:\nBuild a model to predict the sales of a marketing campaign Create a system that runs campaigns that automatically adapt to customer feedback Identify key objects in images Improve click-through rates on search engine results, ads, or any other element Detect whale calls from underwater recordings to prevent collisions Often, it can be hard to get to the stage where the problem is agreed on, because this requires dealing with people who only have a fuzzy idea of what can be done with data science. Dilbertian situations aside, these people often have real problems that they care about, so exploring the core issues with them is time well-spent.\nSolution measurement is often harder than problem definition Many problems that actually matter have solutions that are really hard to measure. For example, improving the well-being of the population (e.g., a company’s customers or a country’s citizens) is an overarching problem that arises in many situations. However, this problem gives rise to the hard question of how well-being can be measured and aggregated. The following paragraphs discuss issues that occur in solution measurement, often making it the hardest part of data science.\nIdeally, we would always be able to run randomised controlled trials to measure treatment effects. However, the reality is that experimental data is often censored, there many constraints on running experiments (ethics, practicality, budget, etc.), and confounding factors may make it impossible to identify the true causal impact of interventions. These issues seriously influence many aspects of our lives. I’ve written a post on how these issues manifest themselves in research on the connection between nutrition and our health. Here, I’ll discuss another major example: the health effects of smoking and anthropogenic climate change.\nWhile smoking and anthropogenic climate change may seem unrelated, they actually have a lot in common. In both cases it is hard (or impossible) to perform experiments to determine causality, and in both cases this fact has been used to mislead the public by parties with commercial and ideological interests. In the case of smoking, due to ethical reasons, one can’t perform an experiment where a random control group is forced not to smoke, while a treatment group is forced to smoke. Further, since it can take many years for smoking-caused diseases to develop, it’d take a long time to obtain the results of such an experiment. Tobacco companies have exploited this fact for years, claiming that there may be some genetic factor that causes both smoking and a higher susceptibility to smoking-related diseases. Fortunately, we live in a world where these claims have been widely discredited, and it is now clear to most people that smoking is harmful. However, similar doubt-casting techniques are used by polluters and their supporters in the debate on anthropogenic climate change. While no serious climate scientist doubts the fact that human activities are causing climate change, this can’t be proved through experimentation on another Earth. In both cases, the answers should be clear when looking at the evidence and the mechanisms at play without an ideological bias. It doesn’t take a scientist to figure out that pumping your lungs full of smoke on a regular basis is likely to be harmful, as is pumping the atmosphere full of greenhouse gases that have been sequestered for millions of years. However, as said by Upton Sinclair, “it is difficult to get a man to understand something, when his salary depends upon his not understanding it.”\nAssuming that we have addressed the issues raised so far, there is the matter of choosing a measure or metric of success. How do we know that our solution works well? A common approach is to choose a single metric to focus on, such as increasing conversion rates. However, all metrics have their flaws, and there are quite a few problems with metric selection and its maintenance over time.\nFirst, focusing on a single metric can be harmful, because no metric is perfect. A classic example of this issue is the focus on growing the economy, as measured by gross domestic product (GDP). The article What is up with the GDP? by Frank Shostak summarises some of the problems with GDP:\nThe GDP framework cannot tell us whether final goods and services that were produced during a particular period of time are a reflection of real wealth expansion, or a reflection of capital consumption.\nFor instance, if a government embarks on the building of a pyramid, which adds absolutely nothing to the well-being of individuals, the GDP framework will regard this as economic growth. In reality, however, the building of the pyramid will divert real funding from wealth-generating activities, thereby stifling the production of wealth.\n[…]\nThe whole idea of GDP gives the impression that there is such a thing as the national output. In the real world, however, wealth is produced by someone and belongs to somebody. In other words, goods and services are not produced in totality and supervised by one supreme leader. This in turn means that the entire concept of GDP is devoid of any basis in reality. It is an empty concept.\nShostak’s criticism comes from a right-winged viewpoint – his argument is that the GDP is used as an excuse for unnecessary government intervention with the market. However, the focus on GDP growth is also heavily-criticised by the left due to the fact that it doesn’t consider environmental effects and inequalities in the distribution of wealth. It is a bit odd that GDP growth is still considered a worthwhile goal by many people, given that it can easily be skewed by a few powerful individuals who choose to build unnecessary pyramids (though perhaps this is the real reason why the GDP persists – wealthy individuals have an interest in keeping it this way).\nEven if we decide to use multiple metrics to evaluate our solution, our troubles aren’t over yet. Using multiple metrics often means that there are trade-offs between the different metrics. For example, with the precision and recall measures that are commonly used to evaluate the performance of search engines, it is rare to be able to increase both precision and recall at the same time. Precision is the percentage of relevant items out of those that have been returned, while recall is the percentage of relevant items that have been returned out of the overall number of relevant items. Hence, it is easy to artificially increase recall to 100% by always returning all the items in the database, but this would mean settling for near-zero precision. Similarly, one can increase precision by always returning a single item that the algorithm is very confident about, but this means that recall would suffer. Ultimately, the best balance between precision and recall depends on the application.\nAnother issue with choosing metrics is the impossibility of reliably evaluating our choices. This is summarised well by Scott Berkun in his book The Year Without Pants:\nAll metrics create temptations. Even with great intentions and smart minds, data runs you faster and faster into a stupid self-destructive circle. Data can’t decide things for you. It can help you see things more clearly if captured carefully, but that’s not the same as deciding. Just as there is an advice paradox, there is a data paradox: no matter how much data you have, you still depend on your intuition for deciding how to interpret and then apply the data.\nPut another way, there is no good KPI for measuring KPIs. There are no good metrics for evaluating metrics (or for evaluating metrics for evaluating metrics for evaluating metrics, and on it goes).\nOK, so we’ve picked some flawed measures that we can’t really evaluate, and we’ve accepted the imperfections of the evaluation process. Are we done yet? No. There’s still the small matter of Goodhart’s Law, which states that “when a measure becomes a target, it ceases to be a good measure.” This is often the case because people will tend to manipulate results and game the system (not necessarily maliciously) in order to hit measured goals. However, even without manipulation and gaming, we often deal with moving targets. Just because the measure we’ve chosen is suitable today, it doesn’t mean it will still be relevant in a few months or years because reality changes. For example, in the 1990s, the number of page views was a good measure of interaction with websites, but nowadays it is a pretty weak measure because many websites are single-page applications. Reality changes and so should our problems, solutions, measures, and goals.\nEmbracing ambiguity and uncertainty Personally, I find the complexities of measurement and problem definition quite interesting. However, many people aren’t that interested in this stuff – they just want working solutions and simple stories. As demonstrated by the examples throughout this article, over-simplification of complicated matters is a pervasive issue that goes beyond what’s commonly considered “data science”. This is why storytelling is seen as a key skill that data scientists should possess. I believe it’s also important to maintain one’s integrity and not just make up stories that people would buy, but it’d be naive to assume that this never happens. Either way, good data scientists embrace uncertainty and ambiguity, but can still tell a simple story if needed.\nNote: The ideas in this post were first presented at The Sydney Data Science Breakfast Meetup Group. The slides for that talk are available here.\n","wordCount":"1983","inLanguage":"en","image":"https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest.jpg","datePublished":"2015-11-23T04:14:21Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The hardest parts of data science</h1><div class=post-meta><span title='2015-11-23 04:14:21 +0000 UTC'>November 23, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-11-23-the-hardest-parts-of-data-science/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest_hub79e18ea0439364131cf541e77991fbc_191404_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest.jpg 1960w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/foggy-random-forest.jpg alt width=1960 height=597></figure><div class=post-content><p>Contrary to common belief, the hardest part of data science isn&rsquo;t building an accurate model or obtaining good, clean data. It is much harder to define feasible problems and come up with reasonable ways of measuring solutions. This post discusses some examples of these issues and how they can be addressed.</p><h2 id=the-not-so-hard-parts>The not-so-hard parts<a hidden class=anchor aria-hidden=true href=#the-not-so-hard-parts>#</a></h2><p>Before discussing the hardest parts of data science, it&rsquo;s worth quickly addressing the two main contenders: model fitting and data collection/cleaning.</p><p><strong>Model fitting</strong> is seen by some as particularly hard, or as <em>real</em> data science. This belief is fuelled in part by the success of <a href=https://www.kaggle.com/ target=_blank rel=noopener>Kaggle</a>, that calls itself <em>the home of data science</em>. Most Kaggle competitions are focused on model fitting: Participants are given a well-defined problem, a dataset, and a measure to optimise, and they compete to produce the most accurate model. Coupling Kaggle&rsquo;s excellent marketing with their competition setup leads many people to believe that data science is all about fitting models. In reality, building reasonably-accurate models is not that hard, because many model-building phases can easily be automated. Indeed, there are many companies that offer model fitting as a service (e.g., Microsoft, Amazon, Google and <a href=http://www.shivonzilis.com/machineintelligence target=_blank rel=noopener>others</a>). Even Ben Hamner, CTO of Kaggle, has said that he is &ldquo;surprised at the number of &lsquo;black box machine learning in the cloud&rsquo; services emerging: model fitting is easy. Problem definition and data collection are not.&rdquo;</p><figure><a href=https://twitter.com/benhamner/status/595850574999990274 target=_blank rel=noopener><img sizes="(min-width: 768px) 569px,
 100vw" srcset="https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/ben-hamner-black-box-ml_hue38d7e4cb07e1ecfcf4351af67252791_46703_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/ben-hamner-black-box-ml_hue38d7e4cb07e1ecfcf4351af67252791_46703_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/ben-hamner-black-box-ml.png 569w," src=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/ben-hamner-black-box-ml.png alt="Ben Hamner tweet on black box ML in the cloud" loading=lazy></a></figure><p><strong>Data collection/cleaning</strong> is the essential part that everyone loves to hate. DJ Patil (US Chief Data Scientist) is <a href=http://codingvc.com/talk-summary-building-great-data-products target=_blank rel=noopener>quoted as saying</a> that &ldquo;the hardest part of data science is getting good, clean data. Cleaning data is often 80% of the work.&rdquo; While I agree that collecting data and cleaning it can be a lot of work, I don&rsquo;t think of this part as particularly hard. It&rsquo;s definitely important and may require careful planning, but in many cases it just isn&rsquo;t very challenging. In addition, it is often the case that the data is already given, or is collected using previously-developed methods.</p><h2 id=problem-definition-is-hard>Problem definition is hard<a hidden class=anchor aria-hidden=true href=#problem-definition-is-hard>#</a></h2><p>There are many reasons why problem definition can be hard. It is sometimes due to stakeholders who don&rsquo;t know what they want, and <a href=https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/>expect data scientists to solve all their data problems (either real or imagined)</a>. This type of situation is summarised by <a href=http://dilbert.com/strip/2012-07-29 target=_blank rel=noopener>the following Dilbert strip</a>. It is best handled by cleverly managing stakeholder expectations, while stirring them towards better-defined problems.</p><figure><a href=dilbert-big-data.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
diff --git a/2015/12/08/this-holiday-season-give-me-real-insights/index.html b/2015/12/08/this-holiday-season-give-me-real-insights/index.html
index b2819a748..4898c409e 100644
--- a/2015/12/08/this-holiday-season-give-me-real-insights/index.html
+++ b/2015/12/08/this-holiday-season-give-me-real-insights/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>This holiday season, give me real insights | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="analytics,data science,Facebook,insights,LinkedIn,marketing,WordPress"><meta name=description content="Some companies present raw data or information as &ldquo;insights&rdquo;. This post surveys some examples, and discusses how they can be turned into real insights."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="This holiday season, give me real insights"><meta property="og:description" content="Some companies present raw data or information as &ldquo;insights&rdquo;. This post surveys some examples, and discusses how they can be turned into real insights."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/"><meta property="og:image" content="https://yanirseroussi.com/dikw-pyramid.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-12-08T06:57:25+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/dikw-pyramid.jpg"><meta name=twitter:title content="This holiday season, give me real insights"><meta name=twitter:description content="Some companies present raw data or information as &ldquo;insights&rdquo;. This post surveys some examples, and discusses how they can be turned into real insights."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"This holiday season, give me real insights","item":"https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"This holiday season, give me real insights","name":"This holiday season, give me real insights","description":"Some companies present raw data or information as \u0026ldquo;insights\u0026rdquo;. This post surveys some examples, and discusses how they can be turned into real insights.","keywords":["analytics","data science","Facebook","insights","LinkedIn","marketing","WordPress"],"articleBody":"Merriam-Webster defines an insight as an understanding of the true nature of something. Many companies seem to define an insight as any piece of data or information, which I would call a pseudo-insight. This post surveys some examples of pseudo-insights, and discusses how these can be built upon to provide real insights.\nExhibit A: WordPress stats This website is hosted on wordpress.com. I’m generally happy with WordPress – though it’s not as exciting and shiny as newer competitors, it is rock-solid and very feature-rich. An example of a great WordPress feature is the new stats area (available under wordpress.com/stats if you have a WordPress website). This area includes an insights page, which is full of prime examples of pseudo-insights.\nAt the top of the insights page, there is a visualisation of posting activity. As the image below shows, this isn’t very interesting for websites like mine. I already know that I post irregularly, because writing a blog post is time-consuming. I suspect that this visualisation isn’t very useful even for more active multi-author blogs, as it is essentially just a different way of displaying the raw data of post dates. Without joining this data with other information, we won’t gain a better understanding of how the blog is performing and why it performs the way it does.\nAn attempt to extract more meaningful insights from posting times appears further down the page, in the form of a widget that tells you the most popular day and hour. The help text says that This is the day and hour when you have been getting the most Views on average. The best timing for publishing a post may be around this period. Unfortunately, I’m pretty certain that this isn’t true in my case. Monday happens to be the most popular day because that’s when I published two of my most popular posts, and I usually try to spread the word about a new post as soon as I publish it. Further, blog posts can become popular a long time after publication, so it is unlikely that the best timing for publishing a post is around Monday 3pm.\nWhat would real WordPress insights look like? If we stick to idea of exploring the effect of publication timing, I would be curious to know if there is indeed a link between when a post is published and its popularity. Automattic (the company behind WordPress) is in a position to test this, as they can explore data from millions of blogs. My gut feeling is that the time of publication has a negligible effect on popularity. Things that matter much more are a post’s title, content, and effective distribution channels. Given the amount of data that they have, Automattic data scientists can definitely explore all of these factors. This would allow them to surface insights that will help authors drive more quality traffic to their websites.\nExhibit B: Facebook page insights As anyone who manages a Facebook page probably knows, Facebook provides pretty rich analytics of pages on their platform. For example, you can see the likes you’ve received over time and how your posts perform, and slice and dice this information in various ways. This is a great feature, but again, calling it insights is a misuse of the word and somewhat of an insult for those of us who work to extract real insights from data. An analytics dashboard is not insights.\nWhat would real Facebook page insights look like? Working off the assumption that people manage a Facebook page to reach and engage their audience, real insights would enhance a page administrator’s understanding of their audience and improve their ability to engage them and reach new people. However, Facebook is famous for having a conflict of interest here, because they require you to pay to reach more people. For example, if a post you shared is performing better than usual, Facebook will send you a notification, asking you to pay to boost the post further. It would be better if they told you what has caused this post to reach more people, and how to reproduce this success with future posts (for free). But this is very unlikely to happen. In the words of CGP Grey: professional sharers cannot trust the platforms upon which they stand, audiences cannot trust the platform to show what they asked to see.\nExhibit C: LinkedIn profile views Who’s viewed your profile is a popular LinkedIn feature. A key part of this feature is a graph that includes your weekly profile views together with actions taken on LinkedIn. The official LinkedIn blog calls this graph the insights graph and provides some examples for its uses:\nSo, for example, if you are trying to attract new clients or business leads, you can see how many potential partners looked at your profile after you joined an important industry group. Or, if you’re looking for a new job, you can look at your insights graph to see whether adding a skill to your profile or endorsing a peer gave you a bigger bump in views by recruiters. No matter your goal, you’ll be able to see which actions lead to the most relevant profile views – then start reaching out and closing the sale or applying for your dream job.\nAs the examples show, the so-called insights graph merely provides information about past actions and profile views on the LinkedIn platform. It is up to you to come up with the insights, but this may be hard if you consider only the actions taken within the walled garden of LinkedIn. For example, as shown in the following graph, my profile views received a boost on the week starting November 23, which was mostly due to publishing a popular post on this website. In general, social networks such as LinkedIn, Twitter, and Facebook tend to have a very narrow view of the world – as if the only interesting things happen on the platform. In reality, most of the action happens off-platform, either within other digital assets or in the physical world.\nWhat would real LinkedIn insights look like? First, I think that the focus on profile views is somewhat misguided. It’s not that hard to artificially generate profile views – simply view other people’s profiles. There is no intrinsic value in someone having viewed your profile – the value comes from a connection that leads to an interesting offer or conversation. Second, LinkedIn is about professional networking that is based on real-world activity. As such, it only forms a small part of the world of professional networking by allowing people to have an online presence that makes them contactable by people they don’t already know. When it comes to insights, it’d be useful to know the true causal factors that lead to interesting connections – much more useful than suggestions such as add software development as a skill on your profile to get up to 3% more profile views.\nSummary: Real insights are about the why There are many other examples of pseudo-insights out there. The reason is probably that the field of analytics is becoming increasingly commoditised, and it is easier to rebrand an analytics dashboard as an insights dashboard than to provide real insights. Providing real insights requires moving up the DIKW pyramid from data and information to knowledge and wisdom – from describing the past to learning general lessons that allow you to influence the future. Providing real insights can be very hard, as it often requires inferring the causes of events – the why that comes after the what and how. More on this later – I have just started reading Samantha Kleinberg’s Why: A Guide to Finding and Using Causes and will report (hopefully real) insights on causality in future posts.\n","wordCount":"1296","inLanguage":"en","image":"https://yanirseroussi.com/dikw-pyramid.jpg","datePublished":"2015-12-08T06:57:25Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">This holiday season, give me real insights</h1><div class=post-meta><span title='2015-12-08 06:57:25 +0000 UTC'>December 8, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-12-08-this-holiday-season-give-me-real-insights/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/dikw-pyramid.jpg alt></figure><div class=post-content><p>Merriam-Webster defines an <a href=http://www.merriam-webster.com/dictionary/insight target=_blank rel=noopener>insight</a> as <em>an understanding of the true nature of something</em>. Many companies seem to define an insight as <em>any piece of data or information</em>, which I would call a pseudo-insight. This post surveys some examples of pseudo-insights, and discusses how these can be built upon to provide real insights.</p><h2 id=exhibit-a-wordpress-stats>Exhibit A: WordPress stats<a hidden class=anchor aria-hidden=true href=#exhibit-a-wordpress-stats>#</a></h2><p>This website is hosted on <a href=http://wordpress.com target=_blank rel=noopener>wordpress.com</a>. I&rsquo;m generally happy with WordPress – though it&rsquo;s not as exciting and shiny as newer competitors, it is rock-solid and very feature-rich. An example of a great WordPress feature is the new stats area (available under <a href=https://wordpress.com/stats target=_blank rel=noopener>wordpress.com/stats</a> if you have a WordPress website). This area includes an insights page, which is full of prime examples of pseudo-insights.</p><p>At the top of the insights page, there is a visualisation of posting activity. As the image below shows, this isn&rsquo;t very interesting for websites like mine. I already know that I post irregularly, because writing a blog post is time-consuming. I suspect that this visualisation isn&rsquo;t very useful even for more active multi-author blogs, as it is essentially just a different way of displaying the raw data of post dates. Without joining this data with other information, we won&rsquo;t gain a better understanding of how the blog is performing and why it performs the way it does.</p><figure><a href=wordpress-insights-posting-activity.png target=_blank rel=noopener><img sizes="(min-width: 768px) 713px,
+<meta name=keywords content="analytics,data science,Facebook,insights,LinkedIn,marketing,WordPress"><meta name=description content="Some companies present raw data or information as &ldquo;insights&rdquo;. This post surveys some examples, and discusses how they can be turned into real insights."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="This holiday season, give me real insights"><meta property="og:description" content="Some companies present raw data or information as &ldquo;insights&rdquo;. This post surveys some examples, and discusses how they can be turned into real insights."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/"><meta property="og:image" content="https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/dikw-pyramid.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-12-08T06:57:25+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/dikw-pyramid.jpg"><meta name=twitter:title content="This holiday season, give me real insights"><meta name=twitter:description content="Some companies present raw data or information as &ldquo;insights&rdquo;. This post surveys some examples, and discusses how they can be turned into real insights."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"This holiday season, give me real insights","item":"https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"This holiday season, give me real insights","name":"This holiday season, give me real insights","description":"Some companies present raw data or information as \u0026ldquo;insights\u0026rdquo;. This post surveys some examples, and discusses how they can be turned into real insights.","keywords":["analytics","data science","Facebook","insights","LinkedIn","marketing","WordPress"],"articleBody":"Merriam-Webster defines an insight as an understanding of the true nature of something. Many companies seem to define an insight as any piece of data or information, which I would call a pseudo-insight. This post surveys some examples of pseudo-insights, and discusses how these can be built upon to provide real insights.\nExhibit A: WordPress stats This website is hosted on wordpress.com. I’m generally happy with WordPress – though it’s not as exciting and shiny as newer competitors, it is rock-solid and very feature-rich. An example of a great WordPress feature is the new stats area (available under wordpress.com/stats if you have a WordPress website). This area includes an insights page, which is full of prime examples of pseudo-insights.\nAt the top of the insights page, there is a visualisation of posting activity. As the image below shows, this isn’t very interesting for websites like mine. I already know that I post irregularly, because writing a blog post is time-consuming. I suspect that this visualisation isn’t very useful even for more active multi-author blogs, as it is essentially just a different way of displaying the raw data of post dates. Without joining this data with other information, we won’t gain a better understanding of how the blog is performing and why it performs the way it does.\nAn attempt to extract more meaningful insights from posting times appears further down the page, in the form of a widget that tells you the most popular day and hour. The help text says that This is the day and hour when you have been getting the most Views on average. The best timing for publishing a post may be around this period. Unfortunately, I’m pretty certain that this isn’t true in my case. Monday happens to be the most popular day because that’s when I published two of my most popular posts, and I usually try to spread the word about a new post as soon as I publish it. Further, blog posts can become popular a long time after publication, so it is unlikely that the best timing for publishing a post is around Monday 3pm.\nWhat would real WordPress insights look like? If we stick to idea of exploring the effect of publication timing, I would be curious to know if there is indeed a link between when a post is published and its popularity. Automattic (the company behind WordPress) is in a position to test this, as they can explore data from millions of blogs. My gut feeling is that the time of publication has a negligible effect on popularity. Things that matter much more are a post’s title, content, and effective distribution channels. Given the amount of data that they have, Automattic data scientists can definitely explore all of these factors. This would allow them to surface insights that will help authors drive more quality traffic to their websites.\nExhibit B: Facebook page insights As anyone who manages a Facebook page probably knows, Facebook provides pretty rich analytics of pages on their platform. For example, you can see the likes you’ve received over time and how your posts perform, and slice and dice this information in various ways. This is a great feature, but again, calling it insights is a misuse of the word and somewhat of an insult for those of us who work to extract real insights from data. An analytics dashboard is not insights.\nWhat would real Facebook page insights look like? Working off the assumption that people manage a Facebook page to reach and engage their audience, real insights would enhance a page administrator’s understanding of their audience and improve their ability to engage them and reach new people. However, Facebook is famous for having a conflict of interest here, because they require you to pay to reach more people. For example, if a post you shared is performing better than usual, Facebook will send you a notification, asking you to pay to boost the post further. It would be better if they told you what has caused this post to reach more people, and how to reproduce this success with future posts (for free). But this is very unlikely to happen. In the words of CGP Grey: professional sharers cannot trust the platforms upon which they stand, audiences cannot trust the platform to show what they asked to see.\nExhibit C: LinkedIn profile views Who’s viewed your profile is a popular LinkedIn feature. A key part of this feature is a graph that includes your weekly profile views together with actions taken on LinkedIn. The official LinkedIn blog calls this graph the insights graph and provides some examples for its uses:\nSo, for example, if you are trying to attract new clients or business leads, you can see how many potential partners looked at your profile after you joined an important industry group. Or, if you’re looking for a new job, you can look at your insights graph to see whether adding a skill to your profile or endorsing a peer gave you a bigger bump in views by recruiters. No matter your goal, you’ll be able to see which actions lead to the most relevant profile views – then start reaching out and closing the sale or applying for your dream job.\nAs the examples show, the so-called insights graph merely provides information about past actions and profile views on the LinkedIn platform. It is up to you to come up with the insights, but this may be hard if you consider only the actions taken within the walled garden of LinkedIn. For example, as shown in the following graph, my profile views received a boost on the week starting November 23, which was mostly due to publishing a popular post on this website. In general, social networks such as LinkedIn, Twitter, and Facebook tend to have a very narrow view of the world – as if the only interesting things happen on the platform. In reality, most of the action happens off-platform, either within other digital assets or in the physical world.\nWhat would real LinkedIn insights look like? First, I think that the focus on profile views is somewhat misguided. It’s not that hard to artificially generate profile views – simply view other people’s profiles. There is no intrinsic value in someone having viewed your profile – the value comes from a connection that leads to an interesting offer or conversation. Second, LinkedIn is about professional networking that is based on real-world activity. As such, it only forms a small part of the world of professional networking by allowing people to have an online presence that makes them contactable by people they don’t already know. When it comes to insights, it’d be useful to know the true causal factors that lead to interesting connections – much more useful than suggestions such as add software development as a skill on your profile to get up to 3% more profile views.\nSummary: Real insights are about the why There are many other examples of pseudo-insights out there. The reason is probably that the field of analytics is becoming increasingly commoditised, and it is easier to rebrand an analytics dashboard as an insights dashboard than to provide real insights. Providing real insights requires moving up the DIKW pyramid from data and information to knowledge and wisdom – from describing the past to learning general lessons that allow you to influence the future. Providing real insights can be very hard, as it often requires inferring the causes of events – the why that comes after the what and how. More on this later – I have just started reading Samantha Kleinberg’s Why: A Guide to Finding and Using Causes and will report (hopefully real) insights on causality in future posts.\n","wordCount":"1296","inLanguage":"en","image":"https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/dikw-pyramid.jpg","datePublished":"2015-12-08T06:57:25Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">This holiday season, give me real insights</h1><div class=post-meta><span title='2015-12-08 06:57:25 +0000 UTC'>December 8, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-12-08-this-holiday-season-give-me-real-insights/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/dikw-pyramid.jpg alt></figure><div class=post-content><p>Merriam-Webster defines an <a href=http://www.merriam-webster.com/dictionary/insight target=_blank rel=noopener>insight</a> as <em>an understanding of the true nature of something</em>. Many companies seem to define an insight as <em>any piece of data or information</em>, which I would call a pseudo-insight. This post surveys some examples of pseudo-insights, and discusses how these can be built upon to provide real insights.</p><h2 id=exhibit-a-wordpress-stats>Exhibit A: WordPress stats<a hidden class=anchor aria-hidden=true href=#exhibit-a-wordpress-stats>#</a></h2><p>This website is hosted on <a href=http://wordpress.com target=_blank rel=noopener>wordpress.com</a>. I&rsquo;m generally happy with WordPress – though it&rsquo;s not as exciting and shiny as newer competitors, it is rock-solid and very feature-rich. An example of a great WordPress feature is the new stats area (available under <a href=https://wordpress.com/stats target=_blank rel=noopener>wordpress.com/stats</a> if you have a WordPress website). This area includes an insights page, which is full of prime examples of pseudo-insights.</p><p>At the top of the insights page, there is a visualisation of posting activity. As the image below shows, this isn&rsquo;t very interesting for websites like mine. I already know that I post irregularly, because writing a blog post is time-consuming. I suspect that this visualisation isn&rsquo;t very useful even for more active multi-author blogs, as it is essentially just a different way of displaying the raw data of post dates. Without joining this data with other information, we won&rsquo;t gain a better understanding of how the blog is performing and why it performs the way it does.</p><figure><a href=wordpress-insights-posting-activity.png target=_blank rel=noopener><img sizes="(min-width: 768px) 713px,
 100vw" srcset="https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/wordpress-insights-posting-activity_huf00e3c8956018a008091108b9751156a_6414_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/wordpress-insights-posting-activity_huf00e3c8956018a008091108b9751156a_6414_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/wordpress-insights-posting-activity.png 713w," src=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/wordpress-insights-posting-activity.png alt="WordPress insights: posting activity" loading=lazy></a></figure><p>An attempt to extract more meaningful insights from posting times appears further down the page, in the form of a widget that tells you the most popular day and hour. The help text says that <em>This is the day and hour when you have been getting the most Views on average. The best timing for publishing a post may be around this period</em>. Unfortunately, I&rsquo;m pretty certain that this isn&rsquo;t true in my case. Monday happens to be the most popular day because that&rsquo;s when I published two of my most popular posts, and I usually try to spread the word about a new post as soon as I publish it. Further, blog posts can become popular a long time after publication, so it is unlikely that the best timing for publishing a post is around Monday 3pm.</p><figure><a href=wordpress-insights-popular-time.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
diff --git a/2016/01/24/the-joys-of-offline-data-collection/index.html b/2016/01/24/the-joys-of-offline-data-collection/index.html
index 613ff95f7..2f33a4536 100644
--- a/2016/01/24/the-joys-of-offline-data-collection/index.html
+++ b/2016/01/24/the-joys-of-offline-data-collection/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>The joys of offline data collection | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,deep learning,environment,marine science,personal,predictive modelling,Reef Life Survey,scuba diving"><meta name=description content="Insights on data collection and machine learning from spending a month sailing, diving, and counting fish with Reef Life Survey."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The joys of offline data collection"><meta property="og:description" content="Insights on data collection and machine learning from spending a month sailing, diving, and counting fish with Reef Life Survey."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/"><meta property="og:image" content="https://yanirseroussi.com/triaenodon-obesus-whitetip-reef-shark.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-01-24T00:32:25+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/triaenodon-obesus-whitetip-reef-shark.jpg"><meta name=twitter:title content="The joys of offline data collection"><meta name=twitter:description content="Insights on data collection and machine learning from spending a month sailing, diving, and counting fish with Reef Life Survey."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The joys of offline data collection","item":"https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The joys of offline data collection","name":"The joys of offline data collection","description":"Insights on data collection and machine learning from spending a month sailing, diving, and counting fish with Reef Life Survey.","keywords":["data science","deep learning","environment","marine science","personal","predictive modelling","Reef Life Survey","scuba diving"],"articleBody":"Many modern data scientists don’t get to experience data collection in the offline world. Recently, I spent a month sailing down the northern Great Barrier Reef, collecting data for the Reef Life Survey project. In addition to being a great diving experience, the trip helped me obtain general insights on data collection and machine learning, which are shared in this article.\nThe Reef Life Survey project Reef Life Survey (RLS) is a citizen scientist project, led by a team from the University of Tasmania. The data collected by RLS volunteers is freely available on the RLS website, and has been used for producing various reports and scientific publications. An RLS survey is performed along a 50 metre tape, which is laid at a constant depth following a reef’s contour. After laying the tape, one diver takes photos of the bottom at 2.5 metre intervals along the transect line. These photos are automatically analysed to classify the type of substrate or growth (e.g., hard coral or sand). Divers then complete two swims along each side of the transect. On the first swim (method 1), divers record all the fish species and large swimming animals found in a 5 metre corridor from the line. The second swim (method 2) requires keeping closer to the bottom and looking under ledges and vegetation in a 1 metre corridor from the line, targeting invertebrates and cryptic animals. The RLS manual includes all the details on how surveys are performed.\nPerforming RLS surveys is not a trivial task. In the tropics, it is not uncommon to record around 100 fish species on method 1. The scientists running the project are very conscious of the importance of obtaining high-quality data, so training to become an RLS volunteer takes considerable effort and dedication. The process generally consists of doing surveys together with an experienced RLS diver, and comparing the data after each dive. Once the trainee’s data matches that of the experienced RLSer, they are considered good enough to perform surveys independently. However, retraining is often required when surveying new ecoregions (e.g., an RLSer trained in Sydney needs further training to survey the Great Barrier Reef).\nRLS requires a lot of hard work, but there are many reasons why it’s worth the effort. As someone who cares about marine conservation, I like the fact that RLS dives yield useful data that is used to drive environmental management decisions. As a scuba diver, I enjoy the opportunity to dive places that are rarely dived and the enhanced knowledge of the marine environment – doing surveys makes me notice things that I would otherwise overlook. Finally, as a data scientist, I find the exposure to the work of marine scientists very educational.\nPre-training and thoughts on supervised learning Doing surveys in the tropics is a completely different story from surveying temperate reefs, due to the substantially higher diversity and abundance of marine creatures. Producing high-quality results requires being able to identify most creatures underwater, while doing the survey. It is possible to write down descriptions and take photos of unidentified species, but doing this for a large number of species is impractical.\nTraining the neural network in my head to classify tropical fish by species was an interesting experience. The approach that worked best was making flashcards using reveal.js, photos scraped from various sources, and past survey data. As the image below shows, each flashcard consists of a single photo, and pressing the down arrow reveals the name of the creature. With some basic JavaScript, I made the presentation select a different subset of photos on each load. Originally, I tried to learn all the 1000+ species that were previously recorded in the northern Great Barrier Reef, but this proved to be too hard – I realised that a better strategy was needed. The strategy that I chose was to focus on the most frequently-recorded species: I started by memorising the most frequent ones (e.g., those recorded on more than 50% of surveys), and gradually made it more challenging by decreasing the frequency threshold (e.g., to 25% in 5% steps). This proved to be pretty effective – by the time I started diving I could identify about 50-100 species underwater, even though I had mostly been using static images. It’d be interesting to know whether this kind of approach would be effective in training neural networks (or other batch-trained models) in certain scenarios – spend a few epochs training with instances from a subset of the classes, and gradually increase the number of considered classes. This may be effective when errors on certain classes are more important than others, and may yield different results from simply weighting classes or instances. Please let me know if you know of anyone who has experimented with this idea (update: gwern from Reddit pointed me to the paper Curriculum Learning by Bengio et al., which discusses this idea).\nRLS flashcard example (Chaetodon lunulatus) While repeatedly looking at photos and their labels felt a lot like training an artificial neural network, as a human I have the advantage of being able to easily use information from multiple sources. For example, fish ID books such as Reef Fish Identification: Tropical Pacific provide concise descriptions of the identifying physical features of each fish (see the image below for the book’s entry for Chaetodon lunulatus – the butterflyfish from the flashcard above). Reading those descriptions made me learn more effectively, by helping me focus my attention on the parts that matter for classification. Learning only from static images can be hard when classifying creatures with highly variable colour schemes – using extraneous knowledge about what actually matters when it comes to classification is the way to go in practice. Further, features that are hard to decode from photos – like behaviour and habitat – are sometimes crucial to distinguishing different species. One interesting thought is that while photos can be seen as raw data, natural language descriptions are essentially models. Utilising such models is likely to be of benefit in many areas. For example, being able to tell a classifier what to look for in an image would make training a supervised classifier more similar to the way humans learn. This may be achieved using similar techniques to those used for generating image descriptions, except that the goal would be to use descriptions of the classes to improve classification accuracy.\nFish ID example (Chaetodon lunulatus). Source: Reef Fish Identification: Tropical Pacific Another difference between my learning and supervised machine learning is that if I found a creature hard to identify, I would go and look for more photos or videos of them. Videos were especially valuable, because in practice I rarely had to identify static creatures. This approach may be applicable in situations where labelled data is abundant. Sometimes, using all the labelled data makes model training too slow to be practical. An approach I used in the past to overcome this issue is to randomly sample the data, but it often makes sense to sample in a way that yields the best model, e.g., by sampling more instances from classes that are harder to classify.\nOne similarity to supervised machine learning that I encountered was the danger of overfitting. Due to the relatively small number of photos and the fact that I had to view each one of them multiple times, I found that in some cases I memorised the entire photo rather than the creature. This was especially the case with low-quality photos or ones that were missing key features. My regularisation approach consisted of trying to memorise the descriptions from the book, and collecting more photos. I wish more algorithms were this self-conscious about overfitting!\nCan’t this be automated? While doing surveys and studying species, I kept asking myself whether the whole thing can be automated. Thanks to deep learning, computers have recently gotten very good at classifying images, sometimes outperforming humans. It seems likely that at some point the survey methodology would be changed to just taking a video of the dive, and letting an algorithm do the hard job of identifying the creatures. Analysis of the bottom photos is automated, so it is reasonable to automate the other survey methods as well. However, there are quite a few challenges that need to be overcome before full automation can be implemented.\nIf the results of the LifeCLEF 2015 Fish Task are any indication, we are quite far from automating fish identification. The precision of the top methods in that challenge was around 80% for identifying 15 fish species from underwater videos, where the chosen species are quite distinct from each other. In tropical surveys it is not uncommon to record around 100 fish species along the 50 metre transect, with many species being similar to each other. It’s usually the case that it’s not same species on every dive (even at the same site), so replacing humans would require training a highly accurate classifier on thousands of species.\nDealing with high diversity isn’t the only challenge in automating RLS. The appearance of many species varies by gender and age, so the classifier would have to learn all those variations (see image below for an example). Getting good training data can be very challenging, since the labelling process is labour-intensive, and elements like colour and backscatter are highly dependent on dive site conditions and the quality of the camera. Another complication is that RLS data includes size estimates, which can be hard to obtain from videos and photos without knowing how far the camera was from the subject and the type of lens used. In addition, accounting for side information (geolocation, behaviour, depth, etc.) can make a huge difference in accurately identifying species, but it isn’t easy to integrate with some learning models. Finally, it is likely that some species will be missed when videos are taken without any identification done underwater, because RLSers tend to get good photos of species that they know will be hard to identify, even if it means spending more time at one spot or shining strobes under ledges.\nChlorurus sordidus variations. Source: Tropical Marine Fishes of Australia Another aspect of automating surveys is completely removing the need for human divers by sending robots down. This is an active research area, and is the only way of surveying deep waters. However, this approach still requires a boat-based crew to deploy the robots. It may also yield different data from RLS for cryptic species, though this depends on the type of robots used. In addition, there’s the issue of cost – RLS relies on volunteer scuba divers who are diving anyway, so the cost of getting RLSers to do surveys is rather low (especially for shore dives near a diver’s home, where there is no cost to RLS). Further, RLS’s mission is “to inspire and engage a global volunteer community to survey reefs using scientific methods and share knowledge about marine ecosystem health”. Engaging the community is a crucial part of RLS because robots do not care about the environment. Humans do.\nSmall data is valuable When compared to datasets commonly encountered online, RLS data is small. As the image below shows, fewer than 10,000 surveys have been conducted to date. However, this data is still valuable, as it provides a high-quality snapshot of the state of marine ecosystems in areas that wouldn’t be surveyed if it wasn’t for RLS volunteers. For example, in a recent Nature article, the authors used RLS data to assess the vulnerability of marine fauna to global warming.\nRLS surveys by Australian financial year (July-June). Source: RLS Foundation Annual Report 2015 Each RLS survey requires several hours of work. In addition to performing the survey itself, a lot of work goes into entering the data and verifying its quality. Getting to the survey sites is not always a trivial task, especially for remote sites such as some of those we dived on my recent trip. Spending a month diving the Great Barrier Reef is a good way of appreciating its greatness. As the map shows, the surveys we did covered only the top part of the reef’s 2300 kilometres, and we only sampled a few sites within that part. The Great Barrier Reef is very vast, and it is hard to convey its vastness with just words or a map. You have to be there to understand – it is quite humbling.\nIn summary, the RLS experience has given me a new appreciation for small data in the offline world. Offline data collection is often expensive and labour-intensive – you need to work hard to produce a few high-quality data points. But the size of your data doesn’t matter (though having more quality data is always good). What really matters is what you do with the data – and the RLS team and their collaborators have been doing quite a lot. The RLS experience also illustrates the importance of domain expertise: I’ve looked at the RLS datasets, but I have no idea what questions are worth asking and answering using those datasets. The RLS project is yet another example of how in science collecting data is time-consuming, and coming up with appropriate research questions is hard. It is a lot of fun, though.\n","wordCount":"2207","inLanguage":"en","image":"https://yanirseroussi.com/triaenodon-obesus-whitetip-reef-shark.jpg","datePublished":"2016-01-24T00:32:25Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The joys of offline data collection</h1><div class=post-meta><span title='2016-01-24 00:32:25 +0000 UTC'>January 24, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-01-24-the-joys-of-offline-data-collection/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark.jpg 3220w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark.jpg alt width=3220 height=1310></figure><div class=post-content><p>Many modern data scientists don&rsquo;t get to experience data collection in the offline world. Recently, I spent a month sailing down the northern Great Barrier Reef, collecting data for the <a href=http://reeflifesurvey.com/ target=_blank rel=noopener>Reef Life Survey project</a>. In addition to being a great diving experience, the trip helped me obtain general insights on data collection and machine learning, which are shared in this article.</p><h2 id=the-reef-life-survey-project>The Reef Life Survey project<a hidden class=anchor aria-hidden=true href=#the-reef-life-survey-project>#</a></h2><p>Reef Life Survey (RLS) is a citizen scientist project, led by a team from the University of Tasmania. The <a href=http://reeflifesurvey.com/reef-life-survey/survey-data/ target=_blank rel=noopener>data collected by RLS volunteers is freely available on the RLS website</a>, and has been used for producing <a href=http://reeflifesurvey.com/scientific-papers/ target=_blank rel=noopener>various reports and scientific publications</a>. An RLS survey is performed along a 50 metre tape, which is laid at a constant depth following a reef&rsquo;s contour. After laying the tape, one diver takes photos of the bottom at 2.5 metre intervals along the transect line. These photos are automatically analysed to <a href=https://drive.google.com/file/d/0B9XQg8_HWQVPU2NweEFmcEJYQTQ/view target=_blank rel=noopener>classify the type of substrate or growth</a> (e.g., hard coral or sand). Divers then complete two swims along each side of the transect. On the first swim (method 1), divers record <strong>all</strong> the fish species and large swimming animals found in a 5 metre corridor from the line. The second swim (method 2) requires keeping closer to the bottom and looking under ledges and vegetation in a 1 metre corridor from the line, targeting invertebrates and cryptic animals. The <a href=http://reeflifesurvey.com/wp-content/uploads/2015/07/NEW-Methods-Manual_150815.pdf target=_blank rel=noopener>RLS manual</a> includes all the details on how surveys are performed.</p><p>Performing RLS surveys is not a trivial task. In the tropics, it is not uncommon to record around 100 fish species on method 1. The scientists running the project are very conscious of the importance of obtaining high-quality data, so training to become an RLS volunteer takes considerable effort and dedication. The process generally consists of doing surveys together with an experienced RLS diver, and comparing the data after each dive. Once the trainee&rsquo;s data matches that of the experienced RLSer, they are considered good enough to perform surveys independently. However, retraining is often required when surveying new ecoregions (e.g., an RLSer trained in Sydney needs further training to survey the Great Barrier Reef).</p><p>RLS requires a lot of hard work, but there are many reasons why it&rsquo;s worth the effort. As someone who cares about marine conservation, I like the fact that RLS dives yield useful data that is used to drive environmental management decisions. As a scuba diver, I enjoy the opportunity to dive places that are rarely dived and the enhanced knowledge of the marine environment – doing surveys makes me notice things that I would otherwise overlook. Finally, as a data scientist, I find the exposure to the work of marine scientists very educational.</p><h2 id=pre-training-and-thoughts-on-supervised-learning>Pre-training and thoughts on supervised learning<a hidden class=anchor aria-hidden=true href=#pre-training-and-thoughts-on-supervised-learning>#</a></h2><p>Doing surveys in the tropics is a completely different story from surveying temperate reefs, due to the substantially higher diversity and abundance of marine creatures. Producing high-quality results requires being able to identify most creatures underwater, while doing the survey. It is possible to write down descriptions and take photos of unidentified species, but doing this for a large number of species is impractical.</p><p>Training the neural network in my head to classify tropical fish by species was an interesting experience. The approach that worked best was making flashcards using <a href=http://lab.hakim.se/reveal-js/ target=_blank rel=noopener>reveal.js</a>, photos scraped from various sources, and past survey data. As the image below shows, each flashcard consists of a single photo, and pressing the down arrow reveals the name of the creature. With some basic JavaScript, I made the presentation select a different subset of photos on each load. Originally, I tried to learn all the 1000+ species that were previously recorded in the northern Great Barrier Reef, but this proved to be too hard – I realised that a better strategy was needed. The strategy that I chose was to focus on the most frequently-recorded species: I started by memorising the most frequent ones (e.g., those recorded on more than 50% of surveys), and gradually made it more challenging by decreasing the frequency threshold (e.g., to 25% in 5% steps). This proved to be pretty effective – by the time I started diving I could identify about 50-100 species underwater, even though I had mostly been using static images. It&rsquo;d be interesting to know whether this kind of approach would be effective in training neural networks (or other batch-trained models) in certain scenarios – spend a few epochs training with instances from a subset of the classes, and gradually increase the number of considered classes. This may be effective when errors on certain classes are more important than others, and may yield different results from simply weighting classes or instances. Please <a href=https://yanirseroussi.com/about/>let me know</a> if you know of anyone who has experimented with this idea (<strong>update:</strong> <a href=https://www.reddit.com/r/MachineLearning/comments/42dp7l/the_joys_of_offline_data_collection_including/cz9jqev target=_blank rel=noopener>gwern from Reddit</a> pointed me to the paper <a href=http://ronan.collobert.com/pub/matos/2009_curriculum_icml.pdf target=_blank rel=noopener>Curriculum Learning</a> by Bengio et al., which discusses this idea).</p><figure><a href=rls-flashcard.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="data science,deep learning,environment,marine science,personal,predictive modelling,Reef Life Survey,scuba diving"><meta name=description content="Insights on data collection and machine learning from spending a month sailing, diving, and counting fish with Reef Life Survey."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The joys of offline data collection"><meta property="og:description" content="Insights on data collection and machine learning from spending a month sailing, diving, and counting fish with Reef Life Survey."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/"><meta property="og:image" content="https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-01-24T00:32:25+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark.jpg"><meta name=twitter:title content="The joys of offline data collection"><meta name=twitter:description content="Insights on data collection and machine learning from spending a month sailing, diving, and counting fish with Reef Life Survey."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The joys of offline data collection","item":"https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The joys of offline data collection","name":"The joys of offline data collection","description":"Insights on data collection and machine learning from spending a month sailing, diving, and counting fish with Reef Life Survey.","keywords":["data science","deep learning","environment","marine science","personal","predictive modelling","Reef Life Survey","scuba diving"],"articleBody":"Many modern data scientists don’t get to experience data collection in the offline world. Recently, I spent a month sailing down the northern Great Barrier Reef, collecting data for the Reef Life Survey project. In addition to being a great diving experience, the trip helped me obtain general insights on data collection and machine learning, which are shared in this article.\nThe Reef Life Survey project Reef Life Survey (RLS) is a citizen scientist project, led by a team from the University of Tasmania. The data collected by RLS volunteers is freely available on the RLS website, and has been used for producing various reports and scientific publications. An RLS survey is performed along a 50 metre tape, which is laid at a constant depth following a reef’s contour. After laying the tape, one diver takes photos of the bottom at 2.5 metre intervals along the transect line. These photos are automatically analysed to classify the type of substrate or growth (e.g., hard coral or sand). Divers then complete two swims along each side of the transect. On the first swim (method 1), divers record all the fish species and large swimming animals found in a 5 metre corridor from the line. The second swim (method 2) requires keeping closer to the bottom and looking under ledges and vegetation in a 1 metre corridor from the line, targeting invertebrates and cryptic animals. The RLS manual includes all the details on how surveys are performed.\nPerforming RLS surveys is not a trivial task. In the tropics, it is not uncommon to record around 100 fish species on method 1. The scientists running the project are very conscious of the importance of obtaining high-quality data, so training to become an RLS volunteer takes considerable effort and dedication. The process generally consists of doing surveys together with an experienced RLS diver, and comparing the data after each dive. Once the trainee’s data matches that of the experienced RLSer, they are considered good enough to perform surveys independently. However, retraining is often required when surveying new ecoregions (e.g., an RLSer trained in Sydney needs further training to survey the Great Barrier Reef).\nRLS requires a lot of hard work, but there are many reasons why it’s worth the effort. As someone who cares about marine conservation, I like the fact that RLS dives yield useful data that is used to drive environmental management decisions. As a scuba diver, I enjoy the opportunity to dive places that are rarely dived and the enhanced knowledge of the marine environment – doing surveys makes me notice things that I would otherwise overlook. Finally, as a data scientist, I find the exposure to the work of marine scientists very educational.\nPre-training and thoughts on supervised learning Doing surveys in the tropics is a completely different story from surveying temperate reefs, due to the substantially higher diversity and abundance of marine creatures. Producing high-quality results requires being able to identify most creatures underwater, while doing the survey. It is possible to write down descriptions and take photos of unidentified species, but doing this for a large number of species is impractical.\nTraining the neural network in my head to classify tropical fish by species was an interesting experience. The approach that worked best was making flashcards using reveal.js, photos scraped from various sources, and past survey data. As the image below shows, each flashcard consists of a single photo, and pressing the down arrow reveals the name of the creature. With some basic JavaScript, I made the presentation select a different subset of photos on each load. Originally, I tried to learn all the 1000+ species that were previously recorded in the northern Great Barrier Reef, but this proved to be too hard – I realised that a better strategy was needed. The strategy that I chose was to focus on the most frequently-recorded species: I started by memorising the most frequent ones (e.g., those recorded on more than 50% of surveys), and gradually made it more challenging by decreasing the frequency threshold (e.g., to 25% in 5% steps). This proved to be pretty effective – by the time I started diving I could identify about 50-100 species underwater, even though I had mostly been using static images. It’d be interesting to know whether this kind of approach would be effective in training neural networks (or other batch-trained models) in certain scenarios – spend a few epochs training with instances from a subset of the classes, and gradually increase the number of considered classes. This may be effective when errors on certain classes are more important than others, and may yield different results from simply weighting classes or instances. Please let me know if you know of anyone who has experimented with this idea (update: gwern from Reddit pointed me to the paper Curriculum Learning by Bengio et al., which discusses this idea).\nRLS flashcard example (Chaetodon lunulatus) While repeatedly looking at photos and their labels felt a lot like training an artificial neural network, as a human I have the advantage of being able to easily use information from multiple sources. For example, fish ID books such as Reef Fish Identification: Tropical Pacific provide concise descriptions of the identifying physical features of each fish (see the image below for the book’s entry for Chaetodon lunulatus – the butterflyfish from the flashcard above). Reading those descriptions made me learn more effectively, by helping me focus my attention on the parts that matter for classification. Learning only from static images can be hard when classifying creatures with highly variable colour schemes – using extraneous knowledge about what actually matters when it comes to classification is the way to go in practice. Further, features that are hard to decode from photos – like behaviour and habitat – are sometimes crucial to distinguishing different species. One interesting thought is that while photos can be seen as raw data, natural language descriptions are essentially models. Utilising such models is likely to be of benefit in many areas. For example, being able to tell a classifier what to look for in an image would make training a supervised classifier more similar to the way humans learn. This may be achieved using similar techniques to those used for generating image descriptions, except that the goal would be to use descriptions of the classes to improve classification accuracy.\nFish ID example (Chaetodon lunulatus). Source: Reef Fish Identification: Tropical Pacific Another difference between my learning and supervised machine learning is that if I found a creature hard to identify, I would go and look for more photos or videos of them. Videos were especially valuable, because in practice I rarely had to identify static creatures. This approach may be applicable in situations where labelled data is abundant. Sometimes, using all the labelled data makes model training too slow to be practical. An approach I used in the past to overcome this issue is to randomly sample the data, but it often makes sense to sample in a way that yields the best model, e.g., by sampling more instances from classes that are harder to classify.\nOne similarity to supervised machine learning that I encountered was the danger of overfitting. Due to the relatively small number of photos and the fact that I had to view each one of them multiple times, I found that in some cases I memorised the entire photo rather than the creature. This was especially the case with low-quality photos or ones that were missing key features. My regularisation approach consisted of trying to memorise the descriptions from the book, and collecting more photos. I wish more algorithms were this self-conscious about overfitting!\nCan’t this be automated? While doing surveys and studying species, I kept asking myself whether the whole thing can be automated. Thanks to deep learning, computers have recently gotten very good at classifying images, sometimes outperforming humans. It seems likely that at some point the survey methodology would be changed to just taking a video of the dive, and letting an algorithm do the hard job of identifying the creatures. Analysis of the bottom photos is automated, so it is reasonable to automate the other survey methods as well. However, there are quite a few challenges that need to be overcome before full automation can be implemented.\nIf the results of the LifeCLEF 2015 Fish Task are any indication, we are quite far from automating fish identification. The precision of the top methods in that challenge was around 80% for identifying 15 fish species from underwater videos, where the chosen species are quite distinct from each other. In tropical surveys it is not uncommon to record around 100 fish species along the 50 metre transect, with many species being similar to each other. It’s usually the case that it’s not same species on every dive (even at the same site), so replacing humans would require training a highly accurate classifier on thousands of species.\nDealing with high diversity isn’t the only challenge in automating RLS. The appearance of many species varies by gender and age, so the classifier would have to learn all those variations (see image below for an example). Getting good training data can be very challenging, since the labelling process is labour-intensive, and elements like colour and backscatter are highly dependent on dive site conditions and the quality of the camera. Another complication is that RLS data includes size estimates, which can be hard to obtain from videos and photos without knowing how far the camera was from the subject and the type of lens used. In addition, accounting for side information (geolocation, behaviour, depth, etc.) can make a huge difference in accurately identifying species, but it isn’t easy to integrate with some learning models. Finally, it is likely that some species will be missed when videos are taken without any identification done underwater, because RLSers tend to get good photos of species that they know will be hard to identify, even if it means spending more time at one spot or shining strobes under ledges.\nChlorurus sordidus variations. Source: Tropical Marine Fishes of Australia Another aspect of automating surveys is completely removing the need for human divers by sending robots down. This is an active research area, and is the only way of surveying deep waters. However, this approach still requires a boat-based crew to deploy the robots. It may also yield different data from RLS for cryptic species, though this depends on the type of robots used. In addition, there’s the issue of cost – RLS relies on volunteer scuba divers who are diving anyway, so the cost of getting RLSers to do surveys is rather low (especially for shore dives near a diver’s home, where there is no cost to RLS). Further, RLS’s mission is “to inspire and engage a global volunteer community to survey reefs using scientific methods and share knowledge about marine ecosystem health”. Engaging the community is a crucial part of RLS because robots do not care about the environment. Humans do.\nSmall data is valuable When compared to datasets commonly encountered online, RLS data is small. As the image below shows, fewer than 10,000 surveys have been conducted to date. However, this data is still valuable, as it provides a high-quality snapshot of the state of marine ecosystems in areas that wouldn’t be surveyed if it wasn’t for RLS volunteers. For example, in a recent Nature article, the authors used RLS data to assess the vulnerability of marine fauna to global warming.\nRLS surveys by Australian financial year (July-June). Source: RLS Foundation Annual Report 2015 Each RLS survey requires several hours of work. In addition to performing the survey itself, a lot of work goes into entering the data and verifying its quality. Getting to the survey sites is not always a trivial task, especially for remote sites such as some of those we dived on my recent trip. Spending a month diving the Great Barrier Reef is a good way of appreciating its greatness. As the map shows, the surveys we did covered only the top part of the reef’s 2300 kilometres, and we only sampled a few sites within that part. The Great Barrier Reef is very vast, and it is hard to convey its vastness with just words or a map. You have to be there to understand – it is quite humbling.\nIn summary, the RLS experience has given me a new appreciation for small data in the offline world. Offline data collection is often expensive and labour-intensive – you need to work hard to produce a few high-quality data points. But the size of your data doesn’t matter (though having more quality data is always good). What really matters is what you do with the data – and the RLS team and their collaborators have been doing quite a lot. The RLS experience also illustrates the importance of domain expertise: I’ve looked at the RLS datasets, but I have no idea what questions are worth asking and answering using those datasets. The RLS project is yet another example of how in science collecting data is time-consuming, and coming up with appropriate research questions is hard. It is a lot of fun, though.\n","wordCount":"2207","inLanguage":"en","image":"https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark.jpg","datePublished":"2016-01-24T00:32:25Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The joys of offline data collection</h1><div class=post-meta><span title='2016-01-24 00:32:25 +0000 UTC'>January 24, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-01-24-the-joys-of-offline-data-collection/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark_hu5b48ea845b0512937c3ac1259641b3e3_859311_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark.jpg 3220w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/triaenodon-obesus-whitetip-reef-shark.jpg alt width=3220 height=1310></figure><div class=post-content><p>Many modern data scientists don&rsquo;t get to experience data collection in the offline world. Recently, I spent a month sailing down the northern Great Barrier Reef, collecting data for the <a href=http://reeflifesurvey.com/ target=_blank rel=noopener>Reef Life Survey project</a>. In addition to being a great diving experience, the trip helped me obtain general insights on data collection and machine learning, which are shared in this article.</p><h2 id=the-reef-life-survey-project>The Reef Life Survey project<a hidden class=anchor aria-hidden=true href=#the-reef-life-survey-project>#</a></h2><p>Reef Life Survey (RLS) is a citizen scientist project, led by a team from the University of Tasmania. The <a href=http://reeflifesurvey.com/reef-life-survey/survey-data/ target=_blank rel=noopener>data collected by RLS volunteers is freely available on the RLS website</a>, and has been used for producing <a href=http://reeflifesurvey.com/scientific-papers/ target=_blank rel=noopener>various reports and scientific publications</a>. An RLS survey is performed along a 50 metre tape, which is laid at a constant depth following a reef&rsquo;s contour. After laying the tape, one diver takes photos of the bottom at 2.5 metre intervals along the transect line. These photos are automatically analysed to <a href=https://drive.google.com/file/d/0B9XQg8_HWQVPU2NweEFmcEJYQTQ/view target=_blank rel=noopener>classify the type of substrate or growth</a> (e.g., hard coral or sand). Divers then complete two swims along each side of the transect. On the first swim (method 1), divers record <strong>all</strong> the fish species and large swimming animals found in a 5 metre corridor from the line. The second swim (method 2) requires keeping closer to the bottom and looking under ledges and vegetation in a 1 metre corridor from the line, targeting invertebrates and cryptic animals. The <a href=http://reeflifesurvey.com/wp-content/uploads/2015/07/NEW-Methods-Manual_150815.pdf target=_blank rel=noopener>RLS manual</a> includes all the details on how surveys are performed.</p><p>Performing RLS surveys is not a trivial task. In the tropics, it is not uncommon to record around 100 fish species on method 1. The scientists running the project are very conscious of the importance of obtaining high-quality data, so training to become an RLS volunteer takes considerable effort and dedication. The process generally consists of doing surveys together with an experienced RLS diver, and comparing the data after each dive. Once the trainee&rsquo;s data matches that of the experienced RLSer, they are considered good enough to perform surveys independently. However, retraining is often required when surveying new ecoregions (e.g., an RLSer trained in Sydney needs further training to survey the Great Barrier Reef).</p><p>RLS requires a lot of hard work, but there are many reasons why it&rsquo;s worth the effort. As someone who cares about marine conservation, I like the fact that RLS dives yield useful data that is used to drive environmental management decisions. As a scuba diver, I enjoy the opportunity to dive places that are rarely dived and the enhanced knowledge of the marine environment – doing surveys makes me notice things that I would otherwise overlook. Finally, as a data scientist, I find the exposure to the work of marine scientists very educational.</p><h2 id=pre-training-and-thoughts-on-supervised-learning>Pre-training and thoughts on supervised learning<a hidden class=anchor aria-hidden=true href=#pre-training-and-thoughts-on-supervised-learning>#</a></h2><p>Doing surveys in the tropics is a completely different story from surveying temperate reefs, due to the substantially higher diversity and abundance of marine creatures. Producing high-quality results requires being able to identify most creatures underwater, while doing the survey. It is possible to write down descriptions and take photos of unidentified species, but doing this for a large number of species is impractical.</p><p>Training the neural network in my head to classify tropical fish by species was an interesting experience. The approach that worked best was making flashcards using <a href=http://lab.hakim.se/reveal-js/ target=_blank rel=noopener>reveal.js</a>, photos scraped from various sources, and past survey data. As the image below shows, each flashcard consists of a single photo, and pressing the down arrow reveals the name of the creature. With some basic JavaScript, I made the presentation select a different subset of photos on each load. Originally, I tried to learn all the 1000+ species that were previously recorded in the northern Great Barrier Reef, but this proved to be too hard – I realised that a better strategy was needed. The strategy that I chose was to focus on the most frequently-recorded species: I started by memorising the most frequent ones (e.g., those recorded on more than 50% of surveys), and gradually made it more challenging by decreasing the frequency threshold (e.g., to 25% in 5% steps). This proved to be pretty effective – by the time I started diving I could identify about 50-100 species underwater, even though I had mostly been using static images. It&rsquo;d be interesting to know whether this kind of approach would be effective in training neural networks (or other batch-trained models) in certain scenarios – spend a few epochs training with instances from a subset of the classes, and gradually increase the number of considered classes. This may be effective when errors on certain classes are more important than others, and may yield different results from simply weighting classes or instances. Please <a href=https://yanirseroussi.com/about/>let me know</a> if you know of anyone who has experimented with this idea (<strong>update:</strong> <a href=https://www.reddit.com/r/MachineLearning/comments/42dp7l/the_joys_of_offline_data_collection_including/cz9jqev target=_blank rel=noopener>gwern from Reddit</a> pointed me to the paper <a href=http://ronan.collobert.com/pub/matos/2009_curriculum_icml.pdf target=_blank rel=noopener>Curriculum Learning</a> by Bengio et al., which discusses this idea).</p><figure><a href=rls-flashcard.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/rls-flashcard_hu3a07a6eda7f0dd6656303f37c93114ae_407004_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/rls-flashcard_hu3a07a6eda7f0dd6656303f37c93114ae_407004_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/rls-flashcard_hu3a07a6eda7f0dd6656303f37c93114ae_407004_720x0_resize_box_3.png 720w,
diff --git a/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.html b/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.html
index 7ee3a0238..1f6380249 100644
--- a/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.html
+++ b/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Why you should stop worrying about deep learning and deepen your understanding of causality instead | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="analytics,causal inference,data science,deep learning,insights,machine learning,predictive modelling"><meta name=description content="Causality is often overlooked but is of much higher relevance to most data scientists than deep learning."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Why you should stop worrying about deep learning and deepen your understanding of causality instead"><meta property="og:description" content="Causality is often overlooked but is of much higher relevance to most data scientists than deep learning."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/"><meta property="og:image" content="https://yanirseroussi.com/correlation-xkcd.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-02-14T11:04:11+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/correlation-xkcd.png"><meta name=twitter:title content="Why you should stop worrying about deep learning and deepen your understanding of causality instead"><meta name=twitter:description content="Causality is often overlooked but is of much higher relevance to most data scientists than deep learning."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Why you should stop worrying about deep learning and deepen your understanding of causality instead","item":"https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Why you should stop worrying about deep learning and deepen your understanding of causality instead","name":"Why you should stop worrying about deep learning and deepen your understanding of causality instead","description":"Causality is often overlooked but is of much higher relevance to most data scientists than deep learning.","keywords":["analytics","causal inference","data science","deep learning","insights","machine learning","predictive modelling"],"articleBody":"Everywhere you go these days, you hear about deep learning’s impressive advancements. New deep learning libraries, tools, and products get announced on a regular basis, making the average data scientist feel like they’re missing out if they don’t hop on the deep learning bandwagon. However, as Kamil Bartocha put it in his post The Inconvenient Truth About Data Science, 95% of tasks do not require deep learning. This is obviously a made up number, but it’s probably an accurate representation of the everyday reality of many data scientists. This post discusses an often-overlooked area of study that is of much higher relevance to most data scientists than deep learning: causality.\nCausality is everywhere An understanding of cause and effect is something that is not unique to humans. For example, the many videos of cats knocking things off tables appear to exemplify experimentation by animals. If you are not familiar with such videos, it can easily be fixed. The thing to notice is that cats appear genuinely curious about what happens when they push an object. And they tend to repeat the experiment to verify that if you push something off, it falls to the ground.\nHumans rely on much more complex causal analysis than that done by cats – an understanding of the long-term effects of one’s actions is crucial to survival. Science, as defined by Wikipedia, is a systematic enterprise that creates, builds and organizes knowledge in the form of testable explanations and predictions about the universe. Causal analysis is key to producing explanations and predictions that are valid and sound, which is why understanding causality is so important to data scientists, traditional scientists, and all humans.\nWhat is causality? It is surprisingly hard to define causality. Just like cats, we all have an intuitive sense of what causality is, but things get complicated on deeper inspection. For example, few people would disagree with the statement that smoking causes cancer. But does it cause cancer immediately? Would smoking a few cigarettes today and never again cause cancer? Do all smokers develop cancer eventually? What about light smokers who live in areas with heavy air pollution?\nSamantha Kleinberg summarises it very well in her book, Why: A Guide to Finding and Using Causes:\nWhile most definitions of causality are based on Hume’s work, none of the ones we can come up with cover all possible cases and each one has counterexamples another does not. For instance, a medication may lead to side effects in only a small fraction of users (so we can’t assume that a cause will always produce an effect), and seat belts normally prevent death but can cause it in some car accidents (so we need to allow for factors that can have mixed producer/preventer roles depending on context).\nThe question often boils down to whether we should see causes as a fundamental building block or force of the world (that can’t be further reduced to any other laws), or if this structure is something we impose. As with nearly every facet of causality, there is disagreement on this point (and even disagreement about whether particular theories are compatible with this notion, which is called causal realism). Some have felt that causes are so hard to find as for the search to be hopeless and, further, that once we have some physical laws, those are more useful than causes anyway. That is, “causes” may be a mere shorthand for things like triggers, pushes, repels, prevents, and so on, rather than a fundamental notion.\nIt is somewhat surprising, given how central the idea of causality is to our daily lives, but there is simply no unified philosophical theory of what causes are, and no single foolproof computational method for finding them with absolute certainty. What makes this even more challenging is that, depending on one’s definition of causality, different factors may be identified as causes in the same situation, and it may not be clear what the ground truth is.\nWhy study causality now? While it’s hard to conclusively prove, it seems to me like interest in formal causal analysis has increased in recent years. My hypothesis is that it’s just a natural progression along the levels of data’s hierarchy of needs. At the start of the big data boom, people were mostly concerned with storing and processing large amounts of data (e.g., using Hadoop, Elasticsearch, or your favourite NoSQL database). Just having your data flowing through pipelines is nice, but not very useful, so the focus switched to reporting and visualisation to extract insights about what happened (commonly known as business intelligence). While having a good picture of what happened is great, it isn’t enough – you can make better decisions if you can predict what’s going to happen, so the focus switched again to predictive analytics. Those who are familiar with predictive analytics know that models often end up relying on correlations between the features and the predicted labels. Using such models without considering the meaning of the variables can lead us to erroneous conclusions, and potentially harmful interventions. For example, based on the following graph we may make a recommendation that the US government decrease its spending on science to reduce the number of suicides by hanging.\nSource: Spurious Correlations by Tyler Vigen Causal analysis aims to identify factors that are independent of spurious correlations, allowing stakeholders to make well-informed decisions. It is all about getting to the top of the DIKW (data-information-knowledge-wisdom) pyramid by understanding why things happen and what we can do to change the world. However, finding true causes can be very hard, especially in cases where you can’t perform experiments. Judea Pearl explains it well:\nWe know, from first principles, that any causal conclusion drawn from observational studies must rest on untested causal assumptions. Cartwright (1989) named this principle ‘no causes in, no causes out,’ which follows formally from the theory of equivalent models (Verma and Pearl, 1990); for any model yielding a conclusion C, one can construct a statistically equivalent model that refutes C and fits the data equally well.\nWhat this means in practice is that you can’t, for example, conclusively prove that smoking causes cancer without making some reasonable assumptions about the mechanisms at play. For ethical reasons, we can’t perform a randomly controlled trial where a test group is forced to smoke for years while a control group is forced not to smoke. Therefore, our conclusions about the causal link between smoking and cancer are drawn from observational studies and an understanding of the mechanisms by which various cancers develop (e.g., the effect of cigarette smoke on individual cells can be studied without forcing people to smoke). Cancer Tobacco companies have exploited this fact for years, making the claim that the probability of both cancer and smoking is raised by some mysterious genetic factors. Fossil fuel and food companies use similar arguments to sell their products and block attempts to regulate their industries (as discussed in previous posts on the hardest parts of data science and nutritionism). Fighting against such arguments is an uphill battle, as it is easy to sow doubt with a few simplistic catchphrases, while proving and communicating causality to laypeople is much harder (or impossible when it comes to deeply-held irrational beliefs).\nMy causality journey is just beginning My interest in formal causal analysis was seeded a couple of years ago, with a reading group that was dedicated to Judea Pearl’s work. We didn’t get very far, as I was a bit disappointed with what causal calculus can and cannot do. This may have been because I didn’t come in with the right expectations – I expected a black box that automatically finds causes. Recently reading Samantha Kleinberg’s excellent book Why: A Guide to Finding and Using Causes has made my expectations somewhat more realistic:\nThousands of years after Aristotle’s seminal work on causality, hundreds of years after Hume gave us two definitions of it, and decades after automated inference became a possibility through powerful new computers, causality is still an unsolved problem. Humans are prone to seeing causality where it does not exist and our algorithms aren’t foolproof. Even worse, once we find a cause it’s still hard to use this information to prevent or produce an outcome because of limits on what information we can collect and how we can understand it. After looking at all the cases where methods haven’t worked and researchers and policy makers have gotten causality really wrong, you might wonder why you should bother.\n[…]\nRather than giving up on causality, what we need to give up on is the idea of having a black box that takes some data straight from its source and emits a stream of causes with no need for interpretation or human intervention. Causal inference is necessary and possible, but it is not perfect and, most importantly, it requires domain knowledge.\nKleinberg’s book is a great general intro to causality, but it intentionally omits the mathematical details behind the various methods. I am now ready to once again go deeper into causality, perhaps starting with Kleinberg’s more technical book, Causality, Probability, and Time. Other recommendations are very welcome!\nCover image source: xkcd: Correlation ","wordCount":"1532","inLanguage":"en","image":"https://yanirseroussi.com/correlation-xkcd.png","datePublished":"2016-02-14T11:04:11Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Why you should stop worrying about deep learning and deepen your understanding of causality instead</h1><div class=post-meta><span title='2016-02-14 11:04:11 +0000 UTC'>February 14, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-02-14-why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/correlation-xkcd.png alt></figure><div class=post-content><p>Everywhere you go these days, you hear about deep learning&rsquo;s impressive advancements. New deep learning libraries, tools, and products get announced on a regular basis, making the average data scientist feel like they&rsquo;re missing out if they don&rsquo;t <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/ target=_blank rel=noopener>hop on the deep learning bandwagon</a>. However, as Kamil Bartocha put it in his post <a href=https://www.linkedin.com/pulse/inconvenient-truth-data-science-kamil-bartocha target=_blank rel=noopener>The Inconvenient Truth About Data Science</a>, <em>95% of tasks do not require deep learning</em>. This is obviously <a href=http://dilbert.com/strip/2008-05-08 target=_blank rel=noopener>a made up number</a>, but it&rsquo;s probably an accurate representation of the everyday reality of many data scientists. This post discusses an often-overlooked area of study that is of much higher relevance to most data scientists than deep learning: <strong>causality</strong>.</p><h2 id=causality-is-everywhere>Causality is everywhere<a hidden class=anchor aria-hidden=true href=#causality-is-everywhere>#</a></h2><p>An understanding of cause and effect is something that is not unique to humans. For example, the many videos of cats knocking things off tables appear to exemplify experimentation by animals. If you are not familiar with such videos, <a href="https://www.youtube.com/results?search_query=cat+knocking+stuff+off" target=_blank rel=noopener>it can easily be fixed</a>. The thing to notice is that cats appear genuinely curious about what happens when they push an object. And they tend to repeat the experiment to verify that if you push something off, it falls to the ground.</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/UoUEQYjYgf4 style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="YouTube Video"></iframe></div></p><p>Humans rely on much more complex causal analysis than that done by cats – an understanding of the long-term effects of one&rsquo;s actions is crucial to survival. <a href=https://en.wikipedia.org/wiki/Science target=_blank rel=noopener>Science, as defined by Wikipedia</a>, <em>is a systematic enterprise that creates, builds and organizes knowledge in the form of testable explanations and predictions about the universe</em>. Causal analysis is key to producing explanations and predictions that are valid and sound, which is why understanding causality is so important to data scientists, traditional scientists, and all humans.</p><h2 id=what-is-causality>What is causality?<a hidden class=anchor aria-hidden=true href=#what-is-causality>#</a></h2><p>It is surprisingly hard to define causality. Just like cats, we all have an intuitive sense of what causality is, but things get complicated on deeper inspection. For example, few people would disagree with the statement that <em>smoking causes cancer</em>. But does it cause cancer immediately? Would smoking a few cigarettes today and never again cause cancer? Do all smokers develop cancer eventually? What about light smokers who live in areas with heavy air pollution?</p><p>Samantha Kleinberg summarises it very well in her book, <a href=http://www.skleinberg.org/why/ target=_blank rel=noopener>Why: A Guide to Finding and Using Causes</a>:</p><blockquote><p>While most definitions of causality are based on <a href=https://en.wikipedia.org/wiki/David_Hume target=_blank rel=noopener>Hume&rsquo;s work</a>, none of the ones we can come up with cover all possible cases and each one has counterexamples another does not. For instance, a medication may lead to side effects in only a small fraction of users (so we can&rsquo;t assume that a cause will always produce an effect), and seat belts normally prevent death but can cause it in some car accidents (so we need to allow for factors that can have mixed producer/preventer roles depending on context).</p><p>The question often boils down to whether we should see causes as a fundamental building block or force of the world (that can&rsquo;t be further reduced to any other laws), or if this structure is something we impose. As with nearly every facet of causality, there is disagreement on this point (and even disagreement about whether particular theories are compatible with this notion, which is called causal realism). Some have felt that causes are so hard to find as for the search to be hopeless and, further, that once we have some physical laws, those are more useful than causes anyway. That is, &ldquo;causes&rdquo; may be a mere shorthand for things like triggers, pushes, repels, prevents, and so on, rather than a fundamental notion.</p><p>It is somewhat surprising, given how central the idea of causality is to our daily lives, but there is simply no unified philosophical theory of what causes are, and no single foolproof computational method for finding them with absolute certainty. What makes this even more challenging is that, depending on one’s definition of causality, different factors may be identified as causes in the same situation, and it may not be clear what the ground truth is.</p></blockquote><h2 id=why-study-causality-now>Why study causality now?<a hidden class=anchor aria-hidden=true href=#why-study-causality-now>#</a></h2><p>While it&rsquo;s hard to conclusively prove, it seems to me like interest in formal causal analysis has increased in recent years. My hypothesis is that it&rsquo;s just a natural progression along the levels of <a href=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/>data&rsquo;s hierarchy of needs</a>. At the start of the big data boom, people were mostly concerned with storing and processing large amounts of data (e.g., using Hadoop, Elasticsearch, or your favourite NoSQL database). Just having your data flowing through pipelines is nice, but not very useful, so the focus switched to reporting and visualisation to extract insights about what happened (commonly known as business intelligence). While having a good picture of what happened is great, it isn&rsquo;t enough – you can make better decisions if you can predict what&rsquo;s going to happen, so the focus switched again to predictive analytics. Those who are familiar with predictive analytics know that models often end up relying on correlations between the features and the predicted labels. Using such models without considering the meaning of the variables can lead us to erroneous conclusions, and potentially harmful interventions. For example, based on the following graph we may make a recommendation that the US government decrease its spending on science to reduce the number of suicides by hanging.</p><figure><a href=us-science-spending-versus-suicides.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="analytics,causal inference,data science,deep learning,insights,machine learning,predictive modelling"><meta name=description content="Causality is often overlooked but is of much higher relevance to most data scientists than deep learning."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Why you should stop worrying about deep learning and deepen your understanding of causality instead"><meta property="og:description" content="Causality is often overlooked but is of much higher relevance to most data scientists than deep learning."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/"><meta property="og:image" content="https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/correlation-xkcd.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-02-14T11:04:11+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/correlation-xkcd.png"><meta name=twitter:title content="Why you should stop worrying about deep learning and deepen your understanding of causality instead"><meta name=twitter:description content="Causality is often overlooked but is of much higher relevance to most data scientists than deep learning."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Why you should stop worrying about deep learning and deepen your understanding of causality instead","item":"https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Why you should stop worrying about deep learning and deepen your understanding of causality instead","name":"Why you should stop worrying about deep learning and deepen your understanding of causality instead","description":"Causality is often overlooked but is of much higher relevance to most data scientists than deep learning.","keywords":["analytics","causal inference","data science","deep learning","insights","machine learning","predictive modelling"],"articleBody":"Everywhere you go these days, you hear about deep learning’s impressive advancements. New deep learning libraries, tools, and products get announced on a regular basis, making the average data scientist feel like they’re missing out if they don’t hop on the deep learning bandwagon. However, as Kamil Bartocha put it in his post The Inconvenient Truth About Data Science, 95% of tasks do not require deep learning. This is obviously a made up number, but it’s probably an accurate representation of the everyday reality of many data scientists. This post discusses an often-overlooked area of study that is of much higher relevance to most data scientists than deep learning: causality.\nCausality is everywhere An understanding of cause and effect is something that is not unique to humans. For example, the many videos of cats knocking things off tables appear to exemplify experimentation by animals. If you are not familiar with such videos, it can easily be fixed. The thing to notice is that cats appear genuinely curious about what happens when they push an object. And they tend to repeat the experiment to verify that if you push something off, it falls to the ground.\nHumans rely on much more complex causal analysis than that done by cats – an understanding of the long-term effects of one’s actions is crucial to survival. Science, as defined by Wikipedia, is a systematic enterprise that creates, builds and organizes knowledge in the form of testable explanations and predictions about the universe. Causal analysis is key to producing explanations and predictions that are valid and sound, which is why understanding causality is so important to data scientists, traditional scientists, and all humans.\nWhat is causality? It is surprisingly hard to define causality. Just like cats, we all have an intuitive sense of what causality is, but things get complicated on deeper inspection. For example, few people would disagree with the statement that smoking causes cancer. But does it cause cancer immediately? Would smoking a few cigarettes today and never again cause cancer? Do all smokers develop cancer eventually? What about light smokers who live in areas with heavy air pollution?\nSamantha Kleinberg summarises it very well in her book, Why: A Guide to Finding and Using Causes:\nWhile most definitions of causality are based on Hume’s work, none of the ones we can come up with cover all possible cases and each one has counterexamples another does not. For instance, a medication may lead to side effects in only a small fraction of users (so we can’t assume that a cause will always produce an effect), and seat belts normally prevent death but can cause it in some car accidents (so we need to allow for factors that can have mixed producer/preventer roles depending on context).\nThe question often boils down to whether we should see causes as a fundamental building block or force of the world (that can’t be further reduced to any other laws), or if this structure is something we impose. As with nearly every facet of causality, there is disagreement on this point (and even disagreement about whether particular theories are compatible with this notion, which is called causal realism). Some have felt that causes are so hard to find as for the search to be hopeless and, further, that once we have some physical laws, those are more useful than causes anyway. That is, “causes” may be a mere shorthand for things like triggers, pushes, repels, prevents, and so on, rather than a fundamental notion.\nIt is somewhat surprising, given how central the idea of causality is to our daily lives, but there is simply no unified philosophical theory of what causes are, and no single foolproof computational method for finding them with absolute certainty. What makes this even more challenging is that, depending on one’s definition of causality, different factors may be identified as causes in the same situation, and it may not be clear what the ground truth is.\nWhy study causality now? While it’s hard to conclusively prove, it seems to me like interest in formal causal analysis has increased in recent years. My hypothesis is that it’s just a natural progression along the levels of data’s hierarchy of needs. At the start of the big data boom, people were mostly concerned with storing and processing large amounts of data (e.g., using Hadoop, Elasticsearch, or your favourite NoSQL database). Just having your data flowing through pipelines is nice, but not very useful, so the focus switched to reporting and visualisation to extract insights about what happened (commonly known as business intelligence). While having a good picture of what happened is great, it isn’t enough – you can make better decisions if you can predict what’s going to happen, so the focus switched again to predictive analytics. Those who are familiar with predictive analytics know that models often end up relying on correlations between the features and the predicted labels. Using such models without considering the meaning of the variables can lead us to erroneous conclusions, and potentially harmful interventions. For example, based on the following graph we may make a recommendation that the US government decrease its spending on science to reduce the number of suicides by hanging.\nSource: Spurious Correlations by Tyler Vigen Causal analysis aims to identify factors that are independent of spurious correlations, allowing stakeholders to make well-informed decisions. It is all about getting to the top of the DIKW (data-information-knowledge-wisdom) pyramid by understanding why things happen and what we can do to change the world. However, finding true causes can be very hard, especially in cases where you can’t perform experiments. Judea Pearl explains it well:\nWe know, from first principles, that any causal conclusion drawn from observational studies must rest on untested causal assumptions. Cartwright (1989) named this principle ‘no causes in, no causes out,’ which follows formally from the theory of equivalent models (Verma and Pearl, 1990); for any model yielding a conclusion C, one can construct a statistically equivalent model that refutes C and fits the data equally well.\nWhat this means in practice is that you can’t, for example, conclusively prove that smoking causes cancer without making some reasonable assumptions about the mechanisms at play. For ethical reasons, we can’t perform a randomly controlled trial where a test group is forced to smoke for years while a control group is forced not to smoke. Therefore, our conclusions about the causal link between smoking and cancer are drawn from observational studies and an understanding of the mechanisms by which various cancers develop (e.g., the effect of cigarette smoke on individual cells can be studied without forcing people to smoke). Cancer Tobacco companies have exploited this fact for years, making the claim that the probability of both cancer and smoking is raised by some mysterious genetic factors. Fossil fuel and food companies use similar arguments to sell their products and block attempts to regulate their industries (as discussed in previous posts on the hardest parts of data science and nutritionism). Fighting against such arguments is an uphill battle, as it is easy to sow doubt with a few simplistic catchphrases, while proving and communicating causality to laypeople is much harder (or impossible when it comes to deeply-held irrational beliefs).\nMy causality journey is just beginning My interest in formal causal analysis was seeded a couple of years ago, with a reading group that was dedicated to Judea Pearl’s work. We didn’t get very far, as I was a bit disappointed with what causal calculus can and cannot do. This may have been because I didn’t come in with the right expectations – I expected a black box that automatically finds causes. Recently reading Samantha Kleinberg’s excellent book Why: A Guide to Finding and Using Causes has made my expectations somewhat more realistic:\nThousands of years after Aristotle’s seminal work on causality, hundreds of years after Hume gave us two definitions of it, and decades after automated inference became a possibility through powerful new computers, causality is still an unsolved problem. Humans are prone to seeing causality where it does not exist and our algorithms aren’t foolproof. Even worse, once we find a cause it’s still hard to use this information to prevent or produce an outcome because of limits on what information we can collect and how we can understand it. After looking at all the cases where methods haven’t worked and researchers and policy makers have gotten causality really wrong, you might wonder why you should bother.\n[…]\nRather than giving up on causality, what we need to give up on is the idea of having a black box that takes some data straight from its source and emits a stream of causes with no need for interpretation or human intervention. Causal inference is necessary and possible, but it is not perfect and, most importantly, it requires domain knowledge.\nKleinberg’s book is a great general intro to causality, but it intentionally omits the mathematical details behind the various methods. I am now ready to once again go deeper into causality, perhaps starting with Kleinberg’s more technical book, Causality, Probability, and Time. Other recommendations are very welcome!\nCover image source: xkcd: Correlation ","wordCount":"1532","inLanguage":"en","image":"https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/correlation-xkcd.png","datePublished":"2016-02-14T11:04:11Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Why you should stop worrying about deep learning and deepen your understanding of causality instead</h1><div class=post-meta><span title='2016-02-14 11:04:11 +0000 UTC'>February 14, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-02-14-why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/correlation-xkcd.png alt></figure><div class=post-content><p>Everywhere you go these days, you hear about deep learning&rsquo;s impressive advancements. New deep learning libraries, tools, and products get announced on a regular basis, making the average data scientist feel like they&rsquo;re missing out if they don&rsquo;t <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/ target=_blank rel=noopener>hop on the deep learning bandwagon</a>. However, as Kamil Bartocha put it in his post <a href=https://www.linkedin.com/pulse/inconvenient-truth-data-science-kamil-bartocha target=_blank rel=noopener>The Inconvenient Truth About Data Science</a>, <em>95% of tasks do not require deep learning</em>. This is obviously <a href=http://dilbert.com/strip/2008-05-08 target=_blank rel=noopener>a made up number</a>, but it&rsquo;s probably an accurate representation of the everyday reality of many data scientists. This post discusses an often-overlooked area of study that is of much higher relevance to most data scientists than deep learning: <strong>causality</strong>.</p><h2 id=causality-is-everywhere>Causality is everywhere<a hidden class=anchor aria-hidden=true href=#causality-is-everywhere>#</a></h2><p>An understanding of cause and effect is something that is not unique to humans. For example, the many videos of cats knocking things off tables appear to exemplify experimentation by animals. If you are not familiar with such videos, <a href="https://www.youtube.com/results?search_query=cat+knocking+stuff+off" target=_blank rel=noopener>it can easily be fixed</a>. The thing to notice is that cats appear genuinely curious about what happens when they push an object. And they tend to repeat the experiment to verify that if you push something off, it falls to the ground.</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/UoUEQYjYgf4 style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="YouTube Video"></iframe></div></p><p>Humans rely on much more complex causal analysis than that done by cats – an understanding of the long-term effects of one&rsquo;s actions is crucial to survival. <a href=https://en.wikipedia.org/wiki/Science target=_blank rel=noopener>Science, as defined by Wikipedia</a>, <em>is a systematic enterprise that creates, builds and organizes knowledge in the form of testable explanations and predictions about the universe</em>. Causal analysis is key to producing explanations and predictions that are valid and sound, which is why understanding causality is so important to data scientists, traditional scientists, and all humans.</p><h2 id=what-is-causality>What is causality?<a hidden class=anchor aria-hidden=true href=#what-is-causality>#</a></h2><p>It is surprisingly hard to define causality. Just like cats, we all have an intuitive sense of what causality is, but things get complicated on deeper inspection. For example, few people would disagree with the statement that <em>smoking causes cancer</em>. But does it cause cancer immediately? Would smoking a few cigarettes today and never again cause cancer? Do all smokers develop cancer eventually? What about light smokers who live in areas with heavy air pollution?</p><p>Samantha Kleinberg summarises it very well in her book, <a href=http://www.skleinberg.org/why/ target=_blank rel=noopener>Why: A Guide to Finding and Using Causes</a>:</p><blockquote><p>While most definitions of causality are based on <a href=https://en.wikipedia.org/wiki/David_Hume target=_blank rel=noopener>Hume&rsquo;s work</a>, none of the ones we can come up with cover all possible cases and each one has counterexamples another does not. For instance, a medication may lead to side effects in only a small fraction of users (so we can&rsquo;t assume that a cause will always produce an effect), and seat belts normally prevent death but can cause it in some car accidents (so we need to allow for factors that can have mixed producer/preventer roles depending on context).</p><p>The question often boils down to whether we should see causes as a fundamental building block or force of the world (that can&rsquo;t be further reduced to any other laws), or if this structure is something we impose. As with nearly every facet of causality, there is disagreement on this point (and even disagreement about whether particular theories are compatible with this notion, which is called causal realism). Some have felt that causes are so hard to find as for the search to be hopeless and, further, that once we have some physical laws, those are more useful than causes anyway. That is, &ldquo;causes&rdquo; may be a mere shorthand for things like triggers, pushes, repels, prevents, and so on, rather than a fundamental notion.</p><p>It is somewhat surprising, given how central the idea of causality is to our daily lives, but there is simply no unified philosophical theory of what causes are, and no single foolproof computational method for finding them with absolute certainty. What makes this even more challenging is that, depending on one’s definition of causality, different factors may be identified as causes in the same situation, and it may not be clear what the ground truth is.</p></blockquote><h2 id=why-study-causality-now>Why study causality now?<a hidden class=anchor aria-hidden=true href=#why-study-causality-now>#</a></h2><p>While it&rsquo;s hard to conclusively prove, it seems to me like interest in formal causal analysis has increased in recent years. My hypothesis is that it&rsquo;s just a natural progression along the levels of <a href=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/>data&rsquo;s hierarchy of needs</a>. At the start of the big data boom, people were mostly concerned with storing and processing large amounts of data (e.g., using Hadoop, Elasticsearch, or your favourite NoSQL database). Just having your data flowing through pipelines is nice, but not very useful, so the focus switched to reporting and visualisation to extract insights about what happened (commonly known as business intelligence). While having a good picture of what happened is great, it isn&rsquo;t enough – you can make better decisions if you can predict what&rsquo;s going to happen, so the focus switched again to predictive analytics. Those who are familiar with predictive analytics know that models often end up relying on correlations between the features and the predicted labels. Using such models without considering the meaning of the variables can lead us to erroneous conclusions, and potentially harmful interventions. For example, based on the following graph we may make a recommendation that the US government decrease its spending on science to reduce the number of suicides by hanging.</p><figure><a href=us-science-spending-versus-suicides.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/us-science-spending-versus-suicides_hucb19a666efd495d868358d2e56c5c43f_82139_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/us-science-spending-versus-suicides_hucb19a666efd495d868358d2e56c5c43f_82139_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/us-science-spending-versus-suicides_hucb19a666efd495d868358d2e56c5c43f_82139_720x0_resize_box_3.png 720w,
diff --git a/2016/03/20/the-rise-of-greedy-robots/index.html b/2016/03/20/the-rise-of-greedy-robots/index.html
index 04290b407..9ebbac7db 100644
--- a/2016/03/20/the-rise-of-greedy-robots/index.html
+++ b/2016/03/20/the-rise-of-greedy-robots/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>The rise of greedy robots | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,deep learning,economics,futurism,machine intelligence"><meta name=description content="Is artificial/machine intelligence a future threat? I argue that it&rsquo;s already here, with greedy robots already dominating our lives."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The rise of greedy robots"><meta property="og:description" content="Is artificial/machine intelligence a future threat? I argue that it&rsquo;s already here, with greedy robots already dominating our lives."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/"><meta property="og:image" content="https://yanirseroussi.com/greedy-robot.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-03-20T20:33:43+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/greedy-robot.jpg"><meta name=twitter:title content="The rise of greedy robots"><meta name=twitter:description content="Is artificial/machine intelligence a future threat? I argue that it&rsquo;s already here, with greedy robots already dominating our lives."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The rise of greedy robots","item":"https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The rise of greedy robots","name":"The rise of greedy robots","description":"Is artificial/machine intelligence a future threat? I argue that it\u0026rsquo;s already here, with greedy robots already dominating our lives.","keywords":["data science","deep learning","economics","futurism","machine intelligence"],"articleBody":"Given the impressive advancement of machine intelligence in recent years, many people have been speculating on what the future holds when it comes to the power and roles of robots in our society. Some have even called for regulation of machine intelligence before it’s too late. My take on this issue is that there is no need to speculate – machine intelligence is already here, with greedy robots already dominating our lives.\nMachine intelligence or artificial intelligence? The problem with talking about artificial intelligence is that it creates an inflated expectation of machines that would be completely human-like – we won’t have true artificial intelligence until we can create machines that are indistinguishable from humans. While the goal of mimicking human intelligence is certainly interesting, it is clear that we are very far from achieving it. We currently can’t even fully simulate C. elegans, a 1mm worm with 302 neurons. However, we do have machines that can perform tasks that require intelligence, where intelligence is defined as the ability to learn or understand things or to deal with new or difficult situations. Unlike artificial intelligence, there is no doubt that machine intelligence already exists.\nAirplanes provide a famous example: we don’t commonly think of them as performing artificial flight – they are machines that fly faster than any bird. Likewise, computers are super-intelligent machines. They can perform calculations that humans can’t, store and recall enormous amounts of information, translate text, play Go, drive cars, and much more – all without requiring rest or food. The robots are here, and they are becoming increasingly useful and powerful.\nWho are those greedy robots? Greed is defined as a selfish desire to have more of something (especially money). It is generally seen as a negative trait in humans. However, we have been cultivating an environment where greedy entities – for-profit organisations – thrive. The primary goal of for-profit organisations is to generate profit for their shareholders. If these organisations were human, they would be seen as the embodiment of greed, as they are focused on making money and little else. Greedy organisations “live” among us and have been enjoying a plethora of legal rights and protections for hundreds of years. These entities, which were formed and shaped by humans, now form and shape human lives.\nHumans running for-profit organisations have little choice but to play by their rules. For example, many people acknowledge that corporate tax avoidance is morally wrong, as revenue from taxes supports the infrastructure and society that enable corporate profits. However, any executive of a public company who refuses to do everything they legally can to minimise their tax bill is likely to lose their job. Despite being separate from the greedy organisations we run, humans have to act greedily to effectively serve their employers.\nThe relationship between greedy organisations and greedy robots is clear. Much of the funding that goes into machine intelligence research comes from for-profit organisations, with the end goal of producing profit for these entities. In the words of Jeffrey Hammerbacher: The best minds of my generation are thinking about how to make people click ads. Hammerbacher, an early Facebook employee, was referring to Facebook’s business model, where considerable resources are dedicated to getting people to engage with advertising – the main driver of Facebook’s revenue. Indeed, Facebook has hired Yann LeCun (a prominent machine intelligence researcher) to head its artificial intelligence research efforts. While LeCun’s appointment will undoubtedly result in general research advancements, Facebook’s motivation is clear – they see machine intelligence as a key driver of future profits. They, and other companies, use machine intelligence to build greedy robots, whose sole goal is to increase profits.\nGreedy robots are all around us. Advertising-driven companies like Facebook and Google use sophisticated algorithms to get people to click on ads. Retail companies like Amazon use machine intelligence to mine through people’s shopping history and generate product recommendations. Banks and mutual funds utilise algorithmic trading to drive their investments. None of this is science fiction, and it doesn’t take much of a leap to imagine a world where greedy robots are even more dominant. Just like we have allowed greedy legal entities to dominate our world and shape our lives, we are allowing greedy robots to do the same, just more efficiently and pervasively.\nWill robots take your job? The growing range of machine intelligence capabilities gives rise to the question of whether robots are going to take over human jobs. One salient example is that of self-driving cars, that are projected to render millions of professional drivers obsolete in the next few decades. The potential impact of machine intelligence on jobs was summarised very well by CGP Grey in his video Humans Need Not Apply. The main message of the video is that machines will soon be able to perform any job better or more cost-effectively than any human, thereby making humans unemployable for economic reasons. The video ends with a call to society to consider how to deal with a future where there are simply no jobs for a large part of the population.\nDespite all the technological advancements since the start of the industrial revolution, the prevailing mode of wealth distribution remains paid labour, i.e., jobs. The implication of this is that much of the work we do is unnecessary or harmful – people work because they have no other option, but their work doesn’t necessarily benefit society. This isn’t a new insight, as the following quotes demonstrate:\n“Most men appear never to have considered what a house is, and are actually though needlessly poor all their lives because they think that they must have such a one as their neighbors have. […] For more than five years I maintained myself thus solely by the labor of my hands, and I found that, by working about six weeks in a year, I could meet all the expenses of living.” – Henry David Thoreau, Walden (1854) “I think that there is far too much work done in the world, that immense harm is caused by the belief that work is virtuous, and that what needs to be preached in modern industrial countries is quite different from what always has been preached. […] Modern technique has made it possible to diminish enormously the amount of labor required to secure the necessaries of life for everyone. […] If, at the end of the war, the scientific organization, which had been created in order to liberate men for fighting and munition work, had been preserved, and the hours of the week had been cut down to four, all would have been well. Instead of that the old chaos was restored, those whose work was demanded were made to work long hours, and the rest were left to starve as unemployed.” – Bertrand Russell, In Praise of Idleness (1932) “In the year 1930, John Maynard Keynes predicted that technology would have advanced sufficiently by century’s end that countries like Great Britain or the United States would achieve a 15-hour work week. There’s every reason to believe he was right. In technological terms, we are quite capable of this. And yet it didn’t happen. Instead, technology has been marshaled, if anything, to figure out ways to make us all work more. In order to achieve this, jobs have had to be created that are, effectively, pointless. Huge swathes of people, in Europe and North America in particular, spend their entire working lives performing tasks they secretly believe do not really need to be performed. The moral and spiritual damage that comes from this situation is profound. It is a scar across our collective soul. Yet virtually no one talks about it.” – David Graeber, On the Phenomenon of Bullshit Jobs (2013) This leads to the conclusion that we are unlikely to experience the utopian future in which intelligent machines do all our work, leaving us ample time for leisure. Yes, people will lose their jobs. But it is not unlikely that new unnecessary jobs will be invented to keep people busy, or worse, many people will simply be unemployed and will not get to enjoy the wealth provided by technology. Stephen Hawking summarised it well recently:\nIf machines produce everything we need, the outcome will depend on how things are distributed. Everyone can enjoy a life of luxurious leisure if the machine-produced wealth is shared, or most people can end up miserably poor if the machine-owners successfully lobby against wealth redistribution. So far, the trend seems to be toward the second option, with technology driving ever-increasing inequality.\nWhere to from here? Many people believe that the existence of powerful greedy entities is good for society. Indeed, there is no doubt that we owe many beneficial technological breakthroughs to competition between for-profit companies. However, a single-minded focus on profit means that in many cases companies do what they can to reduce their responsibility for harmful side-effects of their activities. Examples include environmental pollution, multinational tax evasion, and health effects of products like tobacco and junk food. As history shows us, in truly unregulated markets, companies would happily utilise slavery and child labour to reduce their costs. Clearly, some regulation of greedy entities is required to obtain the best results for society.\nWith machine intelligence becoming increasingly powerful every day, some people think that to produce the best outcomes, we just need to wait for robots to be intelligent enough to completely run our lives. However, as anyone who has actually built intelligent systems knows, the outputs of such systems are strongly dependent on the inputs and goals set by system designers. Machine intelligence is just a tool – a very powerful tool. Like nuclear energy, we can use it to improve our lives, or we can use it to obliterate everything around us. The collective choice is ours to make, but is far from simple.\n","wordCount":"1644","inLanguage":"en","image":"https://yanirseroussi.com/greedy-robot.jpg","datePublished":"2016-03-20T20:33:43Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The rise of greedy robots</h1><div class=post-meta><span title='2016-03-20 20:33:43 +0000 UTC'>March 20, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-03-20-the-rise-of-greedy-robots/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot.jpg alt width=1920 height=1064></figure><div class=post-content><p>Given the impressive advancement of machine intelligence in recent years, many people have been speculating on what the future holds when it comes to the power and roles of robots in our society. Some have even <a href=http://www.theguardian.com/technology/2014/oct/27/elon-musk-artificial-intelligence-ai-biggest-existential-threat target=_blank rel=noopener>called for regulation of machine intelligence before it&rsquo;s too late</a>. My take on this issue is that there is no need to speculate – machine intelligence is already here, with greedy robots already dominating our lives.</p><h2 id=machine-intelligence-or-artificial-intelligence>Machine intelligence or artificial intelligence?<a hidden class=anchor aria-hidden=true href=#machine-intelligence-or-artificial-intelligence>#</a></h2><p>The problem with talking about <em>artificial</em> intelligence is that it creates an inflated expectation of machines that would be completely human-like – we won&rsquo;t have true artificial intelligence until we can create machines that are indistinguishable from humans. While the goal of mimicking human intelligence is certainly interesting, it is clear that we are very far from achieving it. We currently <a href=http://www.openworm.org/ target=_blank rel=noopener>can&rsquo;t even fully simulate C. elegans, a 1mm worm with 302 neurons</a>. However, we do have machines that can perform tasks that require intelligence, where intelligence is defined as <a href=http://www.merriam-webster.com/dictionary/intelligence target=_blank rel=noopener>the ability to learn or understand things or to deal with new or difficult situations</a>. Unlike artificial intelligence, there is no doubt that <em>machine</em> intelligence already exists.</p><p>Airplanes provide a famous example: we don&rsquo;t commonly think of them as performing artificial flight – they are machines that fly faster than any bird. Likewise, computers are super-intelligent machines. They can perform calculations that humans can&rsquo;t, store and recall enormous amounts of information, translate text, play Go, drive cars, and much more – all without requiring rest or food. The robots are here, and they are becoming increasingly useful and powerful.</p><h2 id=who-are-those-greedy-robots>Who are those greedy robots?<a hidden class=anchor aria-hidden=true href=#who-are-those-greedy-robots>#</a></h2><p>Greed is defined as <a href=http://www.merriam-webster.com/dictionary/greed target=_blank rel=noopener>a selfish desire to have more of something (especially money)</a>. It is generally seen as a negative trait in humans. However, we have been cultivating an environment where greedy entities – for-profit organisations – thrive. The primary goal of for-profit organisations is to generate profit for their shareholders. If these organisations were human, they would be seen as the embodiment of greed, as they are focused on making money and little else. Greedy organisations &ldquo;live&rdquo; among us and have been enjoying a plethora of legal rights and protections for hundreds of years. These entities, which were formed and shaped by humans, now form and shape human lives.</p><p>Humans running for-profit organisations have little choice but to play by their rules. For example, many people acknowledge that corporate tax avoidance is morally wrong, as revenue from taxes supports the infrastructure and society that enable corporate profits. However, any executive of a public company who refuses to do everything they legally can to minimise their tax bill is likely to lose their job. Despite being separate from the greedy organisations we run, humans have to act greedily to effectively serve their employers.</p><p>The relationship between greedy organisations and greedy robots is clear. Much of the funding that goes into machine intelligence research comes from for-profit organisations, with the end goal of producing profit for these entities. In the <a href=http://www.fastcompany.com/3008436/takeaway/why-data-god-jeffrey-hammerbacher-left-facebook-found-cloudera target=_blank rel=noopener>words of Jeffrey Hammerbacher</a>: <em>The best minds of my generation are thinking about how to make people click ads.</em> Hammerbacher, an early Facebook employee, was referring to Facebook&rsquo;s business model, where considerable resources are dedicated to getting people to engage with advertising – the main driver of Facebook&rsquo;s revenue. Indeed, Facebook has hired <a href=https://en.wikipedia.org/wiki/Yann_LeCun target=_blank rel=noopener>Yann LeCun</a> (a prominent machine intelligence researcher) to head its artificial intelligence research efforts. While LeCun&rsquo;s appointment will undoubtedly result in general research advancements, Facebook&rsquo;s motivation is clear – they see machine intelligence as a key driver of future profits. They, and other companies, use machine intelligence to build greedy robots, whose sole goal is to increase profits.</p><p>Greedy robots are all around us. Advertising-driven companies like Facebook and Google use sophisticated algorithms to get people to click on ads. Retail companies like Amazon use machine intelligence to mine through people&rsquo;s shopping history and generate product recommendations. Banks and mutual funds utilise algorithmic trading to drive their investments. None of this is science fiction, and it doesn&rsquo;t take much of a leap to imagine a world where greedy robots are even more dominant. Just like we have allowed greedy legal entities to dominate our world and shape our lives, we are allowing greedy robots to do the same, just more efficiently and pervasively.</p><h2 id=will-robots-take-your-job>Will robots take your job?<a hidden class=anchor aria-hidden=true href=#will-robots-take-your-job>#</a></h2><p>The growing range of machine intelligence capabilities gives rise to the question of whether robots are going to take over human jobs. One salient example is that of self-driving cars, that are projected to render millions of professional drivers obsolete in the next few decades. The potential impact of machine intelligence on jobs was summarised very well by CGP Grey in his video <a href="https://www.youtube.com/watch?v=7Pq-S557XQU" target=_blank rel=noopener>Humans Need Not Apply</a>. The main message of the video is that machines will soon be able to perform any job better or more cost-effectively than any human, thereby making humans unemployable for economic reasons. The video ends with a call to society to consider how to deal with a future where there are simply no jobs for a large part of the population.</p><p>Despite all the technological advancements since the start of the industrial revolution, the prevailing mode of wealth distribution remains paid labour, i.e., jobs. The implication of this is that much of the work we do is unnecessary or harmful – people work because they have no other option, but their work doesn&rsquo;t necessarily benefit society. This isn&rsquo;t a new insight, as the following quotes demonstrate:</p><ul><li><em>&ldquo;Most men appear never to have considered what a house is, and are actually though needlessly poor all their lives because they think that they must have such a one as their neighbors have. [&mldr;] For more than five years I maintained myself thus solely by the labor of my hands, and I found that, by working about six weeks in a year, I could meet all the expenses of living.&rdquo;</em> – Henry David Thoreau, <a href=http://www.gutenberg.org/files/205/205-h/205-h.htm target=_blank rel=noopener>Walden</a> (<strong>1854</strong>)</li><li><em>&ldquo;I think that there is far too much work done in the world, that immense harm is caused by the belief that work is virtuous, and that what needs to be preached in modern industrial countries is quite different from what always has been preached. [&mldr;] Modern technique has made it possible to diminish enormously the amount of labor required to secure the necessaries of life for everyone. [&mldr;] If, at the end of the war, the scientific organization, which had been created in order to liberate men for fighting and munition work, had been preserved, and the hours of the week had been cut down to four, all would have been well. Instead of that the old chaos was restored, those whose work was demanded were made to work long hours, and the rest were left to starve as unemployed.&rdquo;</em> – Bertrand Russell, <a href=http://www.zpub.com/notes/idle.html target=_blank rel=noopener>In Praise of Idleness</a> (<strong>1932</strong>)</li><li><em>&ldquo;In the year 1930, John Maynard Keynes predicted that technology would have advanced sufficiently by century&rsquo;s end that countries like Great Britain or the United States would achieve a 15-hour work week. There&rsquo;s every reason to believe he was right. In technological terms, we are quite capable of this. And yet it didn’t happen. Instead, technology has been marshaled, if anything, to figure out ways to make us all work more. In order to achieve this, jobs have had to be created that are, effectively, pointless. Huge swathes of people, in Europe and North America in particular, spend their entire working lives performing tasks they secretly believe do not really need to be performed. The moral and spiritual damage that comes from this situation is profound. It is a scar across our collective soul. Yet virtually no one talks about it.&rdquo;</em> – David Graeber, <a href=http://strikemag.org/bullshit-jobs/ target=_blank rel=noopener>On the Phenomenon of Bullshit Jobs</a> (<strong>2013</strong>)</li></ul><p>This leads to the conclusion that we are unlikely to experience the utopian future in which intelligent machines do all our work, leaving us ample time for leisure. Yes, people will lose their jobs. But it is not unlikely that new unnecessary jobs will be invented to keep people busy, or worse, many people will simply be unemployed and will not get to enjoy the wealth provided by technology. Stephen Hawking <a href=https://www.reddit.com/r/science/comments/3nyn5i/science_ama_series_stephen_hawking_ama_answers/cvsdmkv target=_blank rel=noopener>summarised it well recently</a>:</p><blockquote><p>If machines produce everything we need, the outcome will depend on how things are distributed. Everyone can enjoy a life of luxurious leisure if the machine-produced wealth is shared, or most people can end up miserably poor if the machine-owners successfully lobby against wealth redistribution. So far, the trend seems to be toward the second option, with technology driving ever-increasing inequality.</p></blockquote><h2 id=where-to-from-here>Where to from here?<a hidden class=anchor aria-hidden=true href=#where-to-from-here>#</a></h2><p>Many people believe that the existence of powerful greedy entities is good for society. Indeed, there is no doubt that we owe many beneficial technological breakthroughs to competition between for-profit companies. However, a single-minded focus on profit means that in many cases companies do what they can to reduce their responsibility for harmful side-effects of their activities. Examples include environmental pollution, multinational tax evasion, and health effects of products like tobacco and junk food. As history shows us, in truly unregulated markets, companies would happily utilise slavery and child labour to reduce their costs. Clearly, some regulation of greedy entities is required to obtain the best results for society.</p><p>With machine intelligence becoming increasingly powerful every day, some people think that to produce the best outcomes, we just need to wait for robots to be intelligent enough to completely run our lives. However, as anyone who has actually built intelligent systems knows, the outputs of such systems are strongly dependent on the inputs and goals set by system designers. Machine intelligence is just a tool – a very powerful tool. Like nuclear energy, we can use it to improve our lives, or we can use it to obliterate everything around us. The collective choice is ours to make, but is far from simple.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/deep-learning/>deep learning</a></li><li><a href=https://yanirseroussi.com/tags/economics/>economics</a></li><li><a href=https://yanirseroussi.com/tags/futurism/>futurism</a></li><li><a href=https://yanirseroussi.com/tags/machine-intelligence/>machine intelligence</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on x" href="https://x.com/intent/tweet/?text=The%20rise%20of%20greedy%20robots&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f&amp;hashtags=datascience%2cdeeplearning%2ceconomics%2cfuturism%2cmachineintelligence"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f&amp;title=The%20rise%20of%20greedy%20robots&amp;summary=The%20rise%20of%20greedy%20robots&amp;source=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f&title=The%20rise%20of%20greedy%20robots"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on whatsapp" href="https://api.whatsapp.com/send?text=The%20rise%20of%20greedy%20robots%20-%20https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on telegram" href="https://telegram.me/share/url?text=The%20rise%20of%20greedy%20robots&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on ycombinator" href="https://news.ycombinator.com/submitlink?t=The%20rise%20of%20greedy%20robots&u=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="data science,deep learning,economics,futurism,machine intelligence"><meta name=description content="Is artificial/machine intelligence a future threat? I argue that it&rsquo;s already here, with greedy robots already dominating our lives."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The rise of greedy robots"><meta property="og:description" content="Is artificial/machine intelligence a future threat? I argue that it&rsquo;s already here, with greedy robots already dominating our lives."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/"><meta property="og:image" content="https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-03-20T20:33:43+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot.jpg"><meta name=twitter:title content="The rise of greedy robots"><meta name=twitter:description content="Is artificial/machine intelligence a future threat? I argue that it&rsquo;s already here, with greedy robots already dominating our lives."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The rise of greedy robots","item":"https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The rise of greedy robots","name":"The rise of greedy robots","description":"Is artificial/machine intelligence a future threat? I argue that it\u0026rsquo;s already here, with greedy robots already dominating our lives.","keywords":["data science","deep learning","economics","futurism","machine intelligence"],"articleBody":"Given the impressive advancement of machine intelligence in recent years, many people have been speculating on what the future holds when it comes to the power and roles of robots in our society. Some have even called for regulation of machine intelligence before it’s too late. My take on this issue is that there is no need to speculate – machine intelligence is already here, with greedy robots already dominating our lives.\nMachine intelligence or artificial intelligence? The problem with talking about artificial intelligence is that it creates an inflated expectation of machines that would be completely human-like – we won’t have true artificial intelligence until we can create machines that are indistinguishable from humans. While the goal of mimicking human intelligence is certainly interesting, it is clear that we are very far from achieving it. We currently can’t even fully simulate C. elegans, a 1mm worm with 302 neurons. However, we do have machines that can perform tasks that require intelligence, where intelligence is defined as the ability to learn or understand things or to deal with new or difficult situations. Unlike artificial intelligence, there is no doubt that machine intelligence already exists.\nAirplanes provide a famous example: we don’t commonly think of them as performing artificial flight – they are machines that fly faster than any bird. Likewise, computers are super-intelligent machines. They can perform calculations that humans can’t, store and recall enormous amounts of information, translate text, play Go, drive cars, and much more – all without requiring rest or food. The robots are here, and they are becoming increasingly useful and powerful.\nWho are those greedy robots? Greed is defined as a selfish desire to have more of something (especially money). It is generally seen as a negative trait in humans. However, we have been cultivating an environment where greedy entities – for-profit organisations – thrive. The primary goal of for-profit organisations is to generate profit for their shareholders. If these organisations were human, they would be seen as the embodiment of greed, as they are focused on making money and little else. Greedy organisations “live” among us and have been enjoying a plethora of legal rights and protections for hundreds of years. These entities, which were formed and shaped by humans, now form and shape human lives.\nHumans running for-profit organisations have little choice but to play by their rules. For example, many people acknowledge that corporate tax avoidance is morally wrong, as revenue from taxes supports the infrastructure and society that enable corporate profits. However, any executive of a public company who refuses to do everything they legally can to minimise their tax bill is likely to lose their job. Despite being separate from the greedy organisations we run, humans have to act greedily to effectively serve their employers.\nThe relationship between greedy organisations and greedy robots is clear. Much of the funding that goes into machine intelligence research comes from for-profit organisations, with the end goal of producing profit for these entities. In the words of Jeffrey Hammerbacher: The best minds of my generation are thinking about how to make people click ads. Hammerbacher, an early Facebook employee, was referring to Facebook’s business model, where considerable resources are dedicated to getting people to engage with advertising – the main driver of Facebook’s revenue. Indeed, Facebook has hired Yann LeCun (a prominent machine intelligence researcher) to head its artificial intelligence research efforts. While LeCun’s appointment will undoubtedly result in general research advancements, Facebook’s motivation is clear – they see machine intelligence as a key driver of future profits. They, and other companies, use machine intelligence to build greedy robots, whose sole goal is to increase profits.\nGreedy robots are all around us. Advertising-driven companies like Facebook and Google use sophisticated algorithms to get people to click on ads. Retail companies like Amazon use machine intelligence to mine through people’s shopping history and generate product recommendations. Banks and mutual funds utilise algorithmic trading to drive their investments. None of this is science fiction, and it doesn’t take much of a leap to imagine a world where greedy robots are even more dominant. Just like we have allowed greedy legal entities to dominate our world and shape our lives, we are allowing greedy robots to do the same, just more efficiently and pervasively.\nWill robots take your job? The growing range of machine intelligence capabilities gives rise to the question of whether robots are going to take over human jobs. One salient example is that of self-driving cars, that are projected to render millions of professional drivers obsolete in the next few decades. The potential impact of machine intelligence on jobs was summarised very well by CGP Grey in his video Humans Need Not Apply. The main message of the video is that machines will soon be able to perform any job better or more cost-effectively than any human, thereby making humans unemployable for economic reasons. The video ends with a call to society to consider how to deal with a future where there are simply no jobs for a large part of the population.\nDespite all the technological advancements since the start of the industrial revolution, the prevailing mode of wealth distribution remains paid labour, i.e., jobs. The implication of this is that much of the work we do is unnecessary or harmful – people work because they have no other option, but their work doesn’t necessarily benefit society. This isn’t a new insight, as the following quotes demonstrate:\n“Most men appear never to have considered what a house is, and are actually though needlessly poor all their lives because they think that they must have such a one as their neighbors have. […] For more than five years I maintained myself thus solely by the labor of my hands, and I found that, by working about six weeks in a year, I could meet all the expenses of living.” – Henry David Thoreau, Walden (1854) “I think that there is far too much work done in the world, that immense harm is caused by the belief that work is virtuous, and that what needs to be preached in modern industrial countries is quite different from what always has been preached. […] Modern technique has made it possible to diminish enormously the amount of labor required to secure the necessaries of life for everyone. […] If, at the end of the war, the scientific organization, which had been created in order to liberate men for fighting and munition work, had been preserved, and the hours of the week had been cut down to four, all would have been well. Instead of that the old chaos was restored, those whose work was demanded were made to work long hours, and the rest were left to starve as unemployed.” – Bertrand Russell, In Praise of Idleness (1932) “In the year 1930, John Maynard Keynes predicted that technology would have advanced sufficiently by century’s end that countries like Great Britain or the United States would achieve a 15-hour work week. There’s every reason to believe he was right. In technological terms, we are quite capable of this. And yet it didn’t happen. Instead, technology has been marshaled, if anything, to figure out ways to make us all work more. In order to achieve this, jobs have had to be created that are, effectively, pointless. Huge swathes of people, in Europe and North America in particular, spend their entire working lives performing tasks they secretly believe do not really need to be performed. The moral and spiritual damage that comes from this situation is profound. It is a scar across our collective soul. Yet virtually no one talks about it.” – David Graeber, On the Phenomenon of Bullshit Jobs (2013) This leads to the conclusion that we are unlikely to experience the utopian future in which intelligent machines do all our work, leaving us ample time for leisure. Yes, people will lose their jobs. But it is not unlikely that new unnecessary jobs will be invented to keep people busy, or worse, many people will simply be unemployed and will not get to enjoy the wealth provided by technology. Stephen Hawking summarised it well recently:\nIf machines produce everything we need, the outcome will depend on how things are distributed. Everyone can enjoy a life of luxurious leisure if the machine-produced wealth is shared, or most people can end up miserably poor if the machine-owners successfully lobby against wealth redistribution. So far, the trend seems to be toward the second option, with technology driving ever-increasing inequality.\nWhere to from here? Many people believe that the existence of powerful greedy entities is good for society. Indeed, there is no doubt that we owe many beneficial technological breakthroughs to competition between for-profit companies. However, a single-minded focus on profit means that in many cases companies do what they can to reduce their responsibility for harmful side-effects of their activities. Examples include environmental pollution, multinational tax evasion, and health effects of products like tobacco and junk food. As history shows us, in truly unregulated markets, companies would happily utilise slavery and child labour to reduce their costs. Clearly, some regulation of greedy entities is required to obtain the best results for society.\nWith machine intelligence becoming increasingly powerful every day, some people think that to produce the best outcomes, we just need to wait for robots to be intelligent enough to completely run our lives. However, as anyone who has actually built intelligent systems knows, the outputs of such systems are strongly dependent on the inputs and goals set by system designers. Machine intelligence is just a tool – a very powerful tool. Like nuclear energy, we can use it to improve our lives, or we can use it to obliterate everything around us. The collective choice is ours to make, but is far from simple.\n","wordCount":"1644","inLanguage":"en","image":"https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot.jpg","datePublished":"2016-03-20T20:33:43Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The rise of greedy robots</h1><div class=post-meta><span title='2016-03-20 20:33:43 +0000 UTC'>March 20, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-03-20-the-rise-of-greedy-robots/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot_hu400343414979e1c2dc8bafadfe0b6d4d_563587_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/greedy-robot.jpg alt width=1920 height=1064></figure><div class=post-content><p>Given the impressive advancement of machine intelligence in recent years, many people have been speculating on what the future holds when it comes to the power and roles of robots in our society. Some have even <a href=http://www.theguardian.com/technology/2014/oct/27/elon-musk-artificial-intelligence-ai-biggest-existential-threat target=_blank rel=noopener>called for regulation of machine intelligence before it&rsquo;s too late</a>. My take on this issue is that there is no need to speculate – machine intelligence is already here, with greedy robots already dominating our lives.</p><h2 id=machine-intelligence-or-artificial-intelligence>Machine intelligence or artificial intelligence?<a hidden class=anchor aria-hidden=true href=#machine-intelligence-or-artificial-intelligence>#</a></h2><p>The problem with talking about <em>artificial</em> intelligence is that it creates an inflated expectation of machines that would be completely human-like – we won&rsquo;t have true artificial intelligence until we can create machines that are indistinguishable from humans. While the goal of mimicking human intelligence is certainly interesting, it is clear that we are very far from achieving it. We currently <a href=http://www.openworm.org/ target=_blank rel=noopener>can&rsquo;t even fully simulate C. elegans, a 1mm worm with 302 neurons</a>. However, we do have machines that can perform tasks that require intelligence, where intelligence is defined as <a href=http://www.merriam-webster.com/dictionary/intelligence target=_blank rel=noopener>the ability to learn or understand things or to deal with new or difficult situations</a>. Unlike artificial intelligence, there is no doubt that <em>machine</em> intelligence already exists.</p><p>Airplanes provide a famous example: we don&rsquo;t commonly think of them as performing artificial flight – they are machines that fly faster than any bird. Likewise, computers are super-intelligent machines. They can perform calculations that humans can&rsquo;t, store and recall enormous amounts of information, translate text, play Go, drive cars, and much more – all without requiring rest or food. The robots are here, and they are becoming increasingly useful and powerful.</p><h2 id=who-are-those-greedy-robots>Who are those greedy robots?<a hidden class=anchor aria-hidden=true href=#who-are-those-greedy-robots>#</a></h2><p>Greed is defined as <a href=http://www.merriam-webster.com/dictionary/greed target=_blank rel=noopener>a selfish desire to have more of something (especially money)</a>. It is generally seen as a negative trait in humans. However, we have been cultivating an environment where greedy entities – for-profit organisations – thrive. The primary goal of for-profit organisations is to generate profit for their shareholders. If these organisations were human, they would be seen as the embodiment of greed, as they are focused on making money and little else. Greedy organisations &ldquo;live&rdquo; among us and have been enjoying a plethora of legal rights and protections for hundreds of years. These entities, which were formed and shaped by humans, now form and shape human lives.</p><p>Humans running for-profit organisations have little choice but to play by their rules. For example, many people acknowledge that corporate tax avoidance is morally wrong, as revenue from taxes supports the infrastructure and society that enable corporate profits. However, any executive of a public company who refuses to do everything they legally can to minimise their tax bill is likely to lose their job. Despite being separate from the greedy organisations we run, humans have to act greedily to effectively serve their employers.</p><p>The relationship between greedy organisations and greedy robots is clear. Much of the funding that goes into machine intelligence research comes from for-profit organisations, with the end goal of producing profit for these entities. In the <a href=http://www.fastcompany.com/3008436/takeaway/why-data-god-jeffrey-hammerbacher-left-facebook-found-cloudera target=_blank rel=noopener>words of Jeffrey Hammerbacher</a>: <em>The best minds of my generation are thinking about how to make people click ads.</em> Hammerbacher, an early Facebook employee, was referring to Facebook&rsquo;s business model, where considerable resources are dedicated to getting people to engage with advertising – the main driver of Facebook&rsquo;s revenue. Indeed, Facebook has hired <a href=https://en.wikipedia.org/wiki/Yann_LeCun target=_blank rel=noopener>Yann LeCun</a> (a prominent machine intelligence researcher) to head its artificial intelligence research efforts. While LeCun&rsquo;s appointment will undoubtedly result in general research advancements, Facebook&rsquo;s motivation is clear – they see machine intelligence as a key driver of future profits. They, and other companies, use machine intelligence to build greedy robots, whose sole goal is to increase profits.</p><p>Greedy robots are all around us. Advertising-driven companies like Facebook and Google use sophisticated algorithms to get people to click on ads. Retail companies like Amazon use machine intelligence to mine through people&rsquo;s shopping history and generate product recommendations. Banks and mutual funds utilise algorithmic trading to drive their investments. None of this is science fiction, and it doesn&rsquo;t take much of a leap to imagine a world where greedy robots are even more dominant. Just like we have allowed greedy legal entities to dominate our world and shape our lives, we are allowing greedy robots to do the same, just more efficiently and pervasively.</p><h2 id=will-robots-take-your-job>Will robots take your job?<a hidden class=anchor aria-hidden=true href=#will-robots-take-your-job>#</a></h2><p>The growing range of machine intelligence capabilities gives rise to the question of whether robots are going to take over human jobs. One salient example is that of self-driving cars, that are projected to render millions of professional drivers obsolete in the next few decades. The potential impact of machine intelligence on jobs was summarised very well by CGP Grey in his video <a href="https://www.youtube.com/watch?v=7Pq-S557XQU" target=_blank rel=noopener>Humans Need Not Apply</a>. The main message of the video is that machines will soon be able to perform any job better or more cost-effectively than any human, thereby making humans unemployable for economic reasons. The video ends with a call to society to consider how to deal with a future where there are simply no jobs for a large part of the population.</p><p>Despite all the technological advancements since the start of the industrial revolution, the prevailing mode of wealth distribution remains paid labour, i.e., jobs. The implication of this is that much of the work we do is unnecessary or harmful – people work because they have no other option, but their work doesn&rsquo;t necessarily benefit society. This isn&rsquo;t a new insight, as the following quotes demonstrate:</p><ul><li><em>&ldquo;Most men appear never to have considered what a house is, and are actually though needlessly poor all their lives because they think that they must have such a one as their neighbors have. [&mldr;] For more than five years I maintained myself thus solely by the labor of my hands, and I found that, by working about six weeks in a year, I could meet all the expenses of living.&rdquo;</em> – Henry David Thoreau, <a href=http://www.gutenberg.org/files/205/205-h/205-h.htm target=_blank rel=noopener>Walden</a> (<strong>1854</strong>)</li><li><em>&ldquo;I think that there is far too much work done in the world, that immense harm is caused by the belief that work is virtuous, and that what needs to be preached in modern industrial countries is quite different from what always has been preached. [&mldr;] Modern technique has made it possible to diminish enormously the amount of labor required to secure the necessaries of life for everyone. [&mldr;] If, at the end of the war, the scientific organization, which had been created in order to liberate men for fighting and munition work, had been preserved, and the hours of the week had been cut down to four, all would have been well. Instead of that the old chaos was restored, those whose work was demanded were made to work long hours, and the rest were left to starve as unemployed.&rdquo;</em> – Bertrand Russell, <a href=http://www.zpub.com/notes/idle.html target=_blank rel=noopener>In Praise of Idleness</a> (<strong>1932</strong>)</li><li><em>&ldquo;In the year 1930, John Maynard Keynes predicted that technology would have advanced sufficiently by century&rsquo;s end that countries like Great Britain or the United States would achieve a 15-hour work week. There&rsquo;s every reason to believe he was right. In technological terms, we are quite capable of this. And yet it didn’t happen. Instead, technology has been marshaled, if anything, to figure out ways to make us all work more. In order to achieve this, jobs have had to be created that are, effectively, pointless. Huge swathes of people, in Europe and North America in particular, spend their entire working lives performing tasks they secretly believe do not really need to be performed. The moral and spiritual damage that comes from this situation is profound. It is a scar across our collective soul. Yet virtually no one talks about it.&rdquo;</em> – David Graeber, <a href=http://strikemag.org/bullshit-jobs/ target=_blank rel=noopener>On the Phenomenon of Bullshit Jobs</a> (<strong>2013</strong>)</li></ul><p>This leads to the conclusion that we are unlikely to experience the utopian future in which intelligent machines do all our work, leaving us ample time for leisure. Yes, people will lose their jobs. But it is not unlikely that new unnecessary jobs will be invented to keep people busy, or worse, many people will simply be unemployed and will not get to enjoy the wealth provided by technology. Stephen Hawking <a href=https://www.reddit.com/r/science/comments/3nyn5i/science_ama_series_stephen_hawking_ama_answers/cvsdmkv target=_blank rel=noopener>summarised it well recently</a>:</p><blockquote><p>If machines produce everything we need, the outcome will depend on how things are distributed. Everyone can enjoy a life of luxurious leisure if the machine-produced wealth is shared, or most people can end up miserably poor if the machine-owners successfully lobby against wealth redistribution. So far, the trend seems to be toward the second option, with technology driving ever-increasing inequality.</p></blockquote><h2 id=where-to-from-here>Where to from here?<a hidden class=anchor aria-hidden=true href=#where-to-from-here>#</a></h2><p>Many people believe that the existence of powerful greedy entities is good for society. Indeed, there is no doubt that we owe many beneficial technological breakthroughs to competition between for-profit companies. However, a single-minded focus on profit means that in many cases companies do what they can to reduce their responsibility for harmful side-effects of their activities. Examples include environmental pollution, multinational tax evasion, and health effects of products like tobacco and junk food. As history shows us, in truly unregulated markets, companies would happily utilise slavery and child labour to reduce their costs. Clearly, some regulation of greedy entities is required to obtain the best results for society.</p><p>With machine intelligence becoming increasingly powerful every day, some people think that to produce the best outcomes, we just need to wait for robots to be intelligent enough to completely run our lives. However, as anyone who has actually built intelligent systems knows, the outputs of such systems are strongly dependent on the inputs and goals set by system designers. Machine intelligence is just a tool – a very powerful tool. Like nuclear energy, we can use it to improve our lives, or we can use it to obliterate everything around us. The collective choice is ours to make, but is far from simple.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/deep-learning/>deep learning</a></li><li><a href=https://yanirseroussi.com/tags/economics/>economics</a></li><li><a href=https://yanirseroussi.com/tags/futurism/>futurism</a></li><li><a href=https://yanirseroussi.com/tags/machine-intelligence/>machine intelligence</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on x" href="https://x.com/intent/tweet/?text=The%20rise%20of%20greedy%20robots&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f&amp;hashtags=datascience%2cdeeplearning%2ceconomics%2cfuturism%2cmachineintelligence"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f&amp;title=The%20rise%20of%20greedy%20robots&amp;summary=The%20rise%20of%20greedy%20robots&amp;source=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f&title=The%20rise%20of%20greedy%20robots"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on whatsapp" href="https://api.whatsapp.com/send?text=The%20rise%20of%20greedy%20robots%20-%20https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on telegram" href="https://telegram.me/share/url?text=The%20rise%20of%20greedy%20robots&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The rise of greedy robots on ycombinator" href="https://news.ycombinator.com/submitlink?t=The%20rise%20of%20greedy%20robots&u=https%3a%2f%2fyanirseroussi.com%2f2016%2f03%2f20%2fthe-rise-of-greedy-robots%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-1286><div class=comment-header><a href=#comment-1286><img class=comment-avatar src="https://www.gravatar.com/avatar/9cfd18615668b761e91d0a1253061bf2?s=50"><p class=comment-info><strong>SoftwareAsLife (@SoftDevLife)</strong><br><small>2016-08-17 11:02:44</small></p></a></div><div class="comment-body post-content"><p>Yes, the world has always been greedy. This reminds me of Dijkstra greedy algorithm which is used to find the shortest route. There is a lot of &ldquo;steps&rdquo; for an organization to become profitable. Greediness tries to find the most cost-efficient way to achieve the goal of being profitable. Let us assume that each road is a railway and trains traverse to their destinations. Each decision path will sacrifice other trains waiting to cross to their destination. If human stupidity does not overrule again, our scarce resource ultimately will be constrained by economics to one element only: time. Where do we want humans to allocate spending their time on?</p><p>Greediness will always thrive in the sense it is seen as a trait of growth by society. War, which in today&rsquo;s society we ultimately condemn, was viewed in the past as one way for one nation to gain growth. Growth was limited to the domain of a specific country and the rest treated as an enemy. The end of the war was enforced by the right of not interfering other one&rsquo;s own property. People had to find other means to gain growth. Thus, the concept of greediness was enforced. Greediness is using emotional appeal to manipulate other&rsquo;s people habits to a specific domain.</p><p>The problem with greediness is whether people evaluate the emotional appeal matching to something positive or not positive to self and society. The maximum capacity of getting that right is:</p><ol><li>The effort of people on having a multi-disciplinary knowledge of multiple domains</li><li>The effort of people of using their knowledge to their daily decisions.</li></ol><p>Most consumers are passive on the above two points due to society constraints. More specifically, if people focus learning from other domains, they have the risk of underperforming on their main domain having a competitive disadvantage on their prospect of their career. This limited domain in bayesian terms makes people have low confidence for many topics leaving others to influence our decision making. I think that low confidence is the main causation we see a high rise trend where people&rsquo;s decisions rely more upon push messages instead of pull messages. I haven&rsquo;t seen to this day sophisticated push messages where the user has a choice of options what to see except in the on boarding phase of a product. In addition, the on boarding phase are only reasons why you should use me. There will never be a phase on reasons when not to use me. If a specific domain of a product could say to the user based on the personalized information it gathered: &ldquo;Hey, you shouldn&rsquo;t be using me in this situation. Use Bob instead, it will make your life easier&rdquo; (This will be possible with the evolution of data). The problem is a specific domain will never explain alternative domains that can solve a user&rsquo;s individual problem better because there is not a commission fee of recommending one user to another domain with qualitative information. This causes a domain which consists a set of employees to not have an interest in researching alternative domains that can solve a specific problem better because the current system has not placed a platform to reward it with a commission fee. Instead, the only way for a specific domain to thrive is by copying others ideas or owning them through acquisitions. This demotivates innovation in great sum. So far, it is only people with consciousness, with value or no value, such as start-up entrepreneurs that leave old positions and people who contribute in open source correspondingly, that go the extra mile to innovate. My whole hypothesis is that our natural instincts are a machine learner, and our only task is to do progress on everything, even our own personal life.</p><p>If those two points happen, the rule of greediness will be overruled. People will consciously evaluate whether that emotional appeal makes sense in the big picture because their jobs will force them to associate their domain with alternatives to gain a commission fee. That will gain them a more robust interdisciplinary domain knowledge causing them to have more confidence on pulling than pushing information of other domains they start to know about. Passive consumers will be less passive. The value was based before by war, now greediness, later it will be all about evaluation.</p><p>Your point of people doing less work emphasizes a more passive society than it already is. I do not propose that as that will make our situation worse. The problem is the type of tasks people do, not the task itself. People need to do tasks that progress our society instead of being passive like the game of civilization. It is the only way that makes us happy and has a purpose. Like we humans create machine learning instances have an end goal purpose, so we as humans are machine learners for a purpose where we can handle any situation that becomes a problem. Our starter pack was human suffering, hunger, and death to solve problems. Now it becomes less so and we have to be motivated by it beyond extrinsic rewards.</p></div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.html b/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.html
index c9b4909ad..c0b34df53 100644
--- a/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.html
+++ b/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="causal inference,data science,insights,predictive modelling"><meta name=description content="Discussing the need for untested assumptions and temporality in causal inference. Mostly based on Samantha Kleinberg&rsquo;s Causality, Probability, and Time."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions"><meta property="og:description" content="Discussing the need for untested assumptions and temporality in causal inference. Mostly based on Samantha Kleinberg&rsquo;s Causality, Probability, and Time."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/"><meta property="og:image" content="https://yanirseroussi.com/freediving.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-05-14T19:57:03+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/freediving.jpg"><meta name=twitter:title content="Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions"><meta name=twitter:description content="Discussing the need for untested assumptions and temporality in causal inference. Mostly based on Samantha Kleinberg&rsquo;s Causality, Probability, and Time."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions","item":"https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions","name":"Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions","description":"Discussing the need for untested assumptions and temporality in causal inference. Mostly based on Samantha Kleinberg\u0026rsquo;s Causality, Probability, and Time.","keywords":["causal inference","data science","insights","predictive modelling"],"articleBody":"Background: I have previously written about the need for real insights that address the why behind events, not only the what and how. This was followed by a fairly popular post on causality, which was heavily influenced by Samantha Kleinberg's book Why: A Guide to Finding and Using Causes. This post continues my exploration of the field, and is primarily based on Kleinberg's previous book: Causality, Probability, and Time.\nThe study of causality and causal inference is central to science in general and data science in particular. Being able to distinguish between correlation and causation is key to designing effective interventions in business, public policy, medicine, and many other fields. There are quite a few approaches to inferring causal relationships from data. In this post, I discuss some aspects of Judea Pearl’s graphical modelling approach, and how its limitations are addressed in recent work by Samantha Kleinberg. I then finish with a brief survey of the Bradford Hill criteria and their applicability to a key limitation of all causal inference methods: The need for untested assumptions.\nJudea Pearl Overcoming my Pearl bias First, I must disclose that I have a personal bias in favour of Pearl’s work. While I’ve never met him, Pearl is my academic grandfather – he was the PhD advisor of my main PhD supervisor (Ingrid Zukerman). My first serious exposure to his work was through a Sydney reading group, where we discussed parts of Pearl’s approach to causal inference. Recently, I refreshed my knowledge of Pearl causality by reading Causal inference in statistics: An overview. I am by no means an expert in Pearl’s huge body of work, but I think I understand enough of it to write something of use.\nPearl’s theory of causality employs Bayesian networks to represent causal structures. These are directed acyclic graphs, where each vertex represents a variable, and an edge from X to Y implies that X causes Y. Pearl also introduces the do(X) operator, which simulates interventions by removing all the causes of X, setting it to a constant. There is much more to this theory, but two of its main contributions are the formalisation of causal concepts that are often given only a verbal treatment, and the explicit encoding of causal assumptions. These assumptions must be made by the modeller based on background knowledge, and are encoded in the graph’s structure – a missing edge between two vertices indicates that there is no direct causal relationship between the two variables.\nMy main issue with Pearl’s treatment of causality is that he doesn’t explicitly handle time. While time can be encoded into Pearl’s models (e.g., via dynamic Bayesian networks), there is nothing that prevents creation of models where the future causes changes in the past. A closely-related issue is that Pearl’s causal models must be directed acyclic graphs, making it hard to model feedback loops. For example, Pearl says that “mud does not cause rain”, but this isn’t true – water from mud evaporates, causing rain (which causes mud). What’s true is that “mud now doesn’t cause rain now” or something along these lines, which is something that must be accounted for by adding temporal information to the models.\nNonetheless, Pearl’s theory is an important step forward in the study of causality. In his words, “in the bulk of the statistical literature before 2000, causal claims rarely appear in the mathematics. They surface only in the verbal interpretation that investigators occasionally attach to certain associations, and in the verbal description with which investigators justify assumptions.” The importance of formal causal analysis cannot be overstated, as it underlies many decisions that affect our lives. However, it seems to me like there’s still plenty of work to be done before causal analysis becomes as established as other statistical tools.\nSamantha Kleinberg Kleinberg: Addressing gaps in Pearl’s work I recently finished reading Samantha Kleinberg’s Causality, Probability, and Time. Kleinberg dedicates a good portion of the book to presenting the history of causality and discussing its many definitions. As hinted by the book’s title, Kleinberg believes that one cannot discuss causality without considering time. In her words: “One of the most critical pieces of information about causality, though – the time it takes for the cause to produce its effect – has been largely ignored by both philosophical theories and computational methods. If we do not know when the effect will occur, we have little hope of being able to act successfully using the causal relationship.” Following this assertion, Kleinberg presents a new approach to causal inference that is based on probabilistic computation tree logic (PCTL). With PCTL, one can concisely express probabilistic temporal statements. For example, if we observe a potential cause c occurring at time t, and a possible effect e occurring at time t’, we can use PCTL to state the hypothesis that in general, after c becomes true, it takes between one and |t’ – t| time units for e to become true with probability at least p, i.e., c leads to e:\nIt is obvious why PCTL may be a better fit than Bayesian networks for expressing causal statements. For example, with a Bayesian network, we can easily express the statement that smoking causes lung cancer with probability 0.3, but this isn’t that useful, as it doesn’t tell us how long it’ll take for cancer to develop. With PCTL, we can state that smoking causes lung cancer in 5-30 years with probability at least 0.3. This matches our knowledge that cancer doesn’t develop immediately – one cigarette won’t kill you.\nOne of the key concepts introduced by Kleinberg is that of causal significance. Calculating the causal significance of a cause c to an effect e relies on first identifying the set X of potential (or prima facie) causes of e. The set X contains all discrete variables x such that E[e|x]≠E[e] and x occurs earlier than e. Given the set X, the causal significance of c to e is the mean of E[e|c∧x] – E[e|¬c∧x] for all x≠c. The intuition is that if a cause c is significant, its causal significance value will be high when other potential causes are held fixed. For example, if c is heavy smoking and e is severity of lung cancer (with e=0 meaning no cancer), the expected value of e given c is likely to be higher than the expected value of e given ¬c, when conditioned on any other potential cause. Once causal significance has been measured, we can separate significant causes from insignificant causes by setting a threshold on causal significance values (this threshold can be inferred from the data). Significant causes are considered to be genuine if the data is stationary and the common causes of all pairs of variables have been included, which is a very strong condition that may be hard to fulfil in realistic scenarios. However, causal significance is an evolving concept – last year, Huang and Kleinberg introduced a new definition of causal significance that can be inferred faster and yield more accurate results. My general feeling is that this line of research will continue to yield many interesting and useful results in coming years.\nKleinberg’s work is not without its limitations. In addition to the assumptions that causal relationships are stationary and the requirement to identify all potential causes, the recently-introduced definition of causal significance also requires the relationships to be linear and additive (though this limitation may be relaxed in future work). Another issue is that most of the evaluation in the studies I’ve read was done on synthetic datasets. While there are some results on real-life health and finance data, I find it hard to judge the practicality of utilising Kleinberg’s methods without applying them to problems that I’m more familiar with. Finally, as with other work in the field of causal inference, we need to have some degree of belief in untested assumptions to reach useful conclusions. In Kleinberg’s words:\nThus, a just so cause is genuine in the case where all of the outlined assumptions hold (namely that all common causes are included, the structure is representative of the system and, when data is used, a formula satisfied by the data will be satisfied by the structure). Our belief in whether a cause is genuine, in the case where it is not certain that the assumptions hold, should be proportional to how much we believe that the assumptions are true.\nAustin Bradford Hill Hill: Testing untested assumptions To the best of my knowledge, all causal inference methods rely on untested assumptions. Specifically, we can never include all the variables in the universe in our models. Therefore, any conclusions drawn are reliant on deciding what, when, and how to measure potential causes and effects. Another issue is that no matter how good and believable our modelling is, we cannot use causal inference to convince unreasonable people. For example, some people may cite divine intervention as an unmeasurable cause of anything and everything. In addition, people with certain commercial interests often try to raise doubt about well-established causal mechanisms by making unreasonable claims for evidence of various hidden factors. For example, tobacco companies used to claim that both smoking and lung cancer were caused by a common hidden factor, making the link between smoking and lung cancer a mere association.\nAssuming that we are dealing with reasonable people, there’s still the question of where we should get our untested assumptions from. This question is fairly old, and has been partly answered in 1965 by Austin Bradford Hill, with nine criteria that he recommended should be considered before calling an association causal:\nStrength: How strong is the association? For example, lung cancer deaths of heavy smokers are 20-30 times greater than those of non-smokers. Consistency: Has the association been repeatedly observed in various circumstances? For example, many different populations have exhibited an association between smoking rates and cancer. Specificity: Can we pin down specific instances of the effect to specific instances of the cause? Hill sees this as a nice-to-have condition rather than a must-have – cases with multiple possible causes may not fulfil the specificity requirement. Temporality: Do we know that c leads to e or are we observing them together? This is a condition that isn’t always easy to fulfil, especially when dealing with feedback loops and slow processes. Biological gradient: Hill’s focus was on medicine, and this condition refers to the association exhibiting some dose-response curve. This can be generalised to other fields, as we can expect some regularity in the effect if it is a function of the cause (though it doesn’t have to be a linear function). Plausibility: Do we know of a mechanism that can explain how the cause brings about the effect? Coherence: Does the association conflict with our current knowledge? Even if it does, it isn’t enough to rule out causality, as our current knowledge may be incomplete or wrong. Experiment: If possible, running controlled experiments may yield very powerful evidence in favour of causation. Analogy: Do we know of any similar cause-and-effect relationships? Hill summarises the list of criteria (or viewpoints) with the following statements.\nHere then are nine different viewpoints from all of which we should study association before we cry causation. What I do not believe – and this has been suggested – is that we can usefully lay down some hard-and-fast rules of evidence that must be obeyed before we accept cause and effect. None of my nine viewpoints can bring indisputable evidence for or against the cause-and-effect hypothesis and none can be required as a sine qua non. What they can do, with greater or less strength, is to help us to make up our minds on the fundamental question – is there any other way of explaining the set of facts before us, is there any other answer equally, or more, likely than cause and effect?\nNo formal tests of significance can answer those questions. Such tests can, and should, remind us of the effects that the play of chance can create, and they will instruct us in the likely magnitude of those effects. Beyond that they contribute nothing to the ‘proof’ of our hypothesis.\nHill then goes on to criticise the increased focus on statistical significance as a condition for accepting scientific papers for publication. Remembering that this was over 50 years ago, it is a bit worrying that it has taken so long for the statistical community to formally acknowledge the fact that statistical significance does not imply scientific importance, or constitutes enough evidence to support a causal hypothesis.\nClosing thoughts This post has only scratched the surface of the vast field of study of causality. At this point, I feel like I’ve read quite a bit, and it is time to apply what I learned to real problems. I encounter questions of causality in my everyday work, but haven’t fully applied formal causal inference to any problem yet. My view is that everyone needs to at least be aware of the need to consider causality, and of what it’d take to truly prove causal impact. A large proportion of what many people need in practice may be addressed by Hill’s criteria, rather than by formal methods for causal analysis. Nonetheless, I will report back when I get a chance to apply formal causal inference to real datasets. Stay tuned!\n","wordCount":"2223","inLanguage":"en","image":"https://yanirseroussi.com/freediving.jpg","datePublished":"2016-05-14T19:57:03Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions</h1><div class=post-meta><span title='2016-05-14 19:57:03 +0000 UTC'>May 14, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-05-15-diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving.jpg alt width=1920 height=672></figure><div class=post-content><p class=intro-note>Background: I have previously written about <a href=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/>the need for real insights that address the why behind events, not only the what and how</a>. This was followed by a <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>fairly popular post on causality</a>, which was heavily influenced by Samantha Kleinberg's book <a href=http://www.skleinberg.org/why/ target=_blank rel=noopener>Why: A Guide to Finding and Using Causes</a>. This post continues my exploration of the field, and is primarily based on Kleinberg's previous book: <a href=http://www.skleinberg.org/causality_book/index.html target=_blank rel=noopener>Causality, Probability, and Time</a>.</p><p>The study of causality and causal inference is central to science in general and data science in particular. Being able to distinguish between correlation and causation is key to designing effective interventions in business, public policy, medicine, and many other fields. There are quite a few approaches to inferring causal relationships from data. In this post, I discuss some aspects of <a href=https://en.wikipedia.org/wiki/Judea_Pearl target=_blank rel=noopener>Judea Pearl&rsquo;s</a> graphical modelling approach, and how its limitations are addressed in recent work by <a href=http://www.skleinberg.org/ target=_blank rel=noopener>Samantha Kleinberg</a>. I then finish with a brief survey of the <a href=https://en.wikipedia.org/wiki/Bradford_Hill_criteria target=_blank rel=noopener>Bradford Hill criteria</a> and their applicability to a key limitation of all causal inference methods: The need for untested assumptions.</p><h2 id=hahahugoshortcode42s0hbhb-overcoming-my-pearl-bias><figure class=float-right><a href=judea-pearl.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 435px,
+<meta name=keywords content="causal inference,data science,insights,predictive modelling"><meta name=description content="Discussing the need for untested assumptions and temporality in causal inference. Mostly based on Samantha Kleinberg&rsquo;s Causality, Probability, and Time."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions"><meta property="og:description" content="Discussing the need for untested assumptions and temporality in causal inference. Mostly based on Samantha Kleinberg&rsquo;s Causality, Probability, and Time."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/"><meta property="og:image" content="https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-05-14T19:57:03+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving.jpg"><meta name=twitter:title content="Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions"><meta name=twitter:description content="Discussing the need for untested assumptions and temporality in causal inference. Mostly based on Samantha Kleinberg&rsquo;s Causality, Probability, and Time."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions","item":"https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions","name":"Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions","description":"Discussing the need for untested assumptions and temporality in causal inference. Mostly based on Samantha Kleinberg\u0026rsquo;s Causality, Probability, and Time.","keywords":["causal inference","data science","insights","predictive modelling"],"articleBody":"Background: I have previously written about the need for real insights that address the why behind events, not only the what and how. This was followed by a fairly popular post on causality, which was heavily influenced by Samantha Kleinberg's book Why: A Guide to Finding and Using Causes. This post continues my exploration of the field, and is primarily based on Kleinberg's previous book: Causality, Probability, and Time.\nThe study of causality and causal inference is central to science in general and data science in particular. Being able to distinguish between correlation and causation is key to designing effective interventions in business, public policy, medicine, and many other fields. There are quite a few approaches to inferring causal relationships from data. In this post, I discuss some aspects of Judea Pearl’s graphical modelling approach, and how its limitations are addressed in recent work by Samantha Kleinberg. I then finish with a brief survey of the Bradford Hill criteria and their applicability to a key limitation of all causal inference methods: The need for untested assumptions.\nJudea Pearl Overcoming my Pearl bias First, I must disclose that I have a personal bias in favour of Pearl’s work. While I’ve never met him, Pearl is my academic grandfather – he was the PhD advisor of my main PhD supervisor (Ingrid Zukerman). My first serious exposure to his work was through a Sydney reading group, where we discussed parts of Pearl’s approach to causal inference. Recently, I refreshed my knowledge of Pearl causality by reading Causal inference in statistics: An overview. I am by no means an expert in Pearl’s huge body of work, but I think I understand enough of it to write something of use.\nPearl’s theory of causality employs Bayesian networks to represent causal structures. These are directed acyclic graphs, where each vertex represents a variable, and an edge from X to Y implies that X causes Y. Pearl also introduces the do(X) operator, which simulates interventions by removing all the causes of X, setting it to a constant. There is much more to this theory, but two of its main contributions are the formalisation of causal concepts that are often given only a verbal treatment, and the explicit encoding of causal assumptions. These assumptions must be made by the modeller based on background knowledge, and are encoded in the graph’s structure – a missing edge between two vertices indicates that there is no direct causal relationship between the two variables.\nMy main issue with Pearl’s treatment of causality is that he doesn’t explicitly handle time. While time can be encoded into Pearl’s models (e.g., via dynamic Bayesian networks), there is nothing that prevents creation of models where the future causes changes in the past. A closely-related issue is that Pearl’s causal models must be directed acyclic graphs, making it hard to model feedback loops. For example, Pearl says that “mud does not cause rain”, but this isn’t true – water from mud evaporates, causing rain (which causes mud). What’s true is that “mud now doesn’t cause rain now” or something along these lines, which is something that must be accounted for by adding temporal information to the models.\nNonetheless, Pearl’s theory is an important step forward in the study of causality. In his words, “in the bulk of the statistical literature before 2000, causal claims rarely appear in the mathematics. They surface only in the verbal interpretation that investigators occasionally attach to certain associations, and in the verbal description with which investigators justify assumptions.” The importance of formal causal analysis cannot be overstated, as it underlies many decisions that affect our lives. However, it seems to me like there’s still plenty of work to be done before causal analysis becomes as established as other statistical tools.\nSamantha Kleinberg Kleinberg: Addressing gaps in Pearl’s work I recently finished reading Samantha Kleinberg’s Causality, Probability, and Time. Kleinberg dedicates a good portion of the book to presenting the history of causality and discussing its many definitions. As hinted by the book’s title, Kleinberg believes that one cannot discuss causality without considering time. In her words: “One of the most critical pieces of information about causality, though – the time it takes for the cause to produce its effect – has been largely ignored by both philosophical theories and computational methods. If we do not know when the effect will occur, we have little hope of being able to act successfully using the causal relationship.” Following this assertion, Kleinberg presents a new approach to causal inference that is based on probabilistic computation tree logic (PCTL). With PCTL, one can concisely express probabilistic temporal statements. For example, if we observe a potential cause c occurring at time t, and a possible effect e occurring at time t’, we can use PCTL to state the hypothesis that in general, after c becomes true, it takes between one and |t’ – t| time units for e to become true with probability at least p, i.e., c leads to e:\nIt is obvious why PCTL may be a better fit than Bayesian networks for expressing causal statements. For example, with a Bayesian network, we can easily express the statement that smoking causes lung cancer with probability 0.3, but this isn’t that useful, as it doesn’t tell us how long it’ll take for cancer to develop. With PCTL, we can state that smoking causes lung cancer in 5-30 years with probability at least 0.3. This matches our knowledge that cancer doesn’t develop immediately – one cigarette won’t kill you.\nOne of the key concepts introduced by Kleinberg is that of causal significance. Calculating the causal significance of a cause c to an effect e relies on first identifying the set X of potential (or prima facie) causes of e. The set X contains all discrete variables x such that E[e|x]≠E[e] and x occurs earlier than e. Given the set X, the causal significance of c to e is the mean of E[e|c∧x] – E[e|¬c∧x] for all x≠c. The intuition is that if a cause c is significant, its causal significance value will be high when other potential causes are held fixed. For example, if c is heavy smoking and e is severity of lung cancer (with e=0 meaning no cancer), the expected value of e given c is likely to be higher than the expected value of e given ¬c, when conditioned on any other potential cause. Once causal significance has been measured, we can separate significant causes from insignificant causes by setting a threshold on causal significance values (this threshold can be inferred from the data). Significant causes are considered to be genuine if the data is stationary and the common causes of all pairs of variables have been included, which is a very strong condition that may be hard to fulfil in realistic scenarios. However, causal significance is an evolving concept – last year, Huang and Kleinberg introduced a new definition of causal significance that can be inferred faster and yield more accurate results. My general feeling is that this line of research will continue to yield many interesting and useful results in coming years.\nKleinberg’s work is not without its limitations. In addition to the assumptions that causal relationships are stationary and the requirement to identify all potential causes, the recently-introduced definition of causal significance also requires the relationships to be linear and additive (though this limitation may be relaxed in future work). Another issue is that most of the evaluation in the studies I’ve read was done on synthetic datasets. While there are some results on real-life health and finance data, I find it hard to judge the practicality of utilising Kleinberg’s methods without applying them to problems that I’m more familiar with. Finally, as with other work in the field of causal inference, we need to have some degree of belief in untested assumptions to reach useful conclusions. In Kleinberg’s words:\nThus, a just so cause is genuine in the case where all of the outlined assumptions hold (namely that all common causes are included, the structure is representative of the system and, when data is used, a formula satisfied by the data will be satisfied by the structure). Our belief in whether a cause is genuine, in the case where it is not certain that the assumptions hold, should be proportional to how much we believe that the assumptions are true.\nAustin Bradford Hill Hill: Testing untested assumptions To the best of my knowledge, all causal inference methods rely on untested assumptions. Specifically, we can never include all the variables in the universe in our models. Therefore, any conclusions drawn are reliant on deciding what, when, and how to measure potential causes and effects. Another issue is that no matter how good and believable our modelling is, we cannot use causal inference to convince unreasonable people. For example, some people may cite divine intervention as an unmeasurable cause of anything and everything. In addition, people with certain commercial interests often try to raise doubt about well-established causal mechanisms by making unreasonable claims for evidence of various hidden factors. For example, tobacco companies used to claim that both smoking and lung cancer were caused by a common hidden factor, making the link between smoking and lung cancer a mere association.\nAssuming that we are dealing with reasonable people, there’s still the question of where we should get our untested assumptions from. This question is fairly old, and has been partly answered in 1965 by Austin Bradford Hill, with nine criteria that he recommended should be considered before calling an association causal:\nStrength: How strong is the association? For example, lung cancer deaths of heavy smokers are 20-30 times greater than those of non-smokers. Consistency: Has the association been repeatedly observed in various circumstances? For example, many different populations have exhibited an association between smoking rates and cancer. Specificity: Can we pin down specific instances of the effect to specific instances of the cause? Hill sees this as a nice-to-have condition rather than a must-have – cases with multiple possible causes may not fulfil the specificity requirement. Temporality: Do we know that c leads to e or are we observing them together? This is a condition that isn’t always easy to fulfil, especially when dealing with feedback loops and slow processes. Biological gradient: Hill’s focus was on medicine, and this condition refers to the association exhibiting some dose-response curve. This can be generalised to other fields, as we can expect some regularity in the effect if it is a function of the cause (though it doesn’t have to be a linear function). Plausibility: Do we know of a mechanism that can explain how the cause brings about the effect? Coherence: Does the association conflict with our current knowledge? Even if it does, it isn’t enough to rule out causality, as our current knowledge may be incomplete or wrong. Experiment: If possible, running controlled experiments may yield very powerful evidence in favour of causation. Analogy: Do we know of any similar cause-and-effect relationships? Hill summarises the list of criteria (or viewpoints) with the following statements.\nHere then are nine different viewpoints from all of which we should study association before we cry causation. What I do not believe – and this has been suggested – is that we can usefully lay down some hard-and-fast rules of evidence that must be obeyed before we accept cause and effect. None of my nine viewpoints can bring indisputable evidence for or against the cause-and-effect hypothesis and none can be required as a sine qua non. What they can do, with greater or less strength, is to help us to make up our minds on the fundamental question – is there any other way of explaining the set of facts before us, is there any other answer equally, or more, likely than cause and effect?\nNo formal tests of significance can answer those questions. Such tests can, and should, remind us of the effects that the play of chance can create, and they will instruct us in the likely magnitude of those effects. Beyond that they contribute nothing to the ‘proof’ of our hypothesis.\nHill then goes on to criticise the increased focus on statistical significance as a condition for accepting scientific papers for publication. Remembering that this was over 50 years ago, it is a bit worrying that it has taken so long for the statistical community to formally acknowledge the fact that statistical significance does not imply scientific importance, or constitutes enough evidence to support a causal hypothesis.\nClosing thoughts This post has only scratched the surface of the vast field of study of causality. At this point, I feel like I’ve read quite a bit, and it is time to apply what I learned to real problems. I encounter questions of causality in my everyday work, but haven’t fully applied formal causal inference to any problem yet. My view is that everyone needs to at least be aware of the need to consider causality, and of what it’d take to truly prove causal impact. A large proportion of what many people need in practice may be addressed by Hill’s criteria, rather than by formal methods for causal analysis. Nonetheless, I will report back when I get a chance to apply formal causal inference to real datasets. Stay tuned!\n","wordCount":"2223","inLanguage":"en","image":"https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving.jpg","datePublished":"2016-05-14T19:57:03Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions</h1><div class=post-meta><span title='2016-05-14 19:57:03 +0000 UTC'>May 14, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-05-15-diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving_hub5d72e3c45cdff9da93ca2e12cce16a2_673766_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/freediving.jpg alt width=1920 height=672></figure><div class=post-content><p class=intro-note>Background: I have previously written about <a href=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/>the need for real insights that address the why behind events, not only the what and how</a>. This was followed by a <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>fairly popular post on causality</a>, which was heavily influenced by Samantha Kleinberg's book <a href=http://www.skleinberg.org/why/ target=_blank rel=noopener>Why: A Guide to Finding and Using Causes</a>. This post continues my exploration of the field, and is primarily based on Kleinberg's previous book: <a href=http://www.skleinberg.org/causality_book/index.html target=_blank rel=noopener>Causality, Probability, and Time</a>.</p><p>The study of causality and causal inference is central to science in general and data science in particular. Being able to distinguish between correlation and causation is key to designing effective interventions in business, public policy, medicine, and many other fields. There are quite a few approaches to inferring causal relationships from data. In this post, I discuss some aspects of <a href=https://en.wikipedia.org/wiki/Judea_Pearl target=_blank rel=noopener>Judea Pearl&rsquo;s</a> graphical modelling approach, and how its limitations are addressed in recent work by <a href=http://www.skleinberg.org/ target=_blank rel=noopener>Samantha Kleinberg</a>. I then finish with a brief survey of the <a href=https://en.wikipedia.org/wiki/Bradford_Hill_criteria target=_blank rel=noopener>Bradford Hill criteria</a> and their applicability to a key limitation of all causal inference methods: The need for untested assumptions.</p><h2 id=hahahugoshortcode42s0hbhb-overcoming-my-pearl-bias><figure class=float-right><a href=judea-pearl.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 435px,
 100vw" srcset="https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/judea-pearl_hu9a8d9dce36ef378faf19e0843274044e_79154_360x0_resize_q75_box.jpg 360w,
 https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/judea-pearl.jpg 435w," src=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/judea-pearl.jpg alt="Judea Pearl" width=150 loading=lazy></a><figcaption><p>Judea Pearl</p></figcaption></figure>Overcoming my Pearl bias</h2><p>First, I must disclose that I have a personal bias in favour of Pearl&rsquo;s work. While I&rsquo;ve never met him, Pearl is my academic grandfather – he was the PhD advisor of my main PhD supervisor (Ingrid Zukerman). My first serious exposure to his work was through a Sydney reading group, where we discussed parts of Pearl&rsquo;s approach to causal inference. Recently, I refreshed my knowledge of Pearl causality by reading <a href=http://ftp.cs.ucla.edu/pub/stat_ser/r350.pdf target=_blank rel=noopener>Causal inference in statistics: An overview</a>. I am by no means an expert in Pearl&rsquo;s huge body of work, but I think I understand enough of it to write something of use.</p><p>Pearl&rsquo;s theory of causality employs Bayesian networks to represent causal structures. These are directed acyclic graphs, where each vertex represents a variable, and an edge from X to Y implies that X causes Y. Pearl also introduces the <code>do(X)</code> operator, which simulates interventions by removing all the causes of X, setting it to a constant. There is much more to this theory, but two of its main contributions are the formalisation of causal concepts that are often given only a verbal treatment, and the explicit encoding of causal assumptions. These assumptions must be made by the modeller based on background knowledge, and are encoded in the graph&rsquo;s structure – a missing edge between two vertices indicates that there is no direct causal relationship between the two variables.</p><p>My main issue with Pearl&rsquo;s treatment of causality is that he doesn&rsquo;t explicitly handle time. While time can be encoded into Pearl&rsquo;s models (e.g., via dynamic Bayesian networks), there is nothing that prevents creation of models where the future causes changes in the past. A closely-related issue is that Pearl&rsquo;s causal models must be directed <em>acyclic</em> graphs, making it hard to model feedback loops. For example, Pearl says that &ldquo;mud does not cause rain&rdquo;, but this isn&rsquo;t true – water from mud evaporates, causing rain (which causes mud). What&rsquo;s true is that &ldquo;mud now doesn&rsquo;t cause rain now&rdquo; or something along these lines, which is something that must be accounted for by adding temporal information to the models.</p><p>Nonetheless, Pearl&rsquo;s theory is an important step forward in the study of causality. In his words, &ldquo;<em>in the bulk of the statistical literature before 2000, causal claims rarely appear in the mathematics. They surface only in the verbal interpretation that investigators occasionally attach to certain associations, and in the verbal description with which investigators justify assumptions.</em>&rdquo; The importance of formal causal analysis cannot be overstated, as it underlies many decisions that affect our lives. However, it seems to me like there&rsquo;s still plenty of work to be done before causal analysis becomes as established as other statistical tools.</p><h2 id=hahahugoshortcode42s1hbhb-kleinberg-addressing-gaps-in-pearls-work><figure class=float-right><a href=samantha-kleinberg.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 586px,
 100vw" srcset="https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/samantha-kleinberg_hu3d03a01dcc18bc5be0e67db3d8d209a6_52017_360x0_resize_q75_box.jpg 360w,
diff --git a/2016/06/19/making-bayesian-ab-testing-more-accessible/index.html b/2016/06/19/making-bayesian-ab-testing-more-accessible/index.html
index 6071b12d4..b9bfa3d62 100644
--- a/2016/06/19/making-bayesian-ab-testing-more-accessible/index.html
+++ b/2016/06/19/making-bayesian-ab-testing-more-accessible/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Making Bayesian A/B testing more accessible | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="a/b testing,analytics,causal inference,data science,statistics"><meta name=description content="A web tool I built to interpret A/B test results in a Bayesian way, including prior specification, visualisations, and decision rules."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Making Bayesian A/B testing more accessible"><meta property="og:description" content="A web tool I built to interpret A/B test results in a Bayesian way, including prior specification, visualisations, and decision rules."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/"><meta property="og:image" content="https://yanirseroussi.com/bayesian-split-testing-calculator.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-06-19T10:32:15+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bayesian-split-testing-calculator.png"><meta name=twitter:title content="Making Bayesian A/B testing more accessible"><meta name=twitter:description content="A web tool I built to interpret A/B test results in a Bayesian way, including prior specification, visualisations, and decision rules."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Making Bayesian A/B testing more accessible","item":"https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Making Bayesian A/B testing more accessible","name":"Making Bayesian A\/B testing more accessible","description":"A web tool I built to interpret A/B test results in a Bayesian way, including prior specification, visualisations, and decision rules.","keywords":["a/b testing","analytics","causal inference","data science","statistics"],"articleBody":"Much has been written in recent years on the pitfalls of using traditional hypothesis testing with online A/B tests. A key issue is that you’re likely to end up with many false positives if you repeatedly check your results and stop as soon as you reach statistical significance. One way of dealing with this issue is by following a Bayesian approach to deciding when the experiment should be stopped. While I find the Bayesian view of statistics much more intuitive than the frequentist view, it can be quite challenging to explain Bayesian concepts to laypeople. Hence, I decided to build a new Bayesian A/B testing calculator, which aims to make these concepts clear to any user. This post discusses the general problem and existing solutions, followed by a review of the new tool and how it can be improved further.\nThe problem The classic A/B testing problem is as follows. Suppose we run an experiment where we have a control group and a test group. Participants (typically website visitors) are allocated to groups randomly, and each group is presented with a different variant of the website or page (e.g., variant A is assigned to the control group and variant B is assigned to the test group). Our aim is to increase the overall number of binary successes, where success can be defined as clicking a button or opening a new account. Hence, we track the number of trials in each group together with the number of successes. For a given group, the number of successes divided by number of trials is the group’s raw success rate.\nGiven the results of an experiment (trials and successes for each group), there are a few questions we would typically like to answer:\nShould we choose variant A or variant B to maximise our success rate? How much would our success rate change if we chose one variant over the other? Do we have enough data or should we keep experimenting? It’s important to note some points that might be obvious, but are often overlooked. First, we run an experiment because we assume that it will help us uncover a causal link, where something about A or B is hypothesised to cause people to behave differently, thereby affecting the overall success rate. Second, we want to make a decision and choose either A or B, rather than maintain multiple variants and present the best variant depending on a participant’s features (a problem that’s addressed by contextual bandits, for example). Third, online A/B testing is different from traditional experiments in a lab, because we often have little control over the characteristics of our participants, and when, where, and how they choose to interact with our experiment. This is an important point, because it means that we may need to wait a long time until we get a representative sample of the population. In addition, the raw numbers of trials and successes can’t tell us whether the sample is representative.\nBayesian solutions Many blog posts have been written on how to use Bayesian statistics to answer the above questions, so I won’t get into too much detail here (see the posts by David Robinson, Maciej Kula, Chris Stucchio, and Evan Miller if you need more background). The general idea is that we assume that the success rates for the control and test variants are drawn from Beta(αA, βA) and Beta(αB, βB), respectively, where Beta(α, β) is the beta distribution with shape parameters α and β (which yields values in the [0, 1] interval). As the experiment runs, we update the parameters of the distributions – each success gets added to the group’s α, and each unsuccessful trial gets added to the group’s β. It is often reasonable to assume that the prior (i.e., initial) values of α and β are the same for both variants. If we denote the prior values of the parameters with α and β, and the number of successes and trials for group x with Sx and Tx respectively, we get that the success rates are distributed according to Beta(α + SA, β + TA – SA) for control and Beta(α + SB, β + TB – SB) for test.\nFor example, if α = β = 1, TA = 200, SA = 120, TB = 200, and SB = 100, plotting the probability density functions yields the following chart (A – blue, B – red):\nGiven these distributions, we can calculate the most probable range for the success rate of each variant, and estimate the difference in success rate between the variants. These can be calculated by deriving closed formulas, or by drawing samples from each distribution. In addition, it is important to note that the distributions change as we gather more data, even if the raw success rates don’t. For example, multiplying each count by 10 to obtain TA = 2000, SA = 1200, TB = 2000, and SB = 1000 doesn’t change the success rates, but it does change the distributions – they become much narrower:\nIn the second case we’ve gathered ten times the data, which made the distributions much more distinct. Intuitively, this means we can now be more confident that the success rate of A is higher than that of B. Quantifying this confidence and deciding when to conclude the experiment isn’t straightforward, and should depend on factors that aren’t fully captured by the raw counts. The way I chose to address this issue is presented below, after briefly discussing existing calculators and their limitations.\nExisting online calculators The beauty of frequentist tools for significance testing is that they always give you a simple answer. For example, if we plug the numbers from the first case above (TA = 200, SA = 120, TB = 200, and SB = 100) into Evan Miller’s calculator, we get:\nUnfortunately, both Bayesian calculators that I’m aware of have some limitations. Plugging the same numbers into the calculators by PeakConversion and Lyst would inform you that the probability of A being best is approximately 0.98, but it won’t tell you what’s the best way forward given this information. PeakConversion also outputs the 95% success rate intervals for A (between 53.1% and 66.7%) and B (between 43.1% and 56.9%), but it doesn’t let users set the prior values α and β (it uses α = β = 0.5). The ability to set priors based on what we know about our experimental setting is an important feature of Bayesian statistics that can help reduce the number of false positives. Hiding the priors in PeakConversion’s calculator makes it easier to use but less powerful than Lyst’s tool. In addition, Lyst’s calculator presents the distribution of differences between the success rates of A and B, i.e., the effect size. This is important because we may not bother implementing certain changes if the effect is negligible, even if the probability of one variant being better than the other is very close to 1.\nDespite being more powerful, I find Lyst’s calculator just a bit too technical. Specifically, setting the α and β priors requires some familiarity with the beta distribution, which many people don’t have. Also, the effect size distribution is important, but can be hard to get one’s head around. Therefore, I decided to extend Lyst’s calculator, aiming to release a new tool that is both powerful and easy to use.\nBuilding the new calculator The source code for Lyst’s calculator is available on GitHub, so I decided to use that as the foundation of the new calculator. The first step was to convert the code from HTML, CSS, and JavaScript to Jade, Sass, and CoffeeScript, and clean up some code duplication. As the calculator is served from my GitHub Pages domain, it was easiest to put all the code in that repository. Once I had an environment and codebase that I was happy with, it was time to make functional changes:\nChange the layout to be responsive, so it’d work well on mobile devices. Enable sharing of results by changing the URL when the input changes. Provide clear instructions, so that the calculator can be used by people who don’t necessarily have a strong background in statistics. Allow users to set priors based on more familiar figures than the beta distribution’s α and β priors. Make a clear and well-justified recommendation on how to proceed. While the first two changes were straightforward to implement, the other points were somewhat more challenging. Specifically, providing clear explanations that assume little background knowledge isn’t simple, and I still feel that the current version of the new calculator is a bit too wordy (this may be improved in the future based on user feedback – suggestions welcome). Life would be easier if everyone thought of observed values as being drawn from distributions, but in my experience this is not always the case. However, I believe it is important to communicate the reality of uncertainty, so I don’t want to hide it from users of the calculator, even at the price of more elaborate explanations.\nMaking the priors more intuitive was a bit tricky. At first, I thought I’d let users state their prior knowledge in terms of the mean and variance of past performance, relying on the fact that for Beta(α, β) the mean μ is α / (α + β), and the variance σ2 is αβ / (α + β)2(α + β + 1). The problem is that while the mean is simple to set, as it is always in the (0, 1) range, the upper bound for the variance depends on the mean. Specifically, it can be shown that the variance is in the range (0, μ(1 – μ)). Therefore, I decided to let users quantify their uncertainty about the mean as a number u in the range (0, 1), where σ2 = uμ(1 – μ). Having played with the calculator a bit, I think this makes it easier to set good informative priors. It is also worth noting that I considered allowing users to set different priors for the control and test group, but decided against it to reduce complexity. In addition, it makes sense to have the same prior for both groups – if you have a strong belief or knowledge on which one is going to perform better, you probably don’t need to run an experiment.\nOne of the main reasons I decided to build the calculator was because I wanted a tool that outputs a clear recommendation. This proved to be the most challenging (and interesting) part of this project, as there are quite a few options for Bayesian stopping rules. After reading David Robinson’s review of the limitations of a stopping rule based on the expected loss, and a few of the other resources mentioned in his post, I decided to go with a combination of the third and fourth rules tested by John Kruschke. These rules rely on a threshold of caring, which is the minimum effect size that is seen as significant by the user. For example, if we’re running experiments on the conversion rate of a landing page, we may decide that we don’t care if the absolute change in conversion rate is less than 0.1%. Given this threshold and data from the experiment, the following recommendations are possible:\nStop the experiment and implement either variant, because the difference between the variants is smaller than the threshold. Stop the experiment and implement the winning variant, because the difference between the variants is greater than the threshold. Keep running the experiment, because there isn’t enough data to make a decision. Formally, Kruschke’s rules work as follows. Given the minimum effect threshold t, we define a region of practical equivalence (ROPE) to zero difference as the interval [-t, t]. Then, we compare the ROPE to the 95% high density interval (HDI) of the distribution of differences between A and B. When comparing the ROPE and HDI, there are three options that correspond to the recommendations above:\nThe ROPE is completely contained in the HDI (stop the experiment and implement either variant). The intersection between the ROPE and HDI is empty (stop the experiment and implement the winning variant). The ROPE and HDI only partly overlap (keep running the experiment). Kruschke’s post shows that making the rule more restrictive by adding a notion of user-settable precision can reduce the rate of false positives. The idea is to stop only if the HDI is narrower than precision multiplied by the width of the ROPE. Intuitively, this forces the experimenter to collect more data because it makes the posterior distributions narrower (as shown by the charts above). I found it hard to explain the idea of precision, and didn’t want to confuse users by adding another parameter, so I decided to use a constant precision value of 0.8. If the ROPE and HDI don’t overlap, the tool makes a recommendation to stop, accompanied by a binary level of confidence: high if the precision condition is met, and low otherwise.\nPutting in the numbers from the running example (TA = 200, SA = 120, TB = 200, and SB = 100) together with a minimum effect of 1%, prior success rate of 50%, and 57.74% uncertainty (equivalent to α = β = 1), we get the following output:\nThe full results also include plots of the distributions and their high density intervals. I’m pretty happy with the richer information provided by the calculator, though it still has some limitations and areas that can be improved.\nLimitations and potential improvements As mentioned above, I’d love to reduce the wordiness of the calculator while keeping it self-contained, but I need some feedback to understand if any explanations are redundant. It’d also be great to reduce the reliance on magic numbers, such as the 95% HDI and 0.8 precision used for generating a recommendation. However, making these settable by users would increase the complexity of using the calculator, which is already harder to use than the frequentist alternative. Nonetheless, it’s important to remember that oversimplification is the reason why it’s easier to make the wrong decision when following the classical approach.\nOther potential changes include switching to a closed-form formula rather than draws from a distribution, comparing more than two variants, and improving Kruschke’s stopping rules by simulating more scenarios than those considered in his post. In addition, I’d like to go beyond binary responses (success/failure) to support continuous rewards (e.g., revenue), and allow users to specify different costs for the variants (e.g., implementing B may cost more than sticking with A).\nFinally, it is important to keep in mind that significance testing can’t tell you whether your sample is representative of the population. For example, if you run an experiment on a very popular website, you can get a sample of thousands of people within a few minutes. Concluding an experiment based on such a sample is probably a bad idea, as it is plausible that you would reach different conclusions if you kept running the experiment for a few days, to reduce the effect that the time of day has on the results. Similarly, a few days may not be enough if your user population behaves differently on weekends – you would need to run the experiment over a few weeks. This can be extended to months and years to rule out seasonal effects, but it is up to the experimenter to weigh the practicality of considering such factors versus the need to make decisions (see articles by Peep Laja, Martin Goodson, Sam Ju, and Kohavi et al. for more details). The main thing to remember is that you just cannot completely eliminate uncertainty and the need to consider background knowledge, which is why I believe that helping more people follow the Bayesian approach is a step in the right direction.\n","wordCount":"2637","inLanguage":"en","image":"https://yanirseroussi.com/bayesian-split-testing-calculator.png","datePublished":"2016-06-19T10:32:15Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Making Bayesian A/B testing more accessible</h1><div class=post-meta><span title='2016-06-19 10:32:15 +0000 UTC'>June 19, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-06-19-making-bayesian-ab-testing-more-accessible/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator_hu2128e6cfab878bae9a83560d8015bf85_45345_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator_hu2128e6cfab878bae9a83560d8015bf85_45345_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator_hu2128e6cfab878bae9a83560d8015bf85_45345_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator_hu2128e6cfab878bae9a83560d8015bf85_45345_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator.png 1280w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator.png alt width=1280 height=600></figure><div class=post-content><p>Much has been written in recent years on the pitfalls of using traditional hypothesis testing with online A/B tests. A key issue is that <a href=http://www.evanmiller.org/how-not-to-run-an-ab-test.html target=_blank rel=noopener>you&rsquo;re likely to end up with many false positives if you repeatedly check your results and stop as soon as you reach statistical significance</a>. One way of dealing with this issue is by <a href=http://www.evanmiller.org/bayesian-ab-testing.html target=_blank rel=noopener>following a Bayesian approach</a> to deciding when the experiment should be stopped. While I find the Bayesian view of statistics much more intuitive than the frequentist view, it can be quite challenging to explain Bayesian concepts to laypeople. Hence, I decided to build a new <a href=https://yanirs.github.io/tools/split-test-calculator/ target=_blank rel=noopener>Bayesian A/B testing calculator</a>, which aims to make these concepts clear to any user. This post discusses the general problem and existing solutions, followed by a review of the new tool and how it can be improved further.</p><h2 id=the-problem>The problem<a hidden class=anchor aria-hidden=true href=#the-problem>#</a></h2><p>The classic A/B testing problem is as follows. Suppose we run an experiment where we have a control group and a test group. Participants (typically website visitors) are allocated to groups randomly, and each group is presented with a different variant of the website or page (e.g., variant A is assigned to the control group and variant B is assigned to the test group). Our aim is to increase the overall number of binary <em>successes</em>, where success can be defined as clicking a button or opening a new account. Hence, we track the number of <em>trials</em> in each group together with the number of successes. For a given group, the number of successes divided by number of trials is the group&rsquo;s raw success rate.</p><p>Given the results of an experiment (trials and successes for each group), there are a few questions we would typically like to answer:</p><ol><li>Should we choose variant A or variant B to maximise our success rate?</li><li>How much would our success rate change if we chose one variant over the other?</li><li>Do we have enough data or should we keep experimenting?</li></ol><p>It&rsquo;s important to note some points that might be obvious, but are often overlooked. First, we run an experiment because we assume that it will help us uncover a <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>causal link</a>, where something about A or B is hypothesised to cause people to behave differently, thereby affecting the overall success rate. Second, we <em>want</em> to make a decision and choose either A or B, rather than maintain multiple variants and present the best variant depending on a participant&rsquo;s features (a problem that&rsquo;s addressed by <a href=https://en.wikipedia.org/wiki/Multi-armed_bandit#Contextual_Bandit target=_blank rel=noopener>contextual bandits</a>, for example). Third, online A/B testing is different from traditional experiments in a lab, because we often have little control over the characteristics of our participants, and when, where, and how they choose to interact with our experiment. This is an important point, because it means that we may need to wait a long time until we get a representative sample of the population. In addition, the raw numbers of trials and successes can&rsquo;t tell us whether the sample is representative.</p><h2 id=bayesian-solutions>Bayesian solutions<a hidden class=anchor aria-hidden=true href=#bayesian-solutions>#</a></h2><p>Many blog posts have been written on how to use Bayesian statistics to answer the above questions, so I won&rsquo;t get into too much detail here (see the posts by <a href=http://varianceexplained.org/r/bayesian_ab_baseball/ target=_blank rel=noopener>David Robinson</a>, <a href=http://developers.lyst.com/2014/05/10/bayesian-ab-testing/ target=_blank rel=noopener>Maciej Kula</a>, <a href=https://www.chrisstucchio.com/blog/2014/bayesian_ab_decision_rule.html target=_blank rel=noopener>Chris Stucchio</a>, and <a href=http://www.evanmiller.org/bayesian-ab-testing.html target=_blank rel=noopener>Evan Miller</a> if you need more background). The general idea is that we assume that the success rates for the control and test variants are drawn from Beta(α<sub>A</sub>, β<sub>A</sub>) and Beta(α<sub>B</sub>, β<sub>B</sub>), respectively, where Beta(α, β) is the <a href=https://en.wikipedia.org/wiki/Beta_distribution target=_blank rel=noopener>beta distribution</a> with shape parameters α and β (which yields values in the [0, 1] interval). As the experiment runs, we update the parameters of the distributions – each success gets added to the group&rsquo;s α, and each unsuccessful trial gets added to the group&rsquo;s β. It is often reasonable to assume that the prior (i.e., initial) values of α and β are the same for both variants. If we denote the prior values of the parameters with α<sub></sub> and β<sub></sub>, and the number of successes and trials for group x with S<sub>x</sub> and T<sub>x</sub> respectively, we get that the success rates are distributed according to Beta(α<sub></sub> + S<sub>A</sub>, β<sub></sub> + T<sub>A</sub> – S<sub>A</sub>) for control and Beta(α<sub></sub> + S<sub>B</sub>, β<sub></sub> + T<sub>B</sub> – S<sub>B</sub>) for test.</p><p>For example, if α<sub></sub> = β<sub></sub> = 1, T<sub>A</sub> = 200, S<sub>A</sub> = 120, T<sub>B</sub> = 200, and S<sub>B</sub> = 100, plotting the probability density functions yields the following chart (A – blue, B – red):</p><figure><a href=beta-distributions-examples.png target=_blank rel=noopener><img sizes="(min-width: 768px) 614px,
+<meta name=keywords content="a/b testing,analytics,causal inference,data science,statistics"><meta name=description content="A web tool I built to interpret A/B test results in a Bayesian way, including prior specification, visualisations, and decision rules."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Making Bayesian A/B testing more accessible"><meta property="og:description" content="A web tool I built to interpret A/B test results in a Bayesian way, including prior specification, visualisations, and decision rules."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/"><meta property="og:image" content="https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-06-19T10:32:15+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator.png"><meta name=twitter:title content="Making Bayesian A/B testing more accessible"><meta name=twitter:description content="A web tool I built to interpret A/B test results in a Bayesian way, including prior specification, visualisations, and decision rules."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Making Bayesian A/B testing more accessible","item":"https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Making Bayesian A/B testing more accessible","name":"Making Bayesian A\/B testing more accessible","description":"A web tool I built to interpret A/B test results in a Bayesian way, including prior specification, visualisations, and decision rules.","keywords":["a/b testing","analytics","causal inference","data science","statistics"],"articleBody":"Much has been written in recent years on the pitfalls of using traditional hypothesis testing with online A/B tests. A key issue is that you’re likely to end up with many false positives if you repeatedly check your results and stop as soon as you reach statistical significance. One way of dealing with this issue is by following a Bayesian approach to deciding when the experiment should be stopped. While I find the Bayesian view of statistics much more intuitive than the frequentist view, it can be quite challenging to explain Bayesian concepts to laypeople. Hence, I decided to build a new Bayesian A/B testing calculator, which aims to make these concepts clear to any user. This post discusses the general problem and existing solutions, followed by a review of the new tool and how it can be improved further.\nThe problem The classic A/B testing problem is as follows. Suppose we run an experiment where we have a control group and a test group. Participants (typically website visitors) are allocated to groups randomly, and each group is presented with a different variant of the website or page (e.g., variant A is assigned to the control group and variant B is assigned to the test group). Our aim is to increase the overall number of binary successes, where success can be defined as clicking a button or opening a new account. Hence, we track the number of trials in each group together with the number of successes. For a given group, the number of successes divided by number of trials is the group’s raw success rate.\nGiven the results of an experiment (trials and successes for each group), there are a few questions we would typically like to answer:\nShould we choose variant A or variant B to maximise our success rate? How much would our success rate change if we chose one variant over the other? Do we have enough data or should we keep experimenting? It’s important to note some points that might be obvious, but are often overlooked. First, we run an experiment because we assume that it will help us uncover a causal link, where something about A or B is hypothesised to cause people to behave differently, thereby affecting the overall success rate. Second, we want to make a decision and choose either A or B, rather than maintain multiple variants and present the best variant depending on a participant’s features (a problem that’s addressed by contextual bandits, for example). Third, online A/B testing is different from traditional experiments in a lab, because we often have little control over the characteristics of our participants, and when, where, and how they choose to interact with our experiment. This is an important point, because it means that we may need to wait a long time until we get a representative sample of the population. In addition, the raw numbers of trials and successes can’t tell us whether the sample is representative.\nBayesian solutions Many blog posts have been written on how to use Bayesian statistics to answer the above questions, so I won’t get into too much detail here (see the posts by David Robinson, Maciej Kula, Chris Stucchio, and Evan Miller if you need more background). The general idea is that we assume that the success rates for the control and test variants are drawn from Beta(αA, βA) and Beta(αB, βB), respectively, where Beta(α, β) is the beta distribution with shape parameters α and β (which yields values in the [0, 1] interval). As the experiment runs, we update the parameters of the distributions – each success gets added to the group’s α, and each unsuccessful trial gets added to the group’s β. It is often reasonable to assume that the prior (i.e., initial) values of α and β are the same for both variants. If we denote the prior values of the parameters with α and β, and the number of successes and trials for group x with Sx and Tx respectively, we get that the success rates are distributed according to Beta(α + SA, β + TA – SA) for control and Beta(α + SB, β + TB – SB) for test.\nFor example, if α = β = 1, TA = 200, SA = 120, TB = 200, and SB = 100, plotting the probability density functions yields the following chart (A – blue, B – red):\nGiven these distributions, we can calculate the most probable range for the success rate of each variant, and estimate the difference in success rate between the variants. These can be calculated by deriving closed formulas, or by drawing samples from each distribution. In addition, it is important to note that the distributions change as we gather more data, even if the raw success rates don’t. For example, multiplying each count by 10 to obtain TA = 2000, SA = 1200, TB = 2000, and SB = 1000 doesn’t change the success rates, but it does change the distributions – they become much narrower:\nIn the second case we’ve gathered ten times the data, which made the distributions much more distinct. Intuitively, this means we can now be more confident that the success rate of A is higher than that of B. Quantifying this confidence and deciding when to conclude the experiment isn’t straightforward, and should depend on factors that aren’t fully captured by the raw counts. The way I chose to address this issue is presented below, after briefly discussing existing calculators and their limitations.\nExisting online calculators The beauty of frequentist tools for significance testing is that they always give you a simple answer. For example, if we plug the numbers from the first case above (TA = 200, SA = 120, TB = 200, and SB = 100) into Evan Miller’s calculator, we get:\nUnfortunately, both Bayesian calculators that I’m aware of have some limitations. Plugging the same numbers into the calculators by PeakConversion and Lyst would inform you that the probability of A being best is approximately 0.98, but it won’t tell you what’s the best way forward given this information. PeakConversion also outputs the 95% success rate intervals for A (between 53.1% and 66.7%) and B (between 43.1% and 56.9%), but it doesn’t let users set the prior values α and β (it uses α = β = 0.5). The ability to set priors based on what we know about our experimental setting is an important feature of Bayesian statistics that can help reduce the number of false positives. Hiding the priors in PeakConversion’s calculator makes it easier to use but less powerful than Lyst’s tool. In addition, Lyst’s calculator presents the distribution of differences between the success rates of A and B, i.e., the effect size. This is important because we may not bother implementing certain changes if the effect is negligible, even if the probability of one variant being better than the other is very close to 1.\nDespite being more powerful, I find Lyst’s calculator just a bit too technical. Specifically, setting the α and β priors requires some familiarity with the beta distribution, which many people don’t have. Also, the effect size distribution is important, but can be hard to get one’s head around. Therefore, I decided to extend Lyst’s calculator, aiming to release a new tool that is both powerful and easy to use.\nBuilding the new calculator The source code for Lyst’s calculator is available on GitHub, so I decided to use that as the foundation of the new calculator. The first step was to convert the code from HTML, CSS, and JavaScript to Jade, Sass, and CoffeeScript, and clean up some code duplication. As the calculator is served from my GitHub Pages domain, it was easiest to put all the code in that repository. Once I had an environment and codebase that I was happy with, it was time to make functional changes:\nChange the layout to be responsive, so it’d work well on mobile devices. Enable sharing of results by changing the URL when the input changes. Provide clear instructions, so that the calculator can be used by people who don’t necessarily have a strong background in statistics. Allow users to set priors based on more familiar figures than the beta distribution’s α and β priors. Make a clear and well-justified recommendation on how to proceed. While the first two changes were straightforward to implement, the other points were somewhat more challenging. Specifically, providing clear explanations that assume little background knowledge isn’t simple, and I still feel that the current version of the new calculator is a bit too wordy (this may be improved in the future based on user feedback – suggestions welcome). Life would be easier if everyone thought of observed values as being drawn from distributions, but in my experience this is not always the case. However, I believe it is important to communicate the reality of uncertainty, so I don’t want to hide it from users of the calculator, even at the price of more elaborate explanations.\nMaking the priors more intuitive was a bit tricky. At first, I thought I’d let users state their prior knowledge in terms of the mean and variance of past performance, relying on the fact that for Beta(α, β) the mean μ is α / (α + β), and the variance σ2 is αβ / (α + β)2(α + β + 1). The problem is that while the mean is simple to set, as it is always in the (0, 1) range, the upper bound for the variance depends on the mean. Specifically, it can be shown that the variance is in the range (0, μ(1 – μ)). Therefore, I decided to let users quantify their uncertainty about the mean as a number u in the range (0, 1), where σ2 = uμ(1 – μ). Having played with the calculator a bit, I think this makes it easier to set good informative priors. It is also worth noting that I considered allowing users to set different priors for the control and test group, but decided against it to reduce complexity. In addition, it makes sense to have the same prior for both groups – if you have a strong belief or knowledge on which one is going to perform better, you probably don’t need to run an experiment.\nOne of the main reasons I decided to build the calculator was because I wanted a tool that outputs a clear recommendation. This proved to be the most challenging (and interesting) part of this project, as there are quite a few options for Bayesian stopping rules. After reading David Robinson’s review of the limitations of a stopping rule based on the expected loss, and a few of the other resources mentioned in his post, I decided to go with a combination of the third and fourth rules tested by John Kruschke. These rules rely on a threshold of caring, which is the minimum effect size that is seen as significant by the user. For example, if we’re running experiments on the conversion rate of a landing page, we may decide that we don’t care if the absolute change in conversion rate is less than 0.1%. Given this threshold and data from the experiment, the following recommendations are possible:\nStop the experiment and implement either variant, because the difference between the variants is smaller than the threshold. Stop the experiment and implement the winning variant, because the difference between the variants is greater than the threshold. Keep running the experiment, because there isn’t enough data to make a decision. Formally, Kruschke’s rules work as follows. Given the minimum effect threshold t, we define a region of practical equivalence (ROPE) to zero difference as the interval [-t, t]. Then, we compare the ROPE to the 95% high density interval (HDI) of the distribution of differences between A and B. When comparing the ROPE and HDI, there are three options that correspond to the recommendations above:\nThe ROPE is completely contained in the HDI (stop the experiment and implement either variant). The intersection between the ROPE and HDI is empty (stop the experiment and implement the winning variant). The ROPE and HDI only partly overlap (keep running the experiment). Kruschke’s post shows that making the rule more restrictive by adding a notion of user-settable precision can reduce the rate of false positives. The idea is to stop only if the HDI is narrower than precision multiplied by the width of the ROPE. Intuitively, this forces the experimenter to collect more data because it makes the posterior distributions narrower (as shown by the charts above). I found it hard to explain the idea of precision, and didn’t want to confuse users by adding another parameter, so I decided to use a constant precision value of 0.8. If the ROPE and HDI don’t overlap, the tool makes a recommendation to stop, accompanied by a binary level of confidence: high if the precision condition is met, and low otherwise.\nPutting in the numbers from the running example (TA = 200, SA = 120, TB = 200, and SB = 100) together with a minimum effect of 1%, prior success rate of 50%, and 57.74% uncertainty (equivalent to α = β = 1), we get the following output:\nThe full results also include plots of the distributions and their high density intervals. I’m pretty happy with the richer information provided by the calculator, though it still has some limitations and areas that can be improved.\nLimitations and potential improvements As mentioned above, I’d love to reduce the wordiness of the calculator while keeping it self-contained, but I need some feedback to understand if any explanations are redundant. It’d also be great to reduce the reliance on magic numbers, such as the 95% HDI and 0.8 precision used for generating a recommendation. However, making these settable by users would increase the complexity of using the calculator, which is already harder to use than the frequentist alternative. Nonetheless, it’s important to remember that oversimplification is the reason why it’s easier to make the wrong decision when following the classical approach.\nOther potential changes include switching to a closed-form formula rather than draws from a distribution, comparing more than two variants, and improving Kruschke’s stopping rules by simulating more scenarios than those considered in his post. In addition, I’d like to go beyond binary responses (success/failure) to support continuous rewards (e.g., revenue), and allow users to specify different costs for the variants (e.g., implementing B may cost more than sticking with A).\nFinally, it is important to keep in mind that significance testing can’t tell you whether your sample is representative of the population. For example, if you run an experiment on a very popular website, you can get a sample of thousands of people within a few minutes. Concluding an experiment based on such a sample is probably a bad idea, as it is plausible that you would reach different conclusions if you kept running the experiment for a few days, to reduce the effect that the time of day has on the results. Similarly, a few days may not be enough if your user population behaves differently on weekends – you would need to run the experiment over a few weeks. This can be extended to months and years to rule out seasonal effects, but it is up to the experimenter to weigh the practicality of considering such factors versus the need to make decisions (see articles by Peep Laja, Martin Goodson, Sam Ju, and Kohavi et al. for more details). The main thing to remember is that you just cannot completely eliminate uncertainty and the need to consider background knowledge, which is why I believe that helping more people follow the Bayesian approach is a step in the right direction.\n","wordCount":"2637","inLanguage":"en","image":"https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator.png","datePublished":"2016-06-19T10:32:15Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Making Bayesian A/B testing more accessible</h1><div class=post-meta><span title='2016-06-19 10:32:15 +0000 UTC'>June 19, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-06-19-making-bayesian-ab-testing-more-accessible/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator_hu2128e6cfab878bae9a83560d8015bf85_45345_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator_hu2128e6cfab878bae9a83560d8015bf85_45345_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator_hu2128e6cfab878bae9a83560d8015bf85_45345_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator_hu2128e6cfab878bae9a83560d8015bf85_45345_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator.png 1280w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/bayesian-split-testing-calculator.png alt width=1280 height=600></figure><div class=post-content><p>Much has been written in recent years on the pitfalls of using traditional hypothesis testing with online A/B tests. A key issue is that <a href=http://www.evanmiller.org/how-not-to-run-an-ab-test.html target=_blank rel=noopener>you&rsquo;re likely to end up with many false positives if you repeatedly check your results and stop as soon as you reach statistical significance</a>. One way of dealing with this issue is by <a href=http://www.evanmiller.org/bayesian-ab-testing.html target=_blank rel=noopener>following a Bayesian approach</a> to deciding when the experiment should be stopped. While I find the Bayesian view of statistics much more intuitive than the frequentist view, it can be quite challenging to explain Bayesian concepts to laypeople. Hence, I decided to build a new <a href=https://yanirs.github.io/tools/split-test-calculator/ target=_blank rel=noopener>Bayesian A/B testing calculator</a>, which aims to make these concepts clear to any user. This post discusses the general problem and existing solutions, followed by a review of the new tool and how it can be improved further.</p><h2 id=the-problem>The problem<a hidden class=anchor aria-hidden=true href=#the-problem>#</a></h2><p>The classic A/B testing problem is as follows. Suppose we run an experiment where we have a control group and a test group. Participants (typically website visitors) are allocated to groups randomly, and each group is presented with a different variant of the website or page (e.g., variant A is assigned to the control group and variant B is assigned to the test group). Our aim is to increase the overall number of binary <em>successes</em>, where success can be defined as clicking a button or opening a new account. Hence, we track the number of <em>trials</em> in each group together with the number of successes. For a given group, the number of successes divided by number of trials is the group&rsquo;s raw success rate.</p><p>Given the results of an experiment (trials and successes for each group), there are a few questions we would typically like to answer:</p><ol><li>Should we choose variant A or variant B to maximise our success rate?</li><li>How much would our success rate change if we chose one variant over the other?</li><li>Do we have enough data or should we keep experimenting?</li></ol><p>It&rsquo;s important to note some points that might be obvious, but are often overlooked. First, we run an experiment because we assume that it will help us uncover a <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>causal link</a>, where something about A or B is hypothesised to cause people to behave differently, thereby affecting the overall success rate. Second, we <em>want</em> to make a decision and choose either A or B, rather than maintain multiple variants and present the best variant depending on a participant&rsquo;s features (a problem that&rsquo;s addressed by <a href=https://en.wikipedia.org/wiki/Multi-armed_bandit#Contextual_Bandit target=_blank rel=noopener>contextual bandits</a>, for example). Third, online A/B testing is different from traditional experiments in a lab, because we often have little control over the characteristics of our participants, and when, where, and how they choose to interact with our experiment. This is an important point, because it means that we may need to wait a long time until we get a representative sample of the population. In addition, the raw numbers of trials and successes can&rsquo;t tell us whether the sample is representative.</p><h2 id=bayesian-solutions>Bayesian solutions<a hidden class=anchor aria-hidden=true href=#bayesian-solutions>#</a></h2><p>Many blog posts have been written on how to use Bayesian statistics to answer the above questions, so I won&rsquo;t get into too much detail here (see the posts by <a href=http://varianceexplained.org/r/bayesian_ab_baseball/ target=_blank rel=noopener>David Robinson</a>, <a href=http://developers.lyst.com/2014/05/10/bayesian-ab-testing/ target=_blank rel=noopener>Maciej Kula</a>, <a href=https://www.chrisstucchio.com/blog/2014/bayesian_ab_decision_rule.html target=_blank rel=noopener>Chris Stucchio</a>, and <a href=http://www.evanmiller.org/bayesian-ab-testing.html target=_blank rel=noopener>Evan Miller</a> if you need more background). The general idea is that we assume that the success rates for the control and test variants are drawn from Beta(α<sub>A</sub>, β<sub>A</sub>) and Beta(α<sub>B</sub>, β<sub>B</sub>), respectively, where Beta(α, β) is the <a href=https://en.wikipedia.org/wiki/Beta_distribution target=_blank rel=noopener>beta distribution</a> with shape parameters α and β (which yields values in the [0, 1] interval). As the experiment runs, we update the parameters of the distributions – each success gets added to the group&rsquo;s α, and each unsuccessful trial gets added to the group&rsquo;s β. It is often reasonable to assume that the prior (i.e., initial) values of α and β are the same for both variants. If we denote the prior values of the parameters with α<sub></sub> and β<sub></sub>, and the number of successes and trials for group x with S<sub>x</sub> and T<sub>x</sub> respectively, we get that the success rates are distributed according to Beta(α<sub></sub> + S<sub>A</sub>, β<sub></sub> + T<sub>A</sub> – S<sub>A</sub>) for control and Beta(α<sub></sub> + S<sub>B</sub>, β<sub></sub> + T<sub>B</sub> – S<sub>B</sub>) for test.</p><p>For example, if α<sub></sub> = β<sub></sub> = 1, T<sub>A</sub> = 200, S<sub>A</sub> = 120, T<sub>B</sub> = 200, and S<sub>B</sub> = 100, plotting the probability density functions yields the following chart (A – blue, B – red):</p><figure><a href=beta-distributions-examples.png target=_blank rel=noopener><img sizes="(min-width: 768px) 614px,
 100vw" srcset="https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/beta-distributions-examples_hu6083fd67121821db147a70ce91579621_12977_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/beta-distributions-examples_hu6083fd67121821db147a70ce91579621_12977_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/beta-distributions-examples.png 614w," src=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/beta-distributions-examples.png alt="Beta distributions examples" loading=lazy></a></figure><p>Given these distributions, we can calculate the most probable range for the success rate of each variant, and estimate the difference in success rate between the variants. These can be calculated by <a href=http://www.evanmiller.org/bayesian-ab-testing.html target=_blank rel=noopener>deriving closed formulas</a>, or by <a href=http://varianceexplained.org/r/bayesian_ab_baseball/ target=_blank rel=noopener>drawing samples from each distribution</a>. In addition, it is important to note that the distributions change as we gather more data, even if the raw success rates don&rsquo;t. For example, multiplying each count by 10 to obtain T<sub>A</sub> = 2000, S<sub>A</sub> = 1200, T<sub>B</sub> = 2000, and S<sub>B</sub> = 1000 doesn&rsquo;t change the success rates, but it does change the distributions – they become much narrower:</p><figure><a href=beta-distributions-examples-narrower.png target=_blank rel=noopener><img sizes="(min-width: 768px) 613px,
diff --git a/2016/08/04/is-data-scientist-a-useless-job-title/index.html b/2016/08/04/is-data-scientist-a-useless-job-title/index.html
index 105cce314..4c10982f4 100644
--- a/2016/08/04/is-data-scientist-a-useless-job-title/index.html
+++ b/2016/08/04/is-data-scientist-a-useless-job-title/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Is Data Scientist a useless job title? | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="business,data science,marketing,software engineering,statistics"><meta name=description content="It seems like anyone who touches data can call themselves a data scientist, which makes the title useless. The work they do can still be useful, though."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Is Data Scientist a useless job title?"><meta property="og:description" content="It seems like anyone who touches data can call themselves a data scientist, which makes the title useless. The work they do can still be useful, though."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/"><meta property="og:image" content="https://yanirseroussi.com/silly-data-scientist.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-08-04T22:26:03+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/silly-data-scientist.jpg"><meta name=twitter:title content="Is Data Scientist a useless job title?"><meta name=twitter:description content="It seems like anyone who touches data can call themselves a data scientist, which makes the title useless. The work they do can still be useful, though."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Is Data Scientist a useless job title?","item":"https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Is Data Scientist a useless job title?","name":"Is Data Scientist a useless job title?","description":"It seems like anyone who touches data can call themselves a data scientist, which makes the title useless. The work they do can still be useful, though.","keywords":["business","data science","marketing","software engineering","statistics"],"articleBody":"Data science can be defined as either the intersection or union of software engineering and statistics. In recent years, the field seems to be gravitating towards the broader unifying definition, where everyone who touches data in some way can call themselves a data scientist. Hence, while many people whose job title is Data Scientist do very useful work, the title itself has become fairly useless as an indication of what the title holder actually does. This post briefly discusses how we got to this point, where I think the field is likely to go, and what data scientists can do to remain relevant.\nThe many definitions of data science About two years ago, I published a post discussing the definition of data scientist by Josh Wills, as a person who is better at statistics than any software engineer and better at software engineering than any statistician. I still quite like this definition, because it describes me well, as someone with education and experience in both areas. However, to be better at statistics than any software engineer and better at software engineering than any statistician, you have to be truly proficient in both areas, as some software engineers are comfortable running complex experiments, and some statisticians are capable of building solid software. Quite a few people who don’t meet Wills’s criteria have decided they wanted to be data scientists too, expanding the definition to be something along the lines of someone who is better at statistics than some software engineers (who’ve never done anything fancier than calculating a sample mean) and better at software engineering than some statisticians (who can’t code).\nIn addition to software engineering and statistics, data scientists are expected to deeply understand the domain in which they operate, and be excellent communicators. This leads to the proliferation of increasingly ridiculous Venn diagrams, such as the one by Stephan Kolassa:\nThe perfect data scientist from Kolassa’s Venn diagram is a mythical sexy unicorn ninja rockstar who can transform a business just by thinking about its problems. A more realistic (and less exciting) view of data scientists is offered by Rob Hyndman:\nI take the broad inclusive view. I am a data scientist because I do data analysis, and I do research on the methodology of data analysis. The way I would express it is that I’m a data scientist with a statistical perspective and training. Other data scientists will have different perspectives and different training.\nWe are comfortable with having medical specialists, and we will go to a GP, endocrinologist, physiotherapist, etc., when we have medical problems. We also need to take a team perspective on data science.\nNone of us can realistically cover the whole field, and so we specialise on certain problems and techniques. It is crazy to think that a doctor must know everything, and it is just as crazy to think a data scientist should be an expert in statistics, mathematics, computing, programming, the application discipline, etc. Instead, we need teams of data scientists with different skills, with each being aware of the boundary of their expertise, and who to call in for help when required.\nIndeed, data science is too broad for any data scientist to fully master all areas of expertise. Despite the misleading name of the field, it encompasses both science and engineering, which is why data scientists can be categorised into two types, as suggested by Michael Hochster:\nType A (analyst): focused on static data analysis. Essentially a statistician with coding skills. Type B (builder): focused on building data products. Essentially a software engineer with knowledge in machine learning and statistics. Type A is more of a scientist, and Type B is more of an engineer. Many people end up doing both, but it is pretty rare to have an even 50-50 split between the science and engineering sides, as they require different mindsets. This is illustrated by the following diagram, showing the information flow in science and engineering (source).\nWhy Data Scientist is a useless job title Given that a data scientist is someone who does data analysis, and/or a scientist, and/or an engineer, what does it mean for a person to hold a Data Scientist position? It can mean anything, as it depends on the company and industry. A job title like Data Scientist at Company is about as meaningful as Engineer at Organisation, Scientist at Institution, or Doctor at Hospital. It gives you a general idea what the person’s background is, but provides little clue as to what the person actually does on a day-to-day basis.\nDon’t believe me? Let’s look at a few examples. Noah Lorang (Basecamp) is OK with mostly doing arithmetic. David Robinson (Stack Overflow) builds machine learning features and internal R packages, and visualises data. Robert Chang (Twitter) helps surface product insights, create data pipelines, run A/B tests, and build predictive models. Rob Hyndman (Monash University) and Jake VanderPlas (University of Washington) are academic data scientists who contribute to major R and Python open-source libraries, respectively. From personal knowledge, data scientists in many Australian enterprises focus on generating reports and building dashboards. And in my current role at Car Next Door I do a little bit of everything, e.g., implement new features, fix bugs, set up data pipelines and dashboards, run experiments, build predictive models, and analyse data.\nTo be clear, the work done by many data scientists is very useful. The number of decisions made based on arbitrary thresholds and some means multiplied together on a spreadsheet can be horrifying to those of us with minimal knowledge of basic statistics. Having a good data scientist on board can have a transformative effect on a business. But it’s also very easy to end up with ineffective hires working on low-impact tasks if the business has no idea what their data scientists should be doing. This situation isn’t uncommon, given the wide range of activities that may be performed by data scientists, the lack of consensus on the definition of the field, and a general disagreement over who deserves to be called a real data scientist. We need to move beyond the hype towards clearer definitions that would help align the expectations of data scientists with those of their current and future employers.\nIt’s time to specialise Four years ago, I changed my LinkedIn title from software engineer with a research background to data scientist. Various offers started coming my way, and they haven’t stopped since. Many people have done the same. To be a data scientist, you just need to call yourself a data scientist. The dilution of the term means that as a job title, it is useless. Useless terms are unlikely to last, so if you’re seriously thinking of becoming a data scientist, you should also consider specialising. I believe we’ll see the emergence of new specific titles, such as Machine Learning Engineer. In addition, less “sexy” titles, such as Data Analyst, may end up making a comeback. In any case, those of us who invest in building their skills, delivering value in their job, and making sure people know about it don’t have much to worry about.\nWhat do you think? Is specialisation inevitable or are generalist data scientists here to stay? Please let me know privately, via Twitter, or in the comments section.\n","wordCount":"1213","inLanguage":"en","image":"https://yanirseroussi.com/silly-data-scientist.jpg","datePublished":"2016-08-04T22:26:03Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Is Data Scientist a useless job title?</h1><div class=post-meta><span title='2016-08-04 22:26:03 +0000 UTC'>August 4, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-08-04-is-data-scientist-a-useless-job-title/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist.jpg 1600w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist.jpg alt width=1600 height=960></figure><div class=post-content><p>Data science can be defined as either the <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>intersection</a> or <a href=http://robjhyndman.com/hyndsight/am-i-a-data-scientist/ target=_blank rel=noopener>union</a> of software engineering and statistics. In recent years, the field seems to be gravitating towards the broader unifying definition, where everyone who touches data in some way can call themselves a data scientist. Hence, while many people whose job title is Data Scientist do very useful work, the title itself has become fairly useless as an indication of what the title holder actually does. This post briefly discusses how we got to this point, where I think the field is likely to go, and what data scientists can do to remain relevant.</p><h2 id=the-many-definitions-of-data-science>The many definitions of data science<a hidden class=anchor aria-hidden=true href=#the-many-definitions-of-data-science>#</a></h2><p>About two years ago, I <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>published a post discussing the definition of data scientist by Josh Wills</a>, as a <em>person who is better at statistics than any software engineer and better at software engineering than any statistician</em>. I still quite like this definition, because it describes me well, as someone with education and experience in both areas. However, to be better at statistics than <strong>any</strong> software engineer and better at software engineering than <strong>any</strong> statistician, you have to be truly proficient in both areas, as some software engineers are <a href=https://code.facebook.com/posts/1072626246134461/introducing-fblearner-flow-facebook-s-ai-backbone/ target=_blank rel=noopener>comfortable running complex experiments</a>, and some statisticians <a href=https://www.r-project.org/contributors.html target=_blank rel=noopener>are capable of building solid software</a>. Quite a few people who don&rsquo;t meet Wills&rsquo;s criteria have decided they wanted to be data scientists too, expanding the definition to be something along the lines of <em>someone who is better at statistics than some software engineers (who&rsquo;ve never done anything fancier than calculating a sample mean) and better at software engineering than some statisticians (who can&rsquo;t code)</em>.</p><p>In addition to software engineering and statistics, data scientists are expected to deeply understand the domain in which they operate, and be excellent communicators. This leads to the proliferation of increasingly ridiculous Venn diagrams, such as the one by <a href=http://datascience.stackexchange.com/a/2406 target=_blank rel=noopener>Stephan Kolassa</a>:</p><figure><a href=perfect-data-scientist-venn-diagram.png target=_blank rel=noopener><img sizes="(min-width: 768px) 572px,
+<meta name=keywords content="business,data science,marketing,software engineering,statistics"><meta name=description content="It seems like anyone who touches data can call themselves a data scientist, which makes the title useless. The work they do can still be useful, though."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Is Data Scientist a useless job title?"><meta property="og:description" content="It seems like anyone who touches data can call themselves a data scientist, which makes the title useless. The work they do can still be useful, though."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/"><meta property="og:image" content="https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-08-04T22:26:03+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist.jpg"><meta name=twitter:title content="Is Data Scientist a useless job title?"><meta name=twitter:description content="It seems like anyone who touches data can call themselves a data scientist, which makes the title useless. The work they do can still be useful, though."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Is Data Scientist a useless job title?","item":"https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Is Data Scientist a useless job title?","name":"Is Data Scientist a useless job title?","description":"It seems like anyone who touches data can call themselves a data scientist, which makes the title useless. The work they do can still be useful, though.","keywords":["business","data science","marketing","software engineering","statistics"],"articleBody":"Data science can be defined as either the intersection or union of software engineering and statistics. In recent years, the field seems to be gravitating towards the broader unifying definition, where everyone who touches data in some way can call themselves a data scientist. Hence, while many people whose job title is Data Scientist do very useful work, the title itself has become fairly useless as an indication of what the title holder actually does. This post briefly discusses how we got to this point, where I think the field is likely to go, and what data scientists can do to remain relevant.\nThe many definitions of data science About two years ago, I published a post discussing the definition of data scientist by Josh Wills, as a person who is better at statistics than any software engineer and better at software engineering than any statistician. I still quite like this definition, because it describes me well, as someone with education and experience in both areas. However, to be better at statistics than any software engineer and better at software engineering than any statistician, you have to be truly proficient in both areas, as some software engineers are comfortable running complex experiments, and some statisticians are capable of building solid software. Quite a few people who don’t meet Wills’s criteria have decided they wanted to be data scientists too, expanding the definition to be something along the lines of someone who is better at statistics than some software engineers (who’ve never done anything fancier than calculating a sample mean) and better at software engineering than some statisticians (who can’t code).\nIn addition to software engineering and statistics, data scientists are expected to deeply understand the domain in which they operate, and be excellent communicators. This leads to the proliferation of increasingly ridiculous Venn diagrams, such as the one by Stephan Kolassa:\nThe perfect data scientist from Kolassa’s Venn diagram is a mythical sexy unicorn ninja rockstar who can transform a business just by thinking about its problems. A more realistic (and less exciting) view of data scientists is offered by Rob Hyndman:\nI take the broad inclusive view. I am a data scientist because I do data analysis, and I do research on the methodology of data analysis. The way I would express it is that I’m a data scientist with a statistical perspective and training. Other data scientists will have different perspectives and different training.\nWe are comfortable with having medical specialists, and we will go to a GP, endocrinologist, physiotherapist, etc., when we have medical problems. We also need to take a team perspective on data science.\nNone of us can realistically cover the whole field, and so we specialise on certain problems and techniques. It is crazy to think that a doctor must know everything, and it is just as crazy to think a data scientist should be an expert in statistics, mathematics, computing, programming, the application discipline, etc. Instead, we need teams of data scientists with different skills, with each being aware of the boundary of their expertise, and who to call in for help when required.\nIndeed, data science is too broad for any data scientist to fully master all areas of expertise. Despite the misleading name of the field, it encompasses both science and engineering, which is why data scientists can be categorised into two types, as suggested by Michael Hochster:\nType A (analyst): focused on static data analysis. Essentially a statistician with coding skills. Type B (builder): focused on building data products. Essentially a software engineer with knowledge in machine learning and statistics. Type A is more of a scientist, and Type B is more of an engineer. Many people end up doing both, but it is pretty rare to have an even 50-50 split between the science and engineering sides, as they require different mindsets. This is illustrated by the following diagram, showing the information flow in science and engineering (source).\nWhy Data Scientist is a useless job title Given that a data scientist is someone who does data analysis, and/or a scientist, and/or an engineer, what does it mean for a person to hold a Data Scientist position? It can mean anything, as it depends on the company and industry. A job title like Data Scientist at Company is about as meaningful as Engineer at Organisation, Scientist at Institution, or Doctor at Hospital. It gives you a general idea what the person’s background is, but provides little clue as to what the person actually does on a day-to-day basis.\nDon’t believe me? Let’s look at a few examples. Noah Lorang (Basecamp) is OK with mostly doing arithmetic. David Robinson (Stack Overflow) builds machine learning features and internal R packages, and visualises data. Robert Chang (Twitter) helps surface product insights, create data pipelines, run A/B tests, and build predictive models. Rob Hyndman (Monash University) and Jake VanderPlas (University of Washington) are academic data scientists who contribute to major R and Python open-source libraries, respectively. From personal knowledge, data scientists in many Australian enterprises focus on generating reports and building dashboards. And in my current role at Car Next Door I do a little bit of everything, e.g., implement new features, fix bugs, set up data pipelines and dashboards, run experiments, build predictive models, and analyse data.\nTo be clear, the work done by many data scientists is very useful. The number of decisions made based on arbitrary thresholds and some means multiplied together on a spreadsheet can be horrifying to those of us with minimal knowledge of basic statistics. Having a good data scientist on board can have a transformative effect on a business. But it’s also very easy to end up with ineffective hires working on low-impact tasks if the business has no idea what their data scientists should be doing. This situation isn’t uncommon, given the wide range of activities that may be performed by data scientists, the lack of consensus on the definition of the field, and a general disagreement over who deserves to be called a real data scientist. We need to move beyond the hype towards clearer definitions that would help align the expectations of data scientists with those of their current and future employers.\nIt’s time to specialise Four years ago, I changed my LinkedIn title from software engineer with a research background to data scientist. Various offers started coming my way, and they haven’t stopped since. Many people have done the same. To be a data scientist, you just need to call yourself a data scientist. The dilution of the term means that as a job title, it is useless. Useless terms are unlikely to last, so if you’re seriously thinking of becoming a data scientist, you should also consider specialising. I believe we’ll see the emergence of new specific titles, such as Machine Learning Engineer. In addition, less “sexy” titles, such as Data Analyst, may end up making a comeback. In any case, those of us who invest in building their skills, delivering value in their job, and making sure people know about it don’t have much to worry about.\nWhat do you think? Is specialisation inevitable or are generalist data scientists here to stay? Please let me know privately, via Twitter, or in the comments section.\n","wordCount":"1213","inLanguage":"en","image":"https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist.jpg","datePublished":"2016-08-04T22:26:03Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Is Data Scientist a useless job title?</h1><div class=post-meta><span title='2016-08-04 22:26:03 +0000 UTC'>August 4, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-08-04-is-data-scientist-a-useless-job-title/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist_huea073ab29ab3b372a3741ce291b85548_111212_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist.jpg 1600w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/silly-data-scientist.jpg alt width=1600 height=960></figure><div class=post-content><p>Data science can be defined as either the <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>intersection</a> or <a href=http://robjhyndman.com/hyndsight/am-i-a-data-scientist/ target=_blank rel=noopener>union</a> of software engineering and statistics. In recent years, the field seems to be gravitating towards the broader unifying definition, where everyone who touches data in some way can call themselves a data scientist. Hence, while many people whose job title is Data Scientist do very useful work, the title itself has become fairly useless as an indication of what the title holder actually does. This post briefly discusses how we got to this point, where I think the field is likely to go, and what data scientists can do to remain relevant.</p><h2 id=the-many-definitions-of-data-science>The many definitions of data science<a hidden class=anchor aria-hidden=true href=#the-many-definitions-of-data-science>#</a></h2><p>About two years ago, I <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>published a post discussing the definition of data scientist by Josh Wills</a>, as a <em>person who is better at statistics than any software engineer and better at software engineering than any statistician</em>. I still quite like this definition, because it describes me well, as someone with education and experience in both areas. However, to be better at statistics than <strong>any</strong> software engineer and better at software engineering than <strong>any</strong> statistician, you have to be truly proficient in both areas, as some software engineers are <a href=https://code.facebook.com/posts/1072626246134461/introducing-fblearner-flow-facebook-s-ai-backbone/ target=_blank rel=noopener>comfortable running complex experiments</a>, and some statisticians <a href=https://www.r-project.org/contributors.html target=_blank rel=noopener>are capable of building solid software</a>. Quite a few people who don&rsquo;t meet Wills&rsquo;s criteria have decided they wanted to be data scientists too, expanding the definition to be something along the lines of <em>someone who is better at statistics than some software engineers (who&rsquo;ve never done anything fancier than calculating a sample mean) and better at software engineering than some statisticians (who can&rsquo;t code)</em>.</p><p>In addition to software engineering and statistics, data scientists are expected to deeply understand the domain in which they operate, and be excellent communicators. This leads to the proliferation of increasingly ridiculous Venn diagrams, such as the one by <a href=http://datascience.stackexchange.com/a/2406 target=_blank rel=noopener>Stephan Kolassa</a>:</p><figure><a href=perfect-data-scientist-venn-diagram.png target=_blank rel=noopener><img sizes="(min-width: 768px) 572px,
 100vw" srcset="https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/perfect-data-scientist-venn-diagram_hud98ced44a7d59059ed9f0909603cdd3f_139916_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/perfect-data-scientist-venn-diagram_hud98ced44a7d59059ed9f0909603cdd3f_139916_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/perfect-data-scientist-venn-diagram.png 572w," src=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/perfect-data-scientist-venn-diagram.png alt="Perfect data scientist Venn diagram" loading=lazy></a></figure><p>The perfect data scientist from Kolassa&rsquo;s Venn diagram is a mythical sexy unicorn ninja rockstar who can transform a business just by thinking about its problems. A more realistic (and less exciting) view of data scientists is <a href=http://robjhyndman.com/hyndsight/am-i-a-data-scientist/ target=_blank rel=noopener>offered by Rob Hyndman</a>:</p><blockquote><p>I take the broad inclusive view. I am a data scientist because I do data analysis, and I do research on the methodology of data analysis. The way I would express it is that I’m a data scientist with a statistical perspective and training. Other data scientists will have different perspectives and different training.</p><p>We are comfortable with having medical specialists, and we will go to a GP, endocrinologist, physiotherapist, etc., when we have medical problems. We also need to take a team perspective on data science.</p><p>None of us can realistically cover the whole field, and so we specialise on certain problems and techniques. It is crazy to think that a doctor must know everything, and it is just as crazy to think a data scientist should be an expert in statistics, mathematics, computing, programming, the application discipline, etc. Instead, we need teams of data scientists with different skills, with each being aware of the boundary of their expertise, and who to call in for help when required.</p></blockquote><p>Indeed, data science is too broad for any data scientist to fully master all areas of expertise. Despite the misleading name of the field, it encompasses both science and engineering, which is why data scientists can be categorised into two types, as <a href="https://www.quora.com/What-is-data-science/answer/Michael-Hochster?srid=2sK8&share=98226ca3" target=_blank rel=noopener>suggested by Michael Hochster</a>:</p><ul><li>Type A (analyst): focused on static data analysis. Essentially a statistician with coding skills.</li><li>Type B (builder): focused on building data products. Essentially a software engineer with knowledge in machine learning and statistics.</li></ul><p>Type A is more of a scientist, and Type B is more of an engineer. Many people end up doing both, but it is pretty rare to have an even 50-50 split between the science and engineering sides, as they require different mindsets. This is illustrated by the following diagram, showing the information flow in science and engineering (<a href=https://www.farnamstreetblog.com/2013/07/the-difference-between-science-and-engineering/ target=_blank rel=noopener>source</a>).</p><figure><a href=science-versus-engineering.png target=_blank rel=noopener><img sizes="(min-width: 768px) 581px,
diff --git a/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/index.html b/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/index.html
index d99942fce..f4930551b 100644
--- a/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/index.html
+++ b/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>If you don’t pay attention, data can drive you off a cliff | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="analytics,business,data science,marketing,statistics"><meta name=description content="Seven common mistakes to avoid when working with data, such as ignoring uncertainty and confusing observed and unobserved quantities."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="If you don’t pay attention, data can drive you off a cliff"><meta property="og:description" content="Seven common mistakes to avoid when working with data, such as ignoring uncertainty and confusing observed and unobserved quantities."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/"><meta property="og:image" content="https://yanirseroussi.com/data-driven-off-cliff.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-08-21T21:34:17+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/data-driven-off-cliff.jpg"><meta name=twitter:title content="If you don’t pay attention, data can drive you off a cliff"><meta name=twitter:description content="Seven common mistakes to avoid when working with data, such as ignoring uncertainty and confusing observed and unobserved quantities."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"If you don’t pay attention, data can drive you off a cliff","item":"https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"If you don’t pay attention, data can drive you off a cliff","name":"If you don’t pay attention, data can drive you off a cliff","description":"Seven common mistakes to avoid when working with data, such as ignoring uncertainty and confusing observed and unobserved quantities.","keywords":["analytics","business","data science","marketing","statistics"],"articleBody":"You’re a hotshot manager. You love your dashboards and you keep your finger on the beating pulse of the business. You take pride in using data to drive your decisions rather than shooting from the hip like one of those old-school 1950s bosses. This is the 21st century, and data is king. You even hired a sexy statistician or data scientist, though you don’t really understand what they do. Never mind, you can proudly tell all your friends that you are leading a modern data-driven team. Nothing can go wrong, right? Incorrect. If you don’t pay attention, data can drive you off a cliff. This article discusses seven of the ways this can happen. Read on to ensure it doesn’t happen to you.\n1. Pretending uncertainty doesn’t exist Source: Standard error, Wikipedia Last month, your favourite metric was 5.2%. This month, it’s 5.5%. Looks like things are getting better – you must be doing something right! But is 5.5% really different from 5.2%? All things being equal, you should expect some variability in most of your metrics. The values you see are drawn from a distribution of possible values, which means you can’t be certain what value you’ll be seeing next. Fortunately, with more data you would be able to quantify this uncertainty and know which values are more likely. Don’t fear or ignore uncertainty. Embrace and study it, and you’ll be on the right track.\n2. Confusing observed and unobserved quantities Source: Estimates of Uncertainty around the RBA’s Forecasts Everyone agrees that the future is uncertain. We can generate forecasts with varying degrees of confidence, but we never know for sure what’s going to happen. However, some people tend to ignore uncertainty in forecasts, treating the unobserved future values as comparable to observed present values. For example, marketers often compare customer lifetime value with the cost of acquiring a customer. The problem is that customer lifetime value relies on a prediction of the net profit from a customer (so it’s largely unobserved and uncertain), while the business has much more control and certainty around the cost of acquiring a customer (though it’s not completely known). Treating the two values as if they’re observed and known is risky, as it can lead to major financial losses.\n3. Thinking that your data is correct Ask anyone who works with data, and they’ll tell you that it’s always messy. A well-known saying among data scientists is that 80% of the work is data cleaning and the other 20% is complaining about data cleaning. Hence, it’s likely that at least some of the figures you’re relying on to make decisions are somewhat inaccurate. However, it’s important to remember that this doesn’t make the data completely useless. But if something looks too good to be true, it probably isn’t true. Finally, it’s highly unlikely that the data is always correct when you like the results and always incorrect when the results aren’t favourable, so don’t use the “guy on the internet said our data isn’t 100% correct” excuse to push back on inconvenient truths.\n4. Believing that your data is complete No matter how big you are, your data doesn’t capture everything your customers do. Even Google and the NSA don’t have a full view of what people are up to in the non-digital world, and they can’t completely read our minds (yet). Most businesses have much less data than the big tech companies, and they look a bit silly trying to explain customer behaviour using only the data they have. At the end of the day, you have to work with the data you can access, but never underestimate the effectiveness of obtaining more (relevant) data.\n5. Measuring the wrong thing Source: Measuring what matters: How to pick a good metric Maybe you recently read an article emphasising the importance of real metrics, like daily active users, as opposed to vanity metrics like number of signups to your service. You therefore decide to track the daily active users of your product. But have you thought about whether this metric is relevant to what you’re trying to achieve? If you run a business like Airbnb, where transactions are inherently infrequent, do you really care if people don’t regularly log in? You probably don’t, as long as they use the product when they actually need it. Measuring and trying to optimise the wrong thing can be very risky. Indeed, deciding on metrics and their measurement can be seen as the hardest parts of data science.\n6. Not recognising your unconscious incompetence Source: Four stages of competence, Wikipedia To quote Bertrand Russell: “One of the painful things about our time is that those who feel certainty are stupid, and those with any imagination and understanding are filled with doubt and indecision.” Not recognising the extent of your ignorance when it comes to data is pretty common among those with no training in the field, which may lead to illusory superiority. This may be exacerbated by the fact that those who do know what they’re doing tend to talk a lot about uncertainty and how there are many things that are simply unknowable. My hope is that this short article would help people graduate from unconscious incompetence, where you don’t even recognise the importance of what you don’t know, to conscious incompetence, where you recognise the need to learn and rely on expert advice.\n7. Ignoring expert advice Once you’ve recognised your skill gaps, you may decide to hire a data scientist to help you get more value out of your data. However, despite the hype, data scientists are not magicians. In fact, because of the hype, the definition of data science is so diluted that some people say that the term itself has become useless. The truth is that dealing with data is hard, every organisation is somewhat different, and it takes time and commitment to get value out of data. The worst thing you can do is to hire an expensive expert to help you, and then ignore their advice when their findings are hard to digest. If you’re not ready to work with a data scientist, you might as well save yourself some money and remain in a state of blissful ignorance.\nNote: This article is not a portrayal of how things are with my current employer, Car Next Door. Views expressed are my own. In fact, if you want to work at a place where expert advice is acted on and uncertainty is seen as something to be studied rather than ignored, we’re hiring!\n","wordCount":"1091","inLanguage":"en","image":"https://yanirseroussi.com/data-driven-off-cliff.jpg","datePublished":"2016-08-21T21:34:17Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">If you don’t pay attention, data can drive you off a cliff</h1><div class=post-meta><span title='2016-08-21 21:34:17 +0000 UTC'>August 21, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-08-21-seven-ways-to-be-data-driven-off-a-cliff/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff.jpg alt width=1920 height=657></figure><div class=post-content><p>You&rsquo;re a hotshot manager. You love your dashboards and you keep your finger on the beating pulse of the business. You take pride in using data to drive your decisions rather than shooting from the hip like one of those old-school 1950s bosses. This is the 21st century, and data is king. You even hired a <a href=https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century target=_blank rel=noopener>sexy statistician or data scientist</a>, though you don&rsquo;t really understand what they do. Never mind, you can proudly tell all your friends that you are leading a modern data-driven team. Nothing can go wrong, right? Incorrect. If you don&rsquo;t pay attention, data can drive you off a cliff. This article discusses seven of the ways this can happen. Read on to ensure it doesn&rsquo;t happen to you.</p><h2 id=1-pretending-uncertainty-doesnt-exist>1. Pretending uncertainty doesn&rsquo;t exist<a hidden class=anchor aria-hidden=true href=#1-pretending-uncertainty-doesnt-exist>#</a></h2><figure><a href=standard-deviation-diagram.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="analytics,business,data science,marketing,statistics"><meta name=description content="Seven common mistakes to avoid when working with data, such as ignoring uncertainty and confusing observed and unobserved quantities."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="If you don’t pay attention, data can drive you off a cliff"><meta property="og:description" content="Seven common mistakes to avoid when working with data, such as ignoring uncertainty and confusing observed and unobserved quantities."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/"><meta property="og:image" content="https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-08-21T21:34:17+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff.jpg"><meta name=twitter:title content="If you don’t pay attention, data can drive you off a cliff"><meta name=twitter:description content="Seven common mistakes to avoid when working with data, such as ignoring uncertainty and confusing observed and unobserved quantities."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"If you don’t pay attention, data can drive you off a cliff","item":"https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"If you don’t pay attention, data can drive you off a cliff","name":"If you don’t pay attention, data can drive you off a cliff","description":"Seven common mistakes to avoid when working with data, such as ignoring uncertainty and confusing observed and unobserved quantities.","keywords":["analytics","business","data science","marketing","statistics"],"articleBody":"You’re a hotshot manager. You love your dashboards and you keep your finger on the beating pulse of the business. You take pride in using data to drive your decisions rather than shooting from the hip like one of those old-school 1950s bosses. This is the 21st century, and data is king. You even hired a sexy statistician or data scientist, though you don’t really understand what they do. Never mind, you can proudly tell all your friends that you are leading a modern data-driven team. Nothing can go wrong, right? Incorrect. If you don’t pay attention, data can drive you off a cliff. This article discusses seven of the ways this can happen. Read on to ensure it doesn’t happen to you.\n1. Pretending uncertainty doesn’t exist Source: Standard error, Wikipedia Last month, your favourite metric was 5.2%. This month, it’s 5.5%. Looks like things are getting better – you must be doing something right! But is 5.5% really different from 5.2%? All things being equal, you should expect some variability in most of your metrics. The values you see are drawn from a distribution of possible values, which means you can’t be certain what value you’ll be seeing next. Fortunately, with more data you would be able to quantify this uncertainty and know which values are more likely. Don’t fear or ignore uncertainty. Embrace and study it, and you’ll be on the right track.\n2. Confusing observed and unobserved quantities Source: Estimates of Uncertainty around the RBA’s Forecasts Everyone agrees that the future is uncertain. We can generate forecasts with varying degrees of confidence, but we never know for sure what’s going to happen. However, some people tend to ignore uncertainty in forecasts, treating the unobserved future values as comparable to observed present values. For example, marketers often compare customer lifetime value with the cost of acquiring a customer. The problem is that customer lifetime value relies on a prediction of the net profit from a customer (so it’s largely unobserved and uncertain), while the business has much more control and certainty around the cost of acquiring a customer (though it’s not completely known). Treating the two values as if they’re observed and known is risky, as it can lead to major financial losses.\n3. Thinking that your data is correct Ask anyone who works with data, and they’ll tell you that it’s always messy. A well-known saying among data scientists is that 80% of the work is data cleaning and the other 20% is complaining about data cleaning. Hence, it’s likely that at least some of the figures you’re relying on to make decisions are somewhat inaccurate. However, it’s important to remember that this doesn’t make the data completely useless. But if something looks too good to be true, it probably isn’t true. Finally, it’s highly unlikely that the data is always correct when you like the results and always incorrect when the results aren’t favourable, so don’t use the “guy on the internet said our data isn’t 100% correct” excuse to push back on inconvenient truths.\n4. Believing that your data is complete No matter how big you are, your data doesn’t capture everything your customers do. Even Google and the NSA don’t have a full view of what people are up to in the non-digital world, and they can’t completely read our minds (yet). Most businesses have much less data than the big tech companies, and they look a bit silly trying to explain customer behaviour using only the data they have. At the end of the day, you have to work with the data you can access, but never underestimate the effectiveness of obtaining more (relevant) data.\n5. Measuring the wrong thing Source: Measuring what matters: How to pick a good metric Maybe you recently read an article emphasising the importance of real metrics, like daily active users, as opposed to vanity metrics like number of signups to your service. You therefore decide to track the daily active users of your product. But have you thought about whether this metric is relevant to what you’re trying to achieve? If you run a business like Airbnb, where transactions are inherently infrequent, do you really care if people don’t regularly log in? You probably don’t, as long as they use the product when they actually need it. Measuring and trying to optimise the wrong thing can be very risky. Indeed, deciding on metrics and their measurement can be seen as the hardest parts of data science.\n6. Not recognising your unconscious incompetence Source: Four stages of competence, Wikipedia To quote Bertrand Russell: “One of the painful things about our time is that those who feel certainty are stupid, and those with any imagination and understanding are filled with doubt and indecision.” Not recognising the extent of your ignorance when it comes to data is pretty common among those with no training in the field, which may lead to illusory superiority. This may be exacerbated by the fact that those who do know what they’re doing tend to talk a lot about uncertainty and how there are many things that are simply unknowable. My hope is that this short article would help people graduate from unconscious incompetence, where you don’t even recognise the importance of what you don’t know, to conscious incompetence, where you recognise the need to learn and rely on expert advice.\n7. Ignoring expert advice Once you’ve recognised your skill gaps, you may decide to hire a data scientist to help you get more value out of your data. However, despite the hype, data scientists are not magicians. In fact, because of the hype, the definition of data science is so diluted that some people say that the term itself has become useless. The truth is that dealing with data is hard, every organisation is somewhat different, and it takes time and commitment to get value out of data. The worst thing you can do is to hire an expensive expert to help you, and then ignore their advice when their findings are hard to digest. If you’re not ready to work with a data scientist, you might as well save yourself some money and remain in a state of blissful ignorance.\nNote: This article is not a portrayal of how things are with my current employer, Car Next Door. Views expressed are my own. In fact, if you want to work at a place where expert advice is acted on and uncertainty is seen as something to be studied rather than ignored, we’re hiring!\n","wordCount":"1091","inLanguage":"en","image":"https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff.jpg","datePublished":"2016-08-21T21:34:17Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">If you don’t pay attention, data can drive you off a cliff</h1><div class=post-meta><span title='2016-08-21 21:34:17 +0000 UTC'>August 21, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-08-21-seven-ways-to-be-data-driven-off-a-cliff/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff_hu6382e6839197ee91ca183c977de9f45a_133028_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/data-driven-off-cliff.jpg alt width=1920 height=657></figure><div class=post-content><p>You&rsquo;re a hotshot manager. You love your dashboards and you keep your finger on the beating pulse of the business. You take pride in using data to drive your decisions rather than shooting from the hip like one of those old-school 1950s bosses. This is the 21st century, and data is king. You even hired a <a href=https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century target=_blank rel=noopener>sexy statistician or data scientist</a>, though you don&rsquo;t really understand what they do. Never mind, you can proudly tell all your friends that you are leading a modern data-driven team. Nothing can go wrong, right? Incorrect. If you don&rsquo;t pay attention, data can drive you off a cliff. This article discusses seven of the ways this can happen. Read on to ensure it doesn&rsquo;t happen to you.</p><h2 id=1-pretending-uncertainty-doesnt-exist>1. Pretending uncertainty doesn&rsquo;t exist<a hidden class=anchor aria-hidden=true href=#1-pretending-uncertainty-doesnt-exist>#</a></h2><figure><a href=standard-deviation-diagram.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/standard-deviation-diagram_hu8a11f999223fd4c4976332ce4d982b62_68903_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/standard-deviation-diagram_hu8a11f999223fd4c4976332ce4d982b62_68903_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/standard-deviation-diagram_hu8a11f999223fd4c4976332ce4d982b62_68903_720x0_resize_box_3.png 720w,
diff --git a/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/index.html b/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/index.html
index 3b61dfd08..abe8a893c 100644
--- a/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/index.html
+++ b/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Ask Why! Finding motives, causes, and purpose in data science | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="causal inference,data science,insights,personal"><meta name=description content="Video and summary of a talk I gave at the Data Science Sydney meetup, about going beyond the what & how of predictive modelling."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Ask Why! Finding motives, causes, and purpose in data science"><meta property="og:description" content="Video and summary of a talk I gave at the Data Science Sydney meetup, about going beyond the what & how of predictive modelling."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/"><meta property="og:image" content="https://yanirseroussi.com/why-brick-wall-large.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-09-19T21:28:44+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/why-brick-wall-large.jpg"><meta name=twitter:title content="Ask Why! Finding motives, causes, and purpose in data science"><meta name=twitter:description content="Video and summary of a talk I gave at the Data Science Sydney meetup, about going beyond the what & how of predictive modelling."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Ask Why! Finding motives, causes, and purpose in data science","item":"https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Ask Why! Finding motives, causes, and purpose in data science","name":"Ask Why! Finding motives, causes, and purpose in data science","description":"Video and summary of a talk I gave at the Data Science Sydney meetup, about going beyond the what \u0026amp; how of predictive modelling.","keywords":["causal inference","data science","insights","personal"],"articleBody":"Some people equate predictive modelling with data science, thinking that mastering various machine learning techniques is the key that unlocks the mysteries of the field. However, there is much more to data science than the What and How of predictive modelling. I recently gave a talk where I argued the importance of asking Why, touching on three different topics: stakeholder motives, cause-and-effect relationships, and finding a sense of purpose. A video of the talk is available below. Unfortunately, the videographer mostly focused on me pacing rather than on the screen, but you can check out the slides here (note that you need to use both the left/right and up/down arrows to see all the slides).\nIf you’re interested in the topics covered in the talk, here are a few posts you should read.\nStakeholders and their motives\nIf you don’t pay attention, data can drive you off a cliff The hardest parts of data science You don’t need a data scientist (yet) Causality and experimentation\nMaking Bayesian A/B testing more accessible Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions Why you should stop worrying about deep learning and deepen your understanding of causality instead Purpose, ethics, and my personal path\nShould data science really do that? (on KDNuggets) The long road to a lifestyle business My divestment from fossil fuels The rise of greedy robots Cover image: Why by Ksayer\n","wordCount":"232","inLanguage":"en","image":"https://yanirseroussi.com/why-brick-wall-large.jpg","datePublished":"2016-09-19T21:28:44Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Ask Why! Finding motives, causes, and purpose in data science</h1><div class=post-meta><span title='2016-09-19 21:28:44 +0000 UTC'>September 19, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-09-19-ask-why-finding-motives-causes-and-purpose-in-data-science/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large_hu3d03a01dcc18bc5be0e67db3d8d209a6_279085_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large_hu3d03a01dcc18bc5be0e67db3d8d209a6_279085_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large_hu3d03a01dcc18bc5be0e67db3d8d209a6_279085_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large.jpg 1024w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large.jpg alt width=1024 height=685></figure><div class=post-content><p>Some people equate predictive modelling with data science, thinking that mastering various machine learning techniques is the key that unlocks the mysteries of the field. However, there is much more to data science than the <em>What</em> and <em>How</em> of predictive modelling. I recently gave a talk where I argued the importance of asking <em>Why</em>, touching on three different topics: stakeholder motives, cause-and-effect relationships, and finding a sense of purpose. <a href="http://www.youtube.com/watch?v=2wqu-drqlpo" target=_blank rel=noopener>A video of the talk</a> is available below. Unfortunately, the videographer mostly focused on me pacing rather than on the screen, but you can <a href=https://yanirs.github.io/talks/ask-why/ target=_blank rel=noopener>check out the slides here</a> (note that you need to use both the left/right and up/down arrows to see all the slides).</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/2wqu-drqlpo style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="YouTube Video"></iframe></div></p><p>If you&rsquo;re interested in the topics covered in the talk, here are a few posts you should read.</p><p><strong>Stakeholders and their motives</strong></p><ul><li><a href=https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/>If you don&rsquo;t pay attention, data can drive you off a cliff</a></li><li><a href=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/>The hardest parts of data science</a></li><li><a href=https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/>You don&rsquo;t need a data scientist (yet)</a></li></ul><p><strong>Causality and experimentation</strong></p><ul><li><a href=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/>Making Bayesian A/B testing more accessible</a></li><li><a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions</a></li><li><a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>Why you should stop worrying about deep learning and deepen your understanding of causality instead</a></li></ul><p><strong>Purpose, ethics, and my personal path</strong></p><ul><li><a href=http://www.kdnuggets.com/2015/05/should-data-science-do-that.html target=_blank rel=noopener>Should data science really do that? (on KDNuggets)</a></li><li><a href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/>The long road to a lifestyle business</a></li><li><a href=https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/>My divestment from fossil fuels</a></li><li><a href=https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/>The rise of greedy robots</a></li></ul><p><small>Cover image: <a href=https://flic.kr/p/9yaos5 target=_blank rel=noopener>Why by Ksayer</a></small></p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/causal-inference/>causal inference</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/insights/>insights</a></li><li><a href=https://yanirseroussi.com/tags/personal/>personal</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on x" href="https://x.com/intent/tweet/?text=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f&amp;hashtags=causalinference%2cdatascience%2cinsights%2cpersonal"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f&amp;title=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&amp;summary=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&amp;source=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f&title=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on whatsapp" href="https://api.whatsapp.com/send?text=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science%20-%20https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on telegram" href="https://telegram.me/share/url?text=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on ycombinator" href="https://news.ycombinator.com/submitlink?t=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&u=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="causal inference,data science,insights,personal"><meta name=description content="Video and summary of a talk I gave at the Data Science Sydney meetup, about going beyond the what & how of predictive modelling."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Ask Why! Finding motives, causes, and purpose in data science"><meta property="og:description" content="Video and summary of a talk I gave at the Data Science Sydney meetup, about going beyond the what & how of predictive modelling."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/"><meta property="og:image" content="https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2016-09-19T21:28:44+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large.jpg"><meta name=twitter:title content="Ask Why! Finding motives, causes, and purpose in data science"><meta name=twitter:description content="Video and summary of a talk I gave at the Data Science Sydney meetup, about going beyond the what & how of predictive modelling."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Ask Why! Finding motives, causes, and purpose in data science","item":"https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Ask Why! Finding motives, causes, and purpose in data science","name":"Ask Why! Finding motives, causes, and purpose in data science","description":"Video and summary of a talk I gave at the Data Science Sydney meetup, about going beyond the what \u0026amp; how of predictive modelling.","keywords":["causal inference","data science","insights","personal"],"articleBody":"Some people equate predictive modelling with data science, thinking that mastering various machine learning techniques is the key that unlocks the mysteries of the field. However, there is much more to data science than the What and How of predictive modelling. I recently gave a talk where I argued the importance of asking Why, touching on three different topics: stakeholder motives, cause-and-effect relationships, and finding a sense of purpose. A video of the talk is available below. Unfortunately, the videographer mostly focused on me pacing rather than on the screen, but you can check out the slides here (note that you need to use both the left/right and up/down arrows to see all the slides).\nIf you’re interested in the topics covered in the talk, here are a few posts you should read.\nStakeholders and their motives\nIf you don’t pay attention, data can drive you off a cliff The hardest parts of data science You don’t need a data scientist (yet) Causality and experimentation\nMaking Bayesian A/B testing more accessible Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions Why you should stop worrying about deep learning and deepen your understanding of causality instead Purpose, ethics, and my personal path\nShould data science really do that? (on KDNuggets) The long road to a lifestyle business My divestment from fossil fuels The rise of greedy robots Cover image: Why by Ksayer\n","wordCount":"232","inLanguage":"en","image":"https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large.jpg","datePublished":"2016-09-19T21:28:44Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Ask Why! Finding motives, causes, and purpose in data science</h1><div class=post-meta><span title='2016-09-19 21:28:44 +0000 UTC'>September 19, 2016</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2016-09-19-ask-why-finding-motives-causes-and-purpose-in-data-science/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large_hu3d03a01dcc18bc5be0e67db3d8d209a6_279085_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large_hu3d03a01dcc18bc5be0e67db3d8d209a6_279085_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large_hu3d03a01dcc18bc5be0e67db3d8d209a6_279085_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large.jpg 1024w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/why-brick-wall-large.jpg alt width=1024 height=685></figure><div class=post-content><p>Some people equate predictive modelling with data science, thinking that mastering various machine learning techniques is the key that unlocks the mysteries of the field. However, there is much more to data science than the <em>What</em> and <em>How</em> of predictive modelling. I recently gave a talk where I argued the importance of asking <em>Why</em>, touching on three different topics: stakeholder motives, cause-and-effect relationships, and finding a sense of purpose. <a href="http://www.youtube.com/watch?v=2wqu-drqlpo" target=_blank rel=noopener>A video of the talk</a> is available below. Unfortunately, the videographer mostly focused on me pacing rather than on the screen, but you can <a href=https://yanirs.github.io/talks/ask-why/ target=_blank rel=noopener>check out the slides here</a> (note that you need to use both the left/right and up/down arrows to see all the slides).</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/2wqu-drqlpo style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="YouTube Video"></iframe></div></p><p>If you&rsquo;re interested in the topics covered in the talk, here are a few posts you should read.</p><p><strong>Stakeholders and their motives</strong></p><ul><li><a href=https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/>If you don&rsquo;t pay attention, data can drive you off a cliff</a></li><li><a href=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/>The hardest parts of data science</a></li><li><a href=https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/>You don&rsquo;t need a data scientist (yet)</a></li></ul><p><strong>Causality and experimentation</strong></p><ul><li><a href=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/>Making Bayesian A/B testing more accessible</a></li><li><a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions</a></li><li><a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>Why you should stop worrying about deep learning and deepen your understanding of causality instead</a></li></ul><p><strong>Purpose, ethics, and my personal path</strong></p><ul><li><a href=http://www.kdnuggets.com/2015/05/should-data-science-do-that.html target=_blank rel=noopener>Should data science really do that? (on KDNuggets)</a></li><li><a href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/>The long road to a lifestyle business</a></li><li><a href=https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/>My divestment from fossil fuels</a></li><li><a href=https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/>The rise of greedy robots</a></li></ul><p><small>Cover image: <a href=https://flic.kr/p/9yaos5 target=_blank rel=noopener>Why by Ksayer</a></small></p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/causal-inference/>causal inference</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/insights/>insights</a></li><li><a href=https://yanirseroussi.com/tags/personal/>personal</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on x" href="https://x.com/intent/tweet/?text=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f&amp;hashtags=causalinference%2cdatascience%2cinsights%2cpersonal"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f&amp;title=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&amp;summary=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&amp;source=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f&title=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on whatsapp" href="https://api.whatsapp.com/send?text=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science%20-%20https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on telegram" href="https://telegram.me/share/url?text=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&amp;url=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Ask Why! Finding motives, causes, and purpose in data science on ycombinator" href="https://news.ycombinator.com/submitlink?t=Ask%20Why%21%20Finding%20motives%2c%20causes%2c%20and%20purpose%20in%20data%20science&u=https%3a%2f%2fyanirseroussi.com%2f2016%2f09%2f19%2fask-why-finding-motives-causes-and-purpose-in-data-science%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.html b/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.html
index 1a8b00519..2a6c906ac 100644
--- a/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.html
+++ b/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Customer lifetime value and the proliferation of misinformation on the internet | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="analytics,business,data science,marketing,politics,predictive modelling,science communication,search engine optimisation,statistics"><meta name=description content="There&rsquo;s a lot of misleading content on the estimation of customer lifetime value. Here&rsquo;s what I learned about doing it well."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Customer lifetime value and the proliferation of misinformation on the internet"><meta property="og:description" content="There&rsquo;s a lot of misleading content on the estimation of customer lifetime value. Here&rsquo;s what I learned about doing it well."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/"><meta property="og:image" content="https://yanirseroussi.com/propaganda-graffiti.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-01-08T20:02:30+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/propaganda-graffiti.jpg"><meta name=twitter:title content="Customer lifetime value and the proliferation of misinformation on the internet"><meta name=twitter:description content="There&rsquo;s a lot of misleading content on the estimation of customer lifetime value. Here&rsquo;s what I learned about doing it well."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Customer lifetime value and the proliferation of misinformation on the internet","item":"https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Customer lifetime value and the proliferation of misinformation on the internet","name":"Customer lifetime value and the proliferation of misinformation on the internet","description":"There\u0026rsquo;s a lot of misleading content on the estimation of customer lifetime value. Here\u0026rsquo;s what I learned about doing it well.","keywords":["analytics","business","data science","marketing","politics","predictive modelling","science communication","search engine optimisation","statistics"],"articleBody":"Suppose you work for a business that has paying customers. You want to know how much money your customers are likely to spend to inform decisions on customer acquisition and retention budgets. You’ve done a bit of research, and discovered that the figure you want to calculate is commonly called the customer lifetime value. You google the term, and end up on a page with ten results (and probably some ads). How many of those results contain useful, non-misleading information? As of early 2017, fewer than half. Why is that? How can it be that after nearly 20 years of existence, Google still surfaces misleading information for common search terms? And how can you calculate your customer lifetime value correctly, avoiding the traps set up by clever search engine marketers? Read on to find out!\nBackground: Misleading search results and fake news While Google tries to filter obvious spam from its index, it still relies to a great extent on popularity to rank search results. Popularity is a function of inbound links (weighted by site credibility), and of user interaction with the presented results (e.g., time spent on a result page before moving on to the next result or search). There are two obvious problems with this approach. First, there are no guarantees that wrong, misleading, or inaccurate pages won’t be popular, and therefore earn high rankings. Second, given Google’s near-monopoly of the search market, if a page ranks highly for popular search terms, it is likely to become more popular and be seen as credible. Hence, when searching for the truth, it’d be wise to follow Abraham Lincoln’s famous warning not to trust everything you read on the internet.\nGoogle is not alone in helping spread misinformation. Following Donald Trump’s recent victory in the US presidential election, many people have blamed Facebook for allowing so-called fake news to be widely shared. Indeed, any popular media outlet or website may end up spreading misinformation, especially if – like Facebook and Google – it mainly aggregates and amplifies user-generated content. However, as noted by John Herrman, the problem is much deeper than clearly-fabricated news stories. It is hard to draw the lines between malicious spread of misinformation, slight inaccuracies, and plain ignorance. For example, how would one classify Trump’s claims that climate change is a hoax invented by the Chinese? Should Twitter block his account for knowingly spreading outright lies?\nWrong customer value calculation by example Fortunately, when it comes to customer lifetime value, I doubt that any of the top results returned by Google is intentionally misleading. This is a case where inaccuracies and misinformation result from ignorance rather than from malice. However, relying on such resources without digging further is just as risky as relying on pure fabrications. For example, see this infographic by Kissmetrics, which suggests three different formulas for calculating the average lifetime value of a Starbucks customer. Those three formulas yield very different values ($5,489, $11,535, and $25,272), which the authors then say should be averaged to yield the final lifetime value figure. All formulas are based on numbers that the authors call constants, despite the fact that numbers such as the average customer lifespan or retention rate are clearly not constant in this context (since they’re estimated from the data and used as projections into the future). Indeed, several people have commented on the flaws in Kissmetrics’ approach, which is reminiscent of the Dilbert strip where the pointy-haired boss asks Dilbert to average and multiply wrong data.\nMy main problem with the Kissmetrics infographic is that it helps feed an illusion of understanding that is prevalent among those with no statistical training. As the authors fail to acknowledge the fact that the predictions produced by the formulas are inaccurate, they may cause managers and marketers to believe that they know the lifetime value of their customers. However, it’s important to remember that all models are wrong (but some models are useful), and that the lifetime value of active customers is unknowable since it involves forecasting of uncertain quantities. Hence, it is reckless to encourage people to use the Kissmetrics formulas without trying to quantify how wrong they may be on the specific dataset they’re applied to.\nFader and Hardie: The voice of reason Notably, the work of Peter Fader and Bruce Hardie on customer lifetime value isn’t directly referenced on the first page of Google results. This is unfortunate, as they have gone through the effort of making their models accessible to people with no academic background, e.g., using Excel spreadsheets and YouTube videos. However, it is clear that they are not optimising for search engine rankings, as I found out about their work by adding search terms that the average marketer is unlikely to use (e.g., Python and Bayesian). While surveying Fader and Hardie’s large body of work is beyond the scope of this article, it is worth summarising their criticism of the lifetime value formula that is taught in introductory marketing courses.\nThe formula discussed by Fader and Hardie is CLV = sumt=0..T(m * rt / (1 + d)t), where m is the net cash flow per period, r is the retention rate, d is the discount rate, and T is the time horizon. The five issues that Fader and Hardie identify are as follows.\nThe true lifetime value is unknown while the customer is still active, so the formula is actually for the expected lifetime value, i.e., E(CLV). Since the summation is bounded, the formula isn’t really for the lifetime value – it is an estimate of value up to period T (which may still be useful). As the summation starts at t=0, it gives the expected value of a customer that hasn’t been acquired yet. According to Fader and Hardie, in some cases the formula starts at t=1, i.e., it applies only to existing customers. The distinction between the two cases isn’t always made clear. The formula assumes a constant retention rate. However, it is often the case that retention increases with tenure, i.e., customers who have been with the company for a long time are less likely to churn than recently-acquired customers. It isn’t always possible to calculate a retention rate, as the point at which a customer churns isn’t observed for many products. For example, Starbucks doesn’t know whether customers who haven’t made a purchase for a while have decided to never visit Starbucks again, or whether they’re just going through a period of inactivity. Further, given the ubiquity of Starbucks, it is probably safe to assume that all past customers have a non-zero probability of making another purchase (unless they’re physically dead). According to Fader and Hardie, “the bottom line is that there is no ‘one formula’ that can be used to compute customer lifetime value”. Therefore, teaching the above formula (or one of its variants) misleads people into thinking that they know how to calculate the lifetime value of customers. Hence, they advocate going back to the definition of lifetime value as “the present value of the future cashflows attributed to the customer relationship”, and using a probabilistic approach to generate estimates of the expected lifetime value for each customer. This conclusion also appears in a more accessible series of blog posts by Custora, where it is claimed that probabilistic modelling can yield significantly more accurate estimates than naive formulas.\nGetting serious with the lifetimes package As mentioned above, Fader and Hardie provide Excel implementations of some of their models, which produce individual-level lifetime value predictions. While this is definitely an improvement over using general formulas, better solutions are available if you can code (or have access to people who can do coding for you). For example, using a software package makes it easy to integrate the lifetime value calculation into a live product, enabling automated interventions to increase revenue and profit (among other benefits). According to Roberto Medri, this approach is followed by Etsy, where lifetime value predictions are used to retain customers and increase their value.\nAn example of a software package that I can vouch for is the Python lifetimes package, which implements several probabilistic models for lifetime value prediction in a non-contractual setting (i.e., where churn isn’t observed – as in the Starbucks example above). This package is maintained by Cameron Davidson-Pilon of Shopify, who may be known to some readers from his Bayesian Methods for Hackers book and other Python packages. I’ve successfully used the package on a real dataset and have contributed some small fixes and improvements. The documentation on GitHub is quite good, so I won’t repeat it here. However, it is worth reiterating that as with any predictive model, it is important to evaluate performance on your own dataset before deciding to rely on the package’s predictions. If you only take away one thing from this article, let it be the reminder that it is unwise to blindly accept any formula or model. The models implemented in the package (some of which were introduced by Fader and Hardie) are fairly simple and generally applicable, as they rely only on the past transaction log. These simple models are known to sometimes outperform more complex models that rely on richer data, but this isn’t guaranteed to happen on every dataset. My untested feeling is that in situations where clean and relevant training data is plentiful, models that use other features in addition to those extracted from the transaction log would outperform the models provided by the lifetimes package (if you have empirical evidence that supports or refutes this assumption, please let me know).\nConclusion: You’re better than that Accurate estimation of customer lifetime value is crucial to most businesses. It informs decisions on customer acquisition and retention, and getting it wrong can drive a business from profitability to insolvency. The rise of data science increases the availability of statistical and scientific tools to small and large businesses. Hence, there are few reasons why a revenue-generating business should rely on untested customer value formulas rather than on more realistic models. This extends beyond customer value to nearly every business endeavour: Relying on fabrications is not a sustainable growth strategy, there is no way around learning how to be intelligently driven by data, and no amount of cheap demagoguery and misinformation can alter the objective reality of our world.\n","wordCount":"1716","inLanguage":"en","image":"https://yanirseroussi.com/propaganda-graffiti.jpg","datePublished":"2017-01-08T20:02:30Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Customer lifetime value and the proliferation of misinformation on the internet</h1><div class=post-meta><span title='2017-01-08 20:02:30 +0000 UTC'>January 8, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-01-08-customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti.jpg alt width=1920 height=998></figure><div class=post-content><p>Suppose you work for a business that has paying customers. You want to know how much money your customers are likely to spend to inform decisions on customer acquisition and retention budgets. You&rsquo;ve done a bit of research, and discovered that the figure you want to calculate is commonly called the <em>customer lifetime value</em>. You google the term, and end up on a page with ten results (and probably some ads). How many of those results contain useful, non-misleading information? As of early 2017, fewer than half. Why is that? How can it be that after nearly 20 years of existence, Google still surfaces misleading information for common search terms? And how can you calculate your customer lifetime value correctly, avoiding the traps set up by clever search engine marketers? Read on to find out!</p><h2 id=background-misleading-search-results-and-fake-news>Background: Misleading search results and fake news<a hidden class=anchor aria-hidden=true href=#background-misleading-search-results-and-fake-news>#</a></h2><p>While Google tries to filter obvious spam from its index, it still relies to a great extent on popularity to rank search results. Popularity is a function of inbound links (weighted by site credibility), and of user interaction with the presented results (e.g., time spent on a result page before moving on to the next result or search). There are two obvious problems with this approach. First, there are no guarantees that wrong, misleading, or inaccurate pages won&rsquo;t be popular, and therefore earn high rankings. Second, given Google&rsquo;s near-monopoly of the search market, if a page ranks highly for popular search terms, it is likely to become more popular and be seen as credible. Hence, when searching for the truth, it&rsquo;d be wise to follow Abraham Lincoln&rsquo;s famous warning not to trust everything you read on the internet.</p><figure><a href=dont-believe-everything-you-read-on-the-internet-lincoln.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 576px,
+<meta name=keywords content="analytics,business,data science,marketing,politics,predictive modelling,science communication,search engine optimisation,statistics"><meta name=description content="There&rsquo;s a lot of misleading content on the estimation of customer lifetime value. Here&rsquo;s what I learned about doing it well."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Customer lifetime value and the proliferation of misinformation on the internet"><meta property="og:description" content="There&rsquo;s a lot of misleading content on the estimation of customer lifetime value. Here&rsquo;s what I learned about doing it well."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/"><meta property="og:image" content="https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-01-08T20:02:30+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti.jpg"><meta name=twitter:title content="Customer lifetime value and the proliferation of misinformation on the internet"><meta name=twitter:description content="There&rsquo;s a lot of misleading content on the estimation of customer lifetime value. Here&rsquo;s what I learned about doing it well."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Customer lifetime value and the proliferation of misinformation on the internet","item":"https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Customer lifetime value and the proliferation of misinformation on the internet","name":"Customer lifetime value and the proliferation of misinformation on the internet","description":"There\u0026rsquo;s a lot of misleading content on the estimation of customer lifetime value. Here\u0026rsquo;s what I learned about doing it well.","keywords":["analytics","business","data science","marketing","politics","predictive modelling","science communication","search engine optimisation","statistics"],"articleBody":"Suppose you work for a business that has paying customers. You want to know how much money your customers are likely to spend to inform decisions on customer acquisition and retention budgets. You’ve done a bit of research, and discovered that the figure you want to calculate is commonly called the customer lifetime value. You google the term, and end up on a page with ten results (and probably some ads). How many of those results contain useful, non-misleading information? As of early 2017, fewer than half. Why is that? How can it be that after nearly 20 years of existence, Google still surfaces misleading information for common search terms? And how can you calculate your customer lifetime value correctly, avoiding the traps set up by clever search engine marketers? Read on to find out!\nBackground: Misleading search results and fake news While Google tries to filter obvious spam from its index, it still relies to a great extent on popularity to rank search results. Popularity is a function of inbound links (weighted by site credibility), and of user interaction with the presented results (e.g., time spent on a result page before moving on to the next result or search). There are two obvious problems with this approach. First, there are no guarantees that wrong, misleading, or inaccurate pages won’t be popular, and therefore earn high rankings. Second, given Google’s near-monopoly of the search market, if a page ranks highly for popular search terms, it is likely to become more popular and be seen as credible. Hence, when searching for the truth, it’d be wise to follow Abraham Lincoln’s famous warning not to trust everything you read on the internet.\nGoogle is not alone in helping spread misinformation. Following Donald Trump’s recent victory in the US presidential election, many people have blamed Facebook for allowing so-called fake news to be widely shared. Indeed, any popular media outlet or website may end up spreading misinformation, especially if – like Facebook and Google – it mainly aggregates and amplifies user-generated content. However, as noted by John Herrman, the problem is much deeper than clearly-fabricated news stories. It is hard to draw the lines between malicious spread of misinformation, slight inaccuracies, and plain ignorance. For example, how would one classify Trump’s claims that climate change is a hoax invented by the Chinese? Should Twitter block his account for knowingly spreading outright lies?\nWrong customer value calculation by example Fortunately, when it comes to customer lifetime value, I doubt that any of the top results returned by Google is intentionally misleading. This is a case where inaccuracies and misinformation result from ignorance rather than from malice. However, relying on such resources without digging further is just as risky as relying on pure fabrications. For example, see this infographic by Kissmetrics, which suggests three different formulas for calculating the average lifetime value of a Starbucks customer. Those three formulas yield very different values ($5,489, $11,535, and $25,272), which the authors then say should be averaged to yield the final lifetime value figure. All formulas are based on numbers that the authors call constants, despite the fact that numbers such as the average customer lifespan or retention rate are clearly not constant in this context (since they’re estimated from the data and used as projections into the future). Indeed, several people have commented on the flaws in Kissmetrics’ approach, which is reminiscent of the Dilbert strip where the pointy-haired boss asks Dilbert to average and multiply wrong data.\nMy main problem with the Kissmetrics infographic is that it helps feed an illusion of understanding that is prevalent among those with no statistical training. As the authors fail to acknowledge the fact that the predictions produced by the formulas are inaccurate, they may cause managers and marketers to believe that they know the lifetime value of their customers. However, it’s important to remember that all models are wrong (but some models are useful), and that the lifetime value of active customers is unknowable since it involves forecasting of uncertain quantities. Hence, it is reckless to encourage people to use the Kissmetrics formulas without trying to quantify how wrong they may be on the specific dataset they’re applied to.\nFader and Hardie: The voice of reason Notably, the work of Peter Fader and Bruce Hardie on customer lifetime value isn’t directly referenced on the first page of Google results. This is unfortunate, as they have gone through the effort of making their models accessible to people with no academic background, e.g., using Excel spreadsheets and YouTube videos. However, it is clear that they are not optimising for search engine rankings, as I found out about their work by adding search terms that the average marketer is unlikely to use (e.g., Python and Bayesian). While surveying Fader and Hardie’s large body of work is beyond the scope of this article, it is worth summarising their criticism of the lifetime value formula that is taught in introductory marketing courses.\nThe formula discussed by Fader and Hardie is CLV = sumt=0..T(m * rt / (1 + d)t), where m is the net cash flow per period, r is the retention rate, d is the discount rate, and T is the time horizon. The five issues that Fader and Hardie identify are as follows.\nThe true lifetime value is unknown while the customer is still active, so the formula is actually for the expected lifetime value, i.e., E(CLV). Since the summation is bounded, the formula isn’t really for the lifetime value – it is an estimate of value up to period T (which may still be useful). As the summation starts at t=0, it gives the expected value of a customer that hasn’t been acquired yet. According to Fader and Hardie, in some cases the formula starts at t=1, i.e., it applies only to existing customers. The distinction between the two cases isn’t always made clear. The formula assumes a constant retention rate. However, it is often the case that retention increases with tenure, i.e., customers who have been with the company for a long time are less likely to churn than recently-acquired customers. It isn’t always possible to calculate a retention rate, as the point at which a customer churns isn’t observed for many products. For example, Starbucks doesn’t know whether customers who haven’t made a purchase for a while have decided to never visit Starbucks again, or whether they’re just going through a period of inactivity. Further, given the ubiquity of Starbucks, it is probably safe to assume that all past customers have a non-zero probability of making another purchase (unless they’re physically dead). According to Fader and Hardie, “the bottom line is that there is no ‘one formula’ that can be used to compute customer lifetime value”. Therefore, teaching the above formula (or one of its variants) misleads people into thinking that they know how to calculate the lifetime value of customers. Hence, they advocate going back to the definition of lifetime value as “the present value of the future cashflows attributed to the customer relationship”, and using a probabilistic approach to generate estimates of the expected lifetime value for each customer. This conclusion also appears in a more accessible series of blog posts by Custora, where it is claimed that probabilistic modelling can yield significantly more accurate estimates than naive formulas.\nGetting serious with the lifetimes package As mentioned above, Fader and Hardie provide Excel implementations of some of their models, which produce individual-level lifetime value predictions. While this is definitely an improvement over using general formulas, better solutions are available if you can code (or have access to people who can do coding for you). For example, using a software package makes it easy to integrate the lifetime value calculation into a live product, enabling automated interventions to increase revenue and profit (among other benefits). According to Roberto Medri, this approach is followed by Etsy, where lifetime value predictions are used to retain customers and increase their value.\nAn example of a software package that I can vouch for is the Python lifetimes package, which implements several probabilistic models for lifetime value prediction in a non-contractual setting (i.e., where churn isn’t observed – as in the Starbucks example above). This package is maintained by Cameron Davidson-Pilon of Shopify, who may be known to some readers from his Bayesian Methods for Hackers book and other Python packages. I’ve successfully used the package on a real dataset and have contributed some small fixes and improvements. The documentation on GitHub is quite good, so I won’t repeat it here. However, it is worth reiterating that as with any predictive model, it is important to evaluate performance on your own dataset before deciding to rely on the package’s predictions. If you only take away one thing from this article, let it be the reminder that it is unwise to blindly accept any formula or model. The models implemented in the package (some of which were introduced by Fader and Hardie) are fairly simple and generally applicable, as they rely only on the past transaction log. These simple models are known to sometimes outperform more complex models that rely on richer data, but this isn’t guaranteed to happen on every dataset. My untested feeling is that in situations where clean and relevant training data is plentiful, models that use other features in addition to those extracted from the transaction log would outperform the models provided by the lifetimes package (if you have empirical evidence that supports or refutes this assumption, please let me know).\nConclusion: You’re better than that Accurate estimation of customer lifetime value is crucial to most businesses. It informs decisions on customer acquisition and retention, and getting it wrong can drive a business from profitability to insolvency. The rise of data science increases the availability of statistical and scientific tools to small and large businesses. Hence, there are few reasons why a revenue-generating business should rely on untested customer value formulas rather than on more realistic models. This extends beyond customer value to nearly every business endeavour: Relying on fabrications is not a sustainable growth strategy, there is no way around learning how to be intelligently driven by data, and no amount of cheap demagoguery and misinformation can alter the objective reality of our world.\n","wordCount":"1716","inLanguage":"en","image":"https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti.jpg","datePublished":"2017-01-08T20:02:30Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Customer lifetime value and the proliferation of misinformation on the internet</h1><div class=post-meta><span title='2017-01-08 20:02:30 +0000 UTC'>January 8, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-01-08-customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti_hua38e844f1f7746a2c15b16de88399d3e_742392_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/propaganda-graffiti.jpg alt width=1920 height=998></figure><div class=post-content><p>Suppose you work for a business that has paying customers. You want to know how much money your customers are likely to spend to inform decisions on customer acquisition and retention budgets. You&rsquo;ve done a bit of research, and discovered that the figure you want to calculate is commonly called the <em>customer lifetime value</em>. You google the term, and end up on a page with ten results (and probably some ads). How many of those results contain useful, non-misleading information? As of early 2017, fewer than half. Why is that? How can it be that after nearly 20 years of existence, Google still surfaces misleading information for common search terms? And how can you calculate your customer lifetime value correctly, avoiding the traps set up by clever search engine marketers? Read on to find out!</p><h2 id=background-misleading-search-results-and-fake-news>Background: Misleading search results and fake news<a hidden class=anchor aria-hidden=true href=#background-misleading-search-results-and-fake-news>#</a></h2><p>While Google tries to filter obvious spam from its index, it still relies to a great extent on popularity to rank search results. Popularity is a function of inbound links (weighted by site credibility), and of user interaction with the presented results (e.g., time spent on a result page before moving on to the next result or search). There are two obvious problems with this approach. First, there are no guarantees that wrong, misleading, or inaccurate pages won&rsquo;t be popular, and therefore earn high rankings. Second, given Google&rsquo;s near-monopoly of the search market, if a page ranks highly for popular search terms, it is likely to become more popular and be seen as credible. Hence, when searching for the truth, it&rsquo;d be wise to follow Abraham Lincoln&rsquo;s famous warning not to trust everything you read on the internet.</p><figure><a href=dont-believe-everything-you-read-on-the-internet-lincoln.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 576px,
 100vw" srcset="https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/_hu443dfcd29851ba5e3a0593bd3d3f46da_33137_dbefe08049f120a85e6b8ffeb571b5f7.jpg 360w,
 https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/_hu443dfcd29851ba5e3a0593bd3d3f46da_33137_f4242f0d698b489a965ede37d3bf9fda.jpg 480w,
 https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/dont-believe-everything-you-read-on-the-internet-lincoln.jpg 576w," src=https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/dont-believe-everything-you-read-on-the-internet-lincoln.jpg alt="Abraham Lincoln internet quote" loading=lazy></a></figure><p>Google is not alone in helping spread misinformation. Following Donald Trump&rsquo;s recent victory in the US presidential election, <a href=https://www.facebook.com/zuck/posts/10103269806149061 target=_blank rel=noopener>many people have blamed Facebook</a> for allowing so-called fake news to be widely shared. Indeed, any popular media outlet or website may end up spreading misinformation, especially if – like Facebook and Google – it mainly aggregates and amplifies user-generated content. However, <a href=http://www.nytimes.com/2016/11/19/business/media/exposing-fake-news-eroding-trust-in-real-reporting.html target=_blank rel=noopener>as noted by John Herrman</a>, the problem is much deeper than clearly-fabricated news stories. It is hard to draw the lines between malicious spread of misinformation, slight inaccuracies, and plain ignorance. For example, how would one classify <a href=http://www.politifact.com/truth-o-meter/statements/2016/jun/03/hillary-clinton/yes-donald-trump-did-call-climate-change-chinese-h/ target=_blank rel=noopener>Trump&rsquo;s claims that climate change is a hoax invented by the Chinese</a>? Should Twitter block his account for knowingly spreading outright lies?</p><h2 id=wrong-customer-value-calculation-by-example>Wrong customer value calculation by example<a hidden class=anchor aria-hidden=true href=#wrong-customer-value-calculation-by-example>#</a></h2><p>Fortunately, when it comes to customer lifetime value, I doubt that any of the top results returned by Google is intentionally misleading. This is a case where inaccuracies and misinformation result from ignorance rather than from malice. However, relying on such resources without digging further is just as risky as relying on pure fabrications. For example, see <a href=https://blog.kissmetrics.com/how-to-calculate-lifetime-value/ target=_blank rel="nofollow noopener">this infographic by Kissmetrics</a>, which suggests three different formulas for calculating the average lifetime value of a Starbucks customer. Those three formulas yield very different values ($5,489, $11,535, and $25,272), which the authors then say should be averaged to yield the final lifetime value figure. All formulas are based on numbers that the authors call <em>constants</em>, despite the fact that numbers such as the average customer lifespan or retention rate are clearly not constant in this context (since they&rsquo;re estimated from the data and used as projections into the future). Indeed, several people have commented on the flaws in Kissmetrics&rsquo; approach, which is reminiscent of <a href=http://dilbert.com/strip/2008-05-07 target=_blank rel=noopener>the Dilbert strip where the pointy-haired boss asks Dilbert to average and multiply wrong data</a>.</p><figure><a href=dilbert-average-multiply-data.gif target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
diff --git a/2017/06/03/exploring-and-visualising-reef-life-survey-data/index.html b/2017/06/03/exploring-and-visualising-reef-life-survey-data/index.html
index c6a4e43ba..072951ad9 100644
--- a/2017/06/03/exploring-and-visualising-reef-life-survey-data/index.html
+++ b/2017/06/03/exploring-and-visualising-reef-life-survey-data/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Exploring and visualising Reef Life Survey data | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,environment,JavaScript,marine science,Reef Life Survey,software engineering,web development"><meta name=description content="Web tools I built to visualise Reef Life Survey data and assist citizen scientists in underwater visual census work."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Exploring and visualising Reef Life Survey data"><meta property="og:description" content="Web tools I built to visualise Reef Life Survey data and assist citizen scientists in underwater visual census work."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/"><meta property="og:image" content="https://yanirseroussi.com/reef-life-survey-frequency-explorer-screenshot.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-06-03T00:49:05+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/reef-life-survey-frequency-explorer-screenshot.png"><meta name=twitter:title content="Exploring and visualising Reef Life Survey data"><meta name=twitter:description content="Web tools I built to visualise Reef Life Survey data and assist citizen scientists in underwater visual census work."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Exploring and visualising Reef Life Survey data","item":"https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Exploring and visualising Reef Life Survey data","name":"Exploring and visualising Reef Life Survey data","description":"Web tools I built to visualise Reef Life Survey data and assist citizen scientists in underwater visual census work.","keywords":["data science","environment","JavaScript","marine science","Reef Life Survey","software engineering","web development"],"articleBody":"Last year, I wrote about the Reef Life Survey (RLS) project and my experience with offline data collection on the Great Barrier Reef. I found that using auto-generated flashcards with an increasing level of difficulty is a good way to memorise marine species. Since publishing that post, I have improved the flashcards and built a tool for exploring the aggregate survey data. Both tools are now publicly available on the RLS website. This post describes the tools and their implementation, and outlines possible directions for future work.\nThe tools Each tool is fairly simple and focused on helping users achieve a small set of tasks. The best way to get familiar with the tools is to play with them by following the links below. If you’re only interested in using the tools, you can stop reading after this section. The rest of this post describes the data behind the tools, and some technical implementation details.\nThe Frequency Explorer tool lets users select RLS sites and view the species that have been recorded there (RLS website | full-screen version). The Flashcards tool helps users memorise the names of marine species by showing random images of species from a chosen area (RLS website | full-screen version). The data The RLS database includes data collected by volunteer scuba divers on the diversity and abundance of marine life in sites around the world. An RLS survey is performed along a 50 metre tape, which is laid at a constant depth following a reef’s contour. After laying the tape, one diver takes photos of the bottom at 2.5 metre intervals along the transect line. These photos are analysed later to classify the type of substrate or growth (e.g., hard coral or sand). Divers then complete two swims along each side of the transect. On the first swim (method 1), divers record all the fish species and large swimming animals found in a 5 metre corridor from the line. The second swim (method 2) targets invertebrates and cryptic animals, and requires keeping closer to the bottom and looking under ledges and vegetation in a 1 metre corridor from the line. The RLS manual includes all the details on how surveys are performed. The data collected in the surveys is available for download from a Data Portal hosted by the Institute for Marine and Antarctic Studies at the University of Tasmania. As of early June 2017, the downloadable dataset consists of over half a million data points from almost ten thousand surveys.\nWhen I first started studying marine species, I had to find a source for photos. Initially, I used Scrapy to build simple scrapers that downloaded photos from sites such as The Australian Museum, Fishbase, and Fishes of Australia. Last year, RLS made a large number of high-quality photos taken by volunteers available on their site (via the Species Search function). In addition to their high quality, an advantage of the RLS photos over images from other sources is that they were all taken in situ, i.e., in each animal’s natural habitat. On the other hand, other sites also include photos of dissections and hand-drawn illustrations, which aren’t as useful for divers who want to see marine animals as they appear in the wild. Working exclusively with the RLS image dataset has significantly improved the appearance and usefulness of the tools I built.\nThe raw RLS survey data comes in the form of over 100MB of CSV files. For the purpose of building the tools, I summarised the data into two JSON files with an overall size of less than 3MB (less than 1MB when compressed). This made it possible to implement both tools as single-page apps that don’t require any requests to the server after the initial fetching of the data. The two summary JSONs are:\nspecies.json – a mapping from species ID to an array of five elements: scientific name, common name, species page URL, survey method (0: method 1, 1: method 2, or 2: both), and images (array of URLs). site-surveys.json – a mapping from site code to an array of seven elements: realm, ecoregion, site name, longitude, latitude, number of surveys, and species counts (mapping from each observed species ID to the number of surveys on which it was seen). Both files use mappings to arrays rather than nested objects to reduce the download size. I originally created the files myself by downloading the CSVs from the data portal and scraping the RLS website for images and common names. Static versions of those files from early June 2017 can be found on GitHub (species.json and site-surveys.json). As part of the integration with the RLS website, the RLS developers will implement live versions of the files, which will get updated automatically. I’ll add the links to the live versions when they become available. Please let me or the RLS team know if you find any issues with the data.\nThe approach I chose to produce the species counts in site-surveys.json doesn’t take abundance into account, i.e., each species is counted once per survey regardless of the number of times it was seen on the survey. Ignoring abundance means that for sites with few surveys, the species count may not be a good indicator of future likelihood of occurrence. For example, some fish are solitary and seen rarely, while others occur in schools and are likely to be seen on every survey. However, this is less of an issue for sites with many surveys. In addition, this simple counting approach is easier to explain than some approaches that do account for abundance.\nImplementation details The source code for the tools can be found in my GitHub Pages repository. Each tool is a simple single-page application, consisting of three files: index.jade, main.coffee, and style.less. In addition, the root source directory contains some common code in common.less and util.coffee, as well as configuration files for npm and Grunt. Grunt is used to compile the source files from Jade/Pug, CoffeeScript, and Less to HTML, JS, and CSS respectively. These files are then served statically by GitHub Pages.\nThe common CoffeeScript code loads the JSONs asynchronously, and processes them into nested mappings that are easier to work with than arrays. In addition, the common code contains a method to summarise counts from multiple sites, by aggregating them as simple sums. This means that sites that are surveyed more frequently get weighted more heavily. For example, if a certain fish X was seen once in site A, twice in site B, and never in site C, its count across A, B, and C is 1 + 2 + 0 = 3, but if A was surveyed once, B was surveyed twice, and C was surveyed seven times, X’s aggregate frequency is 3 / (1 + 2 + 7) = 30%. In the future, it may be worth normalising each site’s species counts by the number of times the site was surveyed (making X’s aggregate frequency (1 / 1 + 2 / 2 + 0 / 7) / 3 = 66.67%), but then rare species in rarely-surveyed sites may be overweighted.\nThe Frequency Explorer tool uses the Google Maps API to show a map with all the past survey sites. Users can select sites by drawing an area on the map, or by searching for site names in a Select2 box. The tool fails gracefully when Google Maps isn’t available, which makes it possible to run it offline (assuming you have local copies of the species images). This was very useful on my last trip to the Coral Sea, where I was away from mobile reception for weeks. When sites are selected, the code generates a summary table of the species frequencies, which can be exported to a dynamically-generated CSV. In addition, users can choose to display images of all the species in the table. As this can trigger the download of thousands of images, I used vanilla-lazyload to only load images when they enter the viewport. Finally, Frequency Explorer can also be used as a site selector for the Flashcards tool, as it contains a link to launch Flashcards with the set of selected sites (which is passed in the Flashcards query string).\nThe Flashcards tool relies on the excellent reveal.js library to dynamically generate a presentation with a random subset of images of species that were recorded at the selected sites. The presentation consists of pairs of image and name slides – each image slide is followed by a slide where the name of the previously-shown animal is revealed. As I found that trying to memorise all the species at once is too hard, I added the ability to adjust the difficulty level of the flashcards by setting a frequency threshold (e.g., show only species that were recorded on 25% of surveys), or by focusing on observations from a single survey method (e.g., method 2 surveys in the tropics tend to be much less diverse than method 1 surveys). To avoid reloading the entire page when the settings change, the slides are regenerated dynamically. Reveal isn’t really built to account for dynamic regeneration of slides, so I had to add a call to Reveal.toggleOverview(false) to get the cards to refresh correctly, but other than that it worked perfectly.\nFuture work There are several possible extensions to the work done so far.\nFirst, the integration of the tools into the RLS website is incomplete. They are still served in iframes from my GitHub Pages account, and the JSON data isn’t updated automatically. Completing the integration is dependent on the RLS developers, who also have other priorities. Other RLS-dependent items include better optimisation of images (they’re currently scaled down on the client side), and general performance improvements to the site.\nSecond, the tools themselves could be improved. For example, reliance on third-party libraries should be reduced (e.g., Frequency Explorer uses Bootstrap due to my limited design skills), and it’d be nice if site selections were stored and read from the URL of Frequency Explorer (this is already done for Flashcards). In addition, as the tools are used to train new RLS divers, it’d be useful to extend the Flashcards tool to run in test mode, where users would type in the names of the animals rather than just passively scroll through the presentation. This would make it possible to assess diver readiness to perform surveys based on their test scores.\nFinally, many other interesting things can be done with the RLS data (in addition to producing scientific papers and reports, which is the main focus of the researchers behind the project). Examples include using the images to automate species identification (as discussed more thoroughly in my previous post on the topic), and building models to predict survey output and detect anomalies (e.g., due to climate change or other unusual factors). If you have other ideas, or end up playing with the data and coming with interesting results, please share your findings in the comments section.\n","wordCount":"1825","inLanguage":"en","image":"https://yanirseroussi.com/reef-life-survey-frequency-explorer-screenshot.png","datePublished":"2017-06-03T00:49:05Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Exploring and visualising Reef Life Survey data</h1><div class=post-meta><span title='2017-06-03 00:49:05 +0000 UTC'>June 3, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-06-03-exploring-and-visualising-reef-life-survey-data/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_1500x0_resize_box_3.png 1500w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot.png 3035w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot.png alt width=3035 height=1442></figure><div class=post-content><p>Last year, I wrote about the <a href=http://reeflifesurvey.com target=_blank rel=noopener>Reef Life Survey</a> (RLS) project and <a href=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/>my experience with offline data collection on the Great Barrier Reef</a>. I found that using auto-generated flashcards with an increasing level of difficulty is a good way to memorise marine species. Since publishing that post, I have improved the flashcards and built a tool for exploring the aggregate survey data. Both tools are now publicly available on the RLS website. This post describes the tools and their implementation, and outlines possible directions for future work.</p><h2 id=the-tools>The tools<a hidden class=anchor aria-hidden=true href=#the-tools>#</a></h2><p>Each tool is fairly simple and focused on helping users achieve a small set of tasks. The best way to get familiar with the tools is to play with them by following the links below. If you&rsquo;re only interested in using the tools, you can stop reading after this section. The rest of this post describes the data behind the tools, and some technical implementation details.</p><figure><a href=reef-life-survey-frequency-explorer-screenshot.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="data science,environment,JavaScript,marine science,Reef Life Survey,software engineering,web development"><meta name=description content="Web tools I built to visualise Reef Life Survey data and assist citizen scientists in underwater visual census work."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Exploring and visualising Reef Life Survey data"><meta property="og:description" content="Web tools I built to visualise Reef Life Survey data and assist citizen scientists in underwater visual census work."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/"><meta property="og:image" content="https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-06-03T00:49:05+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot.png"><meta name=twitter:title content="Exploring and visualising Reef Life Survey data"><meta name=twitter:description content="Web tools I built to visualise Reef Life Survey data and assist citizen scientists in underwater visual census work."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Exploring and visualising Reef Life Survey data","item":"https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Exploring and visualising Reef Life Survey data","name":"Exploring and visualising Reef Life Survey data","description":"Web tools I built to visualise Reef Life Survey data and assist citizen scientists in underwater visual census work.","keywords":["data science","environment","JavaScript","marine science","Reef Life Survey","software engineering","web development"],"articleBody":"Last year, I wrote about the Reef Life Survey (RLS) project and my experience with offline data collection on the Great Barrier Reef. I found that using auto-generated flashcards with an increasing level of difficulty is a good way to memorise marine species. Since publishing that post, I have improved the flashcards and built a tool for exploring the aggregate survey data. Both tools are now publicly available on the RLS website. This post describes the tools and their implementation, and outlines possible directions for future work.\nThe tools Each tool is fairly simple and focused on helping users achieve a small set of tasks. The best way to get familiar with the tools is to play with them by following the links below. If you’re only interested in using the tools, you can stop reading after this section. The rest of this post describes the data behind the tools, and some technical implementation details.\nThe Frequency Explorer tool lets users select RLS sites and view the species that have been recorded there (RLS website | full-screen version). The Flashcards tool helps users memorise the names of marine species by showing random images of species from a chosen area (RLS website | full-screen version). The data The RLS database includes data collected by volunteer scuba divers on the diversity and abundance of marine life in sites around the world. An RLS survey is performed along a 50 metre tape, which is laid at a constant depth following a reef’s contour. After laying the tape, one diver takes photos of the bottom at 2.5 metre intervals along the transect line. These photos are analysed later to classify the type of substrate or growth (e.g., hard coral or sand). Divers then complete two swims along each side of the transect. On the first swim (method 1), divers record all the fish species and large swimming animals found in a 5 metre corridor from the line. The second swim (method 2) targets invertebrates and cryptic animals, and requires keeping closer to the bottom and looking under ledges and vegetation in a 1 metre corridor from the line. The RLS manual includes all the details on how surveys are performed. The data collected in the surveys is available for download from a Data Portal hosted by the Institute for Marine and Antarctic Studies at the University of Tasmania. As of early June 2017, the downloadable dataset consists of over half a million data points from almost ten thousand surveys.\nWhen I first started studying marine species, I had to find a source for photos. Initially, I used Scrapy to build simple scrapers that downloaded photos from sites such as The Australian Museum, Fishbase, and Fishes of Australia. Last year, RLS made a large number of high-quality photos taken by volunteers available on their site (via the Species Search function). In addition to their high quality, an advantage of the RLS photos over images from other sources is that they were all taken in situ, i.e., in each animal’s natural habitat. On the other hand, other sites also include photos of dissections and hand-drawn illustrations, which aren’t as useful for divers who want to see marine animals as they appear in the wild. Working exclusively with the RLS image dataset has significantly improved the appearance and usefulness of the tools I built.\nThe raw RLS survey data comes in the form of over 100MB of CSV files. For the purpose of building the tools, I summarised the data into two JSON files with an overall size of less than 3MB (less than 1MB when compressed). This made it possible to implement both tools as single-page apps that don’t require any requests to the server after the initial fetching of the data. The two summary JSONs are:\nspecies.json – a mapping from species ID to an array of five elements: scientific name, common name, species page URL, survey method (0: method 1, 1: method 2, or 2: both), and images (array of URLs). site-surveys.json – a mapping from site code to an array of seven elements: realm, ecoregion, site name, longitude, latitude, number of surveys, and species counts (mapping from each observed species ID to the number of surveys on which it was seen). Both files use mappings to arrays rather than nested objects to reduce the download size. I originally created the files myself by downloading the CSVs from the data portal and scraping the RLS website for images and common names. Static versions of those files from early June 2017 can be found on GitHub (species.json and site-surveys.json). As part of the integration with the RLS website, the RLS developers will implement live versions of the files, which will get updated automatically. I’ll add the links to the live versions when they become available. Please let me or the RLS team know if you find any issues with the data.\nThe approach I chose to produce the species counts in site-surveys.json doesn’t take abundance into account, i.e., each species is counted once per survey regardless of the number of times it was seen on the survey. Ignoring abundance means that for sites with few surveys, the species count may not be a good indicator of future likelihood of occurrence. For example, some fish are solitary and seen rarely, while others occur in schools and are likely to be seen on every survey. However, this is less of an issue for sites with many surveys. In addition, this simple counting approach is easier to explain than some approaches that do account for abundance.\nImplementation details The source code for the tools can be found in my GitHub Pages repository. Each tool is a simple single-page application, consisting of three files: index.jade, main.coffee, and style.less. In addition, the root source directory contains some common code in common.less and util.coffee, as well as configuration files for npm and Grunt. Grunt is used to compile the source files from Jade/Pug, CoffeeScript, and Less to HTML, JS, and CSS respectively. These files are then served statically by GitHub Pages.\nThe common CoffeeScript code loads the JSONs asynchronously, and processes them into nested mappings that are easier to work with than arrays. In addition, the common code contains a method to summarise counts from multiple sites, by aggregating them as simple sums. This means that sites that are surveyed more frequently get weighted more heavily. For example, if a certain fish X was seen once in site A, twice in site B, and never in site C, its count across A, B, and C is 1 + 2 + 0 = 3, but if A was surveyed once, B was surveyed twice, and C was surveyed seven times, X’s aggregate frequency is 3 / (1 + 2 + 7) = 30%. In the future, it may be worth normalising each site’s species counts by the number of times the site was surveyed (making X’s aggregate frequency (1 / 1 + 2 / 2 + 0 / 7) / 3 = 66.67%), but then rare species in rarely-surveyed sites may be overweighted.\nThe Frequency Explorer tool uses the Google Maps API to show a map with all the past survey sites. Users can select sites by drawing an area on the map, or by searching for site names in a Select2 box. The tool fails gracefully when Google Maps isn’t available, which makes it possible to run it offline (assuming you have local copies of the species images). This was very useful on my last trip to the Coral Sea, where I was away from mobile reception for weeks. When sites are selected, the code generates a summary table of the species frequencies, which can be exported to a dynamically-generated CSV. In addition, users can choose to display images of all the species in the table. As this can trigger the download of thousands of images, I used vanilla-lazyload to only load images when they enter the viewport. Finally, Frequency Explorer can also be used as a site selector for the Flashcards tool, as it contains a link to launch Flashcards with the set of selected sites (which is passed in the Flashcards query string).\nThe Flashcards tool relies on the excellent reveal.js library to dynamically generate a presentation with a random subset of images of species that were recorded at the selected sites. The presentation consists of pairs of image and name slides – each image slide is followed by a slide where the name of the previously-shown animal is revealed. As I found that trying to memorise all the species at once is too hard, I added the ability to adjust the difficulty level of the flashcards by setting a frequency threshold (e.g., show only species that were recorded on 25% of surveys), or by focusing on observations from a single survey method (e.g., method 2 surveys in the tropics tend to be much less diverse than method 1 surveys). To avoid reloading the entire page when the settings change, the slides are regenerated dynamically. Reveal isn’t really built to account for dynamic regeneration of slides, so I had to add a call to Reveal.toggleOverview(false) to get the cards to refresh correctly, but other than that it worked perfectly.\nFuture work There are several possible extensions to the work done so far.\nFirst, the integration of the tools into the RLS website is incomplete. They are still served in iframes from my GitHub Pages account, and the JSON data isn’t updated automatically. Completing the integration is dependent on the RLS developers, who also have other priorities. Other RLS-dependent items include better optimisation of images (they’re currently scaled down on the client side), and general performance improvements to the site.\nSecond, the tools themselves could be improved. For example, reliance on third-party libraries should be reduced (e.g., Frequency Explorer uses Bootstrap due to my limited design skills), and it’d be nice if site selections were stored and read from the URL of Frequency Explorer (this is already done for Flashcards). In addition, as the tools are used to train new RLS divers, it’d be useful to extend the Flashcards tool to run in test mode, where users would type in the names of the animals rather than just passively scroll through the presentation. This would make it possible to assess diver readiness to perform surveys based on their test scores.\nFinally, many other interesting things can be done with the RLS data (in addition to producing scientific papers and reports, which is the main focus of the researchers behind the project). Examples include using the images to automate species identification (as discussed more thoroughly in my previous post on the topic), and building models to predict survey output and detect anomalies (e.g., due to climate change or other unusual factors). If you have other ideas, or end up playing with the data and coming with interesting results, please share your findings in the comments section.\n","wordCount":"1825","inLanguage":"en","image":"https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot.png","datePublished":"2017-06-03T00:49:05Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Exploring and visualising Reef Life Survey data</h1><div class=post-meta><span title='2017-06-03 00:49:05 +0000 UTC'>June 3, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-06-03-exploring-and-visualising-reef-life-survey-data/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_1500x0_resize_box_3.png 1500w ,https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot.png 3035w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot.png alt width=3035 height=1442></figure><div class=post-content><p>Last year, I wrote about the <a href=http://reeflifesurvey.com target=_blank rel=noopener>Reef Life Survey</a> (RLS) project and <a href=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/>my experience with offline data collection on the Great Barrier Reef</a>. I found that using auto-generated flashcards with an increasing level of difficulty is a good way to memorise marine species. Since publishing that post, I have improved the flashcards and built a tool for exploring the aggregate survey data. Both tools are now publicly available on the RLS website. This post describes the tools and their implementation, and outlines possible directions for future work.</p><h2 id=the-tools>The tools<a hidden class=anchor aria-hidden=true href=#the-tools>#</a></h2><p>Each tool is fairly simple and focused on helping users achieve a small set of tasks. The best way to get familiar with the tools is to play with them by following the links below. If you&rsquo;re only interested in using the tools, you can stop reading after this section. The rest of this post describes the data behind the tools, and some technical implementation details.</p><figure><a href=reef-life-survey-frequency-explorer-screenshot.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/reef-life-survey-frequency-explorer-screenshot_hu373457bc2952799d7bbd8496305551d0_1306623_720x0_resize_box_3.png 720w,
diff --git a/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.html b/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.html
index 7a1acddbf..8289aa928 100644
--- a/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.html
+++ b/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>My 10-step path to becoming a remote data scientist with Automattic | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Automattic,career,data science,Elasticsearch,personal,WordPress"><meta name=description content="I wanted a well-paid data science-y remote job with an established company that offers a good life balance and makes products I care about. I got it eventually."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My 10-step path to becoming a remote data scientist with Automattic"><meta property="og:description" content="I wanted a well-paid data science-y remote job with an established company that offers a good life balance and makes products I care about. I got it eventually."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/"><meta property="og:image" content="https://yanirseroussi.com/long-remote-road.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-07-29T05:39:26+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/long-remote-road.jpg"><meta name=twitter:title content="My 10-step path to becoming a remote data scientist with Automattic"><meta name=twitter:description content="I wanted a well-paid data science-y remote job with an established company that offers a good life balance and makes products I care about. I got it eventually."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My 10-step path to becoming a remote data scientist with Automattic","item":"https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My 10-step path to becoming a remote data scientist with Automattic","name":"My 10-step path to becoming a remote data scientist with Automattic","description":"I wanted a well-paid data science-y remote job with an established company that offers a good life balance and makes products I care about. I got it eventually.","keywords":["Automattic","career","data science","Elasticsearch","personal","WordPress"],"articleBody":"About two years ago, I read the book The Year without Pants, which describes the author’s experience leading a team at Automattic (the company behind WordPress.com, among other products). Automattic is a fully-distributed company, which means that all of its employees work remotely (hence pants are optional). While the book discusses some of the challenges of working remotely, the author’s general experience was very positive. A few months after reading the book, I decided to look for a full-time position after a period of independent work. Ideally, I wanted a well-paid data science-y remote job with an established distributed tech company that offers a good life balance and makes products I care about. Automattic seemed to tick all my boxes, so I decided to apply for a job with them. This post describes my application steps, which ultimately led to me becoming a data scientist with Automattic.\nBefore jumping in, it’s worth noting that this post describes my personal experience. If you apply for a job with Automattic, your experience is likely to be different, as the process varies across teams, and evolves over time.\n📧 Step 1: Do background research and apply I decided to apply for a data wrangler position with Automattic in October 2015. While data wrangler may sound less sexy than data scientist, reading the job ad led me to believe that the position may involve interesting data science work. This impression was strengthened by some LinkedIn stalking, which included finding current data wranglers and reading through their profiles and websites. I later found out that all the people on the data division start out as data wranglers, and then they may pick their own title. Some data wranglers do data science work, while others are more focused on data engineering, and there are some projects that require a broad range of skills. As the usefulness of the term data scientist is questionable, I’m not too fussed about fancy job titles. It’s more important to do interesting work in a supportive environment.\nApplying for the job was fairly straightforward. I simply followed the instructions from the ad:\nDoes this sound interesting? If yes, please send a short email to jobs @ this domain telling us about yourself and attach a resumé. Let us know what you can contribute to the team. Include the title of the position you’re applying for and your name in the subject. Proofread! Make sure you spell and capitalize WordPress and Automattic correctly. We are lucky to receive hundreds of applications for every position, so try to make your application stand out. If you apply for multiple positions or send multiple emails there will be one reply.\nHaving been on the receiving side of job applications, I find it surprising that many people don’t bother writing a cover letter, addressing the selection criteria in the ad, or even applying for a job they’re qualified to do. Hence, my cover letter was fairly short, comprising of several bullet points that highlight the similarities between the job requirements and my experience. It was nothing fancy, but simple cover letters have worked well for me in the past.\n⏳ Step 2: Wait patiently The initial application was followed by a long wait. From my research, this is the typical scenario. This is unsurprising, as Automattic is a fairly small company with a large footprint, which is both distributed and known as a great place to work (e.g., its Glassdoor rating is 4.9). Therefore, it attracts many applicants from all over the world, which take a while to process. In addition, Matt Mullenweg (Automattic’s CEO) reviews job applications before passing them on to the team leads.\nAs I didn’t know that Matt reviewed job applications, I decided to try to shorten the wait by getting introduced to someone in the data division. My first attempt was via a second-degree LinkedIn connection who works for Automattic. He responded quickly when I reached out to him, saying that his experience working with the company is in line with the Glassdoor reviews – it’s the best job he’s had in his 15-year-long career. However, he couldn’t help me with an intro, because there is no simple way around Automattic’s internal processes. Nonetheless, he reassured me that it is worth waiting patiently, as the strict process means that you end up working with great people.\nI wasn’t in a huge rush to find a job, but in December 2015 I decided to accept an offer to become the head of data science at Car Next Door. This was a good decision at the time, as I believe in the company’s original vision of reducing the number of cars on the road through car sharing, and it seemed like there would be many interesting projects for me to work on. The position wasn’t completely remote, but as the company was already spread across several cities, I was able to work from home for a day or two every week. In addition, it was a pleasant commute by bike from my Sydney home to the office, so putting the fully-remote job search on hold didn’t seem like a major sacrifice. As I haven’t heard anything from Automattic at that stage, it seemed unwise to reject a good offer, so I started working full-time with Car Next Door in January 2016.\nI successfully attracted Automattic’s attention with a post I published on the misuse of the word insights by many tech companies, which included an example from WordPress.com. Greg Ichneumon Brown, one of the data wranglers, commented on the post, and invited me to apply to join Automattic and help them address the issues I raised. This happened after I accepted the offer from Car Next Door, and hasn’t resulted in any speed up of the process, so I just gave up on Automattic and carried on with my life.\n💬 Step 3: Chat with the data lead I finally heard back from Automattic in February 2016 (four months after my initial application and a month into my employment with Car Next Door). Martin Remy, who leads the data division, emailed me to enquire if I’m still interested in the position. I informed him that I was no longer looking for a job, but we agreed to have an informal chat, as I’ve been waiting for such a long time.\nAs is often the case with Automattic interviews, the chat with Martin was completely text-based. Working with a distributed team means that voice and video calls can be hard to schedule. Hence, Automattic relies heavily on textual channels, and text-based interviews allow the company to test the written communication skills of candidates. The chat revolved around my past work experience, and Martin also took the time to answer my questions about the company and the data division. At the conclusion of the chat, Martin suggested I contact him directly if I was ever interested in continuing the application process. While I was happy with my position at the time, the chat strengthened my positive impression of Automattic, and I decided that I would reapply if I were to look for a full-time position again.\nMy next job search started earlier than I had anticipated. In October 2016, I decided to leave Car Next Door due to disagreements with the founders over the general direction of the company. In addition, I had more flexibility in choosing where to live, as my personal circumstances had changed. As I’ve always been curious about life outside the capital cities of Australia, I wanted to move away from Sydney. While I could have probably continued working remotely with Car Next Door, I felt that it would be better to find a job with a fully-distributed team. Therefore, I messaged Martin and we scheduled another chat.\nThe second chat with Martin took place in early November. Similarly to the first chat, it was conducted via Skype text messages, and revolved around my work in the time that has passed since the first chat. This time, as I was keen on continuing with the process, I asked more specific questions about what kind of work I’m likely to end up doing and what the next steps would be. The answers were that I’d be joining the data science team, and that the next steps are a pre-trial test, a paid trial, and a final interview with Matt. While this sounds straightforward, it took another six months until I finally became an Automattic employee (but I wasn’t in a rush).\n☑️ Step 4: Pass the pre-trial test The pre-trial test consisted of a data analysis task, where I was given a dataset and a set of questions to answer by Carly Stambaugh, the data science lead. The goal of the test is to evaluate the candidate’s approach to a problem, and assess organisational and communication skills. As such, the focus isn’t on obtaining a specific result, so candidates are given a choice of several potential avenues to explore. The open-ended nature of the task is reminiscent of many real-world data science projects, where you don’t always have a clear idea of what you’re going to discover. While some people may find this kind of uncertainty daunting, I find it interesting, as it is one of the things that makes data science a science.\nI spent a few days analysing the data and preparing a report, which was submitted as a Jupyter Notebook. After submitting my initial report, there were a few follow-up questions, which I answered by email. The report was reviewed by Carly and Martin, and as they were satisfied with my work, I was invited to proceed to the next stage: A paid trial project.\n👨‍💻 Step 5: Do the trial project The main part of the application process with Automattic is the paid trial project. The rationale behind doing paid trials was explained a few years ago by Matt in Hire by Auditions, Not Resumes:\nBefore we hire anyone, they go through a trial process first, on contract. They can do the work at night or over the weekend, so they don’t have to leave their current job in the meantime. We pay a standard rate of $25 per hour, regardless of whether you’re applying to be an engineer or the chief financial officer.\nDuring the trials, we give the applicants actual work. If you’re applying to work in customer support, you’ll answer tickets. If you’re an engineer, you’ll work on engineering problems. If you’re a designer, you’ll design.\nThere’s nothing like being in the trenches with someone, working with them day by day. It tells you something you can’t learn from resumes, interviews, or reference checks. At the end of the trial, everyone involved has a great sense of whether they want to work together going forward. And, yes, that means everyone — it’s a mutual tryout. Some people decide we’re not the right fit for them.\nThe goal of my trial project was to improve the Elasticsearch language detection algorithm. This took about a month, and ultimately resulted in a pull request that got merged into the language detection plugin. I find this aspect of the process pretty exciting: While the plugin is used to classify millions of documents internally by Automattic, its impact extends beyond the company, as Elasticsearch is used by many other organisations and projects. This stands in contrast to many other technical job interviews, which consist of unpaid work on toy problems under stressful conditions, where the work performed is ultimately thrown away. While the monetary compensation for the trial work is lower than the market rate for data science consulting, I valued the opportunity to work on a real open source project, even if this hadn’t led to me getting hired.\nThere was much more to the trial project than what’s shown in the final pull request. Most of the discussions were held on an internal project thread, primarly under the guidance of Carly (the data science lead), and Greg (the data wrangler who replied to my post a year earlier). The project was kicked off with a general problem statement: There was some evidence that the Elasticsearch language detection plugin doesn’t perform well on short texts, and my mission was to improve it. As the plugin didn’t include any tests for short texts, one of the main contributions of my work was the creation of datasets and tests to measure its accuracy on texts of different lengths. This was followed by some tweaks that improved the plugin’s performance, as summarised in the pull request. Internally, this work consisted of several iterations where I came up with ideas, asked questions, implemented the ideas, shared the results, and discussed further steps. There are still many possible improvements to the work done in the trial. However, as trials generally last around a month, we decided to end it after a few iterations.\nI enjoyed the trial process, but it is definitely not for everyone. Most notably, there is a strong emphasis on asynchronous text-based communication, which is the main mode by which projects are coordinated at Automattic. People who don’t enjoy written communication may find this aspect challenging, but I have always found that writing helps me organise my thoughts, and that I retain information better when reading than when listening to people speak. That being said, Automatticians do meet in person several times a year, and some teams have video chats for some discussions. While doing the trial, I had a video chat with Carly, which was the first (and last) time in the process that I got to see and hear a live human. However, this was not an essential part of the trial project, as our chat was mostly on the data scientist role and my job expectations.\n⏳ Step 6: Wait patiently I finished working on the trial project just before Christmas. The feedback I received throughout the trial was positive, but Martin, Carly, and Greg had to go through the work and discuss it among themselves before making a final decision. This took about a month, due to the holiday period, various personal circumstances, and the data science team meetup that was scheduled for January 2017. Eventually, Martin got back to me with positive news: They were satisfied with my trial work, which meant there was only one stage left – the final interview with Matt Mullenweg, Automattic’s CEO.\n👉 Step 7: Ping Matt Like other parts of the process, the interview with Matt is text-based. The way it works is fairly simple: I was instructed to message Matt on Slack and wait for a response, which may take days or weeks. I sent Matt a message on January 25, and was surprised to hear back from him the following morning. However, that day was Australia Day, which is a public holiday here. Therefore, I only got back to him two hours after he messaged me that morning, and by that time he was probably already busy with other things. This was the start of a pretty long wait.\n⏳ Step 8: Wait patiently I left Car Next Door at the end of January, as I figured that I would be able to line up some other work even if things didn’t work out with Automattic. My plan was to take some time off, and then move up to the Northern Rivers area of New South Wales. I had two Reef Life Survey trips planned, so I wasn’t going to start working again before mid-April. I assumed that I would hear back from Matt before then, which would have allowed me to make an informed decision whether to look for another job or not.\nAfter two weeks of waiting, the time for my dive trips was nearing. As I was going to be without mobile reception for a while, I thought it’d be worth letting Matt know my schedule. After discussing the matter with Martin, I messaged Matt. He responded, saying that we might as well do the interview at the beginning of April, as I won’t be starting work before that time anyway. I would have preferred to be done with the interview earlier, but was happy to have some certainty and not worry about missing more chat messages before April.\nIn early April, I returned from my second dive trip (which included a close encounter with Cyclone Debbie), and was hoping to sort out my remote work situation while completing the move up north. Unfortunately, while the move was successful, I was ready to give up on Automattic because I haven’t heard back from Matt at all in April. However, Martin remained optimistic and encouraged me to wait patiently, which I did as I was pretty busy with the move and with some casual freelancing projects.\n💬 Step 9: Chat with Matt and accept the job offer The chat with Matt finally happened on May 2. As is often the case, it took a few hours and covered my background, the trial process, and some other general questions. I asked him about my long wait for the final chat, and he apologised for me being an outlier, as most chats happen within two weeks of a candidate being passed over to him. As the chat was about to conclude, we got to the topic of salary negotiation (which went well), and then the process was finally over! Within a few hours of the chat I was sent an offer letter and an employment contract. As Automattic has an entity in Australia (called Ausomattic), it’s a fairly standard contract. I signed the contract and started work the following week – over a year and a half after my initial application. Even before I started working, I booked tickets to meet the data division in Montréal – a fairly swift transition from the long wait for the final interview.\n🎉 Step 10: Start working and choose a job title As noted above, Automatticians get to choose their own job titles, so to become a data scientist with Automattic, I had to set my job title to Data Scientist. This is generally how many people become data scientists these days, even outside Automattic. However, job titles don’t matter as much as job satisfaction. And after 2.5 months with Automattic, I’m very satisfied with my decision to join the company. My first three weeks were spent doing customer support, like all new Automattic employees. Since then, I’ve been involved in projects to make engagement measurement more consistent (harder than it sounds, as counting things is hard), and to improve the data science codebase (e.g., moving away from Legacy Python). Besides that, I also went to Montréal for the data division meetup, and have started getting into chatbot work. I’m looking forward to doing more work and sharing my experience here and on data.blog.\n","wordCount":"3143","inLanguage":"en","image":"https://yanirseroussi.com/long-remote-road.jpg","datePublished":"2017-07-29T05:39:26Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My 10-step path to becoming a remote data scientist with Automattic</h1><div class=post-meta><span title='2017-07-29 05:39:26 +0000 UTC'>July 29, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-07-29-my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road.jpg 2000w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road.jpg alt width=2000 height=1125></figure><div class=post-content><p>About two years ago, I read the book <a href=http://scottberkun.com/yearwithoutpants/ target=_blank rel=noopener>The Year without Pants</a>, which describes the author&rsquo;s experience leading a team at <a href=https://automattic.com/ target=_blank rel=noopener>Automattic</a> (the company behind WordPress.com, among other products). Automattic is a fully-distributed company, which means that all of its employees work remotely (hence pants are optional). While the book discusses some of the challenges of working remotely, the author&rsquo;s general experience was very positive. A few months after reading the book, I decided to look for a full-time position after <a href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/>a period of independent work</a>. Ideally, I wanted a well-paid data science-y remote job with an established distributed tech company that offers a good life balance and makes products I care about. Automattic seemed to tick all my boxes, so I decided to apply for a job with them. This post describes my application steps, which ultimately led to me becoming a data scientist with Automattic.</p><p>Before jumping in, it&rsquo;s worth noting that this post describes <em>my</em> personal experience. If you apply for a job with Automattic, your experience is likely to be different, as the process varies across teams, and evolves over time.</p><h2 id=-step-1-do-background-research-and-apply>📧 Step 1: Do background research and apply<a hidden class=anchor aria-hidden=true href=#-step-1-do-background-research-and-apply>#</a></h2><p>I decided to apply for a data wrangler position with Automattic in October 2015. While data <em>wrangler</em> may sound less sexy than data <em>scientist</em>, reading the <a href=http://web.archive.org/web/20150908140923/https://automattic.com/work-with-us/data-wrangler/ target=_blank rel=noopener>job ad</a> led me to believe that the position may involve interesting data science work. This impression was strengthened by some LinkedIn stalking, which included finding current data wranglers and reading through their profiles and websites. I later found out that all the people on the data division start out as data wranglers, and then they may pick their own title. Some data wranglers do data science work, while others are more focused on data engineering, and there are some projects that require a broad range of skills. As <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>the usefulness of the term <em>data scientist</em> is questionable</a>, I&rsquo;m not too fussed about fancy job titles. It&rsquo;s more important to do interesting work in a supportive environment.</p><p>Applying for the job was fairly straightforward. I simply followed the instructions from the ad:</p><blockquote><p>Does this sound interesting? If yes, please send a short email to jobs @ this domain telling us about yourself and attach a resumé. Let us know what you can contribute to the team. Include the title of the position you&rsquo;re applying for and your name in the subject. Proofread! Make sure you spell and capitalize WordPress and Automattic correctly. We are lucky to receive hundreds of applications for every position, so try to make your application stand out. If you apply for multiple positions or send multiple emails there will be one reply.</p></blockquote><p>Having been on the receiving side of job applications, I find it surprising that many people don&rsquo;t bother writing a cover letter, addressing the selection criteria in the ad, or even applying for a job they&rsquo;re qualified to do. Hence, my cover letter was fairly short, comprising of several bullet points that highlight the similarities between the job requirements and my experience. It was nothing fancy, but simple cover letters have worked well for me in the past.</p><h2 id=-step-2-wait-patiently>⏳ Step 2: Wait patiently<a hidden class=anchor aria-hidden=true href=#-step-2-wait-patiently>#</a></h2><p>The initial application was followed by a long wait. From my research, this is the typical scenario. This is unsurprising, as <a href=https://automattic.com/about/ target=_blank rel=noopener>Automattic is a fairly small company with a large footprint</a>, which is both distributed and known as a great place to work (e.g., its <a href=https://www.glassdoor.com.au/Reviews/Automattic-Reviews-E751107.htm target=_blank rel=noopener>Glassdoor rating is 4.9</a>). Therefore, it attracts many applicants from all over the world, which take a while to process. In addition, <a href=http://davemart.in/remote-hiring/ target=_blank rel=noopener>Matt Mullenweg (Automattic&rsquo;s CEO) reviews job applications before passing them on to the team leads</a>.</p><p>As I didn&rsquo;t know that Matt reviewed job applications, I decided to try to shorten the wait by getting introduced to someone in the data division. My first attempt was via a second-degree LinkedIn connection who works for Automattic. He responded quickly when I reached out to him, saying that his experience working with the company is in line with the Glassdoor reviews – it&rsquo;s the best job he&rsquo;s had in his 15-year-long career. However, he couldn&rsquo;t help me with an intro, because there is no simple way around Automattic&rsquo;s internal processes. Nonetheless, he reassured me that it is worth waiting patiently, as the strict process means that you end up working with great people.</p><p>I wasn&rsquo;t in a huge rush to find a job, but in December 2015 I decided to accept an offer to become the head of data science at <a href=https://www.carnextdoor.com.au/ target=_blank rel=noopener>Car Next Door</a>. This was a good decision at the time, as I believe in the company&rsquo;s original vision of reducing the number of cars on the road through car sharing, and it seemed like there would be many interesting projects for me to work on. The position wasn&rsquo;t completely remote, but as the company was already spread across several cities, I was able to work from home for a day or two every week. In addition, it was a pleasant commute by bike from my Sydney home to the office, so putting the fully-remote job search on hold didn&rsquo;t seem like a major sacrifice. As I haven&rsquo;t heard anything from Automattic at that stage, it seemed unwise to reject a good offer, so I started working full-time with Car Next Door in January 2016.</p><p>I successfully attracted Automattic&rsquo;s attention with <a href=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/>a post I published on the misuse of the word <em>insights</em> by many tech companies</a>, which included an example from WordPress.com. Greg Ichneumon Brown, one of the data wranglers, <a href=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/#comment-957>commented on the post</a>, and invited me to apply to join Automattic and help them address the issues I raised. This happened after I accepted the offer from Car Next Door, and hasn&rsquo;t resulted in any speed up of the process, so I just gave up on Automattic and carried on with my life.</p><h2 id=-step-3-chat-with-the-data-lead>💬 Step 3: Chat with the data lead<a hidden class=anchor aria-hidden=true href=#-step-3-chat-with-the-data-lead>#</a></h2><p>I finally heard back from Automattic in February 2016 (four months after my initial application and a month into my employment with Car Next Door). Martin Remy, who leads the data division, emailed me to enquire if I&rsquo;m still interested in the position. I informed him that I was no longer looking for a job, but we agreed to have an informal chat, as I&rsquo;ve been waiting for such a long time.</p><p>As is often the case with Automattic interviews, the chat with Martin was completely text-based. Working with a distributed team means that voice and video calls can be hard to schedule. Hence, Automattic relies heavily on textual channels, and text-based interviews allow the company to test the written communication skills of candidates. The chat revolved around my past work experience, and Martin also took the time to answer my questions about the company and the data division. At the conclusion of the chat, Martin suggested I contact him directly if I was ever interested in continuing the application process. While I was happy with my position at the time, the chat strengthened my positive impression of Automattic, and I decided that I would reapply if I were to look for a full-time position again.</p><p>My next job search started earlier than I had anticipated. In October 2016, I decided to leave Car Next Door due to disagreements with the founders over the general direction of the company. In addition, I had more flexibility in choosing where to live, as my personal circumstances had changed. As I&rsquo;ve always been curious about life outside the capital cities of Australia, I wanted to move away from Sydney. While I could have probably continued working remotely with Car Next Door, I felt that it would be better to find a job with a fully-distributed team. Therefore, I messaged Martin and we scheduled another chat.</p><p>The second chat with Martin took place in early November. Similarly to the first chat, it was conducted via Skype text messages, and revolved around my work in the time that has passed since the first chat. This time, as I was keen on continuing with the process, I asked more specific questions about what kind of work I&rsquo;m likely to end up doing and what the next steps would be. The answers were that I&rsquo;d be joining the data science team, and that the next steps are a pre-trial test, a paid trial, and a final interview with Matt. While this sounds straightforward, it took another six months until I finally became an Automattic employee (but I wasn&rsquo;t in a rush).</p><h2 id=-step-4-pass-the-pre-trial-test>☑️ Step 4: Pass the pre-trial test<a hidden class=anchor aria-hidden=true href=#-step-4-pass-the-pre-trial-test>#</a></h2><p>The pre-trial test consisted of a data analysis task, where I was given a dataset and a set of questions to answer by Carly Stambaugh, the data science lead. The goal of the test is to evaluate the candidate&rsquo;s approach to a problem, and assess organisational and communication skills. As such, the focus isn&rsquo;t on obtaining a specific result, so candidates are given a choice of several potential avenues to explore. The open-ended nature of the task is reminiscent of many real-world data science projects, where you don&rsquo;t always have a clear idea of what you&rsquo;re going to discover. While some people may find this kind of uncertainty daunting, I find it interesting, as it is one of the things that makes data science a <em>science</em>.</p><p>I spent a few days analysing the data and preparing a report, which was submitted as a <a href=http://jupyter.org/ target=_blank rel=noopener>Jupyter Notebook</a>. After submitting my initial report, there were a few follow-up questions, which I answered by email. The report was reviewed by Carly and Martin, and as they were satisfied with my work, I was invited to proceed to the next stage: A paid trial project.</p><h2 id=-step-5-do-the-trial-project>👨‍💻 Step 5: Do the trial project<a hidden class=anchor aria-hidden=true href=#-step-5-do-the-trial-project>#</a></h2><p>The main part of the application process with Automattic is the paid trial project. The rationale behind doing paid trials was explained a few years ago by Matt in <a href=https://hbr.org/2014/01/hire-by-auditions-not-resumes target=_blank rel=noopener>Hire by Auditions, Not Resumes</a>:</p><blockquote><p>Before we hire anyone, they go through a trial process first, on contract. They can do the work at night or over the weekend, so they don&rsquo;t have to leave their current job in the meantime. We pay a standard rate of $25 per hour, regardless of whether you&rsquo;re applying to be an engineer or the chief financial officer.</p><p>During the trials, we give the applicants actual work. If you&rsquo;re applying to work in customer support, you&rsquo;ll answer tickets. If you&rsquo;re an engineer, you&rsquo;ll work on engineering problems. If you&rsquo;re a designer, you&rsquo;ll design.</p><p>There&rsquo;s nothing like being in the trenches with someone, working with them day by day. It tells you something you can&rsquo;t learn from resumes, interviews, or reference checks. At the end of the trial, everyone involved has a great sense of whether they want to work together going forward. And, yes, that means everyone — it&rsquo;s a mutual tryout. Some people decide we&rsquo;re not the right fit for them.</p></blockquote><p>The goal of my trial project was to improve the <a href=https://www.elastic.co/products/elasticsearch target=_blank rel=noopener>Elasticsearch</a> language detection algorithm. This took about a month, and ultimately resulted in <a href=https://github.com/jprante/elasticsearch-langdetect/pull/69 target=_blank rel=noopener>a pull request that got merged into the language detection plugin</a>. I find this aspect of the process pretty exciting: While the plugin is used to classify millions of documents internally by Automattic, its impact extends beyond the company, as Elasticsearch is used by many other organisations and projects. This stands in contrast to many other technical job interviews, which consist of unpaid work on toy problems under stressful conditions, where the work performed is ultimately thrown away. While the monetary compensation for the trial work is lower than the market rate for data science consulting, I valued the opportunity to work on a real open source project, even if this hadn&rsquo;t led to me getting hired.</p><p>There was much more to the trial project than what&rsquo;s shown in the final pull request. Most of the discussions were held on an internal project thread, primarly under the guidance of Carly (the data science lead), and Greg (the data wrangler who replied to my post a year earlier). The project was kicked off with a general problem statement: There was some evidence that the Elasticsearch language detection plugin doesn&rsquo;t perform well on short texts, and my mission was to improve it. As the plugin didn&rsquo;t include any tests for short texts, one of the main contributions of my work was the creation of datasets and tests to measure its accuracy on texts of different lengths. This was followed by some tweaks that improved the plugin&rsquo;s performance, as <a href=https://github.com/jprante/elasticsearch-langdetect/pull/69 target=_blank rel=noopener>summarised in the pull request</a>. Internally, this work consisted of several iterations where I came up with ideas, asked questions, implemented the ideas, shared the results, and discussed further steps. There are still many possible improvements to the work done in the trial. However, as trials generally last around a month, we decided to end it after a few iterations.</p><p>I enjoyed the trial process, but it is definitely not for everyone. Most notably, there is a strong emphasis on asynchronous text-based communication, which is the main mode by which projects are coordinated at Automattic. People who don&rsquo;t enjoy written communication may find this aspect challenging, but I have always found that writing helps me organise my thoughts, and that I retain information better when reading than when listening to people speak. That being said, Automatticians do meet in person several times a year, and some teams have video chats for some discussions. While doing the trial, I had a video chat with Carly, which was the first (and last) time in the process that I got to see and hear a live human. However, this was not an essential part of the trial project, as our chat was mostly on the data scientist role and my job expectations.</p><h2 id=-step-6-wait-patiently>⏳ Step 6: Wait patiently<a hidden class=anchor aria-hidden=true href=#-step-6-wait-patiently>#</a></h2><p>I finished working on the trial project just before Christmas. The feedback I received throughout the trial was positive, but Martin, Carly, and Greg had to go through the work and discuss it among themselves before making a final decision. This took about a month, due to the holiday period, various personal circumstances, and the data science team meetup that was scheduled for January 2017. Eventually, Martin got back to me with positive news: They were satisfied with my trial work, which meant there was only one stage left – the final interview with Matt Mullenweg, Automattic&rsquo;s CEO.</p><h2 id=-step-7-ping-matt>👉 Step 7: Ping Matt<a hidden class=anchor aria-hidden=true href=#-step-7-ping-matt>#</a></h2><p>Like other parts of the process, the interview with Matt is text-based. The way it works is fairly simple: I was instructed to message Matt on Slack and wait for a response, which may take days or weeks. I sent Matt a message on January 25, and was surprised to hear back from him the following morning. However, that day was Australia Day, which is a public holiday here. Therefore, I only got back to him two hours after he messaged me that morning, and by that time he was probably already busy with other things. This was the start of a pretty long wait.</p><h2 id=-step-8-wait-patiently>⏳ Step 8: Wait patiently<a hidden class=anchor aria-hidden=true href=#-step-8-wait-patiently>#</a></h2><p>I left Car Next Door at the end of January, as I figured that I would be able to line up some other work even if things didn&rsquo;t work out with Automattic. My plan was to take some time off, and then move up to the Northern Rivers area of New South Wales. I had two <a href=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/>Reef Life Survey trips</a> planned, so I wasn&rsquo;t going to start working again before mid-April. I assumed that I would hear back from Matt before then, which would have allowed me to make an informed decision whether to look for another job or not.</p><p>After two weeks of waiting, the time for my dive trips was nearing. As I was going to be without mobile reception for a while, I thought it&rsquo;d be worth letting Matt know my schedule. After discussing the matter with Martin, I messaged Matt. He responded, saying that we might as well do the interview at the beginning of April, as I won&rsquo;t be starting work before that time anyway. I would have preferred to be done with the interview earlier, but was happy to have some certainty and not worry about missing more chat messages before April.</p><p>In early April, I returned from my second dive trip (which included <a href=https://www.whitsundaytimes.com.au/news/boat-caught-in-eye-of-cyclone-cruises-home/3164170/ target=_blank rel=noopener>a close encounter with Cyclone Debbie</a>), and was hoping to sort out my remote work situation while completing the move up north. Unfortunately, while the move was successful, I was ready to give up on Automattic because I haven&rsquo;t heard back from Matt at all in April. However, Martin remained optimistic and encouraged me to wait patiently, which I did as I was pretty busy with the move and with some casual freelancing projects.</p><h2 id=-step-9-chat-with-matt-and-accept-the-job-offer>💬 Step 9: Chat with Matt and accept the job offer<a hidden class=anchor aria-hidden=true href=#-step-9-chat-with-matt-and-accept-the-job-offer>#</a></h2><p>The chat with Matt finally happened on May 2. As is often the case, it took a few hours and covered my background, the trial process, and some other general questions. I asked him about my long wait for the final chat, and he apologised for me being an outlier, as most chats happen within two weeks of a candidate being passed over to him. As the chat was about to conclude, we got to the topic of salary negotiation (which went well), and then the process was finally over! Within a few hours of the chat I was sent an offer letter and an employment contract. As Automattic has an entity in Australia (called Ausomattic), it&rsquo;s a fairly standard contract. I signed the contract and started work the following week – over a year and a half after my initial application. Even before I started working, I booked tickets to <a href=https://data.blog/2017/06/29/data-coalesce-automattic-data-division-meets-in-montreal/ target=_blank rel=noopener>meet the data division in Montréal</a> – a fairly swift transition from the long wait for the final interview.</p><h2 id=-step-10-start-working-and-choose-a-job-title>🎉 Step 10: Start working and choose a job title<a hidden class=anchor aria-hidden=true href=#-step-10-start-working-and-choose-a-job-title>#</a></h2><p>As noted above, Automatticians get to choose their own job titles, so to become a data scientist with Automattic, I had to set my job title to Data Scientist. This is generally how many people become data scientists these days, even outside Automattic. However, job titles don&rsquo;t matter as much as job satisfaction. And after 2.5 months with Automattic, I&rsquo;m very satisfied with my decision to join the company. My first three weeks were spent doing customer support, like all new Automattic employees. Since then, I&rsquo;ve been involved in projects to make engagement measurement more consistent (harder than it sounds, as <a href=http://daynebatten.com/2016/06/counting-hard-data-science/ target=_blank rel=noopener>counting things is hard</a>), and to improve the data science codebase (e.g., moving away from <a href=http://powerfulpython.com/blog/magic-word-legacy-python/ target=_blank rel=noopener>Legacy Python</a>). Besides that, I also went to Montréal for the data division meetup, and have started getting into <a href=https://data.blog/2017/05/24/may-the-bot-be-with-you-how-algorithms-are-supporting-happiness-at-wordpress-com/ target=_blank rel=noopener>chatbot work</a>. I&rsquo;m looking forward to doing more work and sharing my experience here and on <a href=https://data.blog/ target=_blank rel=noopener>data.blog</a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/automattic/>Automattic</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/elasticsearch/>Elasticsearch</a></li><li><a href=https://yanirseroussi.com/tags/personal/>personal</a></li><li><a href=https://yanirseroussi.com/tags/wordpress/>WordPress</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on x" href="https://x.com/intent/tweet/?text=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f&amp;hashtags=Automattic%2ccareer%2cdatascience%2cElasticsearch%2cpersonal%2cWordPress"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f&amp;title=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&amp;summary=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&amp;source=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f&title=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on whatsapp" href="https://api.whatsapp.com/send?text=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic%20-%20https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on telegram" href="https://telegram.me/share/url?text=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on ycombinator" href="https://news.ycombinator.com/submitlink?t=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&u=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="Automattic,career,data science,Elasticsearch,personal,WordPress"><meta name=description content="I wanted a well-paid data science-y remote job with an established company that offers a good life balance and makes products I care about. I got it eventually."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My 10-step path to becoming a remote data scientist with Automattic"><meta property="og:description" content="I wanted a well-paid data science-y remote job with an established company that offers a good life balance and makes products I care about. I got it eventually."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/"><meta property="og:image" content="https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-07-29T05:39:26+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road.jpg"><meta name=twitter:title content="My 10-step path to becoming a remote data scientist with Automattic"><meta name=twitter:description content="I wanted a well-paid data science-y remote job with an established company that offers a good life balance and makes products I care about. I got it eventually."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My 10-step path to becoming a remote data scientist with Automattic","item":"https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My 10-step path to becoming a remote data scientist with Automattic","name":"My 10-step path to becoming a remote data scientist with Automattic","description":"I wanted a well-paid data science-y remote job with an established company that offers a good life balance and makes products I care about. I got it eventually.","keywords":["Automattic","career","data science","Elasticsearch","personal","WordPress"],"articleBody":"About two years ago, I read the book The Year without Pants, which describes the author’s experience leading a team at Automattic (the company behind WordPress.com, among other products). Automattic is a fully-distributed company, which means that all of its employees work remotely (hence pants are optional). While the book discusses some of the challenges of working remotely, the author’s general experience was very positive. A few months after reading the book, I decided to look for a full-time position after a period of independent work. Ideally, I wanted a well-paid data science-y remote job with an established distributed tech company that offers a good life balance and makes products I care about. Automattic seemed to tick all my boxes, so I decided to apply for a job with them. This post describes my application steps, which ultimately led to me becoming a data scientist with Automattic.\nBefore jumping in, it’s worth noting that this post describes my personal experience. If you apply for a job with Automattic, your experience is likely to be different, as the process varies across teams, and evolves over time.\n📧 Step 1: Do background research and apply I decided to apply for a data wrangler position with Automattic in October 2015. While data wrangler may sound less sexy than data scientist, reading the job ad led me to believe that the position may involve interesting data science work. This impression was strengthened by some LinkedIn stalking, which included finding current data wranglers and reading through their profiles and websites. I later found out that all the people on the data division start out as data wranglers, and then they may pick their own title. Some data wranglers do data science work, while others are more focused on data engineering, and there are some projects that require a broad range of skills. As the usefulness of the term data scientist is questionable, I’m not too fussed about fancy job titles. It’s more important to do interesting work in a supportive environment.\nApplying for the job was fairly straightforward. I simply followed the instructions from the ad:\nDoes this sound interesting? If yes, please send a short email to jobs @ this domain telling us about yourself and attach a resumé. Let us know what you can contribute to the team. Include the title of the position you’re applying for and your name in the subject. Proofread! Make sure you spell and capitalize WordPress and Automattic correctly. We are lucky to receive hundreds of applications for every position, so try to make your application stand out. If you apply for multiple positions or send multiple emails there will be one reply.\nHaving been on the receiving side of job applications, I find it surprising that many people don’t bother writing a cover letter, addressing the selection criteria in the ad, or even applying for a job they’re qualified to do. Hence, my cover letter was fairly short, comprising of several bullet points that highlight the similarities between the job requirements and my experience. It was nothing fancy, but simple cover letters have worked well for me in the past.\n⏳ Step 2: Wait patiently The initial application was followed by a long wait. From my research, this is the typical scenario. This is unsurprising, as Automattic is a fairly small company with a large footprint, which is both distributed and known as a great place to work (e.g., its Glassdoor rating is 4.9). Therefore, it attracts many applicants from all over the world, which take a while to process. In addition, Matt Mullenweg (Automattic’s CEO) reviews job applications before passing them on to the team leads.\nAs I didn’t know that Matt reviewed job applications, I decided to try to shorten the wait by getting introduced to someone in the data division. My first attempt was via a second-degree LinkedIn connection who works for Automattic. He responded quickly when I reached out to him, saying that his experience working with the company is in line with the Glassdoor reviews – it’s the best job he’s had in his 15-year-long career. However, he couldn’t help me with an intro, because there is no simple way around Automattic’s internal processes. Nonetheless, he reassured me that it is worth waiting patiently, as the strict process means that you end up working with great people.\nI wasn’t in a huge rush to find a job, but in December 2015 I decided to accept an offer to become the head of data science at Car Next Door. This was a good decision at the time, as I believe in the company’s original vision of reducing the number of cars on the road through car sharing, and it seemed like there would be many interesting projects for me to work on. The position wasn’t completely remote, but as the company was already spread across several cities, I was able to work from home for a day or two every week. In addition, it was a pleasant commute by bike from my Sydney home to the office, so putting the fully-remote job search on hold didn’t seem like a major sacrifice. As I haven’t heard anything from Automattic at that stage, it seemed unwise to reject a good offer, so I started working full-time with Car Next Door in January 2016.\nI successfully attracted Automattic’s attention with a post I published on the misuse of the word insights by many tech companies, which included an example from WordPress.com. Greg Ichneumon Brown, one of the data wranglers, commented on the post, and invited me to apply to join Automattic and help them address the issues I raised. This happened after I accepted the offer from Car Next Door, and hasn’t resulted in any speed up of the process, so I just gave up on Automattic and carried on with my life.\n💬 Step 3: Chat with the data lead I finally heard back from Automattic in February 2016 (four months after my initial application and a month into my employment with Car Next Door). Martin Remy, who leads the data division, emailed me to enquire if I’m still interested in the position. I informed him that I was no longer looking for a job, but we agreed to have an informal chat, as I’ve been waiting for such a long time.\nAs is often the case with Automattic interviews, the chat with Martin was completely text-based. Working with a distributed team means that voice and video calls can be hard to schedule. Hence, Automattic relies heavily on textual channels, and text-based interviews allow the company to test the written communication skills of candidates. The chat revolved around my past work experience, and Martin also took the time to answer my questions about the company and the data division. At the conclusion of the chat, Martin suggested I contact him directly if I was ever interested in continuing the application process. While I was happy with my position at the time, the chat strengthened my positive impression of Automattic, and I decided that I would reapply if I were to look for a full-time position again.\nMy next job search started earlier than I had anticipated. In October 2016, I decided to leave Car Next Door due to disagreements with the founders over the general direction of the company. In addition, I had more flexibility in choosing where to live, as my personal circumstances had changed. As I’ve always been curious about life outside the capital cities of Australia, I wanted to move away from Sydney. While I could have probably continued working remotely with Car Next Door, I felt that it would be better to find a job with a fully-distributed team. Therefore, I messaged Martin and we scheduled another chat.\nThe second chat with Martin took place in early November. Similarly to the first chat, it was conducted via Skype text messages, and revolved around my work in the time that has passed since the first chat. This time, as I was keen on continuing with the process, I asked more specific questions about what kind of work I’m likely to end up doing and what the next steps would be. The answers were that I’d be joining the data science team, and that the next steps are a pre-trial test, a paid trial, and a final interview with Matt. While this sounds straightforward, it took another six months until I finally became an Automattic employee (but I wasn’t in a rush).\n☑️ Step 4: Pass the pre-trial test The pre-trial test consisted of a data analysis task, where I was given a dataset and a set of questions to answer by Carly Stambaugh, the data science lead. The goal of the test is to evaluate the candidate’s approach to a problem, and assess organisational and communication skills. As such, the focus isn’t on obtaining a specific result, so candidates are given a choice of several potential avenues to explore. The open-ended nature of the task is reminiscent of many real-world data science projects, where you don’t always have a clear idea of what you’re going to discover. While some people may find this kind of uncertainty daunting, I find it interesting, as it is one of the things that makes data science a science.\nI spent a few days analysing the data and preparing a report, which was submitted as a Jupyter Notebook. After submitting my initial report, there were a few follow-up questions, which I answered by email. The report was reviewed by Carly and Martin, and as they were satisfied with my work, I was invited to proceed to the next stage: A paid trial project.\n👨‍💻 Step 5: Do the trial project The main part of the application process with Automattic is the paid trial project. The rationale behind doing paid trials was explained a few years ago by Matt in Hire by Auditions, Not Resumes:\nBefore we hire anyone, they go through a trial process first, on contract. They can do the work at night or over the weekend, so they don’t have to leave their current job in the meantime. We pay a standard rate of $25 per hour, regardless of whether you’re applying to be an engineer or the chief financial officer.\nDuring the trials, we give the applicants actual work. If you’re applying to work in customer support, you’ll answer tickets. If you’re an engineer, you’ll work on engineering problems. If you’re a designer, you’ll design.\nThere’s nothing like being in the trenches with someone, working with them day by day. It tells you something you can’t learn from resumes, interviews, or reference checks. At the end of the trial, everyone involved has a great sense of whether they want to work together going forward. And, yes, that means everyone — it’s a mutual tryout. Some people decide we’re not the right fit for them.\nThe goal of my trial project was to improve the Elasticsearch language detection algorithm. This took about a month, and ultimately resulted in a pull request that got merged into the language detection plugin. I find this aspect of the process pretty exciting: While the plugin is used to classify millions of documents internally by Automattic, its impact extends beyond the company, as Elasticsearch is used by many other organisations and projects. This stands in contrast to many other technical job interviews, which consist of unpaid work on toy problems under stressful conditions, where the work performed is ultimately thrown away. While the monetary compensation for the trial work is lower than the market rate for data science consulting, I valued the opportunity to work on a real open source project, even if this hadn’t led to me getting hired.\nThere was much more to the trial project than what’s shown in the final pull request. Most of the discussions were held on an internal project thread, primarly under the guidance of Carly (the data science lead), and Greg (the data wrangler who replied to my post a year earlier). The project was kicked off with a general problem statement: There was some evidence that the Elasticsearch language detection plugin doesn’t perform well on short texts, and my mission was to improve it. As the plugin didn’t include any tests for short texts, one of the main contributions of my work was the creation of datasets and tests to measure its accuracy on texts of different lengths. This was followed by some tweaks that improved the plugin’s performance, as summarised in the pull request. Internally, this work consisted of several iterations where I came up with ideas, asked questions, implemented the ideas, shared the results, and discussed further steps. There are still many possible improvements to the work done in the trial. However, as trials generally last around a month, we decided to end it after a few iterations.\nI enjoyed the trial process, but it is definitely not for everyone. Most notably, there is a strong emphasis on asynchronous text-based communication, which is the main mode by which projects are coordinated at Automattic. People who don’t enjoy written communication may find this aspect challenging, but I have always found that writing helps me organise my thoughts, and that I retain information better when reading than when listening to people speak. That being said, Automatticians do meet in person several times a year, and some teams have video chats for some discussions. While doing the trial, I had a video chat with Carly, which was the first (and last) time in the process that I got to see and hear a live human. However, this was not an essential part of the trial project, as our chat was mostly on the data scientist role and my job expectations.\n⏳ Step 6: Wait patiently I finished working on the trial project just before Christmas. The feedback I received throughout the trial was positive, but Martin, Carly, and Greg had to go through the work and discuss it among themselves before making a final decision. This took about a month, due to the holiday period, various personal circumstances, and the data science team meetup that was scheduled for January 2017. Eventually, Martin got back to me with positive news: They were satisfied with my trial work, which meant there was only one stage left – the final interview with Matt Mullenweg, Automattic’s CEO.\n👉 Step 7: Ping Matt Like other parts of the process, the interview with Matt is text-based. The way it works is fairly simple: I was instructed to message Matt on Slack and wait for a response, which may take days or weeks. I sent Matt a message on January 25, and was surprised to hear back from him the following morning. However, that day was Australia Day, which is a public holiday here. Therefore, I only got back to him two hours after he messaged me that morning, and by that time he was probably already busy with other things. This was the start of a pretty long wait.\n⏳ Step 8: Wait patiently I left Car Next Door at the end of January, as I figured that I would be able to line up some other work even if things didn’t work out with Automattic. My plan was to take some time off, and then move up to the Northern Rivers area of New South Wales. I had two Reef Life Survey trips planned, so I wasn’t going to start working again before mid-April. I assumed that I would hear back from Matt before then, which would have allowed me to make an informed decision whether to look for another job or not.\nAfter two weeks of waiting, the time for my dive trips was nearing. As I was going to be without mobile reception for a while, I thought it’d be worth letting Matt know my schedule. After discussing the matter with Martin, I messaged Matt. He responded, saying that we might as well do the interview at the beginning of April, as I won’t be starting work before that time anyway. I would have preferred to be done with the interview earlier, but was happy to have some certainty and not worry about missing more chat messages before April.\nIn early April, I returned from my second dive trip (which included a close encounter with Cyclone Debbie), and was hoping to sort out my remote work situation while completing the move up north. Unfortunately, while the move was successful, I was ready to give up on Automattic because I haven’t heard back from Matt at all in April. However, Martin remained optimistic and encouraged me to wait patiently, which I did as I was pretty busy with the move and with some casual freelancing projects.\n💬 Step 9: Chat with Matt and accept the job offer The chat with Matt finally happened on May 2. As is often the case, it took a few hours and covered my background, the trial process, and some other general questions. I asked him about my long wait for the final chat, and he apologised for me being an outlier, as most chats happen within two weeks of a candidate being passed over to him. As the chat was about to conclude, we got to the topic of salary negotiation (which went well), and then the process was finally over! Within a few hours of the chat I was sent an offer letter and an employment contract. As Automattic has an entity in Australia (called Ausomattic), it’s a fairly standard contract. I signed the contract and started work the following week – over a year and a half after my initial application. Even before I started working, I booked tickets to meet the data division in Montréal – a fairly swift transition from the long wait for the final interview.\n🎉 Step 10: Start working and choose a job title As noted above, Automatticians get to choose their own job titles, so to become a data scientist with Automattic, I had to set my job title to Data Scientist. This is generally how many people become data scientists these days, even outside Automattic. However, job titles don’t matter as much as job satisfaction. And after 2.5 months with Automattic, I’m very satisfied with my decision to join the company. My first three weeks were spent doing customer support, like all new Automattic employees. Since then, I’ve been involved in projects to make engagement measurement more consistent (harder than it sounds, as counting things is hard), and to improve the data science codebase (e.g., moving away from Legacy Python). Besides that, I also went to Montréal for the data division meetup, and have started getting into chatbot work. I’m looking forward to doing more work and sharing my experience here and on data.blog.\n","wordCount":"3143","inLanguage":"en","image":"https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road.jpg","datePublished":"2017-07-29T05:39:26Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My 10-step path to becoming a remote data scientist with Automattic</h1><div class=post-meta><span title='2017-07-29 05:39:26 +0000 UTC'>July 29, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-07-29-my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road_hu690f2353847db52b435aef42e177b9ac_842409_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road.jpg 2000w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/long-remote-road.jpg alt width=2000 height=1125></figure><div class=post-content><p>About two years ago, I read the book <a href=http://scottberkun.com/yearwithoutpants/ target=_blank rel=noopener>The Year without Pants</a>, which describes the author&rsquo;s experience leading a team at <a href=https://automattic.com/ target=_blank rel=noopener>Automattic</a> (the company behind WordPress.com, among other products). Automattic is a fully-distributed company, which means that all of its employees work remotely (hence pants are optional). While the book discusses some of the challenges of working remotely, the author&rsquo;s general experience was very positive. A few months after reading the book, I decided to look for a full-time position after <a href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/>a period of independent work</a>. Ideally, I wanted a well-paid data science-y remote job with an established distributed tech company that offers a good life balance and makes products I care about. Automattic seemed to tick all my boxes, so I decided to apply for a job with them. This post describes my application steps, which ultimately led to me becoming a data scientist with Automattic.</p><p>Before jumping in, it&rsquo;s worth noting that this post describes <em>my</em> personal experience. If you apply for a job with Automattic, your experience is likely to be different, as the process varies across teams, and evolves over time.</p><h2 id=-step-1-do-background-research-and-apply>📧 Step 1: Do background research and apply<a hidden class=anchor aria-hidden=true href=#-step-1-do-background-research-and-apply>#</a></h2><p>I decided to apply for a data wrangler position with Automattic in October 2015. While data <em>wrangler</em> may sound less sexy than data <em>scientist</em>, reading the <a href=http://web.archive.org/web/20150908140923/https://automattic.com/work-with-us/data-wrangler/ target=_blank rel=noopener>job ad</a> led me to believe that the position may involve interesting data science work. This impression was strengthened by some LinkedIn stalking, which included finding current data wranglers and reading through their profiles and websites. I later found out that all the people on the data division start out as data wranglers, and then they may pick their own title. Some data wranglers do data science work, while others are more focused on data engineering, and there are some projects that require a broad range of skills. As <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>the usefulness of the term <em>data scientist</em> is questionable</a>, I&rsquo;m not too fussed about fancy job titles. It&rsquo;s more important to do interesting work in a supportive environment.</p><p>Applying for the job was fairly straightforward. I simply followed the instructions from the ad:</p><blockquote><p>Does this sound interesting? If yes, please send a short email to jobs @ this domain telling us about yourself and attach a resumé. Let us know what you can contribute to the team. Include the title of the position you&rsquo;re applying for and your name in the subject. Proofread! Make sure you spell and capitalize WordPress and Automattic correctly. We are lucky to receive hundreds of applications for every position, so try to make your application stand out. If you apply for multiple positions or send multiple emails there will be one reply.</p></blockquote><p>Having been on the receiving side of job applications, I find it surprising that many people don&rsquo;t bother writing a cover letter, addressing the selection criteria in the ad, or even applying for a job they&rsquo;re qualified to do. Hence, my cover letter was fairly short, comprising of several bullet points that highlight the similarities between the job requirements and my experience. It was nothing fancy, but simple cover letters have worked well for me in the past.</p><h2 id=-step-2-wait-patiently>⏳ Step 2: Wait patiently<a hidden class=anchor aria-hidden=true href=#-step-2-wait-patiently>#</a></h2><p>The initial application was followed by a long wait. From my research, this is the typical scenario. This is unsurprising, as <a href=https://automattic.com/about/ target=_blank rel=noopener>Automattic is a fairly small company with a large footprint</a>, which is both distributed and known as a great place to work (e.g., its <a href=https://www.glassdoor.com.au/Reviews/Automattic-Reviews-E751107.htm target=_blank rel=noopener>Glassdoor rating is 4.9</a>). Therefore, it attracts many applicants from all over the world, which take a while to process. In addition, <a href=http://davemart.in/remote-hiring/ target=_blank rel=noopener>Matt Mullenweg (Automattic&rsquo;s CEO) reviews job applications before passing them on to the team leads</a>.</p><p>As I didn&rsquo;t know that Matt reviewed job applications, I decided to try to shorten the wait by getting introduced to someone in the data division. My first attempt was via a second-degree LinkedIn connection who works for Automattic. He responded quickly when I reached out to him, saying that his experience working with the company is in line with the Glassdoor reviews – it&rsquo;s the best job he&rsquo;s had in his 15-year-long career. However, he couldn&rsquo;t help me with an intro, because there is no simple way around Automattic&rsquo;s internal processes. Nonetheless, he reassured me that it is worth waiting patiently, as the strict process means that you end up working with great people.</p><p>I wasn&rsquo;t in a huge rush to find a job, but in December 2015 I decided to accept an offer to become the head of data science at <a href=https://www.carnextdoor.com.au/ target=_blank rel=noopener>Car Next Door</a>. This was a good decision at the time, as I believe in the company&rsquo;s original vision of reducing the number of cars on the road through car sharing, and it seemed like there would be many interesting projects for me to work on. The position wasn&rsquo;t completely remote, but as the company was already spread across several cities, I was able to work from home for a day or two every week. In addition, it was a pleasant commute by bike from my Sydney home to the office, so putting the fully-remote job search on hold didn&rsquo;t seem like a major sacrifice. As I haven&rsquo;t heard anything from Automattic at that stage, it seemed unwise to reject a good offer, so I started working full-time with Car Next Door in January 2016.</p><p>I successfully attracted Automattic&rsquo;s attention with <a href=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/>a post I published on the misuse of the word <em>insights</em> by many tech companies</a>, which included an example from WordPress.com. Greg Ichneumon Brown, one of the data wranglers, <a href=https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/#comment-957>commented on the post</a>, and invited me to apply to join Automattic and help them address the issues I raised. This happened after I accepted the offer from Car Next Door, and hasn&rsquo;t resulted in any speed up of the process, so I just gave up on Automattic and carried on with my life.</p><h2 id=-step-3-chat-with-the-data-lead>💬 Step 3: Chat with the data lead<a hidden class=anchor aria-hidden=true href=#-step-3-chat-with-the-data-lead>#</a></h2><p>I finally heard back from Automattic in February 2016 (four months after my initial application and a month into my employment with Car Next Door). Martin Remy, who leads the data division, emailed me to enquire if I&rsquo;m still interested in the position. I informed him that I was no longer looking for a job, but we agreed to have an informal chat, as I&rsquo;ve been waiting for such a long time.</p><p>As is often the case with Automattic interviews, the chat with Martin was completely text-based. Working with a distributed team means that voice and video calls can be hard to schedule. Hence, Automattic relies heavily on textual channels, and text-based interviews allow the company to test the written communication skills of candidates. The chat revolved around my past work experience, and Martin also took the time to answer my questions about the company and the data division. At the conclusion of the chat, Martin suggested I contact him directly if I was ever interested in continuing the application process. While I was happy with my position at the time, the chat strengthened my positive impression of Automattic, and I decided that I would reapply if I were to look for a full-time position again.</p><p>My next job search started earlier than I had anticipated. In October 2016, I decided to leave Car Next Door due to disagreements with the founders over the general direction of the company. In addition, I had more flexibility in choosing where to live, as my personal circumstances had changed. As I&rsquo;ve always been curious about life outside the capital cities of Australia, I wanted to move away from Sydney. While I could have probably continued working remotely with Car Next Door, I felt that it would be better to find a job with a fully-distributed team. Therefore, I messaged Martin and we scheduled another chat.</p><p>The second chat with Martin took place in early November. Similarly to the first chat, it was conducted via Skype text messages, and revolved around my work in the time that has passed since the first chat. This time, as I was keen on continuing with the process, I asked more specific questions about what kind of work I&rsquo;m likely to end up doing and what the next steps would be. The answers were that I&rsquo;d be joining the data science team, and that the next steps are a pre-trial test, a paid trial, and a final interview with Matt. While this sounds straightforward, it took another six months until I finally became an Automattic employee (but I wasn&rsquo;t in a rush).</p><h2 id=-step-4-pass-the-pre-trial-test>☑️ Step 4: Pass the pre-trial test<a hidden class=anchor aria-hidden=true href=#-step-4-pass-the-pre-trial-test>#</a></h2><p>The pre-trial test consisted of a data analysis task, where I was given a dataset and a set of questions to answer by Carly Stambaugh, the data science lead. The goal of the test is to evaluate the candidate&rsquo;s approach to a problem, and assess organisational and communication skills. As such, the focus isn&rsquo;t on obtaining a specific result, so candidates are given a choice of several potential avenues to explore. The open-ended nature of the task is reminiscent of many real-world data science projects, where you don&rsquo;t always have a clear idea of what you&rsquo;re going to discover. While some people may find this kind of uncertainty daunting, I find it interesting, as it is one of the things that makes data science a <em>science</em>.</p><p>I spent a few days analysing the data and preparing a report, which was submitted as a <a href=http://jupyter.org/ target=_blank rel=noopener>Jupyter Notebook</a>. After submitting my initial report, there were a few follow-up questions, which I answered by email. The report was reviewed by Carly and Martin, and as they were satisfied with my work, I was invited to proceed to the next stage: A paid trial project.</p><h2 id=-step-5-do-the-trial-project>👨‍💻 Step 5: Do the trial project<a hidden class=anchor aria-hidden=true href=#-step-5-do-the-trial-project>#</a></h2><p>The main part of the application process with Automattic is the paid trial project. The rationale behind doing paid trials was explained a few years ago by Matt in <a href=https://hbr.org/2014/01/hire-by-auditions-not-resumes target=_blank rel=noopener>Hire by Auditions, Not Resumes</a>:</p><blockquote><p>Before we hire anyone, they go through a trial process first, on contract. They can do the work at night or over the weekend, so they don&rsquo;t have to leave their current job in the meantime. We pay a standard rate of $25 per hour, regardless of whether you&rsquo;re applying to be an engineer or the chief financial officer.</p><p>During the trials, we give the applicants actual work. If you&rsquo;re applying to work in customer support, you&rsquo;ll answer tickets. If you&rsquo;re an engineer, you&rsquo;ll work on engineering problems. If you&rsquo;re a designer, you&rsquo;ll design.</p><p>There&rsquo;s nothing like being in the trenches with someone, working with them day by day. It tells you something you can&rsquo;t learn from resumes, interviews, or reference checks. At the end of the trial, everyone involved has a great sense of whether they want to work together going forward. And, yes, that means everyone — it&rsquo;s a mutual tryout. Some people decide we&rsquo;re not the right fit for them.</p></blockquote><p>The goal of my trial project was to improve the <a href=https://www.elastic.co/products/elasticsearch target=_blank rel=noopener>Elasticsearch</a> language detection algorithm. This took about a month, and ultimately resulted in <a href=https://github.com/jprante/elasticsearch-langdetect/pull/69 target=_blank rel=noopener>a pull request that got merged into the language detection plugin</a>. I find this aspect of the process pretty exciting: While the plugin is used to classify millions of documents internally by Automattic, its impact extends beyond the company, as Elasticsearch is used by many other organisations and projects. This stands in contrast to many other technical job interviews, which consist of unpaid work on toy problems under stressful conditions, where the work performed is ultimately thrown away. While the monetary compensation for the trial work is lower than the market rate for data science consulting, I valued the opportunity to work on a real open source project, even if this hadn&rsquo;t led to me getting hired.</p><p>There was much more to the trial project than what&rsquo;s shown in the final pull request. Most of the discussions were held on an internal project thread, primarly under the guidance of Carly (the data science lead), and Greg (the data wrangler who replied to my post a year earlier). The project was kicked off with a general problem statement: There was some evidence that the Elasticsearch language detection plugin doesn&rsquo;t perform well on short texts, and my mission was to improve it. As the plugin didn&rsquo;t include any tests for short texts, one of the main contributions of my work was the creation of datasets and tests to measure its accuracy on texts of different lengths. This was followed by some tweaks that improved the plugin&rsquo;s performance, as <a href=https://github.com/jprante/elasticsearch-langdetect/pull/69 target=_blank rel=noopener>summarised in the pull request</a>. Internally, this work consisted of several iterations where I came up with ideas, asked questions, implemented the ideas, shared the results, and discussed further steps. There are still many possible improvements to the work done in the trial. However, as trials generally last around a month, we decided to end it after a few iterations.</p><p>I enjoyed the trial process, but it is definitely not for everyone. Most notably, there is a strong emphasis on asynchronous text-based communication, which is the main mode by which projects are coordinated at Automattic. People who don&rsquo;t enjoy written communication may find this aspect challenging, but I have always found that writing helps me organise my thoughts, and that I retain information better when reading than when listening to people speak. That being said, Automatticians do meet in person several times a year, and some teams have video chats for some discussions. While doing the trial, I had a video chat with Carly, which was the first (and last) time in the process that I got to see and hear a live human. However, this was not an essential part of the trial project, as our chat was mostly on the data scientist role and my job expectations.</p><h2 id=-step-6-wait-patiently>⏳ Step 6: Wait patiently<a hidden class=anchor aria-hidden=true href=#-step-6-wait-patiently>#</a></h2><p>I finished working on the trial project just before Christmas. The feedback I received throughout the trial was positive, but Martin, Carly, and Greg had to go through the work and discuss it among themselves before making a final decision. This took about a month, due to the holiday period, various personal circumstances, and the data science team meetup that was scheduled for January 2017. Eventually, Martin got back to me with positive news: They were satisfied with my trial work, which meant there was only one stage left – the final interview with Matt Mullenweg, Automattic&rsquo;s CEO.</p><h2 id=-step-7-ping-matt>👉 Step 7: Ping Matt<a hidden class=anchor aria-hidden=true href=#-step-7-ping-matt>#</a></h2><p>Like other parts of the process, the interview with Matt is text-based. The way it works is fairly simple: I was instructed to message Matt on Slack and wait for a response, which may take days or weeks. I sent Matt a message on January 25, and was surprised to hear back from him the following morning. However, that day was Australia Day, which is a public holiday here. Therefore, I only got back to him two hours after he messaged me that morning, and by that time he was probably already busy with other things. This was the start of a pretty long wait.</p><h2 id=-step-8-wait-patiently>⏳ Step 8: Wait patiently<a hidden class=anchor aria-hidden=true href=#-step-8-wait-patiently>#</a></h2><p>I left Car Next Door at the end of January, as I figured that I would be able to line up some other work even if things didn&rsquo;t work out with Automattic. My plan was to take some time off, and then move up to the Northern Rivers area of New South Wales. I had two <a href=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/>Reef Life Survey trips</a> planned, so I wasn&rsquo;t going to start working again before mid-April. I assumed that I would hear back from Matt before then, which would have allowed me to make an informed decision whether to look for another job or not.</p><p>After two weeks of waiting, the time for my dive trips was nearing. As I was going to be without mobile reception for a while, I thought it&rsquo;d be worth letting Matt know my schedule. After discussing the matter with Martin, I messaged Matt. He responded, saying that we might as well do the interview at the beginning of April, as I won&rsquo;t be starting work before that time anyway. I would have preferred to be done with the interview earlier, but was happy to have some certainty and not worry about missing more chat messages before April.</p><p>In early April, I returned from my second dive trip (which included <a href=https://www.whitsundaytimes.com.au/news/boat-caught-in-eye-of-cyclone-cruises-home/3164170/ target=_blank rel=noopener>a close encounter with Cyclone Debbie</a>), and was hoping to sort out my remote work situation while completing the move up north. Unfortunately, while the move was successful, I was ready to give up on Automattic because I haven&rsquo;t heard back from Matt at all in April. However, Martin remained optimistic and encouraged me to wait patiently, which I did as I was pretty busy with the move and with some casual freelancing projects.</p><h2 id=-step-9-chat-with-matt-and-accept-the-job-offer>💬 Step 9: Chat with Matt and accept the job offer<a hidden class=anchor aria-hidden=true href=#-step-9-chat-with-matt-and-accept-the-job-offer>#</a></h2><p>The chat with Matt finally happened on May 2. As is often the case, it took a few hours and covered my background, the trial process, and some other general questions. I asked him about my long wait for the final chat, and he apologised for me being an outlier, as most chats happen within two weeks of a candidate being passed over to him. As the chat was about to conclude, we got to the topic of salary negotiation (which went well), and then the process was finally over! Within a few hours of the chat I was sent an offer letter and an employment contract. As Automattic has an entity in Australia (called Ausomattic), it&rsquo;s a fairly standard contract. I signed the contract and started work the following week – over a year and a half after my initial application. Even before I started working, I booked tickets to <a href=https://data.blog/2017/06/29/data-coalesce-automattic-data-division-meets-in-montreal/ target=_blank rel=noopener>meet the data division in Montréal</a> – a fairly swift transition from the long wait for the final interview.</p><h2 id=-step-10-start-working-and-choose-a-job-title>🎉 Step 10: Start working and choose a job title<a hidden class=anchor aria-hidden=true href=#-step-10-start-working-and-choose-a-job-title>#</a></h2><p>As noted above, Automatticians get to choose their own job titles, so to become a data scientist with Automattic, I had to set my job title to Data Scientist. This is generally how many people become data scientists these days, even outside Automattic. However, job titles don&rsquo;t matter as much as job satisfaction. And after 2.5 months with Automattic, I&rsquo;m very satisfied with my decision to join the company. My first three weeks were spent doing customer support, like all new Automattic employees. Since then, I&rsquo;ve been involved in projects to make engagement measurement more consistent (harder than it sounds, as <a href=http://daynebatten.com/2016/06/counting-hard-data-science/ target=_blank rel=noopener>counting things is hard</a>), and to improve the data science codebase (e.g., moving away from <a href=http://powerfulpython.com/blog/magic-word-legacy-python/ target=_blank rel=noopener>Legacy Python</a>). Besides that, I also went to Montréal for the data division meetup, and have started getting into <a href=https://data.blog/2017/05/24/may-the-bot-be-with-you-how-algorithms-are-supporting-happiness-at-wordpress-com/ target=_blank rel=noopener>chatbot work</a>. I&rsquo;m looking forward to doing more work and sharing my experience here and on <a href=https://data.blog/ target=_blank rel=noopener>data.blog</a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/automattic/>Automattic</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/elasticsearch/>Elasticsearch</a></li><li><a href=https://yanirseroussi.com/tags/personal/>personal</a></li><li><a href=https://yanirseroussi.com/tags/wordpress/>WordPress</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on x" href="https://x.com/intent/tweet/?text=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f&amp;hashtags=Automattic%2ccareer%2cdatascience%2cElasticsearch%2cpersonal%2cWordPress"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f&amp;title=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&amp;summary=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&amp;source=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f&title=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on whatsapp" href="https://api.whatsapp.com/send?text=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic%20-%20https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on telegram" href="https://telegram.me/share/url?text=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My 10-step path to becoming a remote data scientist with Automattic on ycombinator" href="https://news.ycombinator.com/submitlink?t=My%2010-step%20path%20to%20becoming%20a%20remote%20data%20scientist%20with%20Automattic&u=https%3a%2f%2fyanirseroussi.com%2f2017%2f07%2f29%2fmy-10-step-path-to-becoming-a-remote-data-scientist-with-automattic%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-1698><div class=comment-header><a href=#comment-1698><img class=comment-avatar src="https://www.gravatar.com/avatar/42214b1b56b1978983775143bd01a434?s=50"><p class=comment-info><strong>Jonathan Wood</strong><br><small>2017-07-29 18:26:09</small></p></a></div><div class="comment-body post-content">Very enlightening post! It was very awesome to see that the insights you saw to Elasticsearch went to a PR. I bet that was worth the whole thing!</div></div><div class=comment-level-0 id=comment-1700><div class=comment-header><a href=#comment-1700><img class=comment-avatar src="https://www.gravatar.com/avatar/e76f462de89e67d4874d555b8cceb936?s=50"><p class=comment-info><strong>Mostafa</strong><br><small>2017-07-30 09:33:44</small></p></a></div><div class="comment-body post-content">That&rsquo;s very exciting, I wanted to ask are you a self learner or do you have a degree,can you please share your background.
 Thank you</div></div><div class=comment-level-1 id=comment-1705><div class=comment-header><a href=#comment-1705><img class=comment-avatar src="https://www.gravatar.com/avatar/dda019c47a6183120608a6aeac2db6c5?s=50"><p class=comment-info><strong>Yanir Seroussi</strong><br><small>2017-07-31 01:37:04</small></p></a></div><div class="comment-body post-content">Thanks Mostafa. Yes, I have a BSc in computer science, and a PhD in what you would now call data science. See: <a href=https://www.linkedin.com/in/yanirseroussi/ target=_blank rel=noopener>https://www.linkedin.com/in/yanirseroussi/</a></div></div><div class=comment-level-0 id=comment-1965><div class=comment-header><a href=#comment-1965><img class=comment-avatar src="https://www.gravatar.com/avatar/5f1a3858a1d36ac1a2d19f194c6308ae?s=50"><p class=comment-info><strong>Pravin Singh</strong><br><small>2017-11-23 08:51:50</small></p></a></div><div class="comment-body post-content"><p>This was an amazing post, Yanir! Loved the breakdown and the patience you had for the whole process, very well played and you really deserved it! :)</p><p>P.S: Really can connect as I&rsquo;ve been working independently for a while now and would definitely be open to looking for long-term contracts or remote jobs like this.</p></div></div><div class=comment-level-0 id=comment-3121><div class=comment-header><a href=#comment-3121><img class=comment-avatar src="https://www.gravatar.com/avatar/0bf0a218fbad4f6450ecc66c2d91a714?s=50"><p class=comment-info><strong>Baker</strong><br><small>2018-12-23 21:57:33</small></p></a></div><div class="comment-body post-content">Your post is really a therapy to most people who apply for jobs and loose hope of waiting. I believe patience is a key to everything. Thqnks</div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
diff --git a/2017/09/02/state-of-bandcamp-recommender/index.html b/2017/09/02/state-of-bandcamp-recommender/index.html
index 0dbb5b7ae..23243dc1a 100644
--- a/2017/09/02/state-of-bandcamp-recommender/index.html
+++ b/2017/09/02/state-of-bandcamp-recommender/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>State of Bandcamp Recommender, Late 2017 | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Bandcamp,BCRecommender"><meta name=description content="Call for BCRecommender maintainers followed by a decision to shut it down, as I don&rsquo;t have enough time and Bandcamp now offers recommendations."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="State of Bandcamp Recommender, Late 2017"><meta property="og:description" content="Call for BCRecommender maintainers followed by a decision to shut it down, as I don&rsquo;t have enough time and Bandcamp now offers recommendations."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/"><meta property="og:image" content="https://yanirseroussi.com/bcrecommender-homepage.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-09-02T10:19:02+00:00"><meta property="article:modified_time" content="2023-07-07T17:36:55+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bcrecommender-homepage.jpg"><meta name=twitter:title content="State of Bandcamp Recommender, Late 2017"><meta name=twitter:description content="Call for BCRecommender maintainers followed by a decision to shut it down, as I don&rsquo;t have enough time and Bandcamp now offers recommendations."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"State of Bandcamp Recommender, Late 2017","item":"https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"State of Bandcamp Recommender, Late 2017","name":"State of Bandcamp Recommender, Late 2017","description":"Call for BCRecommender maintainers followed by a decision to shut it down, as I don\u0026rsquo;t have enough time and Bandcamp now offers recommendations.","keywords":["Bandcamp","BCRecommender"],"articleBody":"November 2017: Update and goodbye I’ve decided to shut down Bandcamp Recommender (BCRecommender), despite hearing back from a few volunteers. The main reasons are:\nBandcamp now shows album recommendations at the bottom of album pages. While this isn’t quite the same as BCRecommender, I hope that it will evolve to a more comprehensive recommender system. I tried to contact Bandcamp to get their support for the continued running of BCRecommender. I have not heard back from them. It would have been nice to receive some acknowledgement that they find BCRecommender useful. As discussed below, I don’t have much time to spend on the project, and handing it off to other maintainers would have been time-consuming. Given reasons 1 and 2, I don’t feel like it’s worth the effort. Thanks to everyone who’s contacted me – you’re awesome! September 2017: Original announcement I released the first version of Bandcamp Recommender (BCRecommender) about three years ago, with the main goal of surfacing music recommendations from Bandcamp. A secondary goal was learning more about building and marketing a standalone web app. As such, I shared a few posts about BCRecommender over the years:\nInitial posts on the motivation behind building BCRecommender, original system layout, and the recommendation engine. Marketing-oriented posts on applying the Traction Book’s framework to BCRecommender, followed by an update on traction successes and failures, and another post on finding some SEO success. Later architectural changes, including moving away from Parse.com and migrating from MongoDB to Elasticsearch. The last of the above posts was published in November 2015 – almost two years ago. Most of the work on BCRecommender was done up to that point, when my main focus was on part-time contracting while working on my own projects. However, since January 2016 I’ve mostly been working full-time, so I haven’t had the time to give enough attention to the project. Therefore, it looks like it’s time for me to say goodbye to BCRecommender.\nDespite the lack of attention, about 5,000 people still visit BCRecommender every month (down from a peak of around 9,000). I know that people find it useful, even though it hasn’t been functionally updated in a long time (though the recommendations have been refreshed a few times). In an ideal world, BCRecommender would be replaced by algorithmic recommendations from Bandcamp. But unfortunately, Bandcamp still doesn’t offer personalised recommendations. This is a shame, because such recommendations could be of great benefit to both artists and fans. Millions of tracks and albums have been published on Bandcamp, meaning that serving personalised recommendations that cover their full catalogue can only be achieved using algorithms. However, it seems like they’re not interested in building this kind of functionality.\nRather than simply pulling the plug on BCRecommender, I thought I’d put a call out to see if anyone is interested in maintaining it. I’m happy to open source the code and hand the project over to someone else if it means it would be in good hands. With a little bit of work, BCRecommender can be turned into a full Bandcamp-based personalised radio station. If you think you’d be a good fit for maintaining the project, drop me a line and we can discuss further. If you just love BCRecommender, you can also let Bandcamp know that you want them to implement algorithmic recommendations (e.g., on Twitter or by emailing support@bandcamp.com). I’ll keep BCRecommender alive for about two more months and see if I get any responses. Either way, I’ll be saying goodbye to maintaining it before the end of the year.\n","wordCount":"590","inLanguage":"en","image":"https://yanirseroussi.com/bcrecommender-homepage.jpg","datePublished":"2017-09-02T10:19:02Z","dateModified":"2023-07-07T17:36:55+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">State of Bandcamp Recommender, Late 2017</h1><div class=post-meta><span title='2017-09-02 10:19:02 +0000 UTC'>September 2, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-09-02-state-of-bandcamp-recommender/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage.jpg 3738w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage.jpg alt width=3738 height=1127></figure><div class=post-content><h2 id=november-2017-update-and-goodbye>November 2017: Update and goodbye<a hidden class=anchor aria-hidden=true href=#november-2017-update-and-goodbye>#</a></h2><p>I&rsquo;ve decided to shut down Bandcamp Recommender (BCRecommender), despite hearing back from a few volunteers. The main reasons are:</p><ol><li>Bandcamp now shows album recommendations at the bottom of album pages. While this isn&rsquo;t quite the same as BCRecommender, I hope that it will evolve to a more comprehensive recommender system.</li><li>I tried to contact Bandcamp to get their support for the continued running of BCRecommender. I have not heard back from them. It would have been nice to receive some acknowledgement that they find BCRecommender useful.</li><li>As discussed below, I don&rsquo;t have much time to spend on the project, and handing it off to other maintainers would have been time-consuming. Given reasons 1 and 2, I don&rsquo;t feel like it&rsquo;s worth the effort. Thanks to everyone who&rsquo;s contacted me – you&rsquo;re awesome!</li></ol><h2 id=september-2017-original-announcement>September 2017: Original announcement<a hidden class=anchor aria-hidden=true href=#september-2017-original-announcement>#</a></h2><p>I <a href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/>released the first version of Bandcamp Recommender (BCRecommender) about three years ago</a>, with the main goal of surfacing music recommendations from <a href=https://bandcamp.com/ target=_blank rel=noopener>Bandcamp</a>. A secondary goal was learning more about building and marketing a standalone web app. As such, I shared a few posts about BCRecommender over the years:</p><ul><li>Initial posts on <a href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/>the motivation behind building BCRecommender</a>, <a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/>original system layout</a>, and <a href=https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/>the recommendation engine</a>.</li><li>Marketing-oriented posts on <a href=https://yanirseroussi.com/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/>applying the Traction Book&rsquo;s framework to BCRecommender</a>, followed by <a href=https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/>an update on traction successes and failures</a>, and another post on <a href=https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/>finding some SEO success</a>.</li><li>Later architectural changes, including <a href=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/>moving away from Parse.com</a> and <a href=https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/>migrating from MongoDB to Elasticsearch</a>.</li></ul><p>The last of the above posts was published in November 2015 – almost two years ago. Most of the work on BCRecommender was done up to that point, when my main focus was on part-time contracting while working on my own projects. However, since January 2016 I&rsquo;ve mostly been working full-time, so I haven&rsquo;t had the time to give enough attention to the project. Therefore, it looks like it&rsquo;s time for me to say goodbye to BCRecommender.</p><p>Despite the lack of attention, about 5,000 people still visit BCRecommender every month (down from a peak of around 9,000). I know that people find it useful, even though it hasn&rsquo;t been functionally updated in a long time (though the recommendations have been refreshed a few times). In an ideal world, BCRecommender would be replaced by algorithmic recommendations from Bandcamp. But unfortunately, Bandcamp still doesn&rsquo;t offer personalised recommendations. This is a shame, because such recommendations could be of great benefit to both artists and fans. Millions of tracks and albums have been published on Bandcamp, meaning that serving personalised recommendations that cover their full catalogue can only be achieved using algorithms. However, it seems like they&rsquo;re not interested in building this kind of functionality.</p><p>Rather than simply pulling the plug on BCRecommender, I thought I&rsquo;d put a call out to see if anyone is interested in maintaining it. I&rsquo;m happy to open source the code and hand the project over to someone else if it means it would be in good hands. With a little bit of work, BCRecommender can be turned into a full Bandcamp-based personalised radio station. If you think you&rsquo;d be a good fit for maintaining the project, <a href=https://yanirseroussi.com/about/>drop me a line</a> and we can discuss further. If you just love BCRecommender, you can also let Bandcamp know that you want them to implement algorithmic recommendations (e.g., on <a href=https://twitter.com/bandcamp target=_blank rel=noopener>Twitter</a> or by emailing <a href=mailto:support@bandcamp.com>support@bandcamp.com</a>). I&rsquo;ll keep BCRecommender alive for about two more months and see if I get any responses. Either way, I&rsquo;ll be saying goodbye to maintaining it before the end of the year.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bandcamp/>Bandcamp</a></li><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on x" href="https://x.com/intent/tweet/?text=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f&amp;hashtags=Bandcamp%2cBCRecommender"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f&amp;title=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&amp;summary=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&amp;source=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f&title=State%20of%20Bandcamp%20Recommender%2c%20Late%202017"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on whatsapp" href="https://api.whatsapp.com/send?text=State%20of%20Bandcamp%20Recommender%2c%20Late%202017%20-%20https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on telegram" href="https://telegram.me/share/url?text=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on ycombinator" href="https://news.ycombinator.com/submitlink?t=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&u=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="Bandcamp,BCRecommender"><meta name=description content="Call for BCRecommender maintainers followed by a decision to shut it down, as I don&rsquo;t have enough time and Bandcamp now offers recommendations."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="State of Bandcamp Recommender, Late 2017"><meta property="og:description" content="Call for BCRecommender maintainers followed by a decision to shut it down, as I don&rsquo;t have enough time and Bandcamp now offers recommendations."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/"><meta property="og:image" content="https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-09-02T10:19:02+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage.jpg"><meta name=twitter:title content="State of Bandcamp Recommender, Late 2017"><meta name=twitter:description content="Call for BCRecommender maintainers followed by a decision to shut it down, as I don&rsquo;t have enough time and Bandcamp now offers recommendations."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"State of Bandcamp Recommender, Late 2017","item":"https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"State of Bandcamp Recommender, Late 2017","name":"State of Bandcamp Recommender, Late 2017","description":"Call for BCRecommender maintainers followed by a decision to shut it down, as I don\u0026rsquo;t have enough time and Bandcamp now offers recommendations.","keywords":["Bandcamp","BCRecommender"],"articleBody":"November 2017: Update and goodbye I’ve decided to shut down Bandcamp Recommender (BCRecommender), despite hearing back from a few volunteers. The main reasons are:\nBandcamp now shows album recommendations at the bottom of album pages. While this isn’t quite the same as BCRecommender, I hope that it will evolve to a more comprehensive recommender system. I tried to contact Bandcamp to get their support for the continued running of BCRecommender. I have not heard back from them. It would have been nice to receive some acknowledgement that they find BCRecommender useful. As discussed below, I don’t have much time to spend on the project, and handing it off to other maintainers would have been time-consuming. Given reasons 1 and 2, I don’t feel like it’s worth the effort. Thanks to everyone who’s contacted me – you’re awesome! September 2017: Original announcement I released the first version of Bandcamp Recommender (BCRecommender) about three years ago, with the main goal of surfacing music recommendations from Bandcamp. A secondary goal was learning more about building and marketing a standalone web app. As such, I shared a few posts about BCRecommender over the years:\nInitial posts on the motivation behind building BCRecommender, original system layout, and the recommendation engine. Marketing-oriented posts on applying the Traction Book’s framework to BCRecommender, followed by an update on traction successes and failures, and another post on finding some SEO success. Later architectural changes, including moving away from Parse.com and migrating from MongoDB to Elasticsearch. The last of the above posts was published in November 2015 – almost two years ago. Most of the work on BCRecommender was done up to that point, when my main focus was on part-time contracting while working on my own projects. However, since January 2016 I’ve mostly been working full-time, so I haven’t had the time to give enough attention to the project. Therefore, it looks like it’s time for me to say goodbye to BCRecommender.\nDespite the lack of attention, about 5,000 people still visit BCRecommender every month (down from a peak of around 9,000). I know that people find it useful, even though it hasn’t been functionally updated in a long time (though the recommendations have been refreshed a few times). In an ideal world, BCRecommender would be replaced by algorithmic recommendations from Bandcamp. But unfortunately, Bandcamp still doesn’t offer personalised recommendations. This is a shame, because such recommendations could be of great benefit to both artists and fans. Millions of tracks and albums have been published on Bandcamp, meaning that serving personalised recommendations that cover their full catalogue can only be achieved using algorithms. However, it seems like they’re not interested in building this kind of functionality.\nRather than simply pulling the plug on BCRecommender, I thought I’d put a call out to see if anyone is interested in maintaining it. I’m happy to open source the code and hand the project over to someone else if it means it would be in good hands. With a little bit of work, BCRecommender can be turned into a full Bandcamp-based personalised radio station. If you think you’d be a good fit for maintaining the project, drop me a line and we can discuss further. If you just love BCRecommender, you can also let Bandcamp know that you want them to implement algorithmic recommendations (e.g., on Twitter or by emailing support@bandcamp.com). I’ll keep BCRecommender alive for about two more months and see if I get any responses. Either way, I’ll be saying goodbye to maintaining it before the end of the year.\n","wordCount":"590","inLanguage":"en","image":"https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage.jpg","datePublished":"2017-09-02T10:19:02Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">State of Bandcamp Recommender, Late 2017</h1><div class=post-meta><span title='2017-09-02 10:19:02 +0000 UTC'>September 2, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-09-02-state-of-bandcamp-recommender/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage_hu4bb4a83eb29302b814ecd8f57b6ac5b4_276563_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage.jpg 3738w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/bcrecommender-homepage.jpg alt width=3738 height=1127></figure><div class=post-content><h2 id=november-2017-update-and-goodbye>November 2017: Update and goodbye<a hidden class=anchor aria-hidden=true href=#november-2017-update-and-goodbye>#</a></h2><p>I&rsquo;ve decided to shut down Bandcamp Recommender (BCRecommender), despite hearing back from a few volunteers. The main reasons are:</p><ol><li>Bandcamp now shows album recommendations at the bottom of album pages. While this isn&rsquo;t quite the same as BCRecommender, I hope that it will evolve to a more comprehensive recommender system.</li><li>I tried to contact Bandcamp to get their support for the continued running of BCRecommender. I have not heard back from them. It would have been nice to receive some acknowledgement that they find BCRecommender useful.</li><li>As discussed below, I don&rsquo;t have much time to spend on the project, and handing it off to other maintainers would have been time-consuming. Given reasons 1 and 2, I don&rsquo;t feel like it&rsquo;s worth the effort. Thanks to everyone who&rsquo;s contacted me – you&rsquo;re awesome!</li></ol><h2 id=september-2017-original-announcement>September 2017: Original announcement<a hidden class=anchor aria-hidden=true href=#september-2017-original-announcement>#</a></h2><p>I <a href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/>released the first version of Bandcamp Recommender (BCRecommender) about three years ago</a>, with the main goal of surfacing music recommendations from <a href=https://bandcamp.com/ target=_blank rel=noopener>Bandcamp</a>. A secondary goal was learning more about building and marketing a standalone web app. As such, I shared a few posts about BCRecommender over the years:</p><ul><li>Initial posts on <a href=https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/>the motivation behind building BCRecommender</a>, <a href=https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/>original system layout</a>, and <a href=https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/>the recommendation engine</a>.</li><li>Marketing-oriented posts on <a href=https://yanirseroussi.com/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/>applying the Traction Book&rsquo;s framework to BCRecommender</a>, followed by <a href=https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/>an update on traction successes and failures</a>, and another post on <a href=https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/>finding some SEO success</a>.</li><li>Later architectural changes, including <a href=https://yanirseroussi.com/2015/07/31/goodbye-parse-com/>moving away from Parse.com</a> and <a href=https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/>migrating from MongoDB to Elasticsearch</a>.</li></ul><p>The last of the above posts was published in November 2015 – almost two years ago. Most of the work on BCRecommender was done up to that point, when my main focus was on part-time contracting while working on my own projects. However, since January 2016 I&rsquo;ve mostly been working full-time, so I haven&rsquo;t had the time to give enough attention to the project. Therefore, it looks like it&rsquo;s time for me to say goodbye to BCRecommender.</p><p>Despite the lack of attention, about 5,000 people still visit BCRecommender every month (down from a peak of around 9,000). I know that people find it useful, even though it hasn&rsquo;t been functionally updated in a long time (though the recommendations have been refreshed a few times). In an ideal world, BCRecommender would be replaced by algorithmic recommendations from Bandcamp. But unfortunately, Bandcamp still doesn&rsquo;t offer personalised recommendations. This is a shame, because such recommendations could be of great benefit to both artists and fans. Millions of tracks and albums have been published on Bandcamp, meaning that serving personalised recommendations that cover their full catalogue can only be achieved using algorithms. However, it seems like they&rsquo;re not interested in building this kind of functionality.</p><p>Rather than simply pulling the plug on BCRecommender, I thought I&rsquo;d put a call out to see if anyone is interested in maintaining it. I&rsquo;m happy to open source the code and hand the project over to someone else if it means it would be in good hands. With a little bit of work, BCRecommender can be turned into a full Bandcamp-based personalised radio station. If you think you&rsquo;d be a good fit for maintaining the project, <a href=https://yanirseroussi.com/about/>drop me a line</a> and we can discuss further. If you just love BCRecommender, you can also let Bandcamp know that you want them to implement algorithmic recommendations (e.g., on <a href=https://twitter.com/bandcamp target=_blank rel=noopener>Twitter</a> or by emailing <a href=mailto:support@bandcamp.com>support@bandcamp.com</a>). I&rsquo;ll keep BCRecommender alive for about two more months and see if I get any responses. Either way, I&rsquo;ll be saying goodbye to maintaining it before the end of the year.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/bandcamp/>Bandcamp</a></li><li><a href=https://yanirseroussi.com/tags/bcrecommender/>BCRecommender</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on x" href="https://x.com/intent/tweet/?text=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f&amp;hashtags=Bandcamp%2cBCRecommender"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f&amp;title=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&amp;summary=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&amp;source=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f&title=State%20of%20Bandcamp%20Recommender%2c%20Late%202017"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on whatsapp" href="https://api.whatsapp.com/send?text=State%20of%20Bandcamp%20Recommender%2c%20Late%202017%20-%20https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on telegram" href="https://telegram.me/share/url?text=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share State of Bandcamp Recommender, Late 2017 on ycombinator" href="https://news.ycombinator.com/submitlink?t=State%20of%20Bandcamp%20Recommender%2c%20Late%202017&u=https%3a%2f%2fyanirseroussi.com%2f2017%2f09%2f02%2fstate-of-bandcamp-recommender%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-1880><div class=comment-header><a href=#comment-1880><img class=comment-avatar src="https://www.gravatar.com/avatar/5a363ddb605d33dd6c30d9aca7fdde59?s=50"><p class=comment-info><strong>Shanky</strong><br><small>2017-10-15 20:08:20</small></p></a></div><div class="comment-body post-content">Cool&mldr;not sure why and when i subscribed to your mailing list. and now quite surprised to hear that Bandcamp Recommender was your project.
 i am bandcamp freak&mldr; Bandcamp has recently strarted recommendations at the bottom ;-) seems primitive though. example <a href=https://ogreyouasshole.bandcamp.com/album/crossword-lost-sigh-days-james-mcnew-remixes target=_blank rel=noopener>https://ogreyouasshole.bandcamp.com/album/crossword-lost-sigh-days-james-mcnew-remixes</a>
 would love to hear about the basic logic you used behind the &ldquo;recommendations&rdquo; . I have no technical knowledge at all but a few years ago thought of a basic recommendation model ..but couldnt take it forward though&mldr;i thought &lsquo;contextualizing&rsquo; artists would be a cool way to connect bands.
diff --git a/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/index.html b/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/index.html
index 26e0f7f42..2029b15d2 100644
--- a/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/index.html
+++ b/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Advice for aspiring data scientists and other FAQs | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="career,data business,data science,frequently asked questions"><meta name=description content="Frequently asked questions by visitors to this site, especially around entering the data science field."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Advice for aspiring data scientists and other FAQs"><meta property="og:description" content="Frequently asked questions by visitors to this site, especially around entering the data science field."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/"><meta property="og:image" content="https://yanirseroussi.com/gold-coast-surfers.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-10-15T09:15:25+00:00"><meta property="article:modified_time" content="2023-07-10T16:35:18+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/gold-coast-surfers.jpg"><meta name=twitter:title content="Advice for aspiring data scientists and other FAQs"><meta name=twitter:description content="Frequently asked questions by visitors to this site, especially around entering the data science field."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Advice for aspiring data scientists and other FAQs","item":"https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Advice for aspiring data scientists and other FAQs","name":"Advice for aspiring data scientists and other FAQs","description":"Frequently asked questions by visitors to this site, especially around entering the data science field.","keywords":["career","data business","data science","frequently asked questions"],"articleBody":"Aspiring data scientists and other visitors to this site often repeat the same questions. This post is the definitive collection of my answers to such questions (which may evolve over time).\nHow do I become a data scientist?\nIt depends on your situation. Before we get into it, have you thought about why you want to become a data scientist? Hmm… Not really. Why should I become a data scientist?\nI can't answer this for you, but it's great to see you asking why. Do you know what data science is? Do you understand what data scientists do? Sort of. Just so we’re on the same page, what is data science?\nNo one knows for sure. Here are my thoughts from 2014 on defining data science as the intersection of software engineering and statistics, and a more recent post on defining data science in 2018. What are the hardest parts of data science?\nThe hardest parts of data science are problem definition and solution measurement, not model fitting and data cleaning, because counting things is hard. Thanks, that’s helpful. But what do data scientists actually do?\nIt varies a lot. This variability makes the job title somewhat useless. You should try to get an idea what areas of data science interest you. For many people, excitement over the technical aspects wanes with time. And even if you still find the technical aspects exciting, most jobs have boring parts. When considering career changes, think of the non-technical aspects that would keep you engaged. To answer the question, here are some posts on things I've done: Joined Automattic by improving the Elasticsearch language detection plugin, calculated customer lifetime value, analysed A/B test results, built recommender systems (including one for Bandcamp music), competed on Kaggle, and completed a PhD. I've also dabbled in deep learning, marine surveys, causality, and other things that I haven't had the chance to write about. Cool! Can you provide a general overview of how to become a data scientist?\nYes! Check out Alec Smith's excellent articles. I’m pretty happy with my current job, but still thinking of becoming a data scientist. What should I do?\nFind ways of doing data science within your current role, working overtime if needed. Working on a real problem in a familiar domain is much more valuable than working on toy problems from online courses and platforms like Kaggle (though they're also useful). If you're a data analyst, learn how to program to automate and simplify your analyses. If you're a software engineer, become comfortable with analysing and modelling data. Machine learning doesn't have to be a part of what you choose to do. I’m pretty busy. What online course should I take to learn about the area?\nCalling Bullshit: Data Reasoning for the Digital Age is a good place to start. Deep learning should be pretty low on your list if you don't have much background in the area. Should I learn Python or R? Keras or Tensorflow? What about ?\nIt doesn't matter. Focus on principles and you'll be fine. The following quote still applies today (to people of all genders). As to methods, there may be a million and then some, but principles are few. The man who grasps principles can successfully select his own methods. The man who tries methods, ignoring principles, is sure to have trouble.\nHarrington Emerson (1911) I want to become a data science freelancer. Can you provide some advice?\nAs with any freelancing job, expect to spend much of your time on sales and networking. I've only explored the freelancing path briefly, but Radim Řehůřek has published great slides on the topic. If you're thinking of freelancing as a way of gaining financial independence, also consider spending less, earning more, and investing wisely. Can you recommend an academic data science degree?\nSorry, but I don't know much about those degrees. Boris Gorelik has some interesting thoughts on studying data science. Will you be my mentor?\nProbably not, unless you're hard-working, independent, and doing something I find interesting. Feel free to contact me if you believe we'd both find the relationship beneficial. Can you help with my project?\nPossibly. If you think I'd find your project exciting, please do contact me. What about ethics?\nWhat about them? There isn't a single definition of right and wrong, as morality is multi-dimensional. I believe it's important to question your own choices, and avoid applying data science blindly. For me, this means divesting from harmful industries like fossil fuels and striving to go beyond the creation of greedy robots (among other things). I’m a manager. When should I hire a data scientist and start using machine learning?\nThere's a good chance you don't need a data scientist yet, but you should be aware of common pitfalls when trying to be data-driven. It's also worth reading Paras Chopra's post on what you need to know before you board the machine learning train. Do you want to buy my products or services?\nNo. If I did, I'd contact you. I have a question that isn’t answered here or anywhere on the internet, and I think you can help. Can I contact you?\nSure, use the form on this page. ","wordCount":"870","inLanguage":"en","image":"https://yanirseroussi.com/gold-coast-surfers.jpg","datePublished":"2017-10-15T09:15:25Z","dateModified":"2023-07-10T16:35:18+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Advice for aspiring data scientists and other FAQs</h1><div class=post-meta><span title='2017-10-15 09:15:25 +0000 UTC'>October 15, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-10-15-advice-for-aspiring-data-scientists-and-other-faqs/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers.jpg 4000w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers.jpg alt width=4000 height=1620></figure><div class=post-content><p>Aspiring data scientists and other visitors to this site often repeat the same questions. This post is the definitive collection of my answers to such questions (which may evolve over time).</p><p><b id=how-do-i-become-a-data-scientist>How do I become a data scientist?</b></p><p class=indent-1>It depends on your situation. Before we get into it, have you thought about why you want to become a data scientist?</p><p><b id=why-should-i-become-a-data-scientist>Hmm&mldr; Not really. Why should I become a data scientist?</b></p><p class=indent-1>I can't answer this for you, but it's great to see you <a href=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/>asking why</a>. Do you know what data science is? Do you understand what data scientists do?</p><p><b id=what-is-data-science>Sort of. Just so we&rsquo;re on the same page, what is data science?</b></p><p class=indent-1>No one knows for sure. Here are <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>my thoughts from 2014 on defining data science as the intersection of software engineering and statistics</a>, and a more recent post on <a href=https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/>defining data science in 2018</a>.</p><p><b id=hardest-parts-of-data-science>What are the hardest parts of data science?</b></p><p class=indent-1>The hardest parts of data science are <a href=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/>problem definition and solution measurement, not model fitting and data cleaning</a>, because <a href=http://daynebatten.com/2016/06/counting-hard-data-science/>counting things is hard</a>.</p><p><b id=what-do-data-scientists-do>Thanks, that&rsquo;s helpful. But what do data scientists actually do?</b></p><p class=indent-1>It varies a lot. This variability makes the job title <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>somewhat useless</a>. You should try to get an idea what areas of data science interest you. For many people, excitement over the technical aspects wanes with time. And even if you still find the technical aspects exciting, most jobs have boring parts. When considering career changes, think of the non-technical aspects that would keep you engaged.</p><p class=indent-1>To answer the question, here are some posts on things I've done: <a href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/>Joined Automattic by improving the Elasticsearch language detection plugin</a>, <a href=https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/>calculated customer lifetime value</a>, <a href=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/>analysed A/B test results</a>, <a href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/>built recommender systems</a> (including <a href=https://yanirseroussi.com/state-of-bandcamp-recommender-september-2017/>one for Bandcamp music</a>), <a href=https://yanirseroussi.com/2014/04/05/kaggle-competition-summaries/>competed on Kaggle</a>, and <a href=https://yanirseroussi.wordpress.com/phd-work/>completed a PhD</a>. I've also dabbled in <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/>deep learning</a>, <a href=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/>marine surveys</a>, <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>causality</a>, and other things that I haven't had the chance to write about.</p><p><b id=become-a-data-scientist-overview>Cool! Can you provide a general overview of how to become a data scientist?</b></p><p class=indent-1>Yes! Check out <a href=https://www.experfy.com/blog/how-to-become-a-data-scientist-part-1-3>Alec Smith's excellent articles</a>.</p><p><b id=pivot-into-data-science>I&rsquo;m pretty happy with my current job, but still thinking of becoming a data scientist. What should I do?</b></p><p class=indent-1>Find ways of doing data science within your current role, working overtime if needed. Working on a real problem in a familiar domain is much more valuable than working on toy problems from online courses and platforms like Kaggle (though they're also useful). If you're a data analyst, learn how to program to automate and simplify your analyses. If you're a software engineer, become comfortable with analysing and modelling data. <a href=https://brohrer.github.io/imposter_syndrome.html>Machine learning doesn't have to be a part of what you choose to do</a>.</p><p><b id=online-course-recommendation>I&rsquo;m pretty busy. What online course should I take to learn about the area?</b></p><p class=indent-1><a href=http://callingbullshit.org/>Calling Bullshit: Data Reasoning for the Digital Age</a> is a good place to start. <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>Deep learning should be pretty low on your list</a> if you don't have much background in the area.</p><p><b id=tool-recommendation>Should I learn Python or R? Keras or Tensorflow? What about <code>&lt;insert name here></code>?</b></p><p class=indent-1>It doesn't matter. Focus on principles and you'll be fine. The following quote still applies today (to people of all genders).</p><blockquote><p>As to methods, there may be a million and then some, but principles are few. The man who grasps principles can successfully select his own methods. The man who tries methods, ignoring principles, is sure to have trouble.</p><footer><strong></strong>
+<meta name=keywords content="career,data business,data science,frequently asked questions"><meta name=description content="Frequently asked questions by visitors to this site, especially around entering the data science field."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Advice for aspiring data scientists and other FAQs"><meta property="og:description" content="Frequently asked questions by visitors to this site, especially around entering the data science field."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/"><meta property="og:image" content="https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2017-10-15T09:15:25+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers.jpg"><meta name=twitter:title content="Advice for aspiring data scientists and other FAQs"><meta name=twitter:description content="Frequently asked questions by visitors to this site, especially around entering the data science field."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Advice for aspiring data scientists and other FAQs","item":"https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Advice for aspiring data scientists and other FAQs","name":"Advice for aspiring data scientists and other FAQs","description":"Frequently asked questions by visitors to this site, especially around entering the data science field.","keywords":["career","data business","data science","frequently asked questions"],"articleBody":"Aspiring data scientists and other visitors to this site often repeat the same questions. This post is the definitive collection of my answers to such questions (which may evolve over time).\nHow do I become a data scientist?\nIt depends on your situation. Before we get into it, have you thought about why you want to become a data scientist? Hmm… Not really. Why should I become a data scientist?\nI can't answer this for you, but it's great to see you asking why. Do you know what data science is? Do you understand what data scientists do? Sort of. Just so we’re on the same page, what is data science?\nNo one knows for sure. Here are my thoughts from 2014 on defining data science as the intersection of software engineering and statistics, and a more recent post on defining data science in 2018. What are the hardest parts of data science?\nThe hardest parts of data science are problem definition and solution measurement, not model fitting and data cleaning, because counting things is hard. Thanks, that’s helpful. But what do data scientists actually do?\nIt varies a lot. This variability makes the job title somewhat useless. You should try to get an idea what areas of data science interest you. For many people, excitement over the technical aspects wanes with time. And even if you still find the technical aspects exciting, most jobs have boring parts. When considering career changes, think of the non-technical aspects that would keep you engaged. To answer the question, here are some posts on things I've done: Joined Automattic by improving the Elasticsearch language detection plugin, calculated customer lifetime value, analysed A/B test results, built recommender systems (including one for Bandcamp music), competed on Kaggle, and completed a PhD. I've also dabbled in deep learning, marine surveys, causality, and other things that I haven't had the chance to write about. Cool! Can you provide a general overview of how to become a data scientist?\nYes! Check out Alec Smith's excellent articles. I’m pretty happy with my current job, but still thinking of becoming a data scientist. What should I do?\nFind ways of doing data science within your current role, working overtime if needed. Working on a real problem in a familiar domain is much more valuable than working on toy problems from online courses and platforms like Kaggle (though they're also useful). If you're a data analyst, learn how to program to automate and simplify your analyses. If you're a software engineer, become comfortable with analysing and modelling data. Machine learning doesn't have to be a part of what you choose to do. I’m pretty busy. What online course should I take to learn about the area?\nCalling Bullshit: Data Reasoning for the Digital Age is a good place to start. Deep learning should be pretty low on your list if you don't have much background in the area. Should I learn Python or R? Keras or Tensorflow? What about ?\nIt doesn't matter. Focus on principles and you'll be fine. The following quote still applies today (to people of all genders). As to methods, there may be a million and then some, but principles are few. The man who grasps principles can successfully select his own methods. The man who tries methods, ignoring principles, is sure to have trouble.\nHarrington Emerson (1911) I want to become a data science freelancer. Can you provide some advice?\nAs with any freelancing job, expect to spend much of your time on sales and networking. I've only explored the freelancing path briefly, but Radim Řehůřek has published great slides on the topic. If you're thinking of freelancing as a way of gaining financial independence, also consider spending less, earning more, and investing wisely. Can you recommend an academic data science degree?\nSorry, but I don't know much about those degrees. Boris Gorelik has some interesting thoughts on studying data science. Will you be my mentor?\nProbably not, unless you're hard-working, independent, and doing something I find interesting. Feel free to contact me if you believe we'd both find the relationship beneficial. Can you help with my project?\nPossibly. If you think I'd find your project exciting, please do contact me. What about ethics?\nWhat about them? There isn't a single definition of right and wrong, as morality is multi-dimensional. I believe it's important to question your own choices, and avoid applying data science blindly. For me, this means divesting from harmful industries like fossil fuels and striving to go beyond the creation of greedy robots (among other things). I’m a manager. When should I hire a data scientist and start using machine learning?\nThere's a good chance you don't need a data scientist yet, but you should be aware of common pitfalls when trying to be data-driven. It's also worth reading Paras Chopra's post on what you need to know before you board the machine learning train. Do you want to buy my products or services?\nNo. If I did, I'd contact you. I have a question that isn’t answered here or anywhere on the internet, and I think you can help. Can I contact you?\nSure, use the form on this page. ","wordCount":"870","inLanguage":"en","image":"https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers.jpg","datePublished":"2017-10-15T09:15:25Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Advice for aspiring data scientists and other FAQs</h1><div class=post-meta><span title='2017-10-15 09:15:25 +0000 UTC'>October 15, 2017</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2017-10-15-advice-for-aspiring-data-scientists-and-other-faqs/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers_hueb610a201bee2910ae39d7006395df9e_608324_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers.jpg 4000w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/gold-coast-surfers.jpg alt width=4000 height=1620></figure><div class=post-content><p>Aspiring data scientists and other visitors to this site often repeat the same questions. This post is the definitive collection of my answers to such questions (which may evolve over time).</p><p><b id=how-do-i-become-a-data-scientist>How do I become a data scientist?</b></p><p class=indent-1>It depends on your situation. Before we get into it, have you thought about why you want to become a data scientist?</p><p><b id=why-should-i-become-a-data-scientist>Hmm&mldr; Not really. Why should I become a data scientist?</b></p><p class=indent-1>I can't answer this for you, but it's great to see you <a href=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/>asking why</a>. Do you know what data science is? Do you understand what data scientists do?</p><p><b id=what-is-data-science>Sort of. Just so we&rsquo;re on the same page, what is data science?</b></p><p class=indent-1>No one knows for sure. Here are <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>my thoughts from 2014 on defining data science as the intersection of software engineering and statistics</a>, and a more recent post on <a href=https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/>defining data science in 2018</a>.</p><p><b id=hardest-parts-of-data-science>What are the hardest parts of data science?</b></p><p class=indent-1>The hardest parts of data science are <a href=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/>problem definition and solution measurement, not model fitting and data cleaning</a>, because <a href=http://daynebatten.com/2016/06/counting-hard-data-science/>counting things is hard</a>.</p><p><b id=what-do-data-scientists-do>Thanks, that&rsquo;s helpful. But what do data scientists actually do?</b></p><p class=indent-1>It varies a lot. This variability makes the job title <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>somewhat useless</a>. You should try to get an idea what areas of data science interest you. For many people, excitement over the technical aspects wanes with time. And even if you still find the technical aspects exciting, most jobs have boring parts. When considering career changes, think of the non-technical aspects that would keep you engaged.</p><p class=indent-1>To answer the question, here are some posts on things I've done: <a href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/>Joined Automattic by improving the Elasticsearch language detection plugin</a>, <a href=https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/>calculated customer lifetime value</a>, <a href=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/>analysed A/B test results</a>, <a href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/>built recommender systems</a> (including <a href=https://yanirseroussi.com/state-of-bandcamp-recommender-september-2017/>one for Bandcamp music</a>), <a href=https://yanirseroussi.com/2014/04/05/kaggle-competition-summaries/>competed on Kaggle</a>, and <a href=https://yanirseroussi.wordpress.com/phd-work/>completed a PhD</a>. I've also dabbled in <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/>deep learning</a>, <a href=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/>marine surveys</a>, <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>causality</a>, and other things that I haven't had the chance to write about.</p><p><b id=become-a-data-scientist-overview>Cool! Can you provide a general overview of how to become a data scientist?</b></p><p class=indent-1>Yes! Check out <a href=https://www.experfy.com/blog/how-to-become-a-data-scientist-part-1-3>Alec Smith's excellent articles</a>.</p><p><b id=pivot-into-data-science>I&rsquo;m pretty happy with my current job, but still thinking of becoming a data scientist. What should I do?</b></p><p class=indent-1>Find ways of doing data science within your current role, working overtime if needed. Working on a real problem in a familiar domain is much more valuable than working on toy problems from online courses and platforms like Kaggle (though they're also useful). If you're a data analyst, learn how to program to automate and simplify your analyses. If you're a software engineer, become comfortable with analysing and modelling data. <a href=https://brohrer.github.io/imposter_syndrome.html>Machine learning doesn't have to be a part of what you choose to do</a>.</p><p><b id=online-course-recommendation>I&rsquo;m pretty busy. What online course should I take to learn about the area?</b></p><p class=indent-1><a href=http://callingbullshit.org/>Calling Bullshit: Data Reasoning for the Digital Age</a> is a good place to start. <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>Deep learning should be pretty low on your list</a> if you don't have much background in the area.</p><p><b id=tool-recommendation>Should I learn Python or R? Keras or Tensorflow? What about <code>&lt;insert name here></code>?</b></p><p class=indent-1>It doesn't matter. Focus on principles and you'll be fine. The following quote still applies today (to people of all genders).</p><blockquote><p>As to methods, there may be a million and then some, but principles are few. The man who grasps principles can successfully select his own methods. The man who tries methods, ignoring principles, is sure to have trouble.</p><footer><strong></strong>
 <cite><a href=https://quoteinvestigator.com/2015/07/17/methods/ title=https://quoteinvestigator.com/2015/07/17/methods/ target=_blank rel=noopener>Harrington Emerson (1911)</a></cite></footer></blockquote><p><b id=become-a-data-science-freelancer>I want to become a data science freelancer. Can you provide some advice?</b></p><p class=indent-1>As with any freelancing job, expect to spend much of your time on sales and networking. I've only <a href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/>explored the freelancing path briefly</a>, but <a href=https://berlinbuzzwords.de/sites/berlinbuzzwords.de/files/media/documents/radim_rehurek-so_you_want_to_be_a_data_science_consultant.pdf>Radim Řehůřek has published great slides on the topic</a>. If you're thinking of freelancing as a way of gaining financial independence, also consider <a href=https://minafi.com/interactive-guide-early-retirement-financial-independence/>spending less, earning more, and investing wisely</a>.</p><p><b id=data-science-degree>Can you recommend an academic data science degree?</b></p><p class=indent-1>Sorry, but I don't know much about those degrees. <a href=https://gorelik.net/2017/05/29/dont-study-data-science/>Boris Gorelik has some interesting thoughts on studying data science</a>.</p><p><b id=be-my-mentor>Will you be my mentor?</b></p><p class=indent-1>Probably not, unless you're hard-working, independent, and doing something I find interesting. Feel free to <a href=https://yanirseroussi.com/about/>contact me</a> if you believe we'd both find the relationship beneficial.</p><p><b id=help-with-my-project>Can you help with my project?</b></p><p class=indent-1>Possibly. If you think I'd find your project exciting, please do <a href=https://yanirseroussi.com/about/>contact me</a>.</p><hr><p><b id=ethics>What about ethics?</b></p><p class=indent-1>What about them? There isn't a single definition of right and wrong, as <a href=https://en.wikipedia.org/wiki/The_Righteous_Mind>morality is multi-dimensional</a>. I believe it's important to <a href=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/>question your own choices</a>, and <a href=https://www.kdnuggets.com/2015/05/should-data-science-do-that.html>avoid applying data science blindly</a>. For me, this means <a href=https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/>divesting from harmful industries like fossil fuels</a> and striving to go beyond the creation of <a href=https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/>greedy robots</a> (among other things).</p><p><b id=data-driven-manager>I&rsquo;m a manager. When should I hire a data scientist and start using machine learning?</b></p><p class=indent-1>There's a good chance <a href=https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/>you don't need a data scientist yet</a>, but you should be aware of <a href=https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/>common pitfalls when trying to be data-driven</a>. It's also worth reading Paras Chopra's post on <a href=https://growth.wingify.com/what-you-need-to-know-before-you-board-the-machine-learning-train-a81c513098fe>what you need to know before you board the machine learning train</a>.</p><p><b id=spam>Do you want to buy my products or services?</b></p><p class=indent-1>No. If I did, I'd contact you.</p><p><b id=other-questions>I have a question that isn&rsquo;t answered here or anywhere on the internet, and I think you can help. Can I contact you?</b></p><p class=indent-1>Sure, <a href=https://yanirseroussi.com/about/>use the form on this page</a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-business/>data business</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/frequently-asked-questions/>frequently asked questions</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Advice for aspiring data scientists and other FAQs on x" href="https://x.com/intent/tweet/?text=Advice%20for%20aspiring%20data%20scientists%20and%20other%20FAQs&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f10%2f15%2fadvice-for-aspiring-data-scientists-and-other-faqs%2f&amp;hashtags=career%2cdatabusiness%2cdatascience%2cfrequentlyaskedquestions"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Advice for aspiring data scientists and other FAQs on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f10%2f15%2fadvice-for-aspiring-data-scientists-and-other-faqs%2f&amp;title=Advice%20for%20aspiring%20data%20scientists%20and%20other%20FAQs&amp;summary=Advice%20for%20aspiring%20data%20scientists%20and%20other%20FAQs&amp;source=https%3a%2f%2fyanirseroussi.com%2f2017%2f10%2f15%2fadvice-for-aspiring-data-scientists-and-other-faqs%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Advice for aspiring data scientists and other FAQs on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2017%2f10%2f15%2fadvice-for-aspiring-data-scientists-and-other-faqs%2f&title=Advice%20for%20aspiring%20data%20scientists%20and%20other%20FAQs"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Advice for aspiring data scientists and other FAQs on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2017%2f10%2f15%2fadvice-for-aspiring-data-scientists-and-other-faqs%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Advice for aspiring data scientists and other FAQs on whatsapp" href="https://api.whatsapp.com/send?text=Advice%20for%20aspiring%20data%20scientists%20and%20other%20FAQs%20-%20https%3a%2f%2fyanirseroussi.com%2f2017%2f10%2f15%2fadvice-for-aspiring-data-scientists-and-other-faqs%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Advice for aspiring data scientists and other FAQs on telegram" href="https://telegram.me/share/url?text=Advice%20for%20aspiring%20data%20scientists%20and%20other%20FAQs&amp;url=https%3a%2f%2fyanirseroussi.com%2f2017%2f10%2f15%2fadvice-for-aspiring-data-scientists-and-other-faqs%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Advice for aspiring data scientists and other FAQs on ycombinator" href="https://news.ycombinator.com/submitlink?t=Advice%20for%20aspiring%20data%20scientists%20and%20other%20FAQs&u=https%3a%2f%2fyanirseroussi.com%2f2017%2f10%2f15%2fadvice-for-aspiring-data-scientists-and-other-faqs%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-1872><div class=comment-header><a href=#comment-1872><img class=comment-avatar src="https://www.gravatar.com/avatar/95cb4c667a279f91dc647c3a78dccf96?s=50"><p class=comment-info><strong>Simon</strong><br><small>2017-10-15 13:26:01</small></p></a></div><div class="comment-body post-content">Thanks so much for sharing this Yanir!</div></div><div class=comment-level-0 id=comment-1877><div class=comment-header><a href=#comment-1877><img class=comment-avatar src="https://www.gravatar.com/avatar/3e83196ec5d22b66453107ead83adc58?s=50"><p class=comment-info><strong>Eric Colson</strong><br><small>2017-10-15 16:38:31</small></p></a></div><div class="comment-body post-content"><p>Indeed, such questions seem to be very recurring. Thanks for providing answers to help guide folks. I might add a few things:</p><p>when ready for the job search&mldr; Advice to Data Scientists on Where to Work
 <a href=http://multithreaded.stitchfix.com/blog/2015/03/31/advice-for-data-scientists/ target=_blank rel=noopener>http://multithreaded.stitchfix.com/blog/2015/03/31/advice-for-data-scientists/</a></p><p>if you are going to get into data science, do it for the right reasons. Let your passion drive!
diff --git a/2018/07/22/defining-data-science-in-2018/index.html b/2018/07/22/defining-data-science-in-2018/index.html
index b7dacd83b..7f9c35aaa 100644
--- a/2018/07/22/defining-data-science-in-2018/index.html
+++ b/2018/07/22/defining-data-science-in-2018/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Defining data science in 2018 | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="analytics,artificial intelligence,business,data science,machine learning,statistics"><meta name=description content="Updating my definition of data science to match changes in the field. It is now broader than before, but its ultimate goal is still to support decisions."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Defining data science in 2018"><meta property="og:description" content="Updating my definition of data science to match changes in the field. It is now broader than before, but its ultimate goal is still to support decisions."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/"><meta property="og:image" content="https://yanirseroussi.com/what-would-you-say-you-do-here.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2018-07-22T08:27:43+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/what-would-you-say-you-do-here.jpg"><meta name=twitter:title content="Defining data science in 2018"><meta name=twitter:description content="Updating my definition of data science to match changes in the field. It is now broader than before, but its ultimate goal is still to support decisions."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Defining data science in 2018","item":"https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Defining data science in 2018","name":"Defining data science in 2018","description":"Updating my definition of data science to match changes in the field. It is now broader than before, but its ultimate goal is still to support decisions.","keywords":["analytics","artificial intelligence","business","data science","machine learning","statistics"],"articleBody":"I got my first data science job in 2012, the year Harvard Business Review announced data scientist to be the sexiest job of the 21st century. Two years later, I published a post on my then-favourite definition of data science, as the intersection between software engineering and statistics. Unfortunately, that definition became somewhat irrelevant as more and more people jumped on the data science bandwagon – possibly to the point of making data scientist useless as a job title. However, I still call myself a data scientist. Even better – I still get paid for being a data scientist. But what does it mean? What do I actually do here? This article is a short summary of my understanding of the definition of data science in 2018.\nIt’s not all about machine learning As I was wrapping up my PhD in 2012, I started thinking about my next steps. I knew I wanted to get back to working in the tech industry, ideally with a small startup. But it wasn’t clear to me how to market myself – my LinkedIn title at the time was “software engineer with a research background”, which is a bit of a mouthful. Around that time I heard about Kaggle and decided to try competing. This went pretty well, and exposed me to the data science community globally and in Melbourne, where I was living at the time. That’s how I first met Adam Neumann, the founder of Giveable, a startup that aimed to recommend gifts based on social networking data. Upon graduating, I joined Giveable as a data scientist. Changing my LinkedIn title quickly led to many other offers, but I was happy to be working on Giveable – I felt fortunate to have found a startup job that was related to my PhD research on recommender systems.\nMy understanding of data science at the time was heavily influenced by Kaggle and the tech industry. Kaggle was only about predictive modelling competitions back then, and so I believed that data science is about using machine learning to build models and deploy them as part of various applications. I was very comfortable with that definition, having spent my PhD years on several predictive modelling tasks, and having worked as a software engineer prior to that.\nThings have changed considerably since 2012. It is now much easier to deploy machine learning models, even without a deep understanding of how they work. Many more people call themselves data scientists, including some who are more focused on data analysis than on building data products. Even Kaggle – which is now owned by Google – has broadened its scope beyond modelling competitions to support other types of analysis. Numerous articles have been published on the meaning of data science in the past six years. We seem to be going towards a broad definition of the field, which includes any type of general data analysis. This trend of broadening the definition may make data scientist somewhat useless as a job title. However, I believe that data science tasks remain useful, as shown by the following definitions.\nRecent definitions by Hernán, Hawkins, and Dubossarsky In a recent article, Hernán et al. classify data science tasks into three types: description, prediction, and causal inference. Like other authors, they argue that causal inference has been neglected by traditional statistics and some scientific disciplines. They claim that the emergence of data science is an opportunity to get causal inference “right”. Further, they emphasise the importance of domain expert knowledge, which is essential in causal inference. Defining data science in this broad manner seems to capture the essence of what the field is about these days. However, purely descriptive tasks are still often performed by data analysts rather than scientists. And the distinction between prediction and causal inference can be a bit fuzzy, especially as the tools for the latter are at a lower level of maturity. In addition, while I agree with Hernán et al. that domain expertise is important, it seems unlikely that this will forever be the case. No one is born an expert – expertise is gained by learning from and interacting with the world. Therefore, it’s plausible that gaining expertise can and will be automated. Further, there are numerous cases where experts were proven to be wrong. For example, it wasn’t so long ago that doctors recommended smoking.\nDespite the importance of domain knowledge, one can argue that scientists that specialise in a single domain are not data scientists. In fact, the ability to go beyond one domain and think of data in a more abstract manner is what makes a data scientist. Applying this abstract knowledge often requires some domain expertise or input from domain experts, but most data science techniques are not domain-specific – they can be applied to many different problems. John Hawkins explains this point well in an article titled why all scientists are not data scientists:\nThose scientists and statisticians who have focused themselves on understanding the limitations and possibilities of making inferences from experimental data are the ones who are the forerunners to data scientists. They have a skill which transcends the particulars of what it takes to do lab work on cell cultures, or field studies for ecology etc. Their core skill involves thinking about the data involved at an abstracted level. To ask the question “given data with these properties, what conclusions can we draw?”\nFinally, according to Eugene Dubossarsky, “there’s only one purpose to data science, and that is to support decisions. And more specifically, to make better decisions. That should be something no one can argue with.” This goal-focused definition is unsurprising, given the fact that Eugene runs a training and consulting business and has been working in the field for over 20 years. I’m not going to argue with him, but to put it all together, we can define data science as a field that deals with description, prediction, and causal inference from data in a manner that is both domain-independent and domain-aware, with the ultimate goal of supporting decisions.\nWhat about AI? Everyone loves a good buzzword, and these days AI (Artificial Intelligence) is one of the hottest buzzwords. However, despite what some people may try to tell you, AI is unlikely to make data science obsolete any time soon. Following the above definition, as long as there is a need to make decisions based on data, there will be a need for data scientists. This includes decisions that aren’t made by humans, as data scientists are involved in building systems that make decisions autonomously.\nThe resurgence of AI feels somewhat amusing given my personal experience. One of the reasons I decided to pursue a PhD in natural language processing and personalisation was my interest in what I considered to be AI back in 2008. My initial introduction to the field was through an AI course and a project I did as part of my bachelor’s degree in computer science. However, by the time I graduated from my PhD, saying that I’m an AI expert seemed less useful than calling myself a data scientist. It may be that the field is about to shift again, and that rebranding as an AI expert would be more beneficial (though I’d be doing exactly the same work). Titles are somewhat silly – I’m going to continue working with data to support decisions for as long as there is demand for this kind of work and I continue enjoying it. There is plenty to learn and develop in this area, regardless of buzzwords and sexy titles.\n","wordCount":"1264","inLanguage":"en","image":"https://yanirseroussi.com/what-would-you-say-you-do-here.jpg","datePublished":"2018-07-22T08:27:43Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Defining data science in 2018</h1><div class=post-meta><span title='2018-07-22 08:27:43 +0000 UTC'>July 22, 2018</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2018-07-22-defining-data-science-in-2018/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here_hu2e849c7220f0ea4a04e1f6ecb54005af_335268_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here_hu2e849c7220f0ea4a04e1f6ecb54005af_335268_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here_hu2e849c7220f0ea4a04e1f6ecb54005af_335268_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here_hu2e849c7220f0ea4a04e1f6ecb54005af_335268_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here.jpg 1278w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here.jpg alt width=1278 height=686></figure><div class=post-content><p>I got my first data science job in 2012, the year <a href=https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century target=_blank rel=noopener>Harvard Business Review announced data scientist to be the sexiest job of the 21st century</a>. Two years later, I published <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>a post on my then-favourite definition of data science</a>, as the intersection between software engineering and statistics. Unfortunately, that definition became somewhat irrelevant as more and more people jumped on the data science bandwagon – possibly to the point of <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>making data scientist useless as a job title</a>. However, I still call myself a data scientist. Even better – <a href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/>I still get paid for being a data scientist</a>. But what does it mean? What do I actually do here? This article is a short summary of my understanding of the definition of data science in 2018.</p><h2 id=its-not-all-about-machine-learning>It&rsquo;s not all about machine learning<a hidden class=anchor aria-hidden=true href=#its-not-all-about-machine-learning>#</a></h2><p>As I was wrapping up my PhD in 2012, I started thinking about my next steps. I knew I wanted to get back to working in the tech industry, ideally with a small startup. But it wasn&rsquo;t clear to me how to market myself – my LinkedIn title at the time was <em>&ldquo;software engineer with a research background&rdquo;</em>, which is a bit of a mouthful. Around that time I heard about <a href=https://www.kaggle.com/ target=_blank rel=noopener>Kaggle</a> and decided to try competing. <a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/>This went pretty well</a>, and exposed me to the data science community globally and in Melbourne, where I was living at the time. That&rsquo;s how I first met Adam Neumann, the founder of Giveable, a startup that aimed to recommend gifts based on social networking data. Upon graduating, I joined Giveable as a data scientist. Changing my LinkedIn title quickly led to many other offers, but I was happy to be working on Giveable – I felt fortunate to have found a startup job that was related to my PhD research on recommender systems.</p><p>My understanding of data science at the time was heavily influenced by Kaggle and the tech industry. Kaggle was only about predictive modelling competitions back then, and so I believed that data science is about using machine learning to build models and deploy them as part of various applications. I was very comfortable with that definition, having spent my PhD years on several predictive modelling tasks, and having worked as a software engineer prior to that.</p><p>Things have changed considerably since 2012. It is now much easier to deploy machine learning models, <a href="https://www.youtube.com/watch?v=YOIo09qjVl4" target=_blank rel=noopener>even without a deep understanding of how they work</a>. Many more people call themselves data scientists, <a href=https://eng.lyft.com/whats-in-a-name-ce42f419d16c target=_blank rel=noopener>including some who are more focused on data analysis than on building data products</a>. Even Kaggle – which is now owned by Google – <a href="https://www.youtube.com/watch?v=AoRSIdLpFqU" target=_blank rel=noopener>has broadened its scope beyond modelling competitions to support other types of analysis</a>. Numerous articles have been published on the meaning of data science in the past six years. We seem to be going towards a broad definition of the field, which includes any type of general data analysis. This trend of broadening the definition <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>may make data scientist somewhat useless as a job title</a>. However, I believe that data science tasks remain useful, as shown by the following definitions.</p><h2 id=recent-definitions-by-hernán-hawkins-and-dubossarsky>Recent definitions by Hernán, Hawkins, and Dubossarsky<a hidden class=anchor aria-hidden=true href=#recent-definitions-by-hernán-hawkins-and-dubossarsky>#</a></h2><p>In a <a href=https://arxiv.org/pdf/1804.10846.pdf target=_blank rel=noopener>recent article</a>, Hernán et al. classify data science tasks into three types: <em>description</em>, <em>prediction</em>, and <em>causal inference</em>. Like other authors, they argue that causal inference has been neglected by traditional statistics and some scientific disciplines. They claim that the emergence of data science is an opportunity to get causal inference &ldquo;right&rdquo;. Further, they emphasise the importance of domain expert knowledge, <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>which is essential in causal inference</a>. Defining data science in this broad manner seems to capture the essence of what the field is about these days. However, purely descriptive tasks are still often performed by data <em>analysts</em> rather than <em>scientists</em>. And the distinction between prediction and causal inference can be a bit fuzzy, especially as the tools for the latter are at a lower level of maturity. In addition, while I agree with Hernán et al. that domain expertise is important, it seems unlikely that this will forever be the case. No one is born an expert – expertise is gained by learning from and interacting with the world. Therefore, it&rsquo;s plausible that gaining expertise can and will be automated. Further, there are numerous cases where experts were proven to be wrong. For example, it wasn&rsquo;t so long ago that <a href=https://www.healio.com/hematology-oncology/news/print/hemonc-today/%7B241d62a7-fe6e-4c5b-9fed-a33cc6e4bd7c%7D/cigarettes-were-once-physician-tested-approved target=_blank rel=noopener>doctors recommended smoking</a>.</p><p>Despite the importance of domain knowledge, one can argue that scientists that specialise in a single domain are not data scientists. In fact, the ability to go beyond one domain and think of data in a more abstract manner is what makes a data scientist. Applying this abstract knowledge often requires some domain expertise or input from domain experts, but most data science techniques are not domain-specific – they can be applied to many different problems. John Hawkins explains this point well in an article titled <em><a href=https://www.linkedin.com/pulse/why-all-scientists-data-john-hawkins target=_blank rel=noopener>why all scientists are not data scientists</a></em>:</p><blockquote><p>Those scientists and statisticians who have focused themselves on understanding the limitations and possibilities of making inferences from experimental data are the ones who are the forerunners to data scientists. They have a skill which transcends the particulars of what it takes to do lab work on cell cultures, or field studies for ecology etc. Their core skill involves thinking about the data involved at an abstracted level. To ask the question &ldquo;given data with these properties, what conclusions can we draw?&rdquo;</p></blockquote><p>Finally, <a href=https://www.superdatascience.com/podcast-one-purpose-data-science-truth-analytics/ target=_blank rel=noopener>according to Eugene Dubossarsky</a>, <em>&ldquo;there&rsquo;s only one purpose to data science, and that is to support decisions. And more specifically, to make better decisions. That should be something no one can argue with.&rdquo;</em> This goal-focused definition is unsurprising, given the fact that Eugene runs a training and consulting business and has been working in the field for over 20 years. I&rsquo;m not going to argue with him, but to put it all together, <strong>we can define data science as a field that deals with description, prediction, and causal inference from data in a manner that is both domain-independent and domain-aware, with the ultimate goal of supporting decisions</strong>.</p><h2 id=what-about-ai>What about AI?<a hidden class=anchor aria-hidden=true href=#what-about-ai>#</a></h2><p>Everyone loves a good buzzword, and these days AI (Artificial Intelligence) is one of the hottest buzzwords. However, despite <a href=https://www.forbes.com/sites/valleyvoices/2017/01/31/the-rise-of-ai-will-force-a-new-breed-of-data-scientist/ target=_blank rel=noopener>what some people may try to tell you</a>, AI is unlikely to make data science obsolete any time soon. Following the above definition, as long as there is a need to make decisions based on data, there will be a need for data scientists. This includes decisions that aren&rsquo;t made by humans, as data scientists are involved in building systems that make decisions autonomously.</p><p>The resurgence of AI feels somewhat amusing given my personal experience. One of the reasons I decided to pursue a PhD in natural language processing and personalisation was my interest in what I considered to be AI back in 2008. My initial introduction to the field was through an AI course and a project I did as part of my bachelor&rsquo;s degree in computer science. However, by the time I graduated from my PhD, saying that I&rsquo;m an AI expert seemed less useful than calling myself a data scientist. It may be that the field is about to shift again, and that rebranding as an AI expert would be more beneficial (though I&rsquo;d be doing exactly the same work). Titles are somewhat silly – I&rsquo;m going to continue working with data to support decisions for as long as there is demand for this kind of work and I continue enjoying it. There is plenty to learn and develop in this area, regardless of buzzwords and sexy titles.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/analytics/>analytics</a></li><li><a href=https://yanirseroussi.com/tags/artificial-intelligence/>artificial intelligence</a></li><li><a href=https://yanirseroussi.com/tags/business/>business</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/statistics/>statistics</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on x" href="https://x.com/intent/tweet/?text=Defining%20data%20science%20in%202018&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f&amp;hashtags=analytics%2cartificialintelligence%2cbusiness%2cdatascience%2cmachinelearning%2cstatistics"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f&amp;title=Defining%20data%20science%20in%202018&amp;summary=Defining%20data%20science%20in%202018&amp;source=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f&title=Defining%20data%20science%20in%202018"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on whatsapp" href="https://api.whatsapp.com/send?text=Defining%20data%20science%20in%202018%20-%20https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on telegram" href="https://telegram.me/share/url?text=Defining%20data%20science%20in%202018&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on ycombinator" href="https://news.ycombinator.com/submitlink?t=Defining%20data%20science%20in%202018&u=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="analytics,artificial intelligence,business,data science,machine learning,statistics"><meta name=description content="Updating my definition of data science to match changes in the field. It is now broader than before, but its ultimate goal is still to support decisions."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Defining data science in 2018"><meta property="og:description" content="Updating my definition of data science to match changes in the field. It is now broader than before, but its ultimate goal is still to support decisions."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/"><meta property="og:image" content="https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2018-07-22T08:27:43+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here.jpg"><meta name=twitter:title content="Defining data science in 2018"><meta name=twitter:description content="Updating my definition of data science to match changes in the field. It is now broader than before, but its ultimate goal is still to support decisions."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Defining data science in 2018","item":"https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Defining data science in 2018","name":"Defining data science in 2018","description":"Updating my definition of data science to match changes in the field. It is now broader than before, but its ultimate goal is still to support decisions.","keywords":["analytics","artificial intelligence","business","data science","machine learning","statistics"],"articleBody":"I got my first data science job in 2012, the year Harvard Business Review announced data scientist to be the sexiest job of the 21st century. Two years later, I published a post on my then-favourite definition of data science, as the intersection between software engineering and statistics. Unfortunately, that definition became somewhat irrelevant as more and more people jumped on the data science bandwagon – possibly to the point of making data scientist useless as a job title. However, I still call myself a data scientist. Even better – I still get paid for being a data scientist. But what does it mean? What do I actually do here? This article is a short summary of my understanding of the definition of data science in 2018.\nIt’s not all about machine learning As I was wrapping up my PhD in 2012, I started thinking about my next steps. I knew I wanted to get back to working in the tech industry, ideally with a small startup. But it wasn’t clear to me how to market myself – my LinkedIn title at the time was “software engineer with a research background”, which is a bit of a mouthful. Around that time I heard about Kaggle and decided to try competing. This went pretty well, and exposed me to the data science community globally and in Melbourne, where I was living at the time. That’s how I first met Adam Neumann, the founder of Giveable, a startup that aimed to recommend gifts based on social networking data. Upon graduating, I joined Giveable as a data scientist. Changing my LinkedIn title quickly led to many other offers, but I was happy to be working on Giveable – I felt fortunate to have found a startup job that was related to my PhD research on recommender systems.\nMy understanding of data science at the time was heavily influenced by Kaggle and the tech industry. Kaggle was only about predictive modelling competitions back then, and so I believed that data science is about using machine learning to build models and deploy them as part of various applications. I was very comfortable with that definition, having spent my PhD years on several predictive modelling tasks, and having worked as a software engineer prior to that.\nThings have changed considerably since 2012. It is now much easier to deploy machine learning models, even without a deep understanding of how they work. Many more people call themselves data scientists, including some who are more focused on data analysis than on building data products. Even Kaggle – which is now owned by Google – has broadened its scope beyond modelling competitions to support other types of analysis. Numerous articles have been published on the meaning of data science in the past six years. We seem to be going towards a broad definition of the field, which includes any type of general data analysis. This trend of broadening the definition may make data scientist somewhat useless as a job title. However, I believe that data science tasks remain useful, as shown by the following definitions.\nRecent definitions by Hernán, Hawkins, and Dubossarsky In a recent article, Hernán et al. classify data science tasks into three types: description, prediction, and causal inference. Like other authors, they argue that causal inference has been neglected by traditional statistics and some scientific disciplines. They claim that the emergence of data science is an opportunity to get causal inference “right”. Further, they emphasise the importance of domain expert knowledge, which is essential in causal inference. Defining data science in this broad manner seems to capture the essence of what the field is about these days. However, purely descriptive tasks are still often performed by data analysts rather than scientists. And the distinction between prediction and causal inference can be a bit fuzzy, especially as the tools for the latter are at a lower level of maturity. In addition, while I agree with Hernán et al. that domain expertise is important, it seems unlikely that this will forever be the case. No one is born an expert – expertise is gained by learning from and interacting with the world. Therefore, it’s plausible that gaining expertise can and will be automated. Further, there are numerous cases where experts were proven to be wrong. For example, it wasn’t so long ago that doctors recommended smoking.\nDespite the importance of domain knowledge, one can argue that scientists that specialise in a single domain are not data scientists. In fact, the ability to go beyond one domain and think of data in a more abstract manner is what makes a data scientist. Applying this abstract knowledge often requires some domain expertise or input from domain experts, but most data science techniques are not domain-specific – they can be applied to many different problems. John Hawkins explains this point well in an article titled why all scientists are not data scientists:\nThose scientists and statisticians who have focused themselves on understanding the limitations and possibilities of making inferences from experimental data are the ones who are the forerunners to data scientists. They have a skill which transcends the particulars of what it takes to do lab work on cell cultures, or field studies for ecology etc. Their core skill involves thinking about the data involved at an abstracted level. To ask the question “given data with these properties, what conclusions can we draw?”\nFinally, according to Eugene Dubossarsky, “there’s only one purpose to data science, and that is to support decisions. And more specifically, to make better decisions. That should be something no one can argue with.” This goal-focused definition is unsurprising, given the fact that Eugene runs a training and consulting business and has been working in the field for over 20 years. I’m not going to argue with him, but to put it all together, we can define data science as a field that deals with description, prediction, and causal inference from data in a manner that is both domain-independent and domain-aware, with the ultimate goal of supporting decisions.\nWhat about AI? Everyone loves a good buzzword, and these days AI (Artificial Intelligence) is one of the hottest buzzwords. However, despite what some people may try to tell you, AI is unlikely to make data science obsolete any time soon. Following the above definition, as long as there is a need to make decisions based on data, there will be a need for data scientists. This includes decisions that aren’t made by humans, as data scientists are involved in building systems that make decisions autonomously.\nThe resurgence of AI feels somewhat amusing given my personal experience. One of the reasons I decided to pursue a PhD in natural language processing and personalisation was my interest in what I considered to be AI back in 2008. My initial introduction to the field was through an AI course and a project I did as part of my bachelor’s degree in computer science. However, by the time I graduated from my PhD, saying that I’m an AI expert seemed less useful than calling myself a data scientist. It may be that the field is about to shift again, and that rebranding as an AI expert would be more beneficial (though I’d be doing exactly the same work). Titles are somewhat silly – I’m going to continue working with data to support decisions for as long as there is demand for this kind of work and I continue enjoying it. There is plenty to learn and develop in this area, regardless of buzzwords and sexy titles.\n","wordCount":"1264","inLanguage":"en","image":"https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here.jpg","datePublished":"2018-07-22T08:27:43Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Defining data science in 2018</h1><div class=post-meta><span title='2018-07-22 08:27:43 +0000 UTC'>July 22, 2018</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2018-07-22-defining-data-science-in-2018/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here_hu2e849c7220f0ea4a04e1f6ecb54005af_335268_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here_hu2e849c7220f0ea4a04e1f6ecb54005af_335268_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here_hu2e849c7220f0ea4a04e1f6ecb54005af_335268_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here_hu2e849c7220f0ea4a04e1f6ecb54005af_335268_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here.jpg 1278w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/what-would-you-say-you-do-here.jpg alt width=1278 height=686></figure><div class=post-content><p>I got my first data science job in 2012, the year <a href=https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century target=_blank rel=noopener>Harvard Business Review announced data scientist to be the sexiest job of the 21st century</a>. Two years later, I published <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>a post on my then-favourite definition of data science</a>, as the intersection between software engineering and statistics. Unfortunately, that definition became somewhat irrelevant as more and more people jumped on the data science bandwagon – possibly to the point of <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>making data scientist useless as a job title</a>. However, I still call myself a data scientist. Even better – <a href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/>I still get paid for being a data scientist</a>. But what does it mean? What do I actually do here? This article is a short summary of my understanding of the definition of data science in 2018.</p><h2 id=its-not-all-about-machine-learning>It&rsquo;s not all about machine learning<a hidden class=anchor aria-hidden=true href=#its-not-all-about-machine-learning>#</a></h2><p>As I was wrapping up my PhD in 2012, I started thinking about my next steps. I knew I wanted to get back to working in the tech industry, ideally with a small startup. But it wasn&rsquo;t clear to me how to market myself – my LinkedIn title at the time was <em>&ldquo;software engineer with a research background&rdquo;</em>, which is a bit of a mouthful. Around that time I heard about <a href=https://www.kaggle.com/ target=_blank rel=noopener>Kaggle</a> and decided to try competing. <a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/>This went pretty well</a>, and exposed me to the data science community globally and in Melbourne, where I was living at the time. That&rsquo;s how I first met Adam Neumann, the founder of Giveable, a startup that aimed to recommend gifts based on social networking data. Upon graduating, I joined Giveable as a data scientist. Changing my LinkedIn title quickly led to many other offers, but I was happy to be working on Giveable – I felt fortunate to have found a startup job that was related to my PhD research on recommender systems.</p><p>My understanding of data science at the time was heavily influenced by Kaggle and the tech industry. Kaggle was only about predictive modelling competitions back then, and so I believed that data science is about using machine learning to build models and deploy them as part of various applications. I was very comfortable with that definition, having spent my PhD years on several predictive modelling tasks, and having worked as a software engineer prior to that.</p><p>Things have changed considerably since 2012. It is now much easier to deploy machine learning models, <a href="https://www.youtube.com/watch?v=YOIo09qjVl4" target=_blank rel=noopener>even without a deep understanding of how they work</a>. Many more people call themselves data scientists, <a href=https://eng.lyft.com/whats-in-a-name-ce42f419d16c target=_blank rel=noopener>including some who are more focused on data analysis than on building data products</a>. Even Kaggle – which is now owned by Google – <a href="https://www.youtube.com/watch?v=AoRSIdLpFqU" target=_blank rel=noopener>has broadened its scope beyond modelling competitions to support other types of analysis</a>. Numerous articles have been published on the meaning of data science in the past six years. We seem to be going towards a broad definition of the field, which includes any type of general data analysis. This trend of broadening the definition <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>may make data scientist somewhat useless as a job title</a>. However, I believe that data science tasks remain useful, as shown by the following definitions.</p><h2 id=recent-definitions-by-hernán-hawkins-and-dubossarsky>Recent definitions by Hernán, Hawkins, and Dubossarsky<a hidden class=anchor aria-hidden=true href=#recent-definitions-by-hernán-hawkins-and-dubossarsky>#</a></h2><p>In a <a href=https://arxiv.org/pdf/1804.10846.pdf target=_blank rel=noopener>recent article</a>, Hernán et al. classify data science tasks into three types: <em>description</em>, <em>prediction</em>, and <em>causal inference</em>. Like other authors, they argue that causal inference has been neglected by traditional statistics and some scientific disciplines. They claim that the emergence of data science is an opportunity to get causal inference &ldquo;right&rdquo;. Further, they emphasise the importance of domain expert knowledge, <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>which is essential in causal inference</a>. Defining data science in this broad manner seems to capture the essence of what the field is about these days. However, purely descriptive tasks are still often performed by data <em>analysts</em> rather than <em>scientists</em>. And the distinction between prediction and causal inference can be a bit fuzzy, especially as the tools for the latter are at a lower level of maturity. In addition, while I agree with Hernán et al. that domain expertise is important, it seems unlikely that this will forever be the case. No one is born an expert – expertise is gained by learning from and interacting with the world. Therefore, it&rsquo;s plausible that gaining expertise can and will be automated. Further, there are numerous cases where experts were proven to be wrong. For example, it wasn&rsquo;t so long ago that <a href=https://www.healio.com/hematology-oncology/news/print/hemonc-today/%7B241d62a7-fe6e-4c5b-9fed-a33cc6e4bd7c%7D/cigarettes-were-once-physician-tested-approved target=_blank rel=noopener>doctors recommended smoking</a>.</p><p>Despite the importance of domain knowledge, one can argue that scientists that specialise in a single domain are not data scientists. In fact, the ability to go beyond one domain and think of data in a more abstract manner is what makes a data scientist. Applying this abstract knowledge often requires some domain expertise or input from domain experts, but most data science techniques are not domain-specific – they can be applied to many different problems. John Hawkins explains this point well in an article titled <em><a href=https://www.linkedin.com/pulse/why-all-scientists-data-john-hawkins target=_blank rel=noopener>why all scientists are not data scientists</a></em>:</p><blockquote><p>Those scientists and statisticians who have focused themselves on understanding the limitations and possibilities of making inferences from experimental data are the ones who are the forerunners to data scientists. They have a skill which transcends the particulars of what it takes to do lab work on cell cultures, or field studies for ecology etc. Their core skill involves thinking about the data involved at an abstracted level. To ask the question &ldquo;given data with these properties, what conclusions can we draw?&rdquo;</p></blockquote><p>Finally, <a href=https://www.superdatascience.com/podcast-one-purpose-data-science-truth-analytics/ target=_blank rel=noopener>according to Eugene Dubossarsky</a>, <em>&ldquo;there&rsquo;s only one purpose to data science, and that is to support decisions. And more specifically, to make better decisions. That should be something no one can argue with.&rdquo;</em> This goal-focused definition is unsurprising, given the fact that Eugene runs a training and consulting business and has been working in the field for over 20 years. I&rsquo;m not going to argue with him, but to put it all together, <strong>we can define data science as a field that deals with description, prediction, and causal inference from data in a manner that is both domain-independent and domain-aware, with the ultimate goal of supporting decisions</strong>.</p><h2 id=what-about-ai>What about AI?<a hidden class=anchor aria-hidden=true href=#what-about-ai>#</a></h2><p>Everyone loves a good buzzword, and these days AI (Artificial Intelligence) is one of the hottest buzzwords. However, despite <a href=https://www.forbes.com/sites/valleyvoices/2017/01/31/the-rise-of-ai-will-force-a-new-breed-of-data-scientist/ target=_blank rel=noopener>what some people may try to tell you</a>, AI is unlikely to make data science obsolete any time soon. Following the above definition, as long as there is a need to make decisions based on data, there will be a need for data scientists. This includes decisions that aren&rsquo;t made by humans, as data scientists are involved in building systems that make decisions autonomously.</p><p>The resurgence of AI feels somewhat amusing given my personal experience. One of the reasons I decided to pursue a PhD in natural language processing and personalisation was my interest in what I considered to be AI back in 2008. My initial introduction to the field was through an AI course and a project I did as part of my bachelor&rsquo;s degree in computer science. However, by the time I graduated from my PhD, saying that I&rsquo;m an AI expert seemed less useful than calling myself a data scientist. It may be that the field is about to shift again, and that rebranding as an AI expert would be more beneficial (though I&rsquo;d be doing exactly the same work). Titles are somewhat silly – I&rsquo;m going to continue working with data to support decisions for as long as there is demand for this kind of work and I continue enjoying it. There is plenty to learn and develop in this area, regardless of buzzwords and sexy titles.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/analytics/>analytics</a></li><li><a href=https://yanirseroussi.com/tags/artificial-intelligence/>artificial intelligence</a></li><li><a href=https://yanirseroussi.com/tags/business/>business</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/statistics/>statistics</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on x" href="https://x.com/intent/tweet/?text=Defining%20data%20science%20in%202018&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f&amp;hashtags=analytics%2cartificialintelligence%2cbusiness%2cdatascience%2cmachinelearning%2cstatistics"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f&amp;title=Defining%20data%20science%20in%202018&amp;summary=Defining%20data%20science%20in%202018&amp;source=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f&title=Defining%20data%20science%20in%202018"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on whatsapp" href="https://api.whatsapp.com/send?text=Defining%20data%20science%20in%202018%20-%20https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on telegram" href="https://telegram.me/share/url?text=Defining%20data%20science%20in%202018&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Defining data science in 2018 on ycombinator" href="https://news.ycombinator.com/submitlink?t=Defining%20data%20science%20in%202018&u=https%3a%2f%2fyanirseroussi.com%2f2018%2f07%2f22%2fdefining-data-science-in-2018%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-2905><div class=comment-header><a href=#comment-2905><img class=comment-avatar src="https://www.gravatar.com/avatar/dd6b3f4c12022dad24b54b933513ad84?s=50"><p class=comment-info><strong>Pravin</strong><br><small>2018-07-24 06:29:02</small></p></a></div><div class="comment-body post-content"><p>Great set of definitions and path of evolutions here!</p><p>There has to be chaos and confusion as it evolves surely, but the consensus as you very well mentioned is decisions. Anything done in the data world, if not leading to decisions is not quite viable in long term.</p><p>Thanks for sharing your thoughts, love reading your blog.</p></div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2018/11/03/reflections-on-remote-data-science-work/index.html b/2018/11/03/reflections-on-remote-data-science-work/index.html
index 3ba282f38..7f2ba7957 100644
--- a/2018/11/03/reflections-on-remote-data-science-work/index.html
+++ b/2018/11/03/reflections-on-remote-data-science-work/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Reflections on remote data science work | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Automattic,career,data science,remote work,WordPress"><meta name=description content="Discussing the pluses and minuses of remote work eighteen months after joining Automattic as a data scientist."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Reflections on remote data science work"><meta property="og:description" content="Discussing the pluses and minuses of remote work eighteen months after joining Automattic as a data scientist."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/"><meta property="og:image" content="https://yanirseroussi.com/angels-beach.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2018-11-03T06:33:13+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/angels-beach.jpg"><meta name=twitter:title content="Reflections on remote data science work"><meta name=twitter:description content="Discussing the pluses and minuses of remote work eighteen months after joining Automattic as a data scientist."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Reflections on remote data science work","item":"https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Reflections on remote data science work","name":"Reflections on remote data science work","description":"Discussing the pluses and minuses of remote work eighteen months after joining Automattic as a data scientist.","keywords":["Automattic","career","data science","remote work","WordPress"],"articleBody":"It’s been about a year and a half since I joined Automattic as a remote data scientist. This is the longest I’ve been in one position since finishing my PhD in 2012. This is also the first time I’ve worked full-time with a fully-distributed team. In this post, I briefly discuss some of the top pluses and minuses of remote work, based on my experience so far.\n+ Flexible hours\n– Potentially boundless work By far, one of the top perks of remote work with a distributed team is truly flexible hours. I only have one or two synchronous meetings a week, and in the rest of my time I'm free to work the hours I prefer. No one expects me to be online at specific times, as long as the work gets done and I respond to pings within a reasonable time. As I'm a morning person, this means that I typically work a few hours in the early morning, take a long break (e.g., to surf or run some errands), and then work a few more hours in the afternoon or early evening. The potential downside of such flexibility is not being able to stop working, especially as most of my colleagues are in Europe and North America. I deal with this by avoiding all work communications during my designated non-work hours. For example, I don't have any work-related apps on my phone, I keep all my work tabs in a separate tab group, and I turn Slack off when I'm not working. I found that this approach sets enough of a boundary between my work and personal life, though I do end up thinking about work problems outside work hours occasionally. + More time for non-work activities\n– There’s never enough time! Not commuting freed up the equivalent of a workday in my schedule. In addition, having flexible hours means that I can make time in the middle of the day for leisure activities like surfing and diving. However, it's still a full-time job, so I'm not completely free to pursue non-work activities. It often feels like there isn't enough time in the day, as I can always think of more stuff I'd like to do. But my current situation is much better than having to commute on a daily basis. Even though it's been a relatively short time, I find the idea of going back to full-time office work hard to imagine. + No need to attend an office\n– Possible isolation from colleagues (and the real world) Offices – especially open-plan offices – are not great places to get work done. This is definitely the case with work that requires a high level of concentration over uninterrupted blocks of time, like coding and data analysis. Working from home is great for avoiding distractions – there's no need for silly horse blinders here (though I do enjoy looking at the bird and lizard action outside my window). One good thing about offices is the physical availability of colleagues. It's easy to ask others for feedback, socialise over drinks or shared meals, and keep up to date with company politics. Automattic works around the lack of daily physical interaction by running a few meetups a year. The number of people attending a meetup can vary from a handful for team meetups, to hundreds for the annual Grand Meetup. In all cases, the idea is to bring employees together for up to a week at a time to work and socialise. In my experience, the everyday distance creates a craving to attend meetups. I've never worked in a place where co-workers were so enthusiastic about spending so much time together – with non-distributed companies, team building is often seen as a chore. I suppose that the physical distance makes us appreciate the opportunity to be together and make the most of this precious time – it's a bit like being in a long-distance relationship. That said, in the majority of the time, isolation can be a problem. As I'm based in Australia, I probably feel it more than others – most of my teammates are offline during my work hours, which means that there's no one to chat with on Slack. This isn't a huge issue, but I do need to ensure I get enough social interaction through other avenues. As the jobs page of Bandcamp (another distributed company) used to say: \"If you do not have a strong social structure outside of work then employment at Bandcamp will likely lead to heart disease and an early death. We’re hiring!\" + Most communication is written\n– Information overload As Automattic is a fully-distributed company, most of the communication is done in writing. The main tools are Slack and internal forums called P2s (emails are rarely used). This makes catching up on the latest company news easy in comparison to places that rely more heavily on synchronous meetings. The downside of so much written communication is potential information overload. It is impossible to follow all the P2 posts, and even keeping up with stuff I should know can sometimes be overwhelming. I especially feel it in the mornings, as most of my colleagues work while I'm sleeping. Therefore, catching up on everything that happened overnight and responding to pings often takes over an hour – things are rarely as I left them when I last logged off. I experience this same feeling of being overwhelmed when coming back from vacation. Depending on the length of time away, it can take days to catch up. On the plus side, this process doesn't rely on someone filling me in – it's all there for me to read. + Free trips around the world\n– Jet lag and flying As noted above, Automatticians meet in person a few times a year. Since joining, I attended meetups in Montreal, Whistler, Playa del Carmen, Bali, and Orlando. In some cases, I used the opportunity for personal trips near the meetup locations. Such trips can be a lot of fun. However, the obvious downside when travelling from Australia is that getting to meetups usually involves days of jetlag and long flights (e.g., the 17-hour Dallas to Sydney trip). Nonetheless, I still enjoy the travel opportunities. For example, I doubt I would have ever visited Florida and snorkelled with manatees if it wasn't for Automattic. + Exposure to diverse opinions and people\n– Cultural differences can pose challenges Australia's population is made up of many migrants, especially in the tech industry. However, all such migrants have some familiarity with Australian culture and values. The composition of Automattic's workforce is even more diverse, and it lacks the unifying factor of everyone choosing to live in the same place. This is mostly positive, as I find the exposure to a diverse set of people interesting, and everyone tends to be friendly, welcoming, and focused on the work rather than on cultural differences. However, it's important to be aware of differences in communication styles. There's also a wider range of cultural sensitivities than when working with a more homogeneous group. Still, I haven't found it to be much of an issue, possibly because I'm already used to being a migrant. For example, moving to Australia from Israel required some adjustment of my communication style to be less direct. Closing words Overall, I like working with Automattic. For me, the positives outweigh the negatives, as evidenced by the fact that it’s the longest I’ve been in one position since 2012. Doing remote data science work doesn’t seem particularly different to doing any other sort of non-physical work remotely. I hope that more companies will join Automattic and the growing list of remote companies, and offer their employees the option to work from wherever they’re most productive.\nUpdate (March 2019): I also covered similar topics in a Data Science Sydney talk about a day in the life of a remote data scientist.\n","wordCount":"1321","inLanguage":"en","image":"https://yanirseroussi.com/angels-beach.jpg","datePublished":"2018-11-03T06:33:13Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Reflections on remote data science work</h1><div class=post-meta><span title='2018-11-03 06:33:13 +0000 UTC'>November 3, 2018</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2018-11-03-reflections-on-remote-data-science-work/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach.jpg 3998w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach.jpg alt width=3998 height=2143></figure><div class=post-content><p>It&rsquo;s been about a year and a half since <a href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/>I joined Automattic as a remote data scientist</a>. This is the longest I&rsquo;ve been in one position since finishing my PhD in 2012. This is also the first time I&rsquo;ve worked full-time with a fully-distributed team. In this post, I briefly discuss some of the top pluses and minuses of remote work, based on my experience so far.</p><h2 id=-flexible-hoursbr-potentially-boundless-work>+ Flexible hours<br>– Potentially boundless work<a hidden class=anchor aria-hidden=true href=#-flexible-hoursbr-potentially-boundless-work>#</a></h2><p class=indent-1>By far, one of the top perks of remote work with a distributed team is truly flexible hours. I only have one or two synchronous meetings a week, and in the rest of my time I'm free to work the hours I prefer. No one expects me to be online at specific times, as long as the work gets done and I respond to pings within a reasonable time. As I'm a morning person, this means that I typically work a few hours in the early morning, take a long break (e.g., to surf or run some errands), and then work a few more hours in the afternoon or early evening.</p><p class=indent-1>The potential downside of such flexibility is not being able to stop working, especially as most of my colleagues are in Europe and North America. I deal with this by avoiding all work communications during my designated non-work hours. For example, I don't have any work-related apps on my phone, I keep all my work tabs in a separate tab group, and I turn Slack off when I'm not working. I found that this approach sets enough of a boundary between my work and personal life, though I do end up thinking about work problems outside work hours occasionally.</p><h2 id=-more-time-for-non-work-activitiesbr-theres-never-enough-time>+ More time for non-work activities<br>– There&rsquo;s never enough time!<a hidden class=anchor aria-hidden=true href=#-more-time-for-non-work-activitiesbr-theres-never-enough-time>#</a></h2><p class=indent-1>Not commuting freed up the equivalent of a workday in my schedule. In addition, having flexible hours means that I can make time in the middle of the day for leisure activities like surfing and diving. However, it's still a full-time job, so I'm not completely free to pursue non-work activities. It often feels like there isn't enough time in the day, as I can always think of more stuff I'd like to do. But my current situation is much better than having to commute on a daily basis. Even though it's been a relatively short time, I find the idea of going back to full-time office work hard to imagine.</p><h2 id=-no-need-to-attend-an-officebr-possible-isolation-from-colleagues-and-the-real-world>+ No need to attend an office<br>– Possible isolation from colleagues (and the real world)<a hidden class=anchor aria-hidden=true href=#-no-need-to-attend-an-officebr-possible-isolation-from-colleagues-and-the-real-world>#</a></h2><p class=indent-1>Offices &ndash; especially open-plan offices &ndash; are not great places to get work done. This is definitely the case with work that requires a high level of concentration over uninterrupted blocks of time, like coding and data analysis. Working from home is great for avoiding distractions &ndash; there's no need for <a href=https://techcrunch.com/2018/10/17/open-offices-have-driven-panasonic-to-make-horse-blinders-for-humans/>silly horse blinders</a> here (though I do enjoy looking at the bird and lizard action outside my window).</p><p class=indent-1>One good thing about offices is the physical availability of colleagues. It's easy to ask others for feedback, socialise over drinks or shared meals, and keep up to date with company politics. Automattic works around the lack of daily physical interaction by running a few meetups a year. The number of people attending a meetup can vary from a handful for team meetups, to hundreds for the annual Grand Meetup. In all cases, the idea is to bring employees together for up to a week at a time to work and socialise. In my experience, the everyday distance creates a craving to attend meetups. I've never worked in a place where co-workers were so enthusiastic about spending so much time together &ndash; with non-distributed companies, team building is often seen as a chore. I suppose that the physical distance makes us appreciate the opportunity to be together and make the most of this precious time &ndash; it's a bit like being in a long-distance relationship.</p><p class=indent-1>That said, in the majority of the time, isolation can be a problem. As I'm based in Australia, I probably feel it more than others &ndash; most of my teammates are offline during my work hours, which means that there's no one to chat with on Slack. This isn't a huge issue, but I do need to ensure I get enough social interaction through other avenues. As <a href=https://web.archive.org/web/20160102094215/Bandcamp.com/jobs>the jobs page of Bandcamp (another distributed company) used to say</a>: <i>"If you do not have a strong social structure outside of work then employment at Bandcamp will likely lead to heart disease and an early death. We’re hiring!"</i></p><h2 id=-most-communication-is-writtenbr-information-overload>+ Most communication is written<br>– Information overload<a hidden class=anchor aria-hidden=true href=#-most-communication-is-writtenbr-information-overload>#</a></h2><p class=indent-1>As Automattic is a fully-distributed company, most of the communication is done in writing. The main tools are Slack and internal forums called P2s (emails are rarely used). This makes catching up on the latest company news easy in comparison to places that rely more heavily on synchronous meetings. The downside of so much written communication is potential information overload. It is impossible to follow all the P2 posts, and even keeping up with stuff I <i>should</i> know can sometimes be overwhelming. I especially feel it in the mornings, as most of my colleagues work while I'm sleeping. Therefore, catching up on everything that happened overnight and responding to pings often takes over an hour &ndash; things are rarely as I left them when I last logged off. I experience this same feeling of being overwhelmed when coming back from vacation. Depending on the length of time away, it can take days to catch up. On the plus side, this process doesn't rely on someone filling me in &ndash; it's all there for me to read.</p><h2 id=-free-trips-around-the-worldbr-jet-lag-and-flying>+ Free trips around the world<br>– Jet lag and flying<a hidden class=anchor aria-hidden=true href=#-free-trips-around-the-worldbr-jet-lag-and-flying>#</a></h2><p class=indent-1>As noted above, Automatticians meet in person a few times a year. Since joining, I attended meetups in Montreal, Whistler, Playa del Carmen, Bali, and Orlando. In some cases, I used the opportunity for personal trips near the meetup locations. Such trips can be a lot of fun. However, the obvious downside when travelling from Australia is that getting to meetups usually involves days of jetlag and long flights (e.g., the 17-hour Dallas to Sydney trip). Nonetheless, I still enjoy the travel opportunities. For example, I doubt I would have ever visited Florida and snorkelled with manatees if it wasn't for Automattic.</p><h2 id=-exposure-to-diverse-opinions-and-peoplebr-cultural-differences-can-pose-challenges>+ Exposure to diverse opinions and people<br>– Cultural differences can pose challenges<a hidden class=anchor aria-hidden=true href=#-exposure-to-diverse-opinions-and-peoplebr-cultural-differences-can-pose-challenges>#</a></h2><p class=indent-1>Australia's population is made up of many migrants, especially in the tech industry. However, all such migrants have some familiarity with Australian culture and values. The composition of Automattic's workforce is even more diverse, and it lacks the unifying factor of everyone choosing to live in the same place. This is mostly positive, as I find the exposure to a diverse set of people interesting, and everyone tends to be friendly, welcoming, and focused on the work rather than on cultural differences. However, it's important to be aware of differences in communication styles. There's also a wider range of cultural sensitivities than when working with a more homogeneous group. Still, I haven't found it to be much of an issue, possibly because I'm already used to being a migrant. For example, moving to Australia from Israel required some adjustment of my communication style to be less direct.</p><h2 id=closing-words>Closing words<a hidden class=anchor aria-hidden=true href=#closing-words>#</a></h2><p>Overall, I like working with Automattic. For me, the positives outweigh the negatives, as evidenced by the fact that it&rsquo;s the longest I&rsquo;ve been in one position since 2012. Doing remote data science work doesn&rsquo;t seem particularly different to doing any other sort of non-physical work remotely. I hope that more companies will join Automattic and <a href=https://github.com/yanirs/established-remote>the growing list of remote companies</a>, and offer their employees the option to work from wherever they&rsquo;re most productive.</p><p><strong>Update (March 2019):</strong> I also covered similar topics in a Data Science Sydney talk about <a href="https://www.youtube.com/watch?v=5qbVEEtgWcY">a day in the life of a remote data scientist</a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/automattic/>Automattic</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/remote-work/>remote work</a></li><li><a href=https://yanirseroussi.com/tags/wordpress/>WordPress</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on x" href="https://x.com/intent/tweet/?text=Reflections%20on%20remote%20data%20science%20work&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f&amp;hashtags=Automattic%2ccareer%2cdatascience%2cremotework%2cWordPress"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f&amp;title=Reflections%20on%20remote%20data%20science%20work&amp;summary=Reflections%20on%20remote%20data%20science%20work&amp;source=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f&title=Reflections%20on%20remote%20data%20science%20work"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on whatsapp" href="https://api.whatsapp.com/send?text=Reflections%20on%20remote%20data%20science%20work%20-%20https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on telegram" href="https://telegram.me/share/url?text=Reflections%20on%20remote%20data%20science%20work&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on ycombinator" href="https://news.ycombinator.com/submitlink?t=Reflections%20on%20remote%20data%20science%20work&u=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="Automattic,career,data science,remote work,WordPress"><meta name=description content="Discussing the pluses and minuses of remote work eighteen months after joining Automattic as a data scientist."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Reflections on remote data science work"><meta property="og:description" content="Discussing the pluses and minuses of remote work eighteen months after joining Automattic as a data scientist."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/"><meta property="og:image" content="https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2018-11-03T06:33:13+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach.jpg"><meta name=twitter:title content="Reflections on remote data science work"><meta name=twitter:description content="Discussing the pluses and minuses of remote work eighteen months after joining Automattic as a data scientist."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Reflections on remote data science work","item":"https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Reflections on remote data science work","name":"Reflections on remote data science work","description":"Discussing the pluses and minuses of remote work eighteen months after joining Automattic as a data scientist.","keywords":["Automattic","career","data science","remote work","WordPress"],"articleBody":"It’s been about a year and a half since I joined Automattic as a remote data scientist. This is the longest I’ve been in one position since finishing my PhD in 2012. This is also the first time I’ve worked full-time with a fully-distributed team. In this post, I briefly discuss some of the top pluses and minuses of remote work, based on my experience so far.\n+ Flexible hours\n– Potentially boundless work By far, one of the top perks of remote work with a distributed team is truly flexible hours. I only have one or two synchronous meetings a week, and in the rest of my time I'm free to work the hours I prefer. No one expects me to be online at specific times, as long as the work gets done and I respond to pings within a reasonable time. As I'm a morning person, this means that I typically work a few hours in the early morning, take a long break (e.g., to surf or run some errands), and then work a few more hours in the afternoon or early evening. The potential downside of such flexibility is not being able to stop working, especially as most of my colleagues are in Europe and North America. I deal with this by avoiding all work communications during my designated non-work hours. For example, I don't have any work-related apps on my phone, I keep all my work tabs in a separate tab group, and I turn Slack off when I'm not working. I found that this approach sets enough of a boundary between my work and personal life, though I do end up thinking about work problems outside work hours occasionally. + More time for non-work activities\n– There’s never enough time! Not commuting freed up the equivalent of a workday in my schedule. In addition, having flexible hours means that I can make time in the middle of the day for leisure activities like surfing and diving. However, it's still a full-time job, so I'm not completely free to pursue non-work activities. It often feels like there isn't enough time in the day, as I can always think of more stuff I'd like to do. But my current situation is much better than having to commute on a daily basis. Even though it's been a relatively short time, I find the idea of going back to full-time office work hard to imagine. + No need to attend an office\n– Possible isolation from colleagues (and the real world) Offices – especially open-plan offices – are not great places to get work done. This is definitely the case with work that requires a high level of concentration over uninterrupted blocks of time, like coding and data analysis. Working from home is great for avoiding distractions – there's no need for silly horse blinders here (though I do enjoy looking at the bird and lizard action outside my window). One good thing about offices is the physical availability of colleagues. It's easy to ask others for feedback, socialise over drinks or shared meals, and keep up to date with company politics. Automattic works around the lack of daily physical interaction by running a few meetups a year. The number of people attending a meetup can vary from a handful for team meetups, to hundreds for the annual Grand Meetup. In all cases, the idea is to bring employees together for up to a week at a time to work and socialise. In my experience, the everyday distance creates a craving to attend meetups. I've never worked in a place where co-workers were so enthusiastic about spending so much time together – with non-distributed companies, team building is often seen as a chore. I suppose that the physical distance makes us appreciate the opportunity to be together and make the most of this precious time – it's a bit like being in a long-distance relationship. That said, in the majority of the time, isolation can be a problem. As I'm based in Australia, I probably feel it more than others – most of my teammates are offline during my work hours, which means that there's no one to chat with on Slack. This isn't a huge issue, but I do need to ensure I get enough social interaction through other avenues. As the jobs page of Bandcamp (another distributed company) used to say: \"If you do not have a strong social structure outside of work then employment at Bandcamp will likely lead to heart disease and an early death. We’re hiring!\" + Most communication is written\n– Information overload As Automattic is a fully-distributed company, most of the communication is done in writing. The main tools are Slack and internal forums called P2s (emails are rarely used). This makes catching up on the latest company news easy in comparison to places that rely more heavily on synchronous meetings. The downside of so much written communication is potential information overload. It is impossible to follow all the P2 posts, and even keeping up with stuff I should know can sometimes be overwhelming. I especially feel it in the mornings, as most of my colleagues work while I'm sleeping. Therefore, catching up on everything that happened overnight and responding to pings often takes over an hour – things are rarely as I left them when I last logged off. I experience this same feeling of being overwhelmed when coming back from vacation. Depending on the length of time away, it can take days to catch up. On the plus side, this process doesn't rely on someone filling me in – it's all there for me to read. + Free trips around the world\n– Jet lag and flying As noted above, Automatticians meet in person a few times a year. Since joining, I attended meetups in Montreal, Whistler, Playa del Carmen, Bali, and Orlando. In some cases, I used the opportunity for personal trips near the meetup locations. Such trips can be a lot of fun. However, the obvious downside when travelling from Australia is that getting to meetups usually involves days of jetlag and long flights (e.g., the 17-hour Dallas to Sydney trip). Nonetheless, I still enjoy the travel opportunities. For example, I doubt I would have ever visited Florida and snorkelled with manatees if it wasn't for Automattic. + Exposure to diverse opinions and people\n– Cultural differences can pose challenges Australia's population is made up of many migrants, especially in the tech industry. However, all such migrants have some familiarity with Australian culture and values. The composition of Automattic's workforce is even more diverse, and it lacks the unifying factor of everyone choosing to live in the same place. This is mostly positive, as I find the exposure to a diverse set of people interesting, and everyone tends to be friendly, welcoming, and focused on the work rather than on cultural differences. However, it's important to be aware of differences in communication styles. There's also a wider range of cultural sensitivities than when working with a more homogeneous group. Still, I haven't found it to be much of an issue, possibly because I'm already used to being a migrant. For example, moving to Australia from Israel required some adjustment of my communication style to be less direct. Closing words Overall, I like working with Automattic. For me, the positives outweigh the negatives, as evidenced by the fact that it’s the longest I’ve been in one position since 2012. Doing remote data science work doesn’t seem particularly different to doing any other sort of non-physical work remotely. I hope that more companies will join Automattic and the growing list of remote companies, and offer their employees the option to work from wherever they’re most productive.\nUpdate (March 2019): I also covered similar topics in a Data Science Sydney talk about a day in the life of a remote data scientist.\n","wordCount":"1321","inLanguage":"en","image":"https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach.jpg","datePublished":"2018-11-03T06:33:13Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Reflections on remote data science work</h1><div class=post-meta><span title='2018-11-03 06:33:13 +0000 UTC'>November 3, 2018</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2018-11-03-reflections-on-remote-data-science-work/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach_hu1628e10df9028ac19609d5d417782f78_974371_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach.jpg 3998w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/angels-beach.jpg alt width=3998 height=2143></figure><div class=post-content><p>It&rsquo;s been about a year and a half since <a href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/>I joined Automattic as a remote data scientist</a>. This is the longest I&rsquo;ve been in one position since finishing my PhD in 2012. This is also the first time I&rsquo;ve worked full-time with a fully-distributed team. In this post, I briefly discuss some of the top pluses and minuses of remote work, based on my experience so far.</p><h2 id=-flexible-hoursbr-potentially-boundless-work>+ Flexible hours<br>– Potentially boundless work<a hidden class=anchor aria-hidden=true href=#-flexible-hoursbr-potentially-boundless-work>#</a></h2><p class=indent-1>By far, one of the top perks of remote work with a distributed team is truly flexible hours. I only have one or two synchronous meetings a week, and in the rest of my time I'm free to work the hours I prefer. No one expects me to be online at specific times, as long as the work gets done and I respond to pings within a reasonable time. As I'm a morning person, this means that I typically work a few hours in the early morning, take a long break (e.g., to surf or run some errands), and then work a few more hours in the afternoon or early evening.</p><p class=indent-1>The potential downside of such flexibility is not being able to stop working, especially as most of my colleagues are in Europe and North America. I deal with this by avoiding all work communications during my designated non-work hours. For example, I don't have any work-related apps on my phone, I keep all my work tabs in a separate tab group, and I turn Slack off when I'm not working. I found that this approach sets enough of a boundary between my work and personal life, though I do end up thinking about work problems outside work hours occasionally.</p><h2 id=-more-time-for-non-work-activitiesbr-theres-never-enough-time>+ More time for non-work activities<br>– There&rsquo;s never enough time!<a hidden class=anchor aria-hidden=true href=#-more-time-for-non-work-activitiesbr-theres-never-enough-time>#</a></h2><p class=indent-1>Not commuting freed up the equivalent of a workday in my schedule. In addition, having flexible hours means that I can make time in the middle of the day for leisure activities like surfing and diving. However, it's still a full-time job, so I'm not completely free to pursue non-work activities. It often feels like there isn't enough time in the day, as I can always think of more stuff I'd like to do. But my current situation is much better than having to commute on a daily basis. Even though it's been a relatively short time, I find the idea of going back to full-time office work hard to imagine.</p><h2 id=-no-need-to-attend-an-officebr-possible-isolation-from-colleagues-and-the-real-world>+ No need to attend an office<br>– Possible isolation from colleagues (and the real world)<a hidden class=anchor aria-hidden=true href=#-no-need-to-attend-an-officebr-possible-isolation-from-colleagues-and-the-real-world>#</a></h2><p class=indent-1>Offices &ndash; especially open-plan offices &ndash; are not great places to get work done. This is definitely the case with work that requires a high level of concentration over uninterrupted blocks of time, like coding and data analysis. Working from home is great for avoiding distractions &ndash; there's no need for <a href=https://techcrunch.com/2018/10/17/open-offices-have-driven-panasonic-to-make-horse-blinders-for-humans/>silly horse blinders</a> here (though I do enjoy looking at the bird and lizard action outside my window).</p><p class=indent-1>One good thing about offices is the physical availability of colleagues. It's easy to ask others for feedback, socialise over drinks or shared meals, and keep up to date with company politics. Automattic works around the lack of daily physical interaction by running a few meetups a year. The number of people attending a meetup can vary from a handful for team meetups, to hundreds for the annual Grand Meetup. In all cases, the idea is to bring employees together for up to a week at a time to work and socialise. In my experience, the everyday distance creates a craving to attend meetups. I've never worked in a place where co-workers were so enthusiastic about spending so much time together &ndash; with non-distributed companies, team building is often seen as a chore. I suppose that the physical distance makes us appreciate the opportunity to be together and make the most of this precious time &ndash; it's a bit like being in a long-distance relationship.</p><p class=indent-1>That said, in the majority of the time, isolation can be a problem. As I'm based in Australia, I probably feel it more than others &ndash; most of my teammates are offline during my work hours, which means that there's no one to chat with on Slack. This isn't a huge issue, but I do need to ensure I get enough social interaction through other avenues. As <a href=https://web.archive.org/web/20160102094215/Bandcamp.com/jobs>the jobs page of Bandcamp (another distributed company) used to say</a>: <i>"If you do not have a strong social structure outside of work then employment at Bandcamp will likely lead to heart disease and an early death. We’re hiring!"</i></p><h2 id=-most-communication-is-writtenbr-information-overload>+ Most communication is written<br>– Information overload<a hidden class=anchor aria-hidden=true href=#-most-communication-is-writtenbr-information-overload>#</a></h2><p class=indent-1>As Automattic is a fully-distributed company, most of the communication is done in writing. The main tools are Slack and internal forums called P2s (emails are rarely used). This makes catching up on the latest company news easy in comparison to places that rely more heavily on synchronous meetings. The downside of so much written communication is potential information overload. It is impossible to follow all the P2 posts, and even keeping up with stuff I <i>should</i> know can sometimes be overwhelming. I especially feel it in the mornings, as most of my colleagues work while I'm sleeping. Therefore, catching up on everything that happened overnight and responding to pings often takes over an hour &ndash; things are rarely as I left them when I last logged off. I experience this same feeling of being overwhelmed when coming back from vacation. Depending on the length of time away, it can take days to catch up. On the plus side, this process doesn't rely on someone filling me in &ndash; it's all there for me to read.</p><h2 id=-free-trips-around-the-worldbr-jet-lag-and-flying>+ Free trips around the world<br>– Jet lag and flying<a hidden class=anchor aria-hidden=true href=#-free-trips-around-the-worldbr-jet-lag-and-flying>#</a></h2><p class=indent-1>As noted above, Automatticians meet in person a few times a year. Since joining, I attended meetups in Montreal, Whistler, Playa del Carmen, Bali, and Orlando. In some cases, I used the opportunity for personal trips near the meetup locations. Such trips can be a lot of fun. However, the obvious downside when travelling from Australia is that getting to meetups usually involves days of jetlag and long flights (e.g., the 17-hour Dallas to Sydney trip). Nonetheless, I still enjoy the travel opportunities. For example, I doubt I would have ever visited Florida and snorkelled with manatees if it wasn't for Automattic.</p><h2 id=-exposure-to-diverse-opinions-and-peoplebr-cultural-differences-can-pose-challenges>+ Exposure to diverse opinions and people<br>– Cultural differences can pose challenges<a hidden class=anchor aria-hidden=true href=#-exposure-to-diverse-opinions-and-peoplebr-cultural-differences-can-pose-challenges>#</a></h2><p class=indent-1>Australia's population is made up of many migrants, especially in the tech industry. However, all such migrants have some familiarity with Australian culture and values. The composition of Automattic's workforce is even more diverse, and it lacks the unifying factor of everyone choosing to live in the same place. This is mostly positive, as I find the exposure to a diverse set of people interesting, and everyone tends to be friendly, welcoming, and focused on the work rather than on cultural differences. However, it's important to be aware of differences in communication styles. There's also a wider range of cultural sensitivities than when working with a more homogeneous group. Still, I haven't found it to be much of an issue, possibly because I'm already used to being a migrant. For example, moving to Australia from Israel required some adjustment of my communication style to be less direct.</p><h2 id=closing-words>Closing words<a hidden class=anchor aria-hidden=true href=#closing-words>#</a></h2><p>Overall, I like working with Automattic. For me, the positives outweigh the negatives, as evidenced by the fact that it&rsquo;s the longest I&rsquo;ve been in one position since 2012. Doing remote data science work doesn&rsquo;t seem particularly different to doing any other sort of non-physical work remotely. I hope that more companies will join Automattic and <a href=https://github.com/yanirs/established-remote>the growing list of remote companies</a>, and offer their employees the option to work from wherever they&rsquo;re most productive.</p><p><strong>Update (March 2019):</strong> I also covered similar topics in a Data Science Sydney talk about <a href="https://www.youtube.com/watch?v=5qbVEEtgWcY">a day in the life of a remote data scientist</a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/automattic/>Automattic</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/remote-work/>remote work</a></li><li><a href=https://yanirseroussi.com/tags/wordpress/>WordPress</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on x" href="https://x.com/intent/tweet/?text=Reflections%20on%20remote%20data%20science%20work&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f&amp;hashtags=Automattic%2ccareer%2cdatascience%2cremotework%2cWordPress"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f&amp;title=Reflections%20on%20remote%20data%20science%20work&amp;summary=Reflections%20on%20remote%20data%20science%20work&amp;source=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f&title=Reflections%20on%20remote%20data%20science%20work"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on whatsapp" href="https://api.whatsapp.com/send?text=Reflections%20on%20remote%20data%20science%20work%20-%20https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on telegram" href="https://telegram.me/share/url?text=Reflections%20on%20remote%20data%20science%20work&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Reflections on remote data science work on ycombinator" href="https://news.ycombinator.com/submitlink?t=Reflections%20on%20remote%20data%20science%20work&u=https%3a%2f%2fyanirseroussi.com%2f2018%2f11%2f03%2freflections-on-remote-data-science-work%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p><div class=comment-level-0 id=comment-3020><div class=comment-header><a href=#comment-3020><img class=comment-avatar src="https://www.gravatar.com/avatar/9f1de35db1a9174af8885039708185c9?s=50"><p class=comment-info><strong>parentscool</strong><br><small>2018-11-04 01:37:44</small></p></a></div><div class="comment-body post-content">I have been working remotely for WRI for nearly 2 years, and I can resonate with almost everything you have said. Great blog!</div></div><div class=comment-level-0 id=comment-3022><div class=comment-header><a href=#comment-3022><img class=comment-avatar src="https://www.gravatar.com/avatar/c692696b2addd9768ec241472e4b8d6a?s=50"><p class=comment-info><strong>Triparna Ray</strong><br><small>2018-11-05 05:53:51</small></p></a></div><div class="comment-body post-content">Interested. Though not trained as Data scientist yet but as BI consultant with experience over a decade. Let me know if you have any opportunity.</div></div><div class=comment-level-0 id=comment-3024><div class=comment-header><a href=#comment-3024><img class=comment-avatar src="https://www.gravatar.com/avatar/1499c586dfa43d9ca7b63b443bfb5e95?s=50"><p class=comment-info><strong>Sreekanth Yasa</strong><br><small>2018-11-06 02:29:18</small></p></a></div><div class="comment-body post-content">I am working for Accenture as Analyst. The article is very similar to my real life.
 I pursued data science from top university and worked on few capstone projects.</div></div></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
diff --git a/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.html b/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.html
index 7920c5c00..8f27beadc 100644
--- a/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.html
+++ b/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>The most practical causal inference book I’ve read (is still a draft) | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="causal inference,data science,statistics"><meta name=description content="Causal Inference by Miguel Hernán and Jamie Robins is a must-read for anyone interested in the area."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The most practical causal inference book I’ve read (is still a draft)"><meta property="og:description" content="Causal Inference by Miguel Hernán and Jamie Robins is a must-read for anyone interested in the area."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/"><meta property="og:image" content="https://yanirseroussi.com/chicken-egg-roost.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2018-12-24T02:37:50+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/chicken-egg-roost.jpg"><meta name=twitter:title content="The most practical causal inference book I’ve read (is still a draft)"><meta name=twitter:description content="Causal Inference by Miguel Hernán and Jamie Robins is a must-read for anyone interested in the area."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The most practical causal inference book I’ve read (is still a draft)","item":"https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The most practical causal inference book I’ve read (is still a draft)","name":"The most practical causal inference book I’ve read (is still a draft)","description":"Causal Inference by Miguel Hernán and Jamie Robins is a must-read for anyone interested in the area.","keywords":["causal inference","data science","statistics"],"articleBody":"I’ve been interested in the area of causal inference in the past few years. In my opinion it’s more exciting and relevant to everyday life than more hyped data science areas like deep learning. However, I’ve found it hard to apply what I’ve learned about causal inference to my work. Now, I believe I’ve finally found a book with practical techniques that I can use on real problems: Causal Inference by Miguel Hernán and Jamie Robins. It is available for free from their site, but is still in draft mode. This post is a short summary of the reasons why I think Causal Inference is a great practical resource.\nOne of the things that sets Causal Inference apart from other books on the topic is the background of its authors. Hernán and Robins are both epidemiologists, which means they often have to deal with data with strong limitations on sample size and feasibility of experiments. Decisions driven by causal inference in epidemiology can often make the difference between life and death of individuals. Hence, the book is full of practical examples.\nThe book focuses on randomised controlled trials and well-defined interventions as the basis of causal inference from both experimental and observational data. As the authors show, even with randomised experiments, the analysis often requires using observational causal inference tools due to factors like selection and measurement biases. Their insistence on well-defined interventions is particularly refreshing, as one of the things that bothers me about the writings of Judea Pearl (a prominent researcher of causal inference) is the vagueness of statements like “smoking causes cancer” and “mud doesn’t cause rain”. The need for well-defined interventions was summarised by Hernán in the article Does water kill? A call for less casual causal inferences.\nUnlike some other resources, Causal Inference doesn’t appear to be too dogmatic about the framework used for modelling causality. I’m not an expert on where each idea originated, but it seems like the authors mix elements from the potential outcomes framework and from Pearl’s graphical models. They also don’t neglect time as an important consideration in cause-and-effect relationships. In fact, the third part of the book is dedicated to the topic of time-varying treatments and effects.\nThe practicality of the book is also demonstrated by the fact that it comes with code examples in multiple languages. In addition, the authors don’t dwell too much on the philosophy of causality. While it is a fascinating topic, the opening paragraphs of the book make its goals clear:\nBy reading this book you are expressing an interest in learning about causal inference. But, as a human being, you have already mastered the fundamental concepts of causal inference. You certainly know what a causal effect is; you clearly understand the difference between association and causation; and you have used this knowledge constantly throughout your life. In fact, had you not understood these causal concepts, you would have not survived long enough to read this chapter–or even to learn to read. As a toddler you would have jumped right into the swimming pool after observing that those who did so were later able to reach the jam jar. As a teenager, you would have skied down the most dangerous slopes after observing that those who did so were more likely to win the next ski race. As a parent, you would have refused to give antibiotics to your sick child after observing that those children who took their medicines were less likely to be playing in the park the next day.\nSince you already understand the definition of causal effect and the difference between association and causation, do not expect to gain deep conceptual insights from this chapter. Rather, the purpose of this chapter is to introduce mathematical notation that formalizes the causal intuition that you already possess. Make sure that you can match your causal intuition with the mathematical notation introduced here. This notation is necessary to precisely define causal concepts, and we will use it throughout the book.\nI won’t try to summarise the technical aspects of the book – partly because I don’t fully understand it all, and partly because the book itself is already a summary of a very rich research area. However, I’m likely to go back and reread the book in the future, with the goal of applying the techniques from the book to my work. I’d also like to take Hernán’s causal inference course as a way of practising what I’ve learned from the book. For people who want a non-technical summary of the topics covered by the book, I recommend the article The c-word: Scientific euphemisms do not improve causal inference from observational data. If you’re curious about other (less practical) causality books I’ve read, check out my causal inference resource list and my two previous posts on the topic: Why you should stop worrying about deep learning and deepen your understanding of causality instead and Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions.\n","wordCount":"831","inLanguage":"en","image":"https://yanirseroussi.com/chicken-egg-roost.jpg","datePublished":"2018-12-24T02:37:50Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The most practical causal inference book I’ve read (is still a draft)</h1><div class=post-meta><span title='2018-12-24 02:37:50 +0000 UTC'>December 24, 2018</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2018-12-24-the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost.jpg 4210w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost.jpg alt width=4210 height=2812></figure><div class=post-content><p>I&rsquo;ve been interested in the area of causal inference in the past few years. In my opinion <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>it&rsquo;s more exciting and relevant to everyday life than more hyped data science areas like deep learning</a>. However, I&rsquo;ve found it hard to apply what I&rsquo;ve learned about causal inference to my work. Now, I believe I&rsquo;ve finally found a book with practical techniques that I can use on real problems: <a href=https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/ target=_blank rel=noopener><em>Causal Inference</em></a> by Miguel Hernán and Jamie Robins. It is available for free from their site, but is still in draft mode. This post is a short summary of the reasons why I think <em>Causal Inference</em> is a great practical resource.</p><p>One of the things that sets <em>Causal Inference</em> apart from other books on the topic is the background of its authors. Hernán and Robins are both epidemiologists, which means they often have to deal with data with strong limitations on sample size and feasibility of experiments. Decisions driven by causal inference in epidemiology can often make the difference between life and death of individuals. Hence, the book is full of practical examples.</p><p>The book focuses on randomised controlled trials and well-defined interventions as the basis of causal inference from both experimental and observational data. As the authors show, even with randomised experiments, the analysis often requires using observational causal inference tools due to factors like selection and measurement biases. Their insistence on well-defined interventions is particularly refreshing, as one of the things that bothers me about the writings of Judea Pearl (a prominent researcher of causal inference) is <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>the vagueness of statements like <em>&ldquo;smoking causes cancer&rdquo;</em> and <em>&ldquo;mud doesn&rsquo;t cause rain&rdquo;</em></a>. The need for well-defined interventions was summarised by Hernán in the article <a href=https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5207342/ target=_blank rel=noopener><em>Does water kill? A call for less casual causal inferences</em></a>.</p><p>Unlike some other resources, <em>Causal Inference</em> doesn&rsquo;t appear to be too dogmatic about the framework used for modelling causality. I&rsquo;m not an expert on where each idea originated, but it seems like the authors mix elements from the <a href=https://en.wikipedia.org/wiki/Rubin_causal_model target=_blank rel=noopener>potential outcomes framework</a> and from <a href=https://en.wikipedia.org/wiki/Structural_equation_modeling target=_blank rel=noopener>Pearl&rsquo;s graphical models</a>. They also don&rsquo;t neglect time as an important consideration in cause-and-effect relationships. In fact, the third part of the book is dedicated to the topic of time-varying treatments and effects.</p><p>The practicality of the book is also demonstrated by the fact that it comes with code examples in multiple languages. In addition, the authors don&rsquo;t dwell too much on the philosophy of causality. While it is a fascinating topic, the opening paragraphs of the book make its goals clear:</p><blockquote><p>By reading this book you are expressing an interest in learning about causal inference. But, as a human being, you have already mastered the fundamental concepts of causal inference. You certainly know what a causal effect is; you clearly understand the difference between association and causation; and you have used this knowledge constantly throughout your life. In fact, had you not understood these causal concepts, you would have not survived long enough to read this chapter–or even to learn to read. As a toddler you would have jumped right into the swimming pool after observing that those who did so were later able to reach the jam jar. As a teenager, you would have skied down the most dangerous slopes after observing that those who did so were more likely to win the next ski race. As a parent, you would have refused to give antibiotics to your sick child after observing that those children who took their medicines were less likely to be playing in the park the next day.</p><p>Since you already understand the definition of causal effect and the difference between association and causation, do not expect to gain deep conceptual insights from this chapter. Rather, the purpose of this chapter is to introduce mathematical notation that formalizes the causal intuition that you already possess. Make sure that you can match your causal intuition with the mathematical notation introduced here. This notation is necessary to precisely define causal concepts, and we will use it throughout the book.</p></blockquote><p>I won&rsquo;t try to summarise the technical aspects of the book – partly because I don&rsquo;t fully understand it all, and partly because the book itself is already a summary of a very rich research area. However, I&rsquo;m likely to go back and reread the book in the future, with the goal of applying the techniques from the book to my work. I&rsquo;d also like to take <a href=https://www.edx.org/course/causal-diagrams-draw-assumptions-harvardx-ph559x target=_blank rel=noopener>Hernán&rsquo;s causal inference course</a> as a way of practising what I&rsquo;ve learned from the book. For people who want a non-technical summary of the topics covered by the book, I recommend the article <a href=https://ajph.aphapublications.org/doi/10.2105/AJPH.2018.304337 target=_blank rel=noopener><em>The c-word: Scientific euphemisms do not improve causal inference from observational data</em></a>. If you&rsquo;re curious about other (less practical) causality books I&rsquo;ve read, check out <a href=https://yanirseroussi.com/causal-inference-resources/>my causal inference resource list</a> and my two previous posts on the topic: <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/><em>Why you should stop worrying about deep learning and deepen your understanding of causality instead</em></a> and <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/><em>Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions</em></a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/causal-inference/>causal inference</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/statistics/>statistics</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on x" href="https://x.com/intent/tweet/?text=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f&amp;hashtags=causalinference%2cdatascience%2cstatistics"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f&amp;title=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&amp;summary=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f&title=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on whatsapp" href="https://api.whatsapp.com/send?text=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on telegram" href="https://telegram.me/share/url?text=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on ycombinator" href="https://news.ycombinator.com/submitlink?t=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&u=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="causal inference,data science,statistics"><meta name=description content="Causal Inference by Miguel Hernán and Jamie Robins is a must-read for anyone interested in the area."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The most practical causal inference book I’ve read (is still a draft)"><meta property="og:description" content="Causal Inference by Miguel Hernán and Jamie Robins is a must-read for anyone interested in the area."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/"><meta property="og:image" content="https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2018-12-24T02:37:50+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost.jpg"><meta name=twitter:title content="The most practical causal inference book I’ve read (is still a draft)"><meta name=twitter:description content="Causal Inference by Miguel Hernán and Jamie Robins is a must-read for anyone interested in the area."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The most practical causal inference book I’ve read (is still a draft)","item":"https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The most practical causal inference book I’ve read (is still a draft)","name":"The most practical causal inference book I’ve read (is still a draft)","description":"Causal Inference by Miguel Hernán and Jamie Robins is a must-read for anyone interested in the area.","keywords":["causal inference","data science","statistics"],"articleBody":"I’ve been interested in the area of causal inference in the past few years. In my opinion it’s more exciting and relevant to everyday life than more hyped data science areas like deep learning. However, I’ve found it hard to apply what I’ve learned about causal inference to my work. Now, I believe I’ve finally found a book with practical techniques that I can use on real problems: Causal Inference by Miguel Hernán and Jamie Robins. It is available for free from their site, but is still in draft mode. This post is a short summary of the reasons why I think Causal Inference is a great practical resource.\nOne of the things that sets Causal Inference apart from other books on the topic is the background of its authors. Hernán and Robins are both epidemiologists, which means they often have to deal with data with strong limitations on sample size and feasibility of experiments. Decisions driven by causal inference in epidemiology can often make the difference between life and death of individuals. Hence, the book is full of practical examples.\nThe book focuses on randomised controlled trials and well-defined interventions as the basis of causal inference from both experimental and observational data. As the authors show, even with randomised experiments, the analysis often requires using observational causal inference tools due to factors like selection and measurement biases. Their insistence on well-defined interventions is particularly refreshing, as one of the things that bothers me about the writings of Judea Pearl (a prominent researcher of causal inference) is the vagueness of statements like “smoking causes cancer” and “mud doesn’t cause rain”. The need for well-defined interventions was summarised by Hernán in the article Does water kill? A call for less casual causal inferences.\nUnlike some other resources, Causal Inference doesn’t appear to be too dogmatic about the framework used for modelling causality. I’m not an expert on where each idea originated, but it seems like the authors mix elements from the potential outcomes framework and from Pearl’s graphical models. They also don’t neglect time as an important consideration in cause-and-effect relationships. In fact, the third part of the book is dedicated to the topic of time-varying treatments and effects.\nThe practicality of the book is also demonstrated by the fact that it comes with code examples in multiple languages. In addition, the authors don’t dwell too much on the philosophy of causality. While it is a fascinating topic, the opening paragraphs of the book make its goals clear:\nBy reading this book you are expressing an interest in learning about causal inference. But, as a human being, you have already mastered the fundamental concepts of causal inference. You certainly know what a causal effect is; you clearly understand the difference between association and causation; and you have used this knowledge constantly throughout your life. In fact, had you not understood these causal concepts, you would have not survived long enough to read this chapter–or even to learn to read. As a toddler you would have jumped right into the swimming pool after observing that those who did so were later able to reach the jam jar. As a teenager, you would have skied down the most dangerous slopes after observing that those who did so were more likely to win the next ski race. As a parent, you would have refused to give antibiotics to your sick child after observing that those children who took their medicines were less likely to be playing in the park the next day.\nSince you already understand the definition of causal effect and the difference between association and causation, do not expect to gain deep conceptual insights from this chapter. Rather, the purpose of this chapter is to introduce mathematical notation that formalizes the causal intuition that you already possess. Make sure that you can match your causal intuition with the mathematical notation introduced here. This notation is necessary to precisely define causal concepts, and we will use it throughout the book.\nI won’t try to summarise the technical aspects of the book – partly because I don’t fully understand it all, and partly because the book itself is already a summary of a very rich research area. However, I’m likely to go back and reread the book in the future, with the goal of applying the techniques from the book to my work. I’d also like to take Hernán’s causal inference course as a way of practising what I’ve learned from the book. For people who want a non-technical summary of the topics covered by the book, I recommend the article The c-word: Scientific euphemisms do not improve causal inference from observational data. If you’re curious about other (less practical) causality books I’ve read, check out my causal inference resource list and my two previous posts on the topic: Why you should stop worrying about deep learning and deepen your understanding of causality instead and Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions.\n","wordCount":"831","inLanguage":"en","image":"https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost.jpg","datePublished":"2018-12-24T02:37:50Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The most practical causal inference book I’ve read (is still a draft)</h1><div class=post-meta><span title='2018-12-24 02:37:50 +0000 UTC'>December 24, 2018</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2018-12-24-the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost_hu60b33a1bef2586fcaccb307cd6388d77_2433611_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost.jpg 4210w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/chicken-egg-roost.jpg alt width=4210 height=2812></figure><div class=post-content><p>I&rsquo;ve been interested in the area of causal inference in the past few years. In my opinion <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>it&rsquo;s more exciting and relevant to everyday life than more hyped data science areas like deep learning</a>. However, I&rsquo;ve found it hard to apply what I&rsquo;ve learned about causal inference to my work. Now, I believe I&rsquo;ve finally found a book with practical techniques that I can use on real problems: <a href=https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/ target=_blank rel=noopener><em>Causal Inference</em></a> by Miguel Hernán and Jamie Robins. It is available for free from their site, but is still in draft mode. This post is a short summary of the reasons why I think <em>Causal Inference</em> is a great practical resource.</p><p>One of the things that sets <em>Causal Inference</em> apart from other books on the topic is the background of its authors. Hernán and Robins are both epidemiologists, which means they often have to deal with data with strong limitations on sample size and feasibility of experiments. Decisions driven by causal inference in epidemiology can often make the difference between life and death of individuals. Hence, the book is full of practical examples.</p><p>The book focuses on randomised controlled trials and well-defined interventions as the basis of causal inference from both experimental and observational data. As the authors show, even with randomised experiments, the analysis often requires using observational causal inference tools due to factors like selection and measurement biases. Their insistence on well-defined interventions is particularly refreshing, as one of the things that bothers me about the writings of Judea Pearl (a prominent researcher of causal inference) is <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>the vagueness of statements like <em>&ldquo;smoking causes cancer&rdquo;</em> and <em>&ldquo;mud doesn&rsquo;t cause rain&rdquo;</em></a>. The need for well-defined interventions was summarised by Hernán in the article <a href=https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5207342/ target=_blank rel=noopener><em>Does water kill? A call for less casual causal inferences</em></a>.</p><p>Unlike some other resources, <em>Causal Inference</em> doesn&rsquo;t appear to be too dogmatic about the framework used for modelling causality. I&rsquo;m not an expert on where each idea originated, but it seems like the authors mix elements from the <a href=https://en.wikipedia.org/wiki/Rubin_causal_model target=_blank rel=noopener>potential outcomes framework</a> and from <a href=https://en.wikipedia.org/wiki/Structural_equation_modeling target=_blank rel=noopener>Pearl&rsquo;s graphical models</a>. They also don&rsquo;t neglect time as an important consideration in cause-and-effect relationships. In fact, the third part of the book is dedicated to the topic of time-varying treatments and effects.</p><p>The practicality of the book is also demonstrated by the fact that it comes with code examples in multiple languages. In addition, the authors don&rsquo;t dwell too much on the philosophy of causality. While it is a fascinating topic, the opening paragraphs of the book make its goals clear:</p><blockquote><p>By reading this book you are expressing an interest in learning about causal inference. But, as a human being, you have already mastered the fundamental concepts of causal inference. You certainly know what a causal effect is; you clearly understand the difference between association and causation; and you have used this knowledge constantly throughout your life. In fact, had you not understood these causal concepts, you would have not survived long enough to read this chapter–or even to learn to read. As a toddler you would have jumped right into the swimming pool after observing that those who did so were later able to reach the jam jar. As a teenager, you would have skied down the most dangerous slopes after observing that those who did so were more likely to win the next ski race. As a parent, you would have refused to give antibiotics to your sick child after observing that those children who took their medicines were less likely to be playing in the park the next day.</p><p>Since you already understand the definition of causal effect and the difference between association and causation, do not expect to gain deep conceptual insights from this chapter. Rather, the purpose of this chapter is to introduce mathematical notation that formalizes the causal intuition that you already possess. Make sure that you can match your causal intuition with the mathematical notation introduced here. This notation is necessary to precisely define causal concepts, and we will use it throughout the book.</p></blockquote><p>I won&rsquo;t try to summarise the technical aspects of the book – partly because I don&rsquo;t fully understand it all, and partly because the book itself is already a summary of a very rich research area. However, I&rsquo;m likely to go back and reread the book in the future, with the goal of applying the techniques from the book to my work. I&rsquo;d also like to take <a href=https://www.edx.org/course/causal-diagrams-draw-assumptions-harvardx-ph559x target=_blank rel=noopener>Hernán&rsquo;s causal inference course</a> as a way of practising what I&rsquo;ve learned from the book. For people who want a non-technical summary of the topics covered by the book, I recommend the article <a href=https://ajph.aphapublications.org/doi/10.2105/AJPH.2018.304337 target=_blank rel=noopener><em>The c-word: Scientific euphemisms do not improve causal inference from observational data</em></a>. If you&rsquo;re curious about other (less practical) causality books I&rsquo;ve read, check out <a href=https://yanirseroussi.com/causal-inference-resources/>my causal inference resource list</a> and my two previous posts on the topic: <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/><em>Why you should stop worrying about deep learning and deepen your understanding of causality instead</em></a> and <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/><em>Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions</em></a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/causal-inference/>causal inference</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/statistics/>statistics</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on x" href="https://x.com/intent/tweet/?text=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f&amp;hashtags=causalinference%2cdatascience%2cstatistics"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f&amp;title=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&amp;summary=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&amp;source=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f&title=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on whatsapp" href="https://api.whatsapp.com/send?text=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29%20-%20https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on telegram" href="https://telegram.me/share/url?text=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&amp;url=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share The most practical causal inference book I’ve read (is still a draft) on ycombinator" href="https://news.ycombinator.com/submitlink?t=The%20most%20practical%20causal%20inference%20book%20I%e2%80%99ve%20read%20%28is%20still%20a%20draft%29&u=https%3a%2f%2fyanirseroussi.com%2f2018%2f12%2f24%2fthe-most-practical-causal-inference-book-ive-read-is-still-a-draft%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/index.html b/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/index.html
index 7ce24914d..9e6effa78 100644
--- a/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/index.html
+++ b/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Hackers beware: Bootstrap sampling may be harmful | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="bootstrapping,data science,hackers,software engineering,statistics"><meta name=description content="Bootstrap sampling has been promoted as an easy way of modelling uncertainty to hackers without much statistical knowledge. But things aren&rsquo;t that simple."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Hackers beware: Bootstrap sampling may be harmful"><meta property="og:description" content="Bootstrap sampling has been promoted as an easy way of modelling uncertainty to hackers without much statistical knowledge. But things aren&rsquo;t that simple."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/"><meta property="og:image" content="https://yanirseroussi.com/warning-signs.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2019-01-07T21:07:56+00:00"><meta property="article:modified_time" content="2023-07-05T11:39:25+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/warning-signs.jpg"><meta name=twitter:title content="Hackers beware: Bootstrap sampling may be harmful"><meta name=twitter:description content="Bootstrap sampling has been promoted as an easy way of modelling uncertainty to hackers without much statistical knowledge. But things aren&rsquo;t that simple."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Hackers beware: Bootstrap sampling may be harmful","item":"https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Hackers beware: Bootstrap sampling may be harmful","name":"Hackers beware: Bootstrap sampling may be harmful","description":"Bootstrap sampling has been promoted as an easy way of modelling uncertainty to hackers without much statistical knowledge. But things aren\u0026rsquo;t that simple.","keywords":["bootstrapping","data science","hackers","software engineering","statistics"],"articleBody":"Bootstrap sampling techniques are very appealing, as they don’t require knowing much about statistics and opaque formulas. Instead, all one needs to do is resample the given data many times, and calculate the desired statistics. Therefore, bootstrapping has been promoted as an easy way of modelling uncertainty to hackers who don’t have much statistical knowledge. For example, the main thesis of the excellent Statistics for Hackers talk by Jake VanderPlas is: “If you can write a for-loop, you can do statistics”. Similar ground was covered by Erik Bernhardsson in The Hacker’s Guide to Uncertainty Estimates, which provides more use cases for bootstrapping (with code examples). However, I’ve learned in the past few weeks that there are quite a few pitfalls in bootstrapping. Much of what I’ve learned is summarised in a paper titled What Teachers Should Know about the Bootstrap: Resampling in the Undergraduate Statistics Curriculum by Tim Hesterberg. I doubt that many hackers would be motivated to read a paper with such a title, so my goal with this post is to make some of my discoveries more accessible to a wider audience. To learn more about the issues raised in this post, it’s worth reading Hesterberg’s paper and other linked resources.\nFor quick reference, here’s a summary of the advice in this post:\nUse an accurate method for estimating confidence intervals Use enough resamples – at least 10-15K Don’t compare confidence intervals visually Ensure that the basic assumptions apply to your situation Pitfall #1: Inaccurate confidence intervals Confidence intervals are a common way of quantifying the uncertainty in an estimate of a population parameter. The percentile method is one of the simplest bootstrapping approaches for generating confidence intervals. For example, let’s say we have a data sample of size n and we want to estimate a 95% confidence interval for the population mean. We take r bootstrap resamples from the original data sample, where each resample is a sample with replacement of size n. We calculate the mean of each resample and store the means in a sorted array. We then return the 95% confidence interval as the values that fall at the 0.025r and 0.975r indices of the sorted array (i.e., the 2.5% and 97.5% percentiles). The following table shows what the first two resamples may look like for a data sample of size n=5.\nOriginal sample Resample #1 Resample #2 … Values 10 30 20 … 12 20 20 20 12 30 30 12 30 45 45 30 Mean 23.4 23.8 26 … The percentile method is nice and simple. Any programmer should be able to easily implement it in their favourite programming language, assuming they can actually program. Unfortunately, this method is just not accurate enough for small sample sizes. Quoting Hesterberg (emphasis mine):\nThe sample sizes needed for different intervals to satisfy the “reasonably accurate” (off by no more than 10% on each side) criterion are: n ≥ 101 for the bootstrap t, 220 for the skewness-adjusted t statistic, 2,235 for expanded percentile, 2,383 for percentile, 4,815 for ordinary t (which I have rounded up to 5,000 above), 5,063 for t with bootstrap standard errors and something over 8,000 for the reverse percentile method.\nIn a shorter version of the paper cited above, Hesterberg concludes that:\nIn practice, implementing some of the more accurate bootstrap methods is difficult (especially those not described here), and people should use a package rather than attempt this themselves.\nIn short, make sure you’re using an accurate method for estimating confidence intervals when dealing with sample sizes of less than a few thousand values. Using a package is a great idea, but unfortunately I don’t know of any Python bootstrapping package that is feature-complete: ARCH and scikits-bootstrap support advanced confidence interval methods but don’t support analysis of two samples of uneven sizes, while bootstrapped works with samples of uneven sizes but only supports the percentile and the reverse percentile method (which Hesterberg found to be even less accurate). If you know of any better Python packages, please let me know! (I don’t use R, but I suspect the situation is better there). Update: ARCH now supports analysis of samples of uneven sizes following an issue I reported. It seems to be the best Python bootstrapping package, so I recommend using it.\nPitfall #2: Not enough resamples Accurate bootstrap estimates require a large number of resamples. Many code snippets use 1,000 resamples, probably because it looks like a large number. However, seeming large isn’t enough. Quoting Hesterberg again:\nFor both the bootstrap and permutation tests, the number of resamples needs to be 15,000 or more, for 95% probability that simulation-based one-sided levels fall within 10% of the true values, for 95% intervals and 5% tests. I recommend r = 10,000 for routine use, and more when accuracy matters.\n[…]\nWe want decisions to depend on the data, not random variation in the Monte Carlo implementation. We used r = 500,000 in the Verizon project.\nThat’s right, half a million resamples! Accuracy mattered in the Verizon case, as the results of the analysis determined whether large penalties were paid or not. In short, use at least 10-15,000 resamples to be safe. Don’t use 1,000.\nPitfall #3: Comparison of single-sample confidence intervals Confidence intervals are commonly used to decide if the difference between two samples is statistically significant. Bootstrapping provides a straightforward way of estimating confidence intervals without making assumptions about the way the data was generated. For example, given two samples, we can obtain confidence intervals for the mean of each sample and end up with a plot like this:\nWhen looking at this plot, some people may conclude that the difference between the groups isn’t statistically significant because the confidence intervals overlap. However, overlapping confidence intervals don’t imply a lack of statistical significance because it is possible for the confidence interval of the difference between the sample means to not contain zero. Prasanna Parasurama explained why this happens in this post. While this issue isn’t unique to bootstrapping, it’s worth remembering that when comparing two groups, we need to obtain the confidence interval for the difference in the parameter we’re comparing, not compare single-sample confidence intervals.\nFor a concrete example, consider a case where we’re looking at a binary outcomes (yes/no or 1/0), which occur in coin flips or online A/B tests. Sample A consists of 2,150 zeroes and 350 ones, while sample B consists of 2,250 zeroes and 440 ones. As these are fairly large samples, we can use the bootstrap percentile method to obtain 95% confidence intervals for the mean of each sample. As the following figure shows, these intervals overlap. If we use the same method to also obtain a 95% confidence interval for the difference in means between B and A, we see that it doesn’t include zero. Therefore, we can say that the difference between B and A is statistically significant, despite the overlap between the single-sample confidence intervals.\nIt’s worth noting that when analysing binary outcomes, we can make stronger assumptions about the data rather than use bootstrapping to obtain confidence intervals. Erik Bernhardsson suggests using the Beta distribution to obtain single-sample confidence intervals, but as we’ve seen, they don’t tell us enough about the differences between samples. I suggested using a Bayesian approach in the past, which makes explicit modelling assumptions that allow us to encode our prior knowledge on the specific environment where the data was generated. For example, when running online A/B tests, we often have a ballpark figure for reasonable results, which can be used in the Bayesian A/B testing calculator I built.\nPitfall #4: Unrepresentative and dependent samples While the basic bootstrap makes no assumption about the underlying distribution of the data, it is not assumption-free. For example, when dealing with correlated data points from a time series, using the basic bootstrapping approach is wrong because it assumes that the data points are independent. Instead, a block bootstrap should be used – see the ARCH package for some implementation examples. In addition, bootstrapping doesn’t solve problems with the underlying sampling approach. For example, the data sample may not be representative of the population because of its small size, or there may be selection biases and measurement errors. No amount of bootstrapping is going to help with such issues. In general, it always helps to be aware of the data’s generation process, e.g., different considerations apply when dealing with data from online experiments versus observational studies.\nConclusion and next steps While bootstrapping is a powerful method, its initial impression of simplicity is misleading. To draw valid conclusions, it’s a good idea to use a package and be aware of considerations that are specific to the analysed data sample. However, if you’re already increasing your awareness of the data and its generation process, it may make sense to explicitly encode your assumptions in the model. This is where another hacker resource would come in handy: Probabilistic Programming \u0026 Bayesian Methods for Hackers by Cam Davidson-Pilon. Admittedly, it’s a bit longer than the average blog post or conference talk, but it is worth reading.\nGoing down the bootstrapping rabbit hole has reminded me of an important lesson: Blog posts and talks – especially ones with the word hacker in the title – may be a good starting point, but they shouldn’t be relied on for serious work. Instead, it is better to consult peer-reviewed resources and textbooks, such as the references listed in ARCH’s documentation. In my future explorations of bootstrapping and other methods, I will heed Abraham Lincoln’s timeless advice to not trust everything I read on the internet.\nUpdate (Oct 2019): I published a post summarising a talk I gave on the topic, complete with simulation code that illustrates the issues with some bootstrapping algorithms.\n","wordCount":"1625","inLanguage":"en","image":"https://yanirseroussi.com/warning-signs.jpg","datePublished":"2019-01-07T21:07:56Z","dateModified":"2023-07-05T11:39:25+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Hackers beware: Bootstrap sampling may be harmful</h1><div class=post-meta><span title='2019-01-07 21:07:56 +0000 UTC'>January 7, 2019</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2019-01-08-hackers-beware-bootstrap-sampling-may-be-harmful/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs.jpg 3531w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs.jpg alt width=3531 height=1200></figure><div class=post-content><p><a href=https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29 target=_blank rel=noopener>Bootstrap sampling techniques</a> are very appealing, as they don&rsquo;t require knowing much about statistics and opaque formulas. Instead, all one needs to do is resample the given data many times, and calculate the desired statistics. Therefore, bootstrapping has been promoted as an easy way of modelling uncertainty to hackers who don&rsquo;t have much statistical knowledge. For example, the main thesis of the excellent <a href=https://speakerdeck.com/jakevdp/statistics-for-hackers target=_blank rel=noopener><em>Statistics for Hackers</em></a> talk by Jake VanderPlas is: <em>&ldquo;If you can write a for-loop, you can do statistics&rdquo;</em>. Similar ground was covered by Erik Bernhardsson in <a href=https://erikbern.com/2018/10/08/the-hackers-guide-to-uncertainty-estimates.html target=_blank rel=noopener><em>The Hacker&rsquo;s Guide to Uncertainty Estimates</em></a>, which provides more use cases for bootstrapping (with code examples). However, I&rsquo;ve learned in the past few weeks that there are quite a few pitfalls in bootstrapping. Much of what I&rsquo;ve learned is summarised in a paper titled <a href=https://arxiv.org/abs/1411.5279 target=_blank rel=noopener><em>What Teachers Should Know about the Bootstrap: Resampling in the Undergraduate Statistics Curriculum</em></a> by Tim Hesterberg. I doubt that many hackers would be motivated to read a paper with such a title, so my goal with this post is to make some of my discoveries more accessible to a wider audience. To learn more about the issues raised in this post, it&rsquo;s worth reading Hesterberg&rsquo;s paper and other linked resources.</p><p>For quick reference, here&rsquo;s a summary of the advice in this post:</p><ul><li>Use an accurate method for estimating confidence intervals</li><li>Use enough resamples – at least 10-15K</li><li>Don&rsquo;t compare confidence intervals visually</li><li>Ensure that the basic assumptions apply to your situation</li></ul><h2 id=pitfall-1-inaccurate-confidence-intervals>Pitfall #1: Inaccurate confidence intervals<a hidden class=anchor aria-hidden=true href=#pitfall-1-inaccurate-confidence-intervals>#</a></h2><p><a href=https://en.wikipedia.org/wiki/Confidence_interval target=_blank rel=noopener>Confidence intervals</a> are a common way of quantifying the uncertainty in an estimate of a population parameter. The percentile method is one of the simplest bootstrapping approaches for generating confidence intervals. For example, let&rsquo;s say we have a data sample of size <code>n</code> and we want to estimate a 95% confidence interval for the population mean. We take <code>r</code> bootstrap <em>resamples</em> from the original data sample, where each resample is a sample with replacement of size <code>n</code>. We calculate the mean of each resample and store the means in a sorted array. We then return the 95% confidence interval as the values that fall at the <code>0.025r</code> and <code>0.975r</code> indices of the sorted array (i.e., the 2.5% and 97.5% percentiles). The following table shows what the first two resamples may look like for a data sample of size <code>n=5</code>.</p><table><thead><tr><th></th><th>Original sample</th><th>Resample #1</th><th>Resample #2</th><th>&mldr;</th></tr></thead><tbody><tr><td><strong>Values</strong></td><td>10</td><td>30</td><td>20</td><td>&mldr;</td></tr><tr><td></td><td>12</td><td>20</td><td>20</td><td></td></tr><tr><td></td><td>20</td><td>12</td><td>30</td><td></td></tr><tr><td></td><td>30</td><td>12</td><td>30</td><td></td></tr><tr><td></td><td>45</td><td>45</td><td>30</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td></tr><tr><td><strong>Mean</strong></td><td><em>23.4</em></td><td><em>23.8</em></td><td><em>26</em></td><td><em>&mldr;</em></td></tr></tbody></table><p>The percentile method is nice and simple. Any programmer should be able to easily implement it in their favourite programming language, assuming <a href=https://blog.codinghorror.com/why-cant-programmers-program/ target=_blank rel=noopener>they can actually program</a>. Unfortunately, <strong>this method is just not accurate enough for small sample sizes</strong>. Quoting Hesterberg (emphasis mine):</p><blockquote><p>The sample sizes needed for different intervals to satisfy the &ldquo;reasonably accurate&rdquo; (off by no more than 10% on each side) criterion are: n ≥ 101 for the bootstrap t, 220 for the skewness-adjusted t statistic, 2,235 for expanded percentile, <b style=font-weight:700>2,383 for percentile</b>, 4,815 for ordinary t (which I have rounded up to 5,000 above), 5,063 for t with bootstrap standard errors and something over 8,000 for the reverse percentile method.</p></blockquote><p>In <a href=https://storage.googleapis.com/pub-tools-public-publication-data/pdf/44859.pdf target=_blank rel=noopener>a shorter version of the paper cited above</a>, Hesterberg concludes that:</p><blockquote><p>In practice, implementing some of the more accurate bootstrap methods is difficult (especially those not described here), and people should use a package rather than attempt this themselves.</p></blockquote><p>In short, <strong>make sure you&rsquo;re using an accurate method for estimating confidence intervals when dealing with sample sizes of less than a few thousand values</strong>. Using a package is a great idea, but unfortunately I don&rsquo;t know of any Python bootstrapping package that is feature-complete: <a href=https://github.com/bashtage/arch/ target=_blank rel=noopener>ARCH</a> and <a href=https://github.com/cgevans/scikits-bootstrap/ target=_blank rel=noopener>scikits-bootstrap</a> support advanced confidence interval methods but don&rsquo;t support analysis of two samples of uneven sizes, while <a href=https://github.com/facebookincubator/bootstrapped/ target=_blank rel=noopener>bootstrapped</a> works with samples of uneven sizes but only supports the percentile and the reverse percentile method (which Hesterberg found to be even less accurate). If you know of any better Python packages, please let me know! (I don&rsquo;t use R, but I suspect the situation is better there). <strong>Update</strong>: <a href=https://github.com/bashtage/arch/releases/tag/4.8.0 target=_blank rel=noopener>ARCH now supports</a> analysis of samples of uneven sizes <a href=https://github.com/bashtage/arch/issues/260 target=_blank rel=noopener>following an issue I reported</a>. It seems to be the best Python bootstrapping package, so I recommend using it.</p><h2 id=pitfall-2-not-enough-resamples>Pitfall #2: Not enough resamples<a hidden class=anchor aria-hidden=true href=#pitfall-2-not-enough-resamples>#</a></h2><p>Accurate bootstrap estimates require a large number of resamples. Many code snippets use 1,000 resamples, probably because it looks like a large number. However, <em>seeming</em> large isn&rsquo;t enough. Quoting Hesterberg again:</p><blockquote><p>For both the bootstrap and permutation tests, the number of resamples needs to be 15,000 or more, for 95% probability that simulation-based one-sided levels fall within 10% of the true values, for 95% intervals and 5% tests. I recommend r = 10,000 for routine use, and more when accuracy matters.</p><p>[&mldr;]</p><p>We want decisions to depend on the data, not random variation in the Monte Carlo implementation. We used r = 500,000 in the Verizon project.</p></blockquote><p>That&rsquo;s right, half a million resamples! Accuracy mattered in the Verizon case, as the results of the analysis determined whether large penalties were paid or not. In short, <strong>use at least 10-15,000 resamples to be safe</strong>. Don&rsquo;t use 1,000.</p><h2 id=pitfall-3-comparison-of-single-sample-confidence-intervals>Pitfall #3: Comparison of single-sample confidence intervals<a hidden class=anchor aria-hidden=true href=#pitfall-3-comparison-of-single-sample-confidence-intervals>#</a></h2><p>Confidence intervals are commonly used to decide if the difference between two samples is statistically significant. Bootstrapping provides a straightforward way of estimating confidence intervals without making assumptions about the way the data was generated. For example, given two samples, we can obtain confidence intervals for the mean of each sample and end up with a plot like this:</p><figure><a href=overlapping-confidence-intervals.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="bootstrapping,data science,hackers,software engineering,statistics"><meta name=description content="Bootstrap sampling has been promoted as an easy way of modelling uncertainty to hackers without much statistical knowledge. But things aren&rsquo;t that simple."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Hackers beware: Bootstrap sampling may be harmful"><meta property="og:description" content="Bootstrap sampling has been promoted as an easy way of modelling uncertainty to hackers without much statistical knowledge. But things aren&rsquo;t that simple."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/"><meta property="og:image" content="https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2019-01-07T21:07:56+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs.jpg"><meta name=twitter:title content="Hackers beware: Bootstrap sampling may be harmful"><meta name=twitter:description content="Bootstrap sampling has been promoted as an easy way of modelling uncertainty to hackers without much statistical knowledge. But things aren&rsquo;t that simple."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Hackers beware: Bootstrap sampling may be harmful","item":"https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Hackers beware: Bootstrap sampling may be harmful","name":"Hackers beware: Bootstrap sampling may be harmful","description":"Bootstrap sampling has been promoted as an easy way of modelling uncertainty to hackers without much statistical knowledge. But things aren\u0026rsquo;t that simple.","keywords":["bootstrapping","data science","hackers","software engineering","statistics"],"articleBody":"Bootstrap sampling techniques are very appealing, as they don’t require knowing much about statistics and opaque formulas. Instead, all one needs to do is resample the given data many times, and calculate the desired statistics. Therefore, bootstrapping has been promoted as an easy way of modelling uncertainty to hackers who don’t have much statistical knowledge. For example, the main thesis of the excellent Statistics for Hackers talk by Jake VanderPlas is: “If you can write a for-loop, you can do statistics”. Similar ground was covered by Erik Bernhardsson in The Hacker’s Guide to Uncertainty Estimates, which provides more use cases for bootstrapping (with code examples). However, I’ve learned in the past few weeks that there are quite a few pitfalls in bootstrapping. Much of what I’ve learned is summarised in a paper titled What Teachers Should Know about the Bootstrap: Resampling in the Undergraduate Statistics Curriculum by Tim Hesterberg. I doubt that many hackers would be motivated to read a paper with such a title, so my goal with this post is to make some of my discoveries more accessible to a wider audience. To learn more about the issues raised in this post, it’s worth reading Hesterberg’s paper and other linked resources.\nFor quick reference, here’s a summary of the advice in this post:\nUse an accurate method for estimating confidence intervals Use enough resamples – at least 10-15K Don’t compare confidence intervals visually Ensure that the basic assumptions apply to your situation Pitfall #1: Inaccurate confidence intervals Confidence intervals are a common way of quantifying the uncertainty in an estimate of a population parameter. The percentile method is one of the simplest bootstrapping approaches for generating confidence intervals. For example, let’s say we have a data sample of size n and we want to estimate a 95% confidence interval for the population mean. We take r bootstrap resamples from the original data sample, where each resample is a sample with replacement of size n. We calculate the mean of each resample and store the means in a sorted array. We then return the 95% confidence interval as the values that fall at the 0.025r and 0.975r indices of the sorted array (i.e., the 2.5% and 97.5% percentiles). The following table shows what the first two resamples may look like for a data sample of size n=5.\nOriginal sample Resample #1 Resample #2 … Values 10 30 20 … 12 20 20 20 12 30 30 12 30 45 45 30 Mean 23.4 23.8 26 … The percentile method is nice and simple. Any programmer should be able to easily implement it in their favourite programming language, assuming they can actually program. Unfortunately, this method is just not accurate enough for small sample sizes. Quoting Hesterberg (emphasis mine):\nThe sample sizes needed for different intervals to satisfy the “reasonably accurate” (off by no more than 10% on each side) criterion are: n ≥ 101 for the bootstrap t, 220 for the skewness-adjusted t statistic, 2,235 for expanded percentile, 2,383 for percentile, 4,815 for ordinary t (which I have rounded up to 5,000 above), 5,063 for t with bootstrap standard errors and something over 8,000 for the reverse percentile method.\nIn a shorter version of the paper cited above, Hesterberg concludes that:\nIn practice, implementing some of the more accurate bootstrap methods is difficult (especially those not described here), and people should use a package rather than attempt this themselves.\nIn short, make sure you’re using an accurate method for estimating confidence intervals when dealing with sample sizes of less than a few thousand values. Using a package is a great idea, but unfortunately I don’t know of any Python bootstrapping package that is feature-complete: ARCH and scikits-bootstrap support advanced confidence interval methods but don’t support analysis of two samples of uneven sizes, while bootstrapped works with samples of uneven sizes but only supports the percentile and the reverse percentile method (which Hesterberg found to be even less accurate). If you know of any better Python packages, please let me know! (I don’t use R, but I suspect the situation is better there). Update: ARCH now supports analysis of samples of uneven sizes following an issue I reported. It seems to be the best Python bootstrapping package, so I recommend using it.\nPitfall #2: Not enough resamples Accurate bootstrap estimates require a large number of resamples. Many code snippets use 1,000 resamples, probably because it looks like a large number. However, seeming large isn’t enough. Quoting Hesterberg again:\nFor both the bootstrap and permutation tests, the number of resamples needs to be 15,000 or more, for 95% probability that simulation-based one-sided levels fall within 10% of the true values, for 95% intervals and 5% tests. I recommend r = 10,000 for routine use, and more when accuracy matters.\n[…]\nWe want decisions to depend on the data, not random variation in the Monte Carlo implementation. We used r = 500,000 in the Verizon project.\nThat’s right, half a million resamples! Accuracy mattered in the Verizon case, as the results of the analysis determined whether large penalties were paid or not. In short, use at least 10-15,000 resamples to be safe. Don’t use 1,000.\nPitfall #3: Comparison of single-sample confidence intervals Confidence intervals are commonly used to decide if the difference between two samples is statistically significant. Bootstrapping provides a straightforward way of estimating confidence intervals without making assumptions about the way the data was generated. For example, given two samples, we can obtain confidence intervals for the mean of each sample and end up with a plot like this:\nWhen looking at this plot, some people may conclude that the difference between the groups isn’t statistically significant because the confidence intervals overlap. However, overlapping confidence intervals don’t imply a lack of statistical significance because it is possible for the confidence interval of the difference between the sample means to not contain zero. Prasanna Parasurama explained why this happens in this post. While this issue isn’t unique to bootstrapping, it’s worth remembering that when comparing two groups, we need to obtain the confidence interval for the difference in the parameter we’re comparing, not compare single-sample confidence intervals.\nFor a concrete example, consider a case where we’re looking at a binary outcomes (yes/no or 1/0), which occur in coin flips or online A/B tests. Sample A consists of 2,150 zeroes and 350 ones, while sample B consists of 2,250 zeroes and 440 ones. As these are fairly large samples, we can use the bootstrap percentile method to obtain 95% confidence intervals for the mean of each sample. As the following figure shows, these intervals overlap. If we use the same method to also obtain a 95% confidence interval for the difference in means between B and A, we see that it doesn’t include zero. Therefore, we can say that the difference between B and A is statistically significant, despite the overlap between the single-sample confidence intervals.\nIt’s worth noting that when analysing binary outcomes, we can make stronger assumptions about the data rather than use bootstrapping to obtain confidence intervals. Erik Bernhardsson suggests using the Beta distribution to obtain single-sample confidence intervals, but as we’ve seen, they don’t tell us enough about the differences between samples. I suggested using a Bayesian approach in the past, which makes explicit modelling assumptions that allow us to encode our prior knowledge on the specific environment where the data was generated. For example, when running online A/B tests, we often have a ballpark figure for reasonable results, which can be used in the Bayesian A/B testing calculator I built.\nPitfall #4: Unrepresentative and dependent samples While the basic bootstrap makes no assumption about the underlying distribution of the data, it is not assumption-free. For example, when dealing with correlated data points from a time series, using the basic bootstrapping approach is wrong because it assumes that the data points are independent. Instead, a block bootstrap should be used – see the ARCH package for some implementation examples. In addition, bootstrapping doesn’t solve problems with the underlying sampling approach. For example, the data sample may not be representative of the population because of its small size, or there may be selection biases and measurement errors. No amount of bootstrapping is going to help with such issues. In general, it always helps to be aware of the data’s generation process, e.g., different considerations apply when dealing with data from online experiments versus observational studies.\nConclusion and next steps While bootstrapping is a powerful method, its initial impression of simplicity is misleading. To draw valid conclusions, it’s a good idea to use a package and be aware of considerations that are specific to the analysed data sample. However, if you’re already increasing your awareness of the data and its generation process, it may make sense to explicitly encode your assumptions in the model. This is where another hacker resource would come in handy: Probabilistic Programming \u0026 Bayesian Methods for Hackers by Cam Davidson-Pilon. Admittedly, it’s a bit longer than the average blog post or conference talk, but it is worth reading.\nGoing down the bootstrapping rabbit hole has reminded me of an important lesson: Blog posts and talks – especially ones with the word hacker in the title – may be a good starting point, but they shouldn’t be relied on for serious work. Instead, it is better to consult peer-reviewed resources and textbooks, such as the references listed in ARCH’s documentation. In my future explorations of bootstrapping and other methods, I will heed Abraham Lincoln’s timeless advice to not trust everything I read on the internet.\nUpdate (Oct 2019): I published a post summarising a talk I gave on the topic, complete with simulation code that illustrates the issues with some bootstrapping algorithms.\n","wordCount":"1625","inLanguage":"en","image":"https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs.jpg","datePublished":"2019-01-07T21:07:56Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Hackers beware: Bootstrap sampling may be harmful</h1><div class=post-meta><span title='2019-01-07 21:07:56 +0000 UTC'>January 7, 2019</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2019-01-08-hackers-beware-bootstrap-sampling-may-be-harmful/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs_hu66da5e7e5a432a77b79afd3fa924437e_1490615_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs.jpg 3531w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/warning-signs.jpg alt width=3531 height=1200></figure><div class=post-content><p><a href=https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29 target=_blank rel=noopener>Bootstrap sampling techniques</a> are very appealing, as they don&rsquo;t require knowing much about statistics and opaque formulas. Instead, all one needs to do is resample the given data many times, and calculate the desired statistics. Therefore, bootstrapping has been promoted as an easy way of modelling uncertainty to hackers who don&rsquo;t have much statistical knowledge. For example, the main thesis of the excellent <a href=https://speakerdeck.com/jakevdp/statistics-for-hackers target=_blank rel=noopener><em>Statistics for Hackers</em></a> talk by Jake VanderPlas is: <em>&ldquo;If you can write a for-loop, you can do statistics&rdquo;</em>. Similar ground was covered by Erik Bernhardsson in <a href=https://erikbern.com/2018/10/08/the-hackers-guide-to-uncertainty-estimates.html target=_blank rel=noopener><em>The Hacker&rsquo;s Guide to Uncertainty Estimates</em></a>, which provides more use cases for bootstrapping (with code examples). However, I&rsquo;ve learned in the past few weeks that there are quite a few pitfalls in bootstrapping. Much of what I&rsquo;ve learned is summarised in a paper titled <a href=https://arxiv.org/abs/1411.5279 target=_blank rel=noopener><em>What Teachers Should Know about the Bootstrap: Resampling in the Undergraduate Statistics Curriculum</em></a> by Tim Hesterberg. I doubt that many hackers would be motivated to read a paper with such a title, so my goal with this post is to make some of my discoveries more accessible to a wider audience. To learn more about the issues raised in this post, it&rsquo;s worth reading Hesterberg&rsquo;s paper and other linked resources.</p><p>For quick reference, here&rsquo;s a summary of the advice in this post:</p><ul><li>Use an accurate method for estimating confidence intervals</li><li>Use enough resamples – at least 10-15K</li><li>Don&rsquo;t compare confidence intervals visually</li><li>Ensure that the basic assumptions apply to your situation</li></ul><h2 id=pitfall-1-inaccurate-confidence-intervals>Pitfall #1: Inaccurate confidence intervals<a hidden class=anchor aria-hidden=true href=#pitfall-1-inaccurate-confidence-intervals>#</a></h2><p><a href=https://en.wikipedia.org/wiki/Confidence_interval target=_blank rel=noopener>Confidence intervals</a> are a common way of quantifying the uncertainty in an estimate of a population parameter. The percentile method is one of the simplest bootstrapping approaches for generating confidence intervals. For example, let&rsquo;s say we have a data sample of size <code>n</code> and we want to estimate a 95% confidence interval for the population mean. We take <code>r</code> bootstrap <em>resamples</em> from the original data sample, where each resample is a sample with replacement of size <code>n</code>. We calculate the mean of each resample and store the means in a sorted array. We then return the 95% confidence interval as the values that fall at the <code>0.025r</code> and <code>0.975r</code> indices of the sorted array (i.e., the 2.5% and 97.5% percentiles). The following table shows what the first two resamples may look like for a data sample of size <code>n=5</code>.</p><table><thead><tr><th></th><th>Original sample</th><th>Resample #1</th><th>Resample #2</th><th>&mldr;</th></tr></thead><tbody><tr><td><strong>Values</strong></td><td>10</td><td>30</td><td>20</td><td>&mldr;</td></tr><tr><td></td><td>12</td><td>20</td><td>20</td><td></td></tr><tr><td></td><td>20</td><td>12</td><td>30</td><td></td></tr><tr><td></td><td>30</td><td>12</td><td>30</td><td></td></tr><tr><td></td><td>45</td><td>45</td><td>30</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td></tr><tr><td><strong>Mean</strong></td><td><em>23.4</em></td><td><em>23.8</em></td><td><em>26</em></td><td><em>&mldr;</em></td></tr></tbody></table><p>The percentile method is nice and simple. Any programmer should be able to easily implement it in their favourite programming language, assuming <a href=https://blog.codinghorror.com/why-cant-programmers-program/ target=_blank rel=noopener>they can actually program</a>. Unfortunately, <strong>this method is just not accurate enough for small sample sizes</strong>. Quoting Hesterberg (emphasis mine):</p><blockquote><p>The sample sizes needed for different intervals to satisfy the &ldquo;reasonably accurate&rdquo; (off by no more than 10% on each side) criterion are: n ≥ 101 for the bootstrap t, 220 for the skewness-adjusted t statistic, 2,235 for expanded percentile, <b style=font-weight:700>2,383 for percentile</b>, 4,815 for ordinary t (which I have rounded up to 5,000 above), 5,063 for t with bootstrap standard errors and something over 8,000 for the reverse percentile method.</p></blockquote><p>In <a href=https://storage.googleapis.com/pub-tools-public-publication-data/pdf/44859.pdf target=_blank rel=noopener>a shorter version of the paper cited above</a>, Hesterberg concludes that:</p><blockquote><p>In practice, implementing some of the more accurate bootstrap methods is difficult (especially those not described here), and people should use a package rather than attempt this themselves.</p></blockquote><p>In short, <strong>make sure you&rsquo;re using an accurate method for estimating confidence intervals when dealing with sample sizes of less than a few thousand values</strong>. Using a package is a great idea, but unfortunately I don&rsquo;t know of any Python bootstrapping package that is feature-complete: <a href=https://github.com/bashtage/arch/ target=_blank rel=noopener>ARCH</a> and <a href=https://github.com/cgevans/scikits-bootstrap/ target=_blank rel=noopener>scikits-bootstrap</a> support advanced confidence interval methods but don&rsquo;t support analysis of two samples of uneven sizes, while <a href=https://github.com/facebookincubator/bootstrapped/ target=_blank rel=noopener>bootstrapped</a> works with samples of uneven sizes but only supports the percentile and the reverse percentile method (which Hesterberg found to be even less accurate). If you know of any better Python packages, please let me know! (I don&rsquo;t use R, but I suspect the situation is better there). <strong>Update</strong>: <a href=https://github.com/bashtage/arch/releases/tag/4.8.0 target=_blank rel=noopener>ARCH now supports</a> analysis of samples of uneven sizes <a href=https://github.com/bashtage/arch/issues/260 target=_blank rel=noopener>following an issue I reported</a>. It seems to be the best Python bootstrapping package, so I recommend using it.</p><h2 id=pitfall-2-not-enough-resamples>Pitfall #2: Not enough resamples<a hidden class=anchor aria-hidden=true href=#pitfall-2-not-enough-resamples>#</a></h2><p>Accurate bootstrap estimates require a large number of resamples. Many code snippets use 1,000 resamples, probably because it looks like a large number. However, <em>seeming</em> large isn&rsquo;t enough. Quoting Hesterberg again:</p><blockquote><p>For both the bootstrap and permutation tests, the number of resamples needs to be 15,000 or more, for 95% probability that simulation-based one-sided levels fall within 10% of the true values, for 95% intervals and 5% tests. I recommend r = 10,000 for routine use, and more when accuracy matters.</p><p>[&mldr;]</p><p>We want decisions to depend on the data, not random variation in the Monte Carlo implementation. We used r = 500,000 in the Verizon project.</p></blockquote><p>That&rsquo;s right, half a million resamples! Accuracy mattered in the Verizon case, as the results of the analysis determined whether large penalties were paid or not. In short, <strong>use at least 10-15,000 resamples to be safe</strong>. Don&rsquo;t use 1,000.</p><h2 id=pitfall-3-comparison-of-single-sample-confidence-intervals>Pitfall #3: Comparison of single-sample confidence intervals<a hidden class=anchor aria-hidden=true href=#pitfall-3-comparison-of-single-sample-confidence-intervals>#</a></h2><p>Confidence intervals are commonly used to decide if the difference between two samples is statistically significant. Bootstrapping provides a straightforward way of estimating confidence intervals without making assumptions about the way the data was generated. For example, given two samples, we can obtain confidence intervals for the mean of each sample and end up with a plot like this:</p><figure><a href=overlapping-confidence-intervals.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/overlapping-confidence-intervals_hue7fc18354688a60dc90db601b41630cc_12060_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/overlapping-confidence-intervals_hue7fc18354688a60dc90db601b41630cc_12060_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/overlapping-confidence-intervals_hue7fc18354688a60dc90db601b41630cc_12060_720x0_resize_box_3.png 720w,
diff --git a/2019/10/06/bootstrapping-the-right-way/index.html b/2019/10/06/bootstrapping-the-right-way/index.html
index ff46ad225..0aeb5e26d 100644
--- a/2019/10/06/bootstrapping-the-right-way/index.html
+++ b/2019/10/06/bootstrapping-the-right-way/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Bootstrapping the right way? | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="analytics,data science,software engineering,statistics"><meta name=description content="Video and summary of a talk I gave at YOW! Data on bootstrap estimation of confidence intervals."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Bootstrapping the right way?"><meta property="og:description" content="Video and summary of a talk I gave at YOW! Data on bootstrap estimation of confidence intervals."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/"><meta property="og:image" content="https://yanirseroussi.com/revenue-confidence-intervals.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2019-10-06T06:48:07+00:00"><meta property="article:modified_time" content="2023-07-05T11:39:25+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/revenue-confidence-intervals.png"><meta name=twitter:title content="Bootstrapping the right way?"><meta name=twitter:description content="Video and summary of a talk I gave at YOW! Data on bootstrap estimation of confidence intervals."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Bootstrapping the right way?","item":"https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Bootstrapping the right way?","name":"Bootstrapping the right way?","description":"Video and summary of a talk I gave at YOW! Data on bootstrap estimation of confidence intervals.","keywords":["analytics","data science","software engineering","statistics"],"articleBody":"Bootstrapping the right way is a talk I gave earlier this year at the YOW! Data conference in Sydney. You can now watch the video of the talk and have a look through the slides. The content of the talk is similar to a post I published on bootstrapping pitfalls, with some additional simulations.\nThe main takeaways shared in the talk are:\nDon’t compare single-sample confidence intervals by eye Use enough resamples (15K?) Use a solid bootstrapping package (e.g., Python ARCH) Use the right bootstrap for the job Consider going parametric Bayesian Test all the things Testing all the things typically requires writing code, which I did for the talk. You can browse through it in this notebook. The most interesting findings from my tests are summarised by the following figure.\nThe figure shows how the accuracy of confidence interval estimation varies by algorithm, sample size, and the number of bootstrapping resamples on a synthetic revenue dataset. This sort of dataset may occur in freemium scenarios, where several product variations are offered at a few price tiers, including a price of zero (i.e., free). In all cases, the dashed line denotes the requested confidence level of 95%, i.e., the true difference in means between the two revenue distributions should be inside the confidence interval in approximately 95% of the simulations for it to be accurate. Unfortunately, it is clear that both the percentile and BCa algorithms perform poorly on the simulated data. Even with a sample size of 10K, they both yield “95%” confidence intervals that contain the true difference in means less than 90% of the time, i.e., the intervals are too narrow. By contrast, the studentized algorithm gets much closer to the requested confidence level, but this comes at the price of considerably longer runtime due to the need for nested bootstrapping.\nNote that the results presented in the talk are slightly different from the figure above. The difference is due to a small bug in the simulation code: I used a constant random seed for all the bootstrapping simulation iterations (every iteration still contained different data). This has led to the surprising finding that accuracy with 10,000 resamples was lower than with 1,000 resamples. I attributed that finding to dataset quirks, and noted that my results may not generalise to all cases. Indeed, I recently ran a similar set of experiments on different data as part of my work at Automattic, and found that the studentized algorithm accuracy wasn’t as impressive as the results shown here.\nIn addition to synthetic data, the experiments I ran at Automattic included an implementation of an idea by my colleague, Demet Dagdelen: Test accuracy on samples from the full population for a given period (e.g., all sales over a calendar year). In such cases, the full population is well-defined. Therefore, we know the value of the “true” parameters, and we can run the same simulations as on synthetic data. While I can’t share that data, I can say that all algorithms performed much worse on real data than on simulated data. Therefore, we decided to follow the penultimate takeaway and use a parametric Bayesian approach for modelling our data. We may share insights from that line of work on data.blog in the future. In the meantime, comments are very welcome!\nUpdate: You can find more accurate simulations in this post.\n","wordCount":"559","inLanguage":"en","image":"https://yanirseroussi.com/revenue-confidence-intervals.png","datePublished":"2019-10-06T06:48:07Z","dateModified":"2023-07-05T11:39:25+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Bootstrapping the right way?</h1><div class=post-meta><span title='2019-10-06 06:48:07 +0000 UTC'>October 6, 2019</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2019-10-06-bootstrapping-the-right-way/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_1500x0_resize_box_3.png 1500w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals.png 1765w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals.png alt width=1765 height=666></figure><div class=post-content><p><em>Bootstrapping the right way</em> is a talk I gave earlier this year at the YOW! Data conference in Sydney. You can now watch the video of the talk and have a look through <a href=https://yanirs.github.io/talks/bootstrapping-the-right-way/ target=_blank rel=noopener>the slides</a>. The content of the talk is similar to <a href=https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/>a post I published on bootstrapping pitfalls</a>, with some additional simulations.</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/2wZXejYz-e0 style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="YouTube Video"></iframe></div></p><p>The main takeaways shared in the talk are:</p><ul><li>Don&rsquo;t compare single-sample confidence intervals by eye</li><li>Use enough resamples (15K?)</li><li>Use a solid bootstrapping package (e.g., <a href=https://arch.readthedocs.io/ target=_blank rel=noopener>Python ARCH</a>)</li><li>Use the right bootstrap for the job</li><li>Consider going parametric Bayesian</li><li>Test all the things</li></ul><p>Testing all the things typically requires writing code, which I did for the talk. You can browse through it in <a href=https://github.com/yanirs/yanirs.github.io/blob/master/talks/bootstrapping-the-right-way/notebook.ipynb target=_blank rel=noopener>this notebook</a>. The most interesting findings from my tests are summarised by the following figure.</p><figure><a href=revenue-confidence-intervals.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="analytics,data science,software engineering,statistics"><meta name=description content="Video and summary of a talk I gave at YOW! Data on bootstrap estimation of confidence intervals."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Bootstrapping the right way?"><meta property="og:description" content="Video and summary of a talk I gave at YOW! Data on bootstrap estimation of confidence intervals."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/"><meta property="og:image" content="https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2019-10-06T06:48:07+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals.png"><meta name=twitter:title content="Bootstrapping the right way?"><meta name=twitter:description content="Video and summary of a talk I gave at YOW! Data on bootstrap estimation of confidence intervals."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Bootstrapping the right way?","item":"https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Bootstrapping the right way?","name":"Bootstrapping the right way?","description":"Video and summary of a talk I gave at YOW! Data on bootstrap estimation of confidence intervals.","keywords":["analytics","data science","software engineering","statistics"],"articleBody":"Bootstrapping the right way is a talk I gave earlier this year at the YOW! Data conference in Sydney. You can now watch the video of the talk and have a look through the slides. The content of the talk is similar to a post I published on bootstrapping pitfalls, with some additional simulations.\nThe main takeaways shared in the talk are:\nDon’t compare single-sample confidence intervals by eye Use enough resamples (15K?) Use a solid bootstrapping package (e.g., Python ARCH) Use the right bootstrap for the job Consider going parametric Bayesian Test all the things Testing all the things typically requires writing code, which I did for the talk. You can browse through it in this notebook. The most interesting findings from my tests are summarised by the following figure.\nThe figure shows how the accuracy of confidence interval estimation varies by algorithm, sample size, and the number of bootstrapping resamples on a synthetic revenue dataset. This sort of dataset may occur in freemium scenarios, where several product variations are offered at a few price tiers, including a price of zero (i.e., free). In all cases, the dashed line denotes the requested confidence level of 95%, i.e., the true difference in means between the two revenue distributions should be inside the confidence interval in approximately 95% of the simulations for it to be accurate. Unfortunately, it is clear that both the percentile and BCa algorithms perform poorly on the simulated data. Even with a sample size of 10K, they both yield “95%” confidence intervals that contain the true difference in means less than 90% of the time, i.e., the intervals are too narrow. By contrast, the studentized algorithm gets much closer to the requested confidence level, but this comes at the price of considerably longer runtime due to the need for nested bootstrapping.\nNote that the results presented in the talk are slightly different from the figure above. The difference is due to a small bug in the simulation code: I used a constant random seed for all the bootstrapping simulation iterations (every iteration still contained different data). This has led to the surprising finding that accuracy with 10,000 resamples was lower than with 1,000 resamples. I attributed that finding to dataset quirks, and noted that my results may not generalise to all cases. Indeed, I recently ran a similar set of experiments on different data as part of my work at Automattic, and found that the studentized algorithm accuracy wasn’t as impressive as the results shown here.\nIn addition to synthetic data, the experiments I ran at Automattic included an implementation of an idea by my colleague, Demet Dagdelen: Test accuracy on samples from the full population for a given period (e.g., all sales over a calendar year). In such cases, the full population is well-defined. Therefore, we know the value of the “true” parameters, and we can run the same simulations as on synthetic data. While I can’t share that data, I can say that all algorithms performed much worse on real data than on simulated data. Therefore, we decided to follow the penultimate takeaway and use a parametric Bayesian approach for modelling our data. We may share insights from that line of work on data.blog in the future. In the meantime, comments are very welcome!\nUpdate: You can find more accurate simulations in this post.\n","wordCount":"559","inLanguage":"en","image":"https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals.png","datePublished":"2019-10-06T06:48:07Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Bootstrapping the right way?</h1><div class=post-meta><span title='2019-10-06 06:48:07 +0000 UTC'>October 6, 2019</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2019-10-06-bootstrapping-the-right-way/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_1500x0_resize_box_3.png 1500w ,https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals.png 1765w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals.png alt width=1765 height=666></figure><div class=post-content><p><em>Bootstrapping the right way</em> is a talk I gave earlier this year at the YOW! Data conference in Sydney. You can now watch the video of the talk and have a look through <a href=https://yanirs.github.io/talks/bootstrapping-the-right-way/ target=_blank rel=noopener>the slides</a>. The content of the talk is similar to <a href=https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/>a post I published on bootstrapping pitfalls</a>, with some additional simulations.</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/2wZXejYz-e0 style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="YouTube Video"></iframe></div></p><p>The main takeaways shared in the talk are:</p><ul><li>Don&rsquo;t compare single-sample confidence intervals by eye</li><li>Use enough resamples (15K?)</li><li>Use a solid bootstrapping package (e.g., <a href=https://arch.readthedocs.io/ target=_blank rel=noopener>Python ARCH</a>)</li><li>Use the right bootstrap for the job</li><li>Consider going parametric Bayesian</li><li>Test all the things</li></ul><p>Testing all the things typically requires writing code, which I did for the talk. You can browse through it in <a href=https://github.com/yanirs/yanirs.github.io/blob/master/talks/bootstrapping-the-right-way/notebook.ipynb target=_blank rel=noopener>this notebook</a>. The most interesting findings from my tests are summarised by the following figure.</p><figure><a href=revenue-confidence-intervals.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/revenue-confidence-intervals_hua7f3e259e998045c935f50c75e8eb77d_43359_720x0_resize_box_3.png 720w,
diff --git a/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/index.html b/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/index.html
index 10dc094f5..de5b3001f 100644
--- a/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/index.html
+++ b/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>A day in the life of a remote data scientist | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Automattic,career,data science,remote work"><meta name=description content="Video of a talk I gave on remote data science work at the Data Science Sydney meetup."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="A day in the life of a remote data scientist"><meta property="og:description" content="Video of a talk I gave on remote data science work at the Data Science Sydney meetup."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/"><meta property="og:image" content="https://yanirseroussi.com/remote-person-tossing-globe.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2019-12-11T22:06:19+00:00"><meta property="article:modified_time" content="2023-07-05T11:39:25+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/remote-person-tossing-globe.jpg"><meta name=twitter:title content="A day in the life of a remote data scientist"><meta name=twitter:description content="Video of a talk I gave on remote data science work at the Data Science Sydney meetup."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"A day in the life of a remote data scientist","item":"https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"A day in the life of a remote data scientist","name":"A day in the life of a remote data scientist","description":"Video of a talk I gave on remote data science work at the Data Science Sydney meetup.","keywords":["Automattic","career","data science","remote work"],"articleBody":"Earlier this year, I gave a talk titled A Day in the Life of a Remote Data Scientist at the Data Science Sydney meetup. The talk covered similar ground to a post I published on remote data science work, with additional details on my daily schedule and projects, some gifs and Sydney jokes, heckling by the audience, and a Q\u0026A session. I managed to watch it a few months ago without cringing too much, so it’s about time to post it here. The slides are on my GitHub, as is my list of established remote companies, which you may find useful if you want to join the remote work fun.\n","wordCount":"110","inLanguage":"en","image":"https://yanirseroussi.com/remote-person-tossing-globe.jpg","datePublished":"2019-12-11T22:06:19Z","dateModified":"2023-07-05T11:39:25+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">A day in the life of a remote data scientist</h1><div class=post-meta><span title='2019-12-11 22:06:19 +0000 UTC'>December 11, 2019</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2019-12-12-a-day-in-the-life-of-a-remote-data-scientist/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe.jpg 4989w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe.jpg alt width=4989 height=3326></figure><div class=post-content><p>Earlier this year, I gave a talk titled <em>A Day in the Life of a Remote Data Scientist</em> at <a href=https://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>the Data Science Sydney meetup</a>. The talk covered similar ground to <a href=https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/>a post I published on remote data science work</a>, with additional details on my daily schedule and projects, some gifs and Sydney jokes, heckling by the audience, and a Q&amp;A session. I managed to watch it a few months ago without cringing too much, so it&rsquo;s about time to post it here. <a href=https://yanirs.github.io/talks/remote-data-scientist/ target=_blank rel=noopener>The slides are on my GitHub</a>, as is <a href=https://github.com/yanirs/established-remote/ target=_blank rel=noopener>my list of established remote companies</a>, which you may find useful if you want to join the remote work fun.</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/5qbVEEtgWcY style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="YouTube Video"></iframe></div></p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/automattic/>Automattic</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/remote-work/>remote work</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on x" href="https://x.com/intent/tweet/?text=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&amp;url=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f&amp;hashtags=Automattic%2ccareer%2cdatascience%2cremotework"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f&amp;title=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&amp;summary=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&amp;source=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f&title=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on whatsapp" href="https://api.whatsapp.com/send?text=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist%20-%20https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on telegram" href="https://telegram.me/share/url?text=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&amp;url=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on ycombinator" href="https://news.ycombinator.com/submitlink?t=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&u=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="Automattic,career,data science,remote work"><meta name=description content="Video of a talk I gave on remote data science work at the Data Science Sydney meetup."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="A day in the life of a remote data scientist"><meta property="og:description" content="Video of a talk I gave on remote data science work at the Data Science Sydney meetup."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/"><meta property="og:image" content="https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2019-12-11T22:06:19+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe.jpg"><meta name=twitter:title content="A day in the life of a remote data scientist"><meta name=twitter:description content="Video of a talk I gave on remote data science work at the Data Science Sydney meetup."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"A day in the life of a remote data scientist","item":"https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"A day in the life of a remote data scientist","name":"A day in the life of a remote data scientist","description":"Video of a talk I gave on remote data science work at the Data Science Sydney meetup.","keywords":["Automattic","career","data science","remote work"],"articleBody":"Earlier this year, I gave a talk titled A Day in the Life of a Remote Data Scientist at the Data Science Sydney meetup. The talk covered similar ground to a post I published on remote data science work, with additional details on my daily schedule and projects, some gifs and Sydney jokes, heckling by the audience, and a Q\u0026A session. I managed to watch it a few months ago without cringing too much, so it’s about time to post it here. The slides are on my GitHub, as is my list of established remote companies, which you may find useful if you want to join the remote work fun.\n","wordCount":"110","inLanguage":"en","image":"https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe.jpg","datePublished":"2019-12-11T22:06:19Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">A day in the life of a remote data scientist</h1><div class=post-meta><span title='2019-12-11 22:06:19 +0000 UTC'>December 11, 2019</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2019-12-12-a-day-in-the-life-of-a-remote-data-scientist/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe_hu3d03a01dcc18bc5be0e67db3d8d209a6_1872808_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe.jpg 4989w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/remote-person-tossing-globe.jpg alt width=4989 height=3326></figure><div class=post-content><p>Earlier this year, I gave a talk titled <em>A Day in the Life of a Remote Data Scientist</em> at <a href=https://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>the Data Science Sydney meetup</a>. The talk covered similar ground to <a href=https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/>a post I published on remote data science work</a>, with additional details on my daily schedule and projects, some gifs and Sydney jokes, heckling by the audience, and a Q&amp;A session. I managed to watch it a few months ago without cringing too much, so it&rsquo;s about time to post it here. <a href=https://yanirs.github.io/talks/remote-data-scientist/ target=_blank rel=noopener>The slides are on my GitHub</a>, as is <a href=https://github.com/yanirs/established-remote/ target=_blank rel=noopener>my list of established remote companies</a>, which you may find useful if you want to join the remote work fun.</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/5qbVEEtgWcY style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="YouTube Video"></iframe></div></p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/automattic/>Automattic</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/remote-work/>remote work</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on x" href="https://x.com/intent/tweet/?text=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&amp;url=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f&amp;hashtags=Automattic%2ccareer%2cdatascience%2cremotework"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f&amp;title=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&amp;summary=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&amp;source=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f&title=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on whatsapp" href="https://api.whatsapp.com/send?text=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist%20-%20https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on telegram" href="https://telegram.me/share/url?text=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&amp;url=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share A day in the life of a remote data scientist on ycombinator" href="https://news.ycombinator.com/submitlink?t=A%20day%20in%20the%20life%20of%20a%20remote%20data%20scientist&u=https%3a%2f%2fyanirseroussi.com%2f2019%2f12%2f12%2fa-day-in-the-life-of-a-remote-data-scientist%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2020/01/11/software-commodities-are-eating-interesting-data-science-work/index.html b/2020/01/11/software-commodities-are-eating-interesting-data-science-work/index.html
index 8156503ca..96f19202d 100644
--- a/2020/01/11/software-commodities-are-eating-interesting-data-science-work/index.html
+++ b/2020/01/11/software-commodities-are-eating-interesting-data-science-work/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Software commodities are eating interesting data science work | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="business,career,data science,software engineering"><meta name=description content="Being a data scientist can sometimes feel like a race against software commodities that replace interesting work. What can one do to remain relevant?"><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Software commodities are eating interesting data science work"><meta property="og:description" content="Being a data scientist can sometimes feel like a race against software commodities that replace interesting work. What can one do to remain relevant?"><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/"><meta property="og:image" content="https://yanirseroussi.com/pacman.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2020-01-11T09:22:35+00:00"><meta property="article:modified_time" content="2023-07-05T11:39:25+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/pacman.png"><meta name=twitter:title content="Software commodities are eating interesting data science work"><meta name=twitter:description content="Being a data scientist can sometimes feel like a race against software commodities that replace interesting work. What can one do to remain relevant?"><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Software commodities are eating interesting data science work","item":"https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Software commodities are eating interesting data science work","name":"Software commodities are eating interesting data science work","description":"Being a data scientist can sometimes feel like a race against software commodities that replace interesting work. What can one do to remain relevant?","keywords":["business","career","data science","software engineering"],"articleBody":" The passage of time makes wizards of us all. Today, any dullard can make bells ring across the ocean by tapping out phone numbers, cause inanimate toys to march by barking an order, or activate remote devices by touching a wireless screen. Thomas Edison couldn’t have managed any of this at his peak—and shortly before his time, such powers would have been considered the unique realm of God.\nRob Reid After On Being a data scientist can sometimes feel like a race against software innovations. Every interesting and useful problem is bound to become a software commodity. My story seems to reflect that: From my first steps in sentiment analysis and topic modelling, through building recommender systems while dabbling in Kaggle competitions and deep learning a few years ago, and to my present-day interest in causal inference. What can one do to remain relevant in such an environment? Read this post to find out.\nHighlights from my past When I started my PhD in 2009, the plan was to work on sentiment analysis of opinion polls. This got me into applied machine learning using Java and Weka, with which I made some modest contributions to the field. Today, researching sentiment analysis would feel somewhat pointless, given the plethora of sentiment analysis services. Sentiment analysis is a commodity – using it in practice is a software engineering problem.\nMoving forward in my PhD, I got into topic modelling. I learned about Bayesian statistics and conjugate priors. I went through the arduous process of solving integrals by hand and coding a custom Gibbs sampler for the models I specified. Today, I probably wouldn’t bother with the maths. Instead, I’d specify the model and let a probabilistic programming tool like pymc3 or Stan handle the rest. Bayesian inference is now a commodity that’s accessible to any hacker.\nA part of my PhD thesis that can probably be replaced by a probabilistic programming tool Towards the end of my PhD in 2012, I got into Kaggle competitions. Back then, it seemed like “real” data science consisted of building and tuning machine learning models – that’s what Kaggle was all about. While I’ve done quite well in those competitions, I’ve come to realise that the utility of fine-tuning machine learning algorithms is quite limited. In reality, problem definition and solution measurement are more challenging and important. Using machine learning in practice is typically an engineering problem: We can use an existing service or package, follow best practices, and have a great solution for most use cases. No research or custom data work is required beyond turning data into features, which is essentially a data engineering problem. In short, solid machine learning solutions are delivered by solid engineers who glue together solid commodity components. Quoting Google’s Rules of Machine Learning:\nTo make great products: do machine learning like the great engineer you are, not like the great machine learning expert you aren’t.\nMost of the problems you will face are, in fact, engineering problems. Even with all the resources of a great machine learning expert, most of the gains come from great features, not great machine learning algorithms. So, the basic approach is:\nMake sure your pipeline is solid end to end. Start with a reasonable objective. Add common-sense features in a simple way. Make sure that your pipeline stays solid. Many problems in data “science” are actually engineering problems – described best by the flow on the right (source) Some of my first jobs as a data scientist in industry involved building recommender systems. With recommender systems, much of the work is on the system around the recommendation algorithm. That is, building a recommender system was always mostly an engineering problem. However, these days we have services like AWS Personalize, which does most of the heavy lifting around recommendation. This makes the deployment of recommender systems a pure engineering problem. Like many other problems, recommender systems have been commodified.\nI have not done much with deep learning, but there the general trend is even more apparent: Useful innovations quickly turn into tools. Examples include library evolution from Theano to TensorFlow, and commodified prediction services from companies like Google, Amazon, and Microsoft. If you want to use a deep learning service in your application, you probably don’t need a data scientist or even a machine learning engineer. A solid software engineer who can pick the right tools should be enough.\nHow to remain relevant? So where does this leave us? It seems to be a more general phenomenon. Essentially every problem that requires specialised knowledge and is valuable ends up attracting repeatable solutions that obviate the need for deep thinking and manual work. These solutions are software commodities. Deploying them is a matter of writing some glue code and fitting them into the overall system – an engineering problem. Implementing data science components to compete with commodities may be interesting and fun, but it’s usually a waste of time when there’s a generic solution that is good enough.\nAs an individual data scientist, what can you do when your speciality becomes a software commodity? I see a few options:\nEmbrace the engineering angle. Become good (or better) at engineering solutions. Be pragmatic. Do what it takes to get the job done. This is probably easier for data scientists like me, who have an engineering background, than for more research/analysis-oriented data scientists. Such data scientists sometimes sneer at engineering work, claiming it’s “fake” data science. Fake or not, solid engineering tools can easily make stubborn data scientists obsolete. Keep building custom solutions even when viable commodities exist. While this may be more fun for the individual, I believe it isn’t a sustainable approach. The cost of building and maintaining custom solutions will typically be higher than the cost of commodity solutions. Insisting on custom solutions seems like a recipe for becoming irrelevant. Keep adapting and moving to non-commodity areas. Some things are easier to automate than others. For example, building a machine learning pipeline when the problem is well-defined is relatively easy, but deciding what features to create typically requires some domain expertise. In addition, new research keeps coming out in areas that are less hot than machine learning. One such area is causal inference, where there are still solutions that are yet to be commodified. Move to the cutting edge. If you want to research novel methods, a “standard” data scientist position may not be for you. Many industry positions are focused on applying proven solutions to a specific organisation. If that doesn’t sound like fun, you’re better off moving to academia or joining a commercial research group. Are there any other options I don’t see? Let me know in the comments!\n","wordCount":"1116","inLanguage":"en","image":"https://yanirseroussi.com/pacman.png","datePublished":"2020-01-11T09:22:35Z","dateModified":"2023-07-05T11:39:25+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Software commodities are eating interesting data science work</h1><div class=post-meta><span title='2020-01-11 09:22:35 +0000 UTC'>January 11, 2020</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2020-01-11-software-commodities-are-eating-interesting-data-science-work/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_1500x0_resize_box_3.png 1500w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman.png 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman.png alt width=1920 height=714></figure><div class=post-content><blockquote><p>The passage of time makes wizards of us all. Today, any dullard can make bells ring across the ocean by tapping out phone numbers, cause inanimate toys to march by barking an order, or activate remote devices by touching a wireless screen. Thomas Edison couldn&rsquo;t have managed any of this at his peak—and shortly before his time, such powers would have been considered the unique realm of God.</p><footer><strong>Rob Reid</strong>
+<meta name=keywords content="business,career,data science,software engineering"><meta name=description content="Being a data scientist can sometimes feel like a race against software commodities that replace interesting work. What can one do to remain relevant?"><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Software commodities are eating interesting data science work"><meta property="og:description" content="Being a data scientist can sometimes feel like a race against software commodities that replace interesting work. What can one do to remain relevant?"><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/"><meta property="og:image" content="https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2020-01-11T09:22:35+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman.png"><meta name=twitter:title content="Software commodities are eating interesting data science work"><meta name=twitter:description content="Being a data scientist can sometimes feel like a race against software commodities that replace interesting work. What can one do to remain relevant?"><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Software commodities are eating interesting data science work","item":"https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Software commodities are eating interesting data science work","name":"Software commodities are eating interesting data science work","description":"Being a data scientist can sometimes feel like a race against software commodities that replace interesting work. What can one do to remain relevant?","keywords":["business","career","data science","software engineering"],"articleBody":" The passage of time makes wizards of us all. Today, any dullard can make bells ring across the ocean by tapping out phone numbers, cause inanimate toys to march by barking an order, or activate remote devices by touching a wireless screen. Thomas Edison couldn’t have managed any of this at his peak—and shortly before his time, such powers would have been considered the unique realm of God.\nRob Reid After On Being a data scientist can sometimes feel like a race against software innovations. Every interesting and useful problem is bound to become a software commodity. My story seems to reflect that: From my first steps in sentiment analysis and topic modelling, through building recommender systems while dabbling in Kaggle competitions and deep learning a few years ago, and to my present-day interest in causal inference. What can one do to remain relevant in such an environment? Read this post to find out.\nHighlights from my past When I started my PhD in 2009, the plan was to work on sentiment analysis of opinion polls. This got me into applied machine learning using Java and Weka, with which I made some modest contributions to the field. Today, researching sentiment analysis would feel somewhat pointless, given the plethora of sentiment analysis services. Sentiment analysis is a commodity – using it in practice is a software engineering problem.\nMoving forward in my PhD, I got into topic modelling. I learned about Bayesian statistics and conjugate priors. I went through the arduous process of solving integrals by hand and coding a custom Gibbs sampler for the models I specified. Today, I probably wouldn’t bother with the maths. Instead, I’d specify the model and let a probabilistic programming tool like pymc3 or Stan handle the rest. Bayesian inference is now a commodity that’s accessible to any hacker.\nA part of my PhD thesis that can probably be replaced by a probabilistic programming tool Towards the end of my PhD in 2012, I got into Kaggle competitions. Back then, it seemed like “real” data science consisted of building and tuning machine learning models – that’s what Kaggle was all about. While I’ve done quite well in those competitions, I’ve come to realise that the utility of fine-tuning machine learning algorithms is quite limited. In reality, problem definition and solution measurement are more challenging and important. Using machine learning in practice is typically an engineering problem: We can use an existing service or package, follow best practices, and have a great solution for most use cases. No research or custom data work is required beyond turning data into features, which is essentially a data engineering problem. In short, solid machine learning solutions are delivered by solid engineers who glue together solid commodity components. Quoting Google’s Rules of Machine Learning:\nTo make great products: do machine learning like the great engineer you are, not like the great machine learning expert you aren’t.\nMost of the problems you will face are, in fact, engineering problems. Even with all the resources of a great machine learning expert, most of the gains come from great features, not great machine learning algorithms. So, the basic approach is:\nMake sure your pipeline is solid end to end. Start with a reasonable objective. Add common-sense features in a simple way. Make sure that your pipeline stays solid. Many problems in data “science” are actually engineering problems – described best by the flow on the right (source) Some of my first jobs as a data scientist in industry involved building recommender systems. With recommender systems, much of the work is on the system around the recommendation algorithm. That is, building a recommender system was always mostly an engineering problem. However, these days we have services like AWS Personalize, which does most of the heavy lifting around recommendation. This makes the deployment of recommender systems a pure engineering problem. Like many other problems, recommender systems have been commodified.\nI have not done much with deep learning, but there the general trend is even more apparent: Useful innovations quickly turn into tools. Examples include library evolution from Theano to TensorFlow, and commodified prediction services from companies like Google, Amazon, and Microsoft. If you want to use a deep learning service in your application, you probably don’t need a data scientist or even a machine learning engineer. A solid software engineer who can pick the right tools should be enough.\nHow to remain relevant? So where does this leave us? It seems to be a more general phenomenon. Essentially every problem that requires specialised knowledge and is valuable ends up attracting repeatable solutions that obviate the need for deep thinking and manual work. These solutions are software commodities. Deploying them is a matter of writing some glue code and fitting them into the overall system – an engineering problem. Implementing data science components to compete with commodities may be interesting and fun, but it’s usually a waste of time when there’s a generic solution that is good enough.\nAs an individual data scientist, what can you do when your speciality becomes a software commodity? I see a few options:\nEmbrace the engineering angle. Become good (or better) at engineering solutions. Be pragmatic. Do what it takes to get the job done. This is probably easier for data scientists like me, who have an engineering background, than for more research/analysis-oriented data scientists. Such data scientists sometimes sneer at engineering work, claiming it’s “fake” data science. Fake or not, solid engineering tools can easily make stubborn data scientists obsolete. Keep building custom solutions even when viable commodities exist. While this may be more fun for the individual, I believe it isn’t a sustainable approach. The cost of building and maintaining custom solutions will typically be higher than the cost of commodity solutions. Insisting on custom solutions seems like a recipe for becoming irrelevant. Keep adapting and moving to non-commodity areas. Some things are easier to automate than others. For example, building a machine learning pipeline when the problem is well-defined is relatively easy, but deciding what features to create typically requires some domain expertise. In addition, new research keeps coming out in areas that are less hot than machine learning. One such area is causal inference, where there are still solutions that are yet to be commodified. Move to the cutting edge. If you want to research novel methods, a “standard” data scientist position may not be for you. Many industry positions are focused on applying proven solutions to a specific organisation. If that doesn’t sound like fun, you’re better off moving to academia or joining a commercial research group. Are there any other options I don’t see? Let me know in the comments!\n","wordCount":"1116","inLanguage":"en","image":"https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman.png","datePublished":"2020-01-11T09:22:35Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Software commodities are eating interesting data science work</h1><div class=post-meta><span title='2020-01-11 09:22:35 +0000 UTC'>January 11, 2020</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2020-01-11-software-commodities-are-eating-interesting-data-science-work/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_1080x0_resize_box_3.png 1080w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman_huf6fd963093a5068d761a419fcde11af6_17555_1500x0_resize_box_3.png 1500w ,https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman.png 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/pacman.png alt width=1920 height=714></figure><div class=post-content><blockquote><p>The passage of time makes wizards of us all. Today, any dullard can make bells ring across the ocean by tapping out phone numbers, cause inanimate toys to march by barking an order, or activate remote devices by touching a wireless screen. Thomas Edison couldn&rsquo;t have managed any of this at his peak—and shortly before his time, such powers would have been considered the unique realm of God.</p><footer><strong>Rob Reid</strong>
 <cite><a href=https://after-on.com/after-on-novel title=https://after-on.com/after-on-novel target=_blank rel=noopener>After On</a></cite></footer></blockquote><p>Being a data scientist can sometimes feel like a race against software innovations. Every interesting and useful problem is bound to become a software commodity. My story seems to reflect that: From my first steps in sentiment analysis and topic modelling, through building recommender systems while dabbling in Kaggle competitions and deep learning a few years ago, and to <a href=https://yanirseroussi.com/causal-inference-resources/>my present-day interest in causal inference</a>. What can one do to remain relevant in such an environment? Read this post to find out.</p><h2 id=highlights-from-my-past>Highlights from my past<a hidden class=anchor aria-hidden=true href=#highlights-from-my-past>#</a></h2><p>When I started my PhD in 2009, <a href=https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/>the plan was to work on sentiment analysis of opinion polls</a>. This got me into applied machine learning using Java and <a href=https://www.cs.waikato.ac.nz/ml/weka/ target=_blank rel=noopener>Weka</a>, with which I made some modest contributions to the field. Today, researching sentiment analysis would feel somewhat pointless, given the plethora of sentiment analysis services. Sentiment analysis is a commodity – using it in practice is a software engineering problem.</p><p>Moving forward in my PhD, I got into topic modelling. I learned about Bayesian statistics and conjugate priors. I went through the arduous process of solving integrals by hand and coding a custom Gibbs sampler for <a href=https://yanirseroussi.com/phd-work/>the models I specified</a>. Today, I probably wouldn&rsquo;t bother with the maths. Instead, I&rsquo;d specify the model and let a probabilistic programming tool like <a href=https://docs.pymc.io/ target=_blank rel=noopener>pymc3</a> or <a href=https://mc-stan.org/ target=_blank rel=noopener>Stan</a> handle the rest. Bayesian inference is now a commodity that&rsquo;s <a href=http://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/ target=_blank rel=noopener>accessible to any hacker</a>.</p><figure><a href=thesis-maths.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/thesis-maths_hu0ea4991cbd3b0c7c32427194623a941b_124619_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/thesis-maths_hu0ea4991cbd3b0c7c32427194623a941b_124619_480x0_resize_box_3.png 480w,
diff --git a/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.html b/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.html
index 1b79971e9..45d88dcbe 100644
--- a/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.html
+++ b/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Many is not enough: Counting simulations to bootstrap the right way | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="bootstrapping,confidence intervals,data science,statistics"><meta name=description content="Going deeper into correct testing of different methods for bootstrap estimation of confidence intervals."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Many is not enough: Counting simulations to bootstrap the right way"><meta property="og:description" content="Going deeper into correct testing of different methods for bootstrap estimation of confidence intervals."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/"><meta property="og:image" content="https://yanirseroussi.com/santa-counting.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2020-08-24T01:35:17+00:00"><meta property="article:modified_time" content="2023-07-05T11:39:25+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/santa-counting.jpg"><meta name=twitter:title content="Many is not enough: Counting simulations to bootstrap the right way"><meta name=twitter:description content="Going deeper into correct testing of different methods for bootstrap estimation of confidence intervals."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Many is not enough: Counting simulations to bootstrap the right way","item":"https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Many is not enough: Counting simulations to bootstrap the right way","name":"Many is not enough: Counting simulations to bootstrap the right way","description":"Going deeper into correct testing of different methods for bootstrap estimation of confidence intervals.","keywords":["bootstrapping","confidence intervals","data science","statistics"],"articleBody":"Previously, I encouraged readers to test different approaches to bootstrapped confidence interval (CI) estimation. Such testing can done by relying on the definition of CIs: Given an infinite number of independent samples from the same population, we expect a ci_level CI to contain the population parameter in exactly ci_level percent of the samples. Therefore, we run “many” simulations (num_simulations), where each simulation generates a random sample from the same population and runs the CI algorithm on the sample. We then look at the observed CI level (i.e., the percentage of CIs that contain the true population parameter), and say that the CI algorithm works as expected if the observed CI level is “not too far” from the requested ci_level.\nKeen observers may notice that the language I used to describe the process isn’t accurate enough. How many is “many” simulations? How far is “not too far”?\nI made a mistake by not asking and answering these questions before. I decided that num_simulations=1,000 is a reasonable number of simulations, and didn’t consider how this affects the observed CI level. The decision to use num_simulations=1,000 was informed by practical concerns (i.e., wanting the simulations to finish within a reasonable timeframe), while ranges for the observed CI level were determined empirically – by observing the results of the simulations rather than by considering the properties of the problem.\nThe idea of using simulations to test bootstrapped CIs came from Tim Hesterberg’s What Teachers Should Know about the Bootstrap. The experiments presented in that paper used num_simulations=10,000, but it wasn’t made clear why this number was chosen. This may have been due to space limitations or because this point is obvious to experienced statisticians. Embarrassingly, my approach of using fewer simulations without considering how they affect the observed CIs can be seen as a form of Belief in The Law of Small Numbers.\nFortunately, it’s not hard to move away from belief in the law of small numbers in this case: We can see a set of simulations as sampling from Binomial(n=num_simulations, p=ci_level), where the number of “successes” is the number of simulations where the true population parameter falls in the CI returned by the CI algorithm. We can define our desired level of confidence in the simulation results as the simulation confidence, and use the simulation confidence interval of the binomial distribution to decide on a likely range for the observed CI level.\nTo make this more concrete, here’s a Python function that gives the observed CI level bounds for different values of num_simulations, given the ci_level and simulation confidence. The output from running this function with the default arguments is plotted below.\nimport numpy as np import pandas as pd import scipy.stats def get_observed_ci_bounds( all_num_simulations=(10, 100, 500, 1000, 2000, 5000, 10000), ci_level=0.95, simulation_confidence=0.99 ): return pd.DataFrame( index=pd.Series(all_num_simulations, name='num_simulations'), data=[ np.array( scipy.stats.binom.interval(simulation_confidence, n=num_simulations, p=ci_level) ) / num_simulations for num_simulations in all_num_simulations ], columns=['low', 'high'] ) * 100 \u003e\u003e\u003e print(get_observed_ci_bounds()) num_simulations low high 10 70.00 100.00 100 89.00 100.00 500 92.40 97.40 1000 93.10 96.70 2000 93.70 96.20 5000 94.18 95.78 10000 94.43 95.55 Therefore, when setting num_simulations to 1,000 (as I did in the experiments I presented previously), we can be 99% confident that the observed CI level of a perfect CI algorithm would be between 93.1% and 96.7% when asked to generate 95% CIs. As shown by the following figure, this doesn’t materially change my previous conclusions: On the dataset from those experiments, the Studentized algorithm delivers satisfactory results, while the Percentile and BCa algorithms are quite far from perfection. And of course, we can now quantify their distance from perfection – the CIs they yield in the best case would be acceptable if we wanted 90% CIs, where we expect the observed CI to be in the 87.5% to 92.4% range (obtained by running the function above with ci_level=0.9). As there are better alternatives, I believe that this is a good enough reason to avoid using the Percentile and BCa algorithms.\nNotes: See this notebook for code – use the same environment as the original notebook. The cover photo is by Dima D from Pexels.\n","wordCount":"684","inLanguage":"en","image":"https://yanirseroussi.com/santa-counting.jpg","datePublished":"2020-08-24T01:35:17Z","dateModified":"2023-07-05T11:39:25+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Many is not enough: Counting simulations to bootstrap the right way</h1><div class=post-meta><span title='2020-08-24 01:35:17 +0000 UTC'>August 24, 2020</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2020-08-24-many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting.jpg 4896w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting.jpg alt width=4896 height=3264></figure><div class=post-content><p><a href=https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/>Previously, I encouraged readers to test different approaches to bootstrapped confidence interval (CI) estimation</a>. Such testing can done by relying on <a href=https://en.wikipedia.org/wiki/Confidence_interval target=_blank rel=noopener>the definition of CIs</a>: Given an infinite number of independent samples from the same population, we expect a <code>ci_level</code> CI to contain the population parameter in exactly <code>ci_level</code> percent of the samples. Therefore, we run &ldquo;many&rdquo; simulations (<code>num_simulations</code>), where each simulation generates a random sample from the same population and runs the CI algorithm on the sample. We then look at the <em>observed</em> CI level (i.e., the percentage of CIs that contain the true population parameter), and say that the CI algorithm works as expected if the observed CI level is &ldquo;not too far&rdquo; from the requested <code>ci_level</code>.</p><p><strong>Keen observers may notice that the language I used to describe the process isn&rsquo;t accurate enough. How many is &ldquo;many&rdquo; simulations? How far is &ldquo;not too far&rdquo;?</strong></p><p>I made a mistake by not asking and answering these questions before. I decided that <code>num_simulations</code>=1,000 is a reasonable number of simulations, and didn&rsquo;t consider how this affects the observed CI level. The decision to use <code>num_simulations</code>=1,000 was informed by practical concerns (i.e., wanting the simulations to finish within a reasonable timeframe), while ranges for the observed CI level were determined empirically – by observing the results of the simulations rather than by considering the properties of the problem.</p><p>The idea of using simulations to test bootstrapped CIs came from Tim Hesterberg&rsquo;s <a href=https://arxiv.org/abs/1411.5279 target=_blank rel=noopener>What Teachers Should Know about the Bootstrap</a>. The experiments presented in that paper used <code>num_simulations</code>=10,000, but it wasn&rsquo;t made clear why this number was chosen. This may have been due to space limitations or because this point is obvious to experienced statisticians. Embarrassingly, my approach of using fewer simulations without considering how they affect the observed CIs can be seen as a form of <a href=http://stats.org.uk/statistical-inference/TverskyKahneman1971.pdf target=_blank rel=noopener>Belief in The Law of Small Numbers</a>.</p><p>Fortunately, it&rsquo;s not hard to move away from belief in the law of small numbers in this case: We can see a set of simulations as sampling from <code>Binomial(n=num_simulations, p=ci_level)</code>, where the number of &ldquo;successes&rdquo; is the number of simulations where the true population parameter falls in the CI returned by the CI algorithm. We can define our desired level of confidence in the simulation results as the <em>simulation confidence</em>, and use the simulation confidence interval of the binomial distribution to decide on a likely range for the observed CI level.</p><p>To make this more concrete, here&rsquo;s a Python function that gives the observed CI level bounds for different values of <code>num_simulations</code>, given the <code>ci_level</code> and simulation confidence. The output from running this function with the default arguments is plotted below.</p><div class=highlight><pre tabindex=0 style=color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-python data-lang=python><span style=display:flex><span><span style=color:#f92672>import</span> numpy <span style=color:#66d9ef>as</span> np
+<meta name=keywords content="bootstrapping,confidence intervals,data science,statistics"><meta name=description content="Going deeper into correct testing of different methods for bootstrap estimation of confidence intervals."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Many is not enough: Counting simulations to bootstrap the right way"><meta property="og:description" content="Going deeper into correct testing of different methods for bootstrap estimation of confidence intervals."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/"><meta property="og:image" content="https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2020-08-24T01:35:17+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting.jpg"><meta name=twitter:title content="Many is not enough: Counting simulations to bootstrap the right way"><meta name=twitter:description content="Going deeper into correct testing of different methods for bootstrap estimation of confidence intervals."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Many is not enough: Counting simulations to bootstrap the right way","item":"https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Many is not enough: Counting simulations to bootstrap the right way","name":"Many is not enough: Counting simulations to bootstrap the right way","description":"Going deeper into correct testing of different methods for bootstrap estimation of confidence intervals.","keywords":["bootstrapping","confidence intervals","data science","statistics"],"articleBody":"Previously, I encouraged readers to test different approaches to bootstrapped confidence interval (CI) estimation. Such testing can done by relying on the definition of CIs: Given an infinite number of independent samples from the same population, we expect a ci_level CI to contain the population parameter in exactly ci_level percent of the samples. Therefore, we run “many” simulations (num_simulations), where each simulation generates a random sample from the same population and runs the CI algorithm on the sample. We then look at the observed CI level (i.e., the percentage of CIs that contain the true population parameter), and say that the CI algorithm works as expected if the observed CI level is “not too far” from the requested ci_level.\nKeen observers may notice that the language I used to describe the process isn’t accurate enough. How many is “many” simulations? How far is “not too far”?\nI made a mistake by not asking and answering these questions before. I decided that num_simulations=1,000 is a reasonable number of simulations, and didn’t consider how this affects the observed CI level. The decision to use num_simulations=1,000 was informed by practical concerns (i.e., wanting the simulations to finish within a reasonable timeframe), while ranges for the observed CI level were determined empirically – by observing the results of the simulations rather than by considering the properties of the problem.\nThe idea of using simulations to test bootstrapped CIs came from Tim Hesterberg’s What Teachers Should Know about the Bootstrap. The experiments presented in that paper used num_simulations=10,000, but it wasn’t made clear why this number was chosen. This may have been due to space limitations or because this point is obvious to experienced statisticians. Embarrassingly, my approach of using fewer simulations without considering how they affect the observed CIs can be seen as a form of Belief in The Law of Small Numbers.\nFortunately, it’s not hard to move away from belief in the law of small numbers in this case: We can see a set of simulations as sampling from Binomial(n=num_simulations, p=ci_level), where the number of “successes” is the number of simulations where the true population parameter falls in the CI returned by the CI algorithm. We can define our desired level of confidence in the simulation results as the simulation confidence, and use the simulation confidence interval of the binomial distribution to decide on a likely range for the observed CI level.\nTo make this more concrete, here’s a Python function that gives the observed CI level bounds for different values of num_simulations, given the ci_level and simulation confidence. The output from running this function with the default arguments is plotted below.\nimport numpy as np import pandas as pd import scipy.stats def get_observed_ci_bounds( all_num_simulations=(10, 100, 500, 1000, 2000, 5000, 10000), ci_level=0.95, simulation_confidence=0.99 ): return pd.DataFrame( index=pd.Series(all_num_simulations, name='num_simulations'), data=[ np.array( scipy.stats.binom.interval(simulation_confidence, n=num_simulations, p=ci_level) ) / num_simulations for num_simulations in all_num_simulations ], columns=['low', 'high'] ) * 100 \u003e\u003e\u003e print(get_observed_ci_bounds()) num_simulations low high 10 70.00 100.00 100 89.00 100.00 500 92.40 97.40 1000 93.10 96.70 2000 93.70 96.20 5000 94.18 95.78 10000 94.43 95.55 Therefore, when setting num_simulations to 1,000 (as I did in the experiments I presented previously), we can be 99% confident that the observed CI level of a perfect CI algorithm would be between 93.1% and 96.7% when asked to generate 95% CIs. As shown by the following figure, this doesn’t materially change my previous conclusions: On the dataset from those experiments, the Studentized algorithm delivers satisfactory results, while the Percentile and BCa algorithms are quite far from perfection. And of course, we can now quantify their distance from perfection – the CIs they yield in the best case would be acceptable if we wanted 90% CIs, where we expect the observed CI to be in the 87.5% to 92.4% range (obtained by running the function above with ci_level=0.9). As there are better alternatives, I believe that this is a good enough reason to avoid using the Percentile and BCa algorithms.\nNotes: See this notebook for code – use the same environment as the original notebook. The cover photo is by Dima D from Pexels.\n","wordCount":"684","inLanguage":"en","image":"https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting.jpg","datePublished":"2020-08-24T01:35:17Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Many is not enough: Counting simulations to bootstrap the right way</h1><div class=post-meta><span title='2020-08-24 01:35:17 +0000 UTC'>August 24, 2020</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2020-08-24-many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting_hu3d03a01dcc18bc5be0e67db3d8d209a6_1687432_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting.jpg 4896w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/santa-counting.jpg alt width=4896 height=3264></figure><div class=post-content><p><a href=https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/>Previously, I encouraged readers to test different approaches to bootstrapped confidence interval (CI) estimation</a>. Such testing can done by relying on <a href=https://en.wikipedia.org/wiki/Confidence_interval target=_blank rel=noopener>the definition of CIs</a>: Given an infinite number of independent samples from the same population, we expect a <code>ci_level</code> CI to contain the population parameter in exactly <code>ci_level</code> percent of the samples. Therefore, we run &ldquo;many&rdquo; simulations (<code>num_simulations</code>), where each simulation generates a random sample from the same population and runs the CI algorithm on the sample. We then look at the <em>observed</em> CI level (i.e., the percentage of CIs that contain the true population parameter), and say that the CI algorithm works as expected if the observed CI level is &ldquo;not too far&rdquo; from the requested <code>ci_level</code>.</p><p><strong>Keen observers may notice that the language I used to describe the process isn&rsquo;t accurate enough. How many is &ldquo;many&rdquo; simulations? How far is &ldquo;not too far&rdquo;?</strong></p><p>I made a mistake by not asking and answering these questions before. I decided that <code>num_simulations</code>=1,000 is a reasonable number of simulations, and didn&rsquo;t consider how this affects the observed CI level. The decision to use <code>num_simulations</code>=1,000 was informed by practical concerns (i.e., wanting the simulations to finish within a reasonable timeframe), while ranges for the observed CI level were determined empirically – by observing the results of the simulations rather than by considering the properties of the problem.</p><p>The idea of using simulations to test bootstrapped CIs came from Tim Hesterberg&rsquo;s <a href=https://arxiv.org/abs/1411.5279 target=_blank rel=noopener>What Teachers Should Know about the Bootstrap</a>. The experiments presented in that paper used <code>num_simulations</code>=10,000, but it wasn&rsquo;t made clear why this number was chosen. This may have been due to space limitations or because this point is obvious to experienced statisticians. Embarrassingly, my approach of using fewer simulations without considering how they affect the observed CIs can be seen as a form of <a href=http://stats.org.uk/statistical-inference/TverskyKahneman1971.pdf target=_blank rel=noopener>Belief in The Law of Small Numbers</a>.</p><p>Fortunately, it&rsquo;s not hard to move away from belief in the law of small numbers in this case: We can see a set of simulations as sampling from <code>Binomial(n=num_simulations, p=ci_level)</code>, where the number of &ldquo;successes&rdquo; is the number of simulations where the true population parameter falls in the CI returned by the CI algorithm. We can define our desired level of confidence in the simulation results as the <em>simulation confidence</em>, and use the simulation confidence interval of the binomial distribution to decide on a likely range for the observed CI level.</p><p>To make this more concrete, here&rsquo;s a Python function that gives the observed CI level bounds for different values of <code>num_simulations</code>, given the <code>ci_level</code> and simulation confidence. The output from running this function with the default arguments is plotted below.</p><div class=highlight><pre tabindex=0 style=color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-python data-lang=python><span style=display:flex><span><span style=color:#f92672>import</span> numpy <span style=color:#66d9ef>as</span> np
 </span></span><span style=display:flex><span><span style=color:#f92672>import</span> pandas <span style=color:#66d9ef>as</span> pd
 </span></span><span style=display:flex><span><span style=color:#f92672>import</span> scipy.stats
 </span></span><span style=display:flex><span> 
diff --git a/2021/04/05/some-highlights-from-2020/index.html b/2021/04/05/some-highlights-from-2020/index.html
index 0d7dd4526..11b8c5391 100644
--- a/2021/04/05/some-highlights-from-2020/index.html
+++ b/2021/04/05/some-highlights-from-2020/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Some highlights from 2020 | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="a/b testing,career,causal inference,environment,Reef Life Survey,remote work,sustainability"><meta name=description content="Sharing remote teamwork insights, my climate & sustainability activism, Reef Life Survey publications, and progress on Automattic&rsquo;s Experimentation Platform."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Some highlights from 2020"><meta property="og:description" content="Sharing remote teamwork insights, my climate & sustainability activism, Reef Life Survey publications, and progress on Automattic&rsquo;s Experimentation Platform."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/"><meta property="og:image" content="https://yanirseroussi.com/lord-howe-island.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2021-04-05T06:41:48+00:00"><meta property="article:modified_time" content="2023-07-05T11:39:25+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/lord-howe-island.jpg"><meta name=twitter:title content="Some highlights from 2020"><meta name=twitter:description content="Sharing remote teamwork insights, my climate & sustainability activism, Reef Life Survey publications, and progress on Automattic&rsquo;s Experimentation Platform."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Some highlights from 2020","item":"https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Some highlights from 2020","name":"Some highlights from 2020","description":"Sharing remote teamwork insights, my climate \u0026amp; sustainability activism, Reef Life Survey publications, and progress on Automattic\u0026rsquo;s Experimentation Platform.","keywords":["a/b testing","career","causal inference","environment","Reef Life Survey","remote work","sustainability"],"articleBody":"My track record of posting here has been pretty poor in 2020, partly because of a bunch of content I’ve contributed elsewhere. In general, my guiding principle for posting is to only add stuff I’d want to read or cite, e.g., because I haven’t seen it discussed elsewhere. Well, no one has compiled a meta-post of my public work from 2020 (that I know of), so it’s finally time to publish it myself.\nRemote work. I’ve been working remotely with Automattic since 2017, so I was pretty covid-ready as far as work was concerned. The main thing that’s changed for me is being unable to meet my colleagues in person. Looking back at the interview I did with BuiltIn from March 2020, it’s somewhat amusing that I was hopeful that we’d get to travel in May 2020, as business trips are still on hold a year later. Outside Automattic, it was interesting to see how quickly remote work has become commonplace, to the point where my curated list of established remote companies now seems irrelevant. Also, my June webinar with Felipe Flores on running remote teams is probably dated now that many more people have hands-on experience with remote work. The world has adapted quickly, though it seems like Automattic’s globally-distributed model is still quite unusual. Instead, many companies have switched to a locally-remote model, hiring remotely within the same country or timezone region. Considering the coordination costs of globally-distributed teams and the impact of frequent long-haul flights on employee wellbeing and on our environment, it may turn out that the locally-remote model is more sustainable in the long term. Only time will tell.\nSustainability. The Australian bushfires of 2019-20 provided me with extra motivation to help nudge Automattic to do more in the fight against climate change. The initial covid-19 lockdown provided me with extra free time to make the measurement and offsetting of Automattic’s emissions from data centre power use happen. I summarised this work in a post on the company’s blog, and discussed it in an interview with PublishPress. If there’s one key reason why I haven’t posted more here, it’s that the sustainability work always seems more worthwhile. I hope to continue working in the area in 2021, so the frequency of posts here is likely to remain about the same.\nWhile data from RLS dives helps global conservation efforts, diving also reminds me that there’s still so much left to save and conserve Reef Life Survey (RLS). Another distributed organisation that I’m involved with, and a worthwhile cause, is the RLS foundation. I previously posted about my experiences with RLS offline data collection and visualisation of the collected data, and have since helped with quite a few RLS surveys. Despite lockdowns and border closures, 2020 was no exception: I participated in the Lord Howe biennial surveys in February (just before the initial lockdown), and was fortunate to join a survey trip from Airlie Beach to Thursday Island in October (long after lockdown lifted in the lucky state of Queensland). I also joined the 38(!) author list of Establishing the ecological basis for conservation of shallow marine life using Reef Life Survey – a Biological Conservation journal paper covering RLS’s history, methodology, outcomes, and more. Finally, I was surprised and honoured to receive the Scoresby Shepherd Award for doing the most RLS surveys in the 2019-20 financial year. It was clearly a bit of a slow year due to the pandemic, but it’s always nice to get recognised. Overall, 2020 was definitely a good year for my participation in RLS and I’m planning on contributing more in 2021, especially with help around organising and conducting surveys in Southeast Queensland.\nTechnical work. My main “day job” focus in 2020 was on being the tech lead for Automattic’s new experimentation platform (ExPlat). This aligns well with my long-standing interest in causal inference. Among other things, it gave me an opportunity to apply my favourite approach to Bayesian A/B testing in the wild, and get excited about other interesting causal inference work we have in the pipeline. Now that ExPlat’s foundation is mostly in place, we are planning on sharing much of our work on data.blog. My colleague Aaron just published the first post in the series, and my post on ExPlat’s architecture will be next. Subscribe to data.blog to get updates!\n","wordCount":"723","inLanguage":"en","image":"https://yanirseroussi.com/lord-howe-island.jpg","datePublished":"2021-04-05T06:41:48Z","dateModified":"2023-07-05T11:39:25+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Some highlights from 2020</h1><div class=post-meta><span title='2021-04-05 06:41:48 +0000 UTC'>April 5, 2021</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2021-04-05-some-highlights-from-2020/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island.jpg 3638w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island.jpg alt width=3638 height=1032></figure><div class=post-content><p>My track record of posting here has been pretty poor in 2020, partly because of a bunch of content I&rsquo;ve contributed elsewhere. In general, my guiding principle for posting is to only add stuff I&rsquo;d want to read or cite, e.g., because I haven&rsquo;t seen it discussed elsewhere. Well, no one has compiled a meta-post of my public work from 2020 (that I know of), so it&rsquo;s finally time to publish it myself.</p><p><strong>Remote work.</strong> I&rsquo;ve been working remotely with <a href=https://automattic.com/ target=_blank rel=noopener>Automattic</a> since 2017, so I was pretty covid-ready as far as work was concerned. The main thing that&rsquo;s changed for me is being unable to meet my colleagues in person. Looking back at <a href=https://builtin.com/remote-work/remote-data-teams target=_blank rel=noopener>the interview I did with BuiltIn from March 2020</a>, it&rsquo;s somewhat amusing that I was hopeful that we&rsquo;d get to travel in May 2020, as business trips are still on hold a year later. Outside Automattic, it was interesting to see how quickly remote work has become commonplace, to the point where <a href=https://github.com/yanirs/established-remote/ target=_blank rel=noopener>my curated list of established remote companies</a> now seems irrelevant. Also, <a href="https://www.youtube.com/watch?v=79LfP8Kqgvw" target=_blank rel=noopener>my June webinar with Felipe Flores on running remote teams</a> is probably dated now that many more people have hands-on experience with remote work. The world has adapted quickly, though it seems like Automattic&rsquo;s globally-distributed model is still quite unusual. Instead, many companies have switched to a <em>locally-remote</em> model, hiring remotely within the same country or timezone region. Considering the coordination costs of globally-distributed teams and the impact of frequent long-haul flights on employee wellbeing and on our environment, it may turn out that the locally-remote model is more sustainable in the long term. Only time will tell.</p><p><strong>Sustainability.</strong> The Australian bushfires of 2019-20 provided me with extra motivation to help nudge Automattic to do more in the fight against climate change. The initial covid-19 lockdown provided me with extra free time to make the measurement and offsetting of Automattic&rsquo;s emissions from data centre power use happen. I summarised this work in <a href=https://wordpress.com/blog/2020/09/21/toward-zero-reducing-and-offsetting-our-data-center-power-emissions/ target=_blank rel=noopener>a post on the company&rsquo;s blog</a>, and discussed it in <a href="https://www.youtube.com/watch?v=tMFr_agPLJY" target=_blank rel=noopener>an interview with PublishPress</a>. If there&rsquo;s one key reason why I haven&rsquo;t posted more here, it&rsquo;s that the sustainability work always seems more worthwhile. I hope to continue working in the area in 2021, so the frequency of posts here is likely to remain about the same.</p><figure><a href=bougainville-reef-wall-dive.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="a/b testing,career,causal inference,environment,Reef Life Survey,remote work,sustainability"><meta name=description content="Sharing remote teamwork insights, my climate & sustainability activism, Reef Life Survey publications, and progress on Automattic&rsquo;s Experimentation Platform."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Some highlights from 2020"><meta property="og:description" content="Sharing remote teamwork insights, my climate & sustainability activism, Reef Life Survey publications, and progress on Automattic&rsquo;s Experimentation Platform."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/"><meta property="og:image" content="https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2021-04-05T06:41:48+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island.jpg"><meta name=twitter:title content="Some highlights from 2020"><meta name=twitter:description content="Sharing remote teamwork insights, my climate & sustainability activism, Reef Life Survey publications, and progress on Automattic&rsquo;s Experimentation Platform."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Some highlights from 2020","item":"https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Some highlights from 2020","name":"Some highlights from 2020","description":"Sharing remote teamwork insights, my climate \u0026amp; sustainability activism, Reef Life Survey publications, and progress on Automattic\u0026rsquo;s Experimentation Platform.","keywords":["a/b testing","career","causal inference","environment","Reef Life Survey","remote work","sustainability"],"articleBody":"My track record of posting here has been pretty poor in 2020, partly because of a bunch of content I’ve contributed elsewhere. In general, my guiding principle for posting is to only add stuff I’d want to read or cite, e.g., because I haven’t seen it discussed elsewhere. Well, no one has compiled a meta-post of my public work from 2020 (that I know of), so it’s finally time to publish it myself.\nRemote work. I’ve been working remotely with Automattic since 2017, so I was pretty covid-ready as far as work was concerned. The main thing that’s changed for me is being unable to meet my colleagues in person. Looking back at the interview I did with BuiltIn from March 2020, it’s somewhat amusing that I was hopeful that we’d get to travel in May 2020, as business trips are still on hold a year later. Outside Automattic, it was interesting to see how quickly remote work has become commonplace, to the point where my curated list of established remote companies now seems irrelevant. Also, my June webinar with Felipe Flores on running remote teams is probably dated now that many more people have hands-on experience with remote work. The world has adapted quickly, though it seems like Automattic’s globally-distributed model is still quite unusual. Instead, many companies have switched to a locally-remote model, hiring remotely within the same country or timezone region. Considering the coordination costs of globally-distributed teams and the impact of frequent long-haul flights on employee wellbeing and on our environment, it may turn out that the locally-remote model is more sustainable in the long term. Only time will tell.\nSustainability. The Australian bushfires of 2019-20 provided me with extra motivation to help nudge Automattic to do more in the fight against climate change. The initial covid-19 lockdown provided me with extra free time to make the measurement and offsetting of Automattic’s emissions from data centre power use happen. I summarised this work in a post on the company’s blog, and discussed it in an interview with PublishPress. If there’s one key reason why I haven’t posted more here, it’s that the sustainability work always seems more worthwhile. I hope to continue working in the area in 2021, so the frequency of posts here is likely to remain about the same.\nWhile data from RLS dives helps global conservation efforts, diving also reminds me that there’s still so much left to save and conserve Reef Life Survey (RLS). Another distributed organisation that I’m involved with, and a worthwhile cause, is the RLS foundation. I previously posted about my experiences with RLS offline data collection and visualisation of the collected data, and have since helped with quite a few RLS surveys. Despite lockdowns and border closures, 2020 was no exception: I participated in the Lord Howe biennial surveys in February (just before the initial lockdown), and was fortunate to join a survey trip from Airlie Beach to Thursday Island in October (long after lockdown lifted in the lucky state of Queensland). I also joined the 38(!) author list of Establishing the ecological basis for conservation of shallow marine life using Reef Life Survey – a Biological Conservation journal paper covering RLS’s history, methodology, outcomes, and more. Finally, I was surprised and honoured to receive the Scoresby Shepherd Award for doing the most RLS surveys in the 2019-20 financial year. It was clearly a bit of a slow year due to the pandemic, but it’s always nice to get recognised. Overall, 2020 was definitely a good year for my participation in RLS and I’m planning on contributing more in 2021, especially with help around organising and conducting surveys in Southeast Queensland.\nTechnical work. My main “day job” focus in 2020 was on being the tech lead for Automattic’s new experimentation platform (ExPlat). This aligns well with my long-standing interest in causal inference. Among other things, it gave me an opportunity to apply my favourite approach to Bayesian A/B testing in the wild, and get excited about other interesting causal inference work we have in the pipeline. Now that ExPlat’s foundation is mostly in place, we are planning on sharing much of our work on data.blog. My colleague Aaron just published the first post in the series, and my post on ExPlat’s architecture will be next. Subscribe to data.blog to get updates!\n","wordCount":"723","inLanguage":"en","image":"https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island.jpg","datePublished":"2021-04-05T06:41:48Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Some highlights from 2020</h1><div class=post-meta><span title='2021-04-05 06:41:48 +0000 UTC'>April 5, 2021</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2021-04-05-some-highlights-from-2020/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island_hu6d31f51738dbcad4c67a5473d8db48f6_2186164_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island.jpg 3638w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/lord-howe-island.jpg alt width=3638 height=1032></figure><div class=post-content><p>My track record of posting here has been pretty poor in 2020, partly because of a bunch of content I&rsquo;ve contributed elsewhere. In general, my guiding principle for posting is to only add stuff I&rsquo;d want to read or cite, e.g., because I haven&rsquo;t seen it discussed elsewhere. Well, no one has compiled a meta-post of my public work from 2020 (that I know of), so it&rsquo;s finally time to publish it myself.</p><p><strong>Remote work.</strong> I&rsquo;ve been working remotely with <a href=https://automattic.com/ target=_blank rel=noopener>Automattic</a> since 2017, so I was pretty covid-ready as far as work was concerned. The main thing that&rsquo;s changed for me is being unable to meet my colleagues in person. Looking back at <a href=https://builtin.com/remote-work/remote-data-teams target=_blank rel=noopener>the interview I did with BuiltIn from March 2020</a>, it&rsquo;s somewhat amusing that I was hopeful that we&rsquo;d get to travel in May 2020, as business trips are still on hold a year later. Outside Automattic, it was interesting to see how quickly remote work has become commonplace, to the point where <a href=https://github.com/yanirs/established-remote/ target=_blank rel=noopener>my curated list of established remote companies</a> now seems irrelevant. Also, <a href="https://www.youtube.com/watch?v=79LfP8Kqgvw" target=_blank rel=noopener>my June webinar with Felipe Flores on running remote teams</a> is probably dated now that many more people have hands-on experience with remote work. The world has adapted quickly, though it seems like Automattic&rsquo;s globally-distributed model is still quite unusual. Instead, many companies have switched to a <em>locally-remote</em> model, hiring remotely within the same country or timezone region. Considering the coordination costs of globally-distributed teams and the impact of frequent long-haul flights on employee wellbeing and on our environment, it may turn out that the locally-remote model is more sustainable in the long term. Only time will tell.</p><p><strong>Sustainability.</strong> The Australian bushfires of 2019-20 provided me with extra motivation to help nudge Automattic to do more in the fight against climate change. The initial covid-19 lockdown provided me with extra free time to make the measurement and offsetting of Automattic&rsquo;s emissions from data centre power use happen. I summarised this work in <a href=https://wordpress.com/blog/2020/09/21/toward-zero-reducing-and-offsetting-our-data-center-power-emissions/ target=_blank rel=noopener>a post on the company&rsquo;s blog</a>, and discussed it in <a href="https://www.youtube.com/watch?v=tMFr_agPLJY" target=_blank rel=noopener>an interview with PublishPress</a>. If there&rsquo;s one key reason why I haven&rsquo;t posted more here, it&rsquo;s that the sustainability work always seems more worthwhile. I hope to continue working in the area in 2021, so the frequency of posts here is likely to remain about the same.</p><figure><a href=bougainville-reef-wall-dive.jpg target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/bougainville-reef-wall-dive_hufa91eac262d7ccfc888de175482140e1_5626271_360x0_resize_q75_box.jpg 360w,
 https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/bougainville-reef-wall-dive_hufa91eac262d7ccfc888de175482140e1_5626271_480x0_resize_q75_box.jpg 480w,
 https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/bougainville-reef-wall-dive_hufa91eac262d7ccfc888de175482140e1_5626271_720x0_resize_q75_box.jpg 720w,
diff --git a/2021/10/07/my-work-with-automattic/index.html b/2021/10/07/my-work-with-automattic/index.html
index 3b89f8153..50f2c4f1f 100644
--- a/2021/10/07/my-work-with-automattic/index.html
+++ b/2021/10/07/my-work-with-automattic/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>My work with Automattic | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Automattic,career,causal inference,data science,environment,machine learning,marketing,remote work,software engineering"><meta name=description content="Back-dated meta-post that gathers my posts on Automattic blogs into a summary of the work I&rsquo;ve done with the company."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2021/10/07/my-work-with-automattic/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My work with Automattic"><meta property="og:description" content="Back-dated meta-post that gathers my posts on Automattic blogs into a summary of the work I&rsquo;ve done with the company."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2021/10/07/my-work-with-automattic/"><meta property="og:image" content="https://yanirseroussi.com/bing-yanir-seroussi-automattic-work.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2021-10-07T00:00:00+00:00"><meta property="article:modified_time" content="2023-07-05T16:02:07+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bing-yanir-seroussi-automattic-work.webp"><meta name=twitter:title content="My work with Automattic"><meta name=twitter:description content="Back-dated meta-post that gathers my posts on Automattic blogs into a summary of the work I&rsquo;ve done with the company."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My work with Automattic","item":"https://yanirseroussi.com/2021/10/07/my-work-with-automattic/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My work with Automattic","name":"My work with Automattic","description":"Back-dated meta-post that gathers my posts on Automattic blogs into a summary of the work I\u0026rsquo;ve done with the company.","keywords":["Automattic","career","causal inference","data science","environment","machine learning","marketing","remote work","software engineering"],"articleBody":"Automattic is the company behind WordPress.com, Tumblr, Jetpack, WooCommerce, and several other products. I worked with Automattic as a Type B Data Scientist (i.e., I mostly built and deployed code to production) from May 2017 to October 2021. This post is back-dated to my last day with the company to make it fit nicely into my post timeline, but I’m actually writing this in July 2023. The magic of time travel! 🪄\nA nice perk of working with Automattic was getting to write about my work on company blogs. When my website was on WordPress.com, I used the reblogging feature to share those posts here, but they never looked great. One of the first projects I completed after leaving Automattic was migrating my site from WordPress.com to Hugo, which made the reblog posts look even worse. Now all those reblogs redirect here, thanks to Hugo’s aliases feature.\nAnyway, here are some highlights from my Automattic work along with links to the relevant posts:\nLeading the build of a unified experimentation platform and spreading causal inference best practices throughout the organisation: ExPlat: Automattic’s Experimentation Platform (by Aaron Yan – Aaron was the team lead, and I was the tech lead for the project) Architecting ExPlat: Automattic’s New Experimentation Platform (by me) ExPlat’s Development Principles and Practices (by me) Co-developing pipe, a bespoke machine learning pipeline that was mostly used for marketing tasks when I was around (and is apparently still going strong in 2023 and beyond): Introducing pipe, The Automattic Machine Learning Pipeline (by Demet Dagdelen – pipe started as a two-person project that we worked on together) How to Increase Retention and Revenue in 1,000 Nontrivial Steps (by me) Building Thousands of Reproducible ML Models with pipe, the Automattic Machine Learning Pipeline (by Demet Dagdelen) Using ML for Campaign Optimization: Our Journey to Marketing Science at Automattic (by Demet Dagdelen) End-to-end implementation of automated customer chat tagging. My colleague Charles Earl published a post on the initial steps of the project around the time I joined the company. I helped get it to production shortly after I joined in 2017, once I was done with my first project that included improved measurement and presentation of key engagement metrics. In other words, I spent my first few months as an analytics engineer, then a few months as a machine learning engineer (classifications that were new or nonexistent back then). Encouraging the adoption of engineering best practices in data science projects. Hosting Cameron Davidson-Pilon for a chat and running internal book clubs and learning groups. Starting and co-leading an employee resource group to promote sustainability at Automattic, which resulted in carbon offsetting based on my research. On this website, you can also read about how I ended up joining Automattic and on some of the reasons behind my decision to leave the company.\n","wordCount":"471","inLanguage":"en","image":"https://yanirseroussi.com/bing-yanir-seroussi-automattic-work.webp","datePublished":"2021-10-07T00:00:00Z","dateModified":"2023-07-05T16:02:07+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2021/10/07/my-work-with-automattic/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My work with Automattic</h1><div class=post-meta><span title='2021-10-07 00:00:00 +0000 UTC'>October 7, 2021</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2021-10-07-my-work-with-automattic/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work_hu4e71d1d731e036718ec371daa39a901d_40330_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work_hu4e71d1d731e036718ec371daa39a901d_40330_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work.webp 512w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work.webp alt="Bing thinks I looked like this while working at Automattic." width=512 height=481><p>Bing thinks I looked like this while working at Automattic.</p></figure><div class=post-content><p><a href=https://automattic.com/ target=_blank rel=noopener>Automattic</a> is the company behind WordPress.com, Tumblr, Jetpack, WooCommerce, and several other products. I worked with Automattic as a <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>Type B Data Scientist</a> (i.e., I mostly built and deployed code to production) from May 2017 to October 2021. This post is back-dated to my last day with the company to make it fit nicely into my post timeline, but I&rsquo;m actually writing this in July 2023. The magic of time travel! 🪄</p><p>A nice perk of working with Automattic was getting to write about my work on company blogs. When my website was on WordPress.com, I used the reblogging feature to share those posts here, but they never looked great. One of the first projects I completed after leaving Automattic was <a href=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/>migrating my site from WordPress.com to Hugo</a>, which made the reblog posts look even worse. Now all those reblogs redirect here, thanks <a href=https://gohugo.io/content-management/urls/#aliases target=_blank rel=noopener>to Hugo&rsquo;s aliases feature</a>.</p><p>Anyway, here are some highlights from my Automattic work along with links to the relevant posts:</p><ul><li>Leading the build of a unified experimentation platform and spreading causal inference best practices throughout the organisation:<ul><li><a href=https://data.blog/2021/03/16/explat-automattics-experimentation-platform/ target=_blank rel=noopener>ExPlat: Automattic&rsquo;s Experimentation Platform</a> (by Aaron Yan – Aaron was the team lead, and I was the tech lead for the project)</li><li><a href=https://data.blog/2021/04/14/architecting-explat-automattics-new-experimentation-platform/ target=_blank rel=noopener>Architecting ExPlat: Automattic&rsquo;s New Experimentation Platform</a> (by me)</li><li><a href=https://data.blog/2021/08/06/explats-development-principles-and-practices/ target=_blank rel=noopener>ExPlat&rsquo;s Development Principles and Practices</a> (by me)</li></ul></li><li>Co-developing pipe, a bespoke machine learning pipeline that was mostly used for marketing tasks when I was around (and is apparently still going strong in 2023 and beyond):<ul><li><a href=https://data.blog/2018/11/15/introducing-pipe-the-automattic-machine-learning-pipeline/ target=_blank rel=noopener>Introducing pipe, The Automattic Machine Learning Pipeline</a> (by Demet Dagdelen – pipe started as a two-person project that we worked on together)</li><li><a href=https://data.blog/2019/01/15/how-to-increase-retention-and-revenue-in-1000-nontrivial-steps/ target=_blank rel=noopener>How to Increase Retention and Revenue in 1,000 Nontrivial Steps</a> (by me)</li><li><a href=https://data.blog/2019/01/08/building-thousands-of-reproducible-ml-models-with-pipe-the-automattic-machine-learning-pipeline/ target=_blank rel=noopener>Building Thousands of Reproducible ML Models with pipe, the Automattic Machine Learning Pipeline</a> (by Demet Dagdelen)</li><li><a href=https://data.blog/2019/06/10/using-ml-for-campaign-optimization-our-journey-to-marketing-science-at-automattic/ target=_blank rel=noopener>Using ML for Campaign Optimization: Our Journey to Marketing Science at Automattic</a> (by Demet Dagdelen)</li></ul></li><li>End-to-end implementation of automated customer chat tagging. My colleague Charles Earl published <a href=https://data.blog/2017/05/24/may-the-bot-be-with-you-how-algorithms-are-supporting-happiness-at-wordpress-com/ target=_blank rel=noopener>a post on the initial steps of the project</a> around the time I joined the company. I helped get it to production shortly after I joined in 2017, once I was done with my first project that included improved measurement and presentation of key engagement metrics. In other words, I spent my first few months as an analytics engineer, then a few months as a machine learning engineer (classifications that were new or nonexistent back then).</li><li><a href=https://data.blog/2018/03/20/engineering-data-science-at-automattic/ target=_blank rel=noopener>Encouraging the adoption of engineering best practices in data science projects</a>.</li><li><a href=https://data.blog/2019/05/23/data-science-insights-from-cameron-davidson-pilon/ target=_blank rel=noopener>Hosting Cameron Davidson-Pilon for a chat</a> and running internal book clubs and learning groups.</li><li><a href=https://wordpress.com/blog/2020/09/21/toward-zero-reducing-and-offsetting-our-data-center-power-emissions/ target=_blank rel=noopener>Starting and co-leading an employee resource group to promote sustainability at Automattic, which resulted in carbon offsetting based on my research</a>.</li></ul><p>On this website, you can also read about <a href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/>how I ended up joining Automattic</a> and on <a href=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/>some of the reasons behind my decision to leave the company</a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/automattic/>Automattic</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/causal-inference/>causal inference</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/environment/>environment</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/marketing/>marketing</a></li><li><a href=https://yanirseroussi.com/tags/remote-work/>remote work</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on x" href="https://x.com/intent/tweet/?text=My%20work%20with%20Automattic&amp;url=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f&amp;hashtags=Automattic%2ccareer%2ccausalinference%2cdatascience%2cenvironment%2cmachinelearning%2cmarketing%2cremotework%2csoftwareengineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f&amp;title=My%20work%20with%20Automattic&amp;summary=My%20work%20with%20Automattic&amp;source=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f&title=My%20work%20with%20Automattic"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on whatsapp" href="https://api.whatsapp.com/send?text=My%20work%20with%20Automattic%20-%20https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on telegram" href="https://telegram.me/share/url?text=My%20work%20with%20Automattic&amp;url=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on ycombinator" href="https://news.ycombinator.com/submitlink?t=My%20work%20with%20Automattic&u=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="Automattic,career,causal inference,data science,environment,machine learning,marketing,remote work,software engineering"><meta name=description content="Back-dated meta-post that gathers my posts on Automattic blogs into a summary of the work I&rsquo;ve done with the company."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2021/10/07/my-work-with-automattic/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My work with Automattic"><meta property="og:description" content="Back-dated meta-post that gathers my posts on Automattic blogs into a summary of the work I&rsquo;ve done with the company."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2021/10/07/my-work-with-automattic/"><meta property="og:image" content="https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2021-10-07T00:00:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work.webp"><meta name=twitter:title content="My work with Automattic"><meta name=twitter:description content="Back-dated meta-post that gathers my posts on Automattic blogs into a summary of the work I&rsquo;ve done with the company."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My work with Automattic","item":"https://yanirseroussi.com/2021/10/07/my-work-with-automattic/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My work with Automattic","name":"My work with Automattic","description":"Back-dated meta-post that gathers my posts on Automattic blogs into a summary of the work I\u0026rsquo;ve done with the company.","keywords":["Automattic","career","causal inference","data science","environment","machine learning","marketing","remote work","software engineering"],"articleBody":"Automattic is the company behind WordPress.com, Tumblr, Jetpack, WooCommerce, and several other products. I worked with Automattic as a Type B Data Scientist (i.e., I mostly built and deployed code to production) from May 2017 to October 2021. This post is back-dated to my last day with the company to make it fit nicely into my post timeline, but I’m actually writing this in July 2023. The magic of time travel! 🪄\nA nice perk of working with Automattic was getting to write about my work on company blogs. When my website was on WordPress.com, I used the reblogging feature to share those posts here, but they never looked great. One of the first projects I completed after leaving Automattic was migrating my site from WordPress.com to Hugo, which made the reblog posts look even worse. Now all those reblogs redirect here, thanks to Hugo’s aliases feature.\nAnyway, here are some highlights from my Automattic work along with links to the relevant posts:\nLeading the build of a unified experimentation platform and spreading causal inference best practices throughout the organisation: ExPlat: Automattic’s Experimentation Platform (by Aaron Yan – Aaron was the team lead, and I was the tech lead for the project) Architecting ExPlat: Automattic’s New Experimentation Platform (by me) ExPlat’s Development Principles and Practices (by me) Co-developing pipe, a bespoke machine learning pipeline that was mostly used for marketing tasks when I was around (and is apparently still going strong in 2023 and beyond): Introducing pipe, The Automattic Machine Learning Pipeline (by Demet Dagdelen – pipe started as a two-person project that we worked on together) How to Increase Retention and Revenue in 1,000 Nontrivial Steps (by me) Building Thousands of Reproducible ML Models with pipe, the Automattic Machine Learning Pipeline (by Demet Dagdelen) Using ML for Campaign Optimization: Our Journey to Marketing Science at Automattic (by Demet Dagdelen) End-to-end implementation of automated customer chat tagging. My colleague Charles Earl published a post on the initial steps of the project around the time I joined the company. I helped get it to production shortly after I joined in 2017, once I was done with my first project that included improved measurement and presentation of key engagement metrics. In other words, I spent my first few months as an analytics engineer, then a few months as a machine learning engineer (classifications that were new or nonexistent back then). Encouraging the adoption of engineering best practices in data science projects. Hosting Cameron Davidson-Pilon for a chat and running internal book clubs and learning groups. Starting and co-leading an employee resource group to promote sustainability at Automattic, which resulted in carbon offsetting based on my research. On this website, you can also read about how I ended up joining Automattic and on some of the reasons behind my decision to leave the company.\n","wordCount":"471","inLanguage":"en","image":"https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work.webp","datePublished":"2021-10-07T00:00:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2021/10/07/my-work-with-automattic/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My work with Automattic</h1><div class=post-meta><span title='2021-10-07 00:00:00 +0000 UTC'>October 7, 2021</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2021-10-07-my-work-with-automattic/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work_hu4e71d1d731e036718ec371daa39a901d_40330_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work_hu4e71d1d731e036718ec371daa39a901d_40330_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work.webp 512w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2021/10/07/my-work-with-automattic/bing-yanir-seroussi-automattic-work.webp alt="Bing thinks I looked like this while working at Automattic." width=512 height=481><p>Bing thinks I looked like this while working at Automattic.</p></figure><div class=post-content><p><a href=https://automattic.com/ target=_blank rel=noopener>Automattic</a> is the company behind WordPress.com, Tumblr, Jetpack, WooCommerce, and several other products. I worked with Automattic as a <a href=https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/>Type B Data Scientist</a> (i.e., I mostly built and deployed code to production) from May 2017 to October 2021. This post is back-dated to my last day with the company to make it fit nicely into my post timeline, but I&rsquo;m actually writing this in July 2023. The magic of time travel! 🪄</p><p>A nice perk of working with Automattic was getting to write about my work on company blogs. When my website was on WordPress.com, I used the reblogging feature to share those posts here, but they never looked great. One of the first projects I completed after leaving Automattic was <a href=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/>migrating my site from WordPress.com to Hugo</a>, which made the reblog posts look even worse. Now all those reblogs redirect here, thanks <a href=https://gohugo.io/content-management/urls/#aliases target=_blank rel=noopener>to Hugo&rsquo;s aliases feature</a>.</p><p>Anyway, here are some highlights from my Automattic work along with links to the relevant posts:</p><ul><li>Leading the build of a unified experimentation platform and spreading causal inference best practices throughout the organisation:<ul><li><a href=https://data.blog/2021/03/16/explat-automattics-experimentation-platform/ target=_blank rel=noopener>ExPlat: Automattic&rsquo;s Experimentation Platform</a> (by Aaron Yan – Aaron was the team lead, and I was the tech lead for the project)</li><li><a href=https://data.blog/2021/04/14/architecting-explat-automattics-new-experimentation-platform/ target=_blank rel=noopener>Architecting ExPlat: Automattic&rsquo;s New Experimentation Platform</a> (by me)</li><li><a href=https://data.blog/2021/08/06/explats-development-principles-and-practices/ target=_blank rel=noopener>ExPlat&rsquo;s Development Principles and Practices</a> (by me)</li></ul></li><li>Co-developing pipe, a bespoke machine learning pipeline that was mostly used for marketing tasks when I was around (and is apparently still going strong in 2023 and beyond):<ul><li><a href=https://data.blog/2018/11/15/introducing-pipe-the-automattic-machine-learning-pipeline/ target=_blank rel=noopener>Introducing pipe, The Automattic Machine Learning Pipeline</a> (by Demet Dagdelen – pipe started as a two-person project that we worked on together)</li><li><a href=https://data.blog/2019/01/15/how-to-increase-retention-and-revenue-in-1000-nontrivial-steps/ target=_blank rel=noopener>How to Increase Retention and Revenue in 1,000 Nontrivial Steps</a> (by me)</li><li><a href=https://data.blog/2019/01/08/building-thousands-of-reproducible-ml-models-with-pipe-the-automattic-machine-learning-pipeline/ target=_blank rel=noopener>Building Thousands of Reproducible ML Models with pipe, the Automattic Machine Learning Pipeline</a> (by Demet Dagdelen)</li><li><a href=https://data.blog/2019/06/10/using-ml-for-campaign-optimization-our-journey-to-marketing-science-at-automattic/ target=_blank rel=noopener>Using ML for Campaign Optimization: Our Journey to Marketing Science at Automattic</a> (by Demet Dagdelen)</li></ul></li><li>End-to-end implementation of automated customer chat tagging. My colleague Charles Earl published <a href=https://data.blog/2017/05/24/may-the-bot-be-with-you-how-algorithms-are-supporting-happiness-at-wordpress-com/ target=_blank rel=noopener>a post on the initial steps of the project</a> around the time I joined the company. I helped get it to production shortly after I joined in 2017, once I was done with my first project that included improved measurement and presentation of key engagement metrics. In other words, I spent my first few months as an analytics engineer, then a few months as a machine learning engineer (classifications that were new or nonexistent back then).</li><li><a href=https://data.blog/2018/03/20/engineering-data-science-at-automattic/ target=_blank rel=noopener>Encouraging the adoption of engineering best practices in data science projects</a>.</li><li><a href=https://data.blog/2019/05/23/data-science-insights-from-cameron-davidson-pilon/ target=_blank rel=noopener>Hosting Cameron Davidson-Pilon for a chat</a> and running internal book clubs and learning groups.</li><li><a href=https://wordpress.com/blog/2020/09/21/toward-zero-reducing-and-offsetting-our-data-center-power-emissions/ target=_blank rel=noopener>Starting and co-leading an employee resource group to promote sustainability at Automattic, which resulted in carbon offsetting based on my research</a>.</li></ul><p>On this website, you can also read about <a href=https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/>how I ended up joining Automattic</a> and on <a href=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/>some of the reasons behind my decision to leave the company</a>.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/automattic/>Automattic</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/causal-inference/>causal inference</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/environment/>environment</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/marketing/>marketing</a></li><li><a href=https://yanirseroussi.com/tags/remote-work/>remote work</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on x" href="https://x.com/intent/tweet/?text=My%20work%20with%20Automattic&amp;url=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f&amp;hashtags=Automattic%2ccareer%2ccausalinference%2cdatascience%2cenvironment%2cmachinelearning%2cmarketing%2cremotework%2csoftwareengineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f&amp;title=My%20work%20with%20Automattic&amp;summary=My%20work%20with%20Automattic&amp;source=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f&title=My%20work%20with%20Automattic"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on whatsapp" href="https://api.whatsapp.com/send?text=My%20work%20with%20Automattic%20-%20https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on telegram" href="https://telegram.me/share/url?text=My%20work%20with%20Automattic&amp;url=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My work with Automattic on ycombinator" href="https://news.ycombinator.com/submitlink?t=My%20work%20with%20Automattic&u=https%3a%2f%2fyanirseroussi.com%2f2021%2f10%2f07%2fmy-work-with-automattic%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.html b/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.html
index 683c84d7a..0cb177c1c 100644
--- a/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.html
+++ b/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Migrating from WordPress.com to Hugo on GitHub + Cloudflare | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Cloudflare,GitHub,Hugo,sustainability,web development,WordPress"><meta name=description content="My reasons for switching from WordPress.com to Hugo on GitHub + Cloudflare, along with a summary of the solution components and migration process."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Migrating from WordPress.com to Hugo on GitHub + Cloudflare"><meta property="og:description" content="My reasons for switching from WordPress.com to Hugo on GitHub + Cloudflare, along with a summary of the solution components and migration process."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/"><meta property="og:image" content="https://yanirseroussi.com/bird-migration.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2021-11-10T06:30:00+00:00"><meta property="article:modified_time" content="2022-07-31T16:16:05+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/bird-migration.jpg"><meta name=twitter:title content="Migrating from WordPress.com to Hugo on GitHub + Cloudflare"><meta name=twitter:description content="My reasons for switching from WordPress.com to Hugo on GitHub + Cloudflare, along with a summary of the solution components and migration process."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Migrating from WordPress.com to Hugo on GitHub + Cloudflare","item":"https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Migrating from WordPress.com to Hugo on GitHub + Cloudflare","name":"Migrating from WordPress.com to Hugo on GitHub \u002b Cloudflare","description":"My reasons for switching from WordPress.com to Hugo on GitHub + Cloudflare, along with a summary of the solution components and migration process.","keywords":["Cloudflare","GitHub","Hugo","sustainability","web development","WordPress"],"articleBody":"Last month, I left Automattic (the company behind WordPress.com) after about 4.5 years of working there as a data scientist. As I am moving back into independent consulting, I decided it was time to give my website a facelift and start posting more often. The biggest part of the facelift was migrating off WordPress.com – I now use Hugo for site generation and GitHub + Cloudflare for hosting. This post summarises my reasons for switching and some technical choices I made, which may be useful for people who are considering a similar migration.\nWhy switch from WordPress.com to Hugo? The easiest short-term choice would have been to stick with WordPress.com and spend more time on publishing new posts and working on other projects. However, if I were to start a new personal site today, it’s unlikely I would choose WordPress.com, i.e., not migrating would have been due to inertia. Given that I had the free time to invest in the migration, it seemed worth doing for the following long-term benefits:\nMore control over the styling, content, and layout. On WordPress.com, I was on the AU$60 / year Personal plan (which I got for free while working with Automattic). Adding custom CSS requires upgrading to the AU$120 / year Premium plan. More advanced customisation via WordPress plugins requires paying for the AU$396 / year Business plan. With Hugo, I have full control over the website’s source code – for free. Indeed, it felt liberating to customise my chosen Hugo theme and eliminate inline styling in old posts (which I previously used to work around the custom CSS limitation on WordPress.com).\nBetter editing experience. Since 2014 and through the years when I published the most content, I used the Classic WordPress editor. Not being a fan of heavy WYSIWYG editors, I used to edit posts in HTML mode and focus on the content with minimal markup. Since December 2018, WordPress has shipped with Gutenberg as the default editor. While it is possible to use the Classic editor as a block within Gutenberg, I find the experience too clunky. And I’m not the only one: As of November 2021, the Gutenberg plugin has an average rating of 2.1 stars (including many recent one-star reviews), and the Gutenberg repository has over 700 open bugs.\nMoving to Hugo means that I’m free to write my posts in Markdown or HTML using any offline text editor, and experience no surprises when my posts are published. As a bonus, you can see the source of this and all other posts on GitHub. Given that plain text files have been around for far longer than WordPress and Gutenberg, the approach of relying on Markdown for this website is likely to continue working well for decades, even if I end up replacing Hugo. And for a bit of fun, going with Markdown means that GitHub Copilot can try to help with suggestions that range from laughable to eerily insightful.\nGitHub Copilot trying to help with this post Platform-independent follower list. On WordPress.com, about half my followers subscribed via email. The other half used the WordPress.com Reader. Reader subscribers can only be ported to other WordPress sites. With the migration, all subscribers join a single mailing list that is easy to port across service providers.\nLower running costs. Hosting a site with a custom domain on GitHub Pages is free, but mapping a custom domain to a WordPress.com site requires payment. While the plan cost is low compared to the time cost of switching, it’s nice to eliminate recurring payments to WordPress.com.\nLearning opportunity. Not being a web developer, I don’t follow changes in the web development world too closely. Taking more ownership and control over my personal site means that I have to refresh some of my knowledge, i.e., the time spent on the migration wasn’t completely wasted on mindless work.\nNaturally, I identified some potential risks: Hugo is younger and more likely to be abandoned by its developers than WordPress, maintenance tasks may end up being too time-consuming, and I might miss some features offered by WordPress.com. Ultimately, I decided that the benefits outweigh the risks, which was just the first in a string of decisions on the journey to move off WordPress.com.\nDecisions, decisions… (or: solution components) WordPress.com is an integrated solution, where many useful features are included even on the Free plan. As such, I would still recommend it for people who are less technically inclined, or to those who aren’t interested in fine control over their website. A good way to appreciate what’s included in WordPress.com is to try to migrate an existing site off the platform. I found the process a bit overwhelming at first, but ultimately I persevered and ended up with the following solution components.\nSite generator: Hugo. The biggest change was in the site generation approach – from the dynamic WordPress to the static Hugo. This switch makes sense for my website: I write a new post every once in a while, and it remains unchanged for years. Hence, the same content gets served tens of thousands of times. Moving to a static site generator obviates the need for a traditional database – my posts are simple Markdown files that Hugo turns to HTML. Together with a bit of CSS and JS, that’s enough to serve the same content forever.\nOf the many static site generation options, I chose Hugo because it seems popular and well-maintained, and because I find its focus on speed attractive. I also like that it’s simple to install and deploy to many hosts, and that its documentation is clear and comprehensive.\nHugo theme: PaperMod. When initially testing Hugo, I went with the Ananke theme from the quick start manual. Then I switched to Beautiful Hugo for its built-in Staticman comment support. When I realised that this support is limited and easy to mimic in other themes, I switched to PaperMod after seeing it on Dan C Williams’s site. PaperMod has a few quirks, but it’s easy to override anything I don’t like (see my tweaks on this site’s repo).\nHost: GitHub Pages. I wanted to avoid opening new accounts where possible, so hosting the site on GitHub Pages was a natural and safe choice: It’s backed by a massive company and has been free to use since 2008. I also like GitHub’s sustainability policy, though this should be the standard – any tech company can and should get to at least net zero this decade.\nDNS, CDN, and more: Cloudflare. I’ve used Cloudflare in the past and was impressed with the range of high-quality services they provide for free or for a low price. Therefore, making Cloudflare the DNS and CDN provider for this site was a no-brainer. I’m also planning to use it for my domain registration, as Cloudflare now provides registrar services at wholesale prices. On the sustainability front, Cloudflare is committed to powering its network with 100% renewables – it’s going in the right direction, but it’s not as clear on Scope 3 emissions as GitHub.\nComments: Static display + GitHub issues. By far, the most annoying part of the migration was settling on a solution for comments. The Hugo docs suggest Disqus as the default, but they also note many other options. With about 150 comments over nearly eight years, this site is hardly a vibrant discussion forum – using Disqus feels like an overkill. After a bit of research, I learned about Staticman, which can be self-hosted to turn every comment into a static YAML file that gets rendered by Hugo. I liked the static generation aspect of the Staticman approach, but I didn’t like the idea of complicating things by running another service. Therefore, I settled on my own comment layout and stylesheet, which includes buttons to add new comments as GitHub issues. For this, I found the posts by Khalid Yasoob and Dan C Williams helpful, though I deviated from their solutions.\nAs I was already moderating comments on my WordPress.com site, I doubt that the additional overhead of manually turning issues into YAML files would be unmanageable. In any case, I can iterate on my solution by adding issue templates and automating the conversion of issues to YAML. Other than the added processing overhead, a downside of my approach in comparison to Staticman is that it requires commenters to have a GitHub account. Given my audience, I think it’s a reasonable requirement, and it should help mitigate spam. In any case, I’m not married to this solution – I can always switch to Staticman, Disqus, or any other commenting system. That’s the beauty of gaining control over my website.\nContact form: Google. I had a WordPress.com contact form on my About page, which I replaced with an embedded Google Form. As Google Forms don’t have built-in spam protection from anonymous users, my form requires users to log in to Google. This limits options for potential contacts, but I’m also contactable via LinkedIn or GitHub. While all these options require an account, they’re free and backed by companies that are serious about fighting spam. And of course, Google is a sustainability leader.\nMailing list: TinyLetter. As TinyLetter has been around for years and is owned by MailChimp, it feels like a safe choice for managing my current email subscriber list (unless it grows beyond TinyLetter’s limits). In any case, porting an email list is easy, as no one owns email. Unfortunately, I’m unsure about TinyLetter’s sustainability, but with Mailchimp’s recent acquisition by Intuit, I hope it will be covered by Intuit’s ambitious sustainability goals.\nAnalytics: Cloudflare. I considered installing Google Analytics, which I didn’t have on my WordPress.com site because it requires a Premium plan. However, I decided against it given the prevalence of Google Analytics blockers (especially among tech-savvy audiences). Taking a bit of my own advice, I asked myself why I needed analytics? The main reasons are: Verifying the site works as expected, and getting a broad idea of where traffic is coming from and which posts are popular. For this, “accurate” view counts are unnecessary, as is close tracking of individuals. Therefore, I went with the lightweight web analytics provided by Cloudflare, which doesn’t collect personal user data. In some respects, it is more limited than the free stats offered by WordPress.com, e.g., Cloudflare’s data retention period is 30 days. But since my focus is operational, I don’t need to retain stats from past months and years – they feel like vanity metrics that won’t change my behaviour.\nSource: Measuring what matters: How to pick a good metric Making the Big Switch The migration process was similar to that described by Yasoob Khalid. Notable changes from Yasoob’s post were excluding the Staticman setup, tweaking the comment conversion script, and importing images into page bundles rather than using the resized images produced by the WordPress-to-Hugo Exporter. Since I had to go post by post to fix various things that broke in the process (e.g., YouTube embeds), I also took the opportunity to manually clean up the image filenames. Once I was happy with the result, I switched the domain mapping on GitHub and Cloudflare, left a note to followers on WordPress.com, and started monitoring traffic via Cloudflare Web Analytics.\nOverall, I’m satisfied with the result. The new layout feels much lighter and less cluttered, but it’s also enriched by features like a dark mode toggle. Lost functionality includes Like buttons, “reblog” options, and the top and bottom menu shown to logged-in WordPress.com users. But these bits feel superfluous – people can still like my posts without a Like button.\nBefore and after look of a recent post As I was eager to finish the initial migration, I avoided spending too much time on non-critical tasks. These include following all the SEO best practices, increasing page speed, applying various style tweaks, and other small changes. With more control over my site, I now have the power to incrementally address such tasks over time.\nIn summary, I found the migration rewarding and educational. It was also fun to go through old posts and get motivated to publish more frequently. I’m looking forward to shifting my focus to the content – stay tuned for new posts!\n","wordCount":"2036","inLanguage":"en","image":"https://yanirseroussi.com/bird-migration.jpg","datePublished":"2021-11-10T06:30:00Z","dateModified":"2022-07-31T16:16:05+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Migrating from WordPress.com to Hugo on GitHub + Cloudflare</h1><div class=post-meta><span title='2021-11-10 06:30:00 +0000 UTC'>November 10, 2021</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2021-11-10-migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration.jpg alt width=1920 height=938></figure><div class=post-content><p>Last month, I left Automattic (the company behind WordPress.com) after about 4.5 years of working there as a data scientist. As I am moving back into independent consulting, I decided it was time to give my website a facelift and start posting more often. The biggest part of the facelift was migrating off WordPress.com – I now use <a href=https://gohugo.io/ target=_blank rel=noopener>Hugo</a> for site generation and GitHub + Cloudflare for hosting. This post summarises my reasons for switching and some technical choices I made, which may be useful for people who are considering a similar migration.</p><h2 id=why-switch-from-wordpresscom-to-hugo>Why switch from WordPress.com to Hugo?<a hidden class=anchor aria-hidden=true href=#why-switch-from-wordpresscom-to-hugo>#</a></h2><p>The easiest short-term choice would have been to stick with WordPress.com and spend more time on publishing new posts and working on other projects. However, if I were to start a new personal site today, it&rsquo;s unlikely I would choose WordPress.com, i.e., <em>not</em> migrating would have been due to inertia. Given that I had the free time to invest in the migration, it seemed worth doing for the following long-term benefits:</p><ul><li><p><strong>More control over the styling, content, and layout.</strong> On WordPress.com, I was on the AU$60 / year Personal plan (which I got for free while working with Automattic). Adding custom CSS requires upgrading to the AU$120 / year Premium plan. More advanced customisation via WordPress plugins requires paying for the AU$396 / year Business plan. With Hugo, I have full control over the website&rsquo;s source code – for free. Indeed, it felt liberating to customise my chosen Hugo theme and eliminate inline styling in old posts (which I previously used to work around the custom CSS limitation on WordPress.com).</p></li><li><p><strong>Better editing experience.</strong> Since 2014 and through the years when I published the most content, I used the Classic WordPress editor. Not being a fan of heavy <a href=https://en.wikipedia.org/wiki/WYSIWYG title="what you see is what you get" target=_blank rel=noopener>WYSIWYG</a> editors, I used to edit posts in HTML mode and focus on the content with minimal markup. Since December 2018, WordPress has shipped with Gutenberg as the default editor. While it is possible to use the Classic editor as a block within Gutenberg, I find the experience too clunky. And I&rsquo;m not the only one: As of November 2021, <a href=https://wordpress.org/support/plugin/gutenberg/reviews/ target=_blank rel=noopener>the Gutenberg plugin has an average rating of 2.1 stars</a> (including many recent one-star reviews), and <a href="https://github.com/WordPress/gutenberg/issues?q=is%3Aissue+is%3Aopen+label%3A%22%5BType%5D+Bug%22" target=_blank rel=noopener>the Gutenberg repository has over 700 open bugs</a>.</p><p>Moving to Hugo means that I&rsquo;m free to write my posts in Markdown or HTML using any offline text editor, and experience no surprises when my posts are published. As a bonus, you can see the source of <a href=https://github.com/yanirs/yanirseroussi.com/tree/master/content/posts target=_blank rel=noopener>this and all other posts</a> on GitHub. Given that plain text files have been around for far longer than WordPress and Gutenberg, the approach of relying on Markdown for this website is likely to continue working well for decades, even if I end up replacing Hugo. And for a bit of fun, going with Markdown means that <a href=https://copilot.github.com/ target=_blank rel=noopener>GitHub Copilot</a> can try to help with suggestions that range from laughable to eerily insightful.</p><figure><a href=github-copilot.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="Cloudflare,GitHub,Hugo,sustainability,web development,WordPress"><meta name=description content="My reasons for switching from WordPress.com to Hugo on GitHub + Cloudflare, along with a summary of the solution components and migration process."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Migrating from WordPress.com to Hugo on GitHub + Cloudflare"><meta property="og:description" content="My reasons for switching from WordPress.com to Hugo on GitHub + Cloudflare, along with a summary of the solution components and migration process."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/"><meta property="og:image" content="https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2021-11-10T06:30:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration.jpg"><meta name=twitter:title content="Migrating from WordPress.com to Hugo on GitHub + Cloudflare"><meta name=twitter:description content="My reasons for switching from WordPress.com to Hugo on GitHub + Cloudflare, along with a summary of the solution components and migration process."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Migrating from WordPress.com to Hugo on GitHub + Cloudflare","item":"https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Migrating from WordPress.com to Hugo on GitHub + Cloudflare","name":"Migrating from WordPress.com to Hugo on GitHub \u002b Cloudflare","description":"My reasons for switching from WordPress.com to Hugo on GitHub + Cloudflare, along with a summary of the solution components and migration process.","keywords":["Cloudflare","GitHub","Hugo","sustainability","web development","WordPress"],"articleBody":"Last month, I left Automattic (the company behind WordPress.com) after about 4.5 years of working there as a data scientist. As I am moving back into independent consulting, I decided it was time to give my website a facelift and start posting more often. The biggest part of the facelift was migrating off WordPress.com – I now use Hugo for site generation and GitHub + Cloudflare for hosting. This post summarises my reasons for switching and some technical choices I made, which may be useful for people who are considering a similar migration.\nWhy switch from WordPress.com to Hugo? The easiest short-term choice would have been to stick with WordPress.com and spend more time on publishing new posts and working on other projects. However, if I were to start a new personal site today, it’s unlikely I would choose WordPress.com, i.e., not migrating would have been due to inertia. Given that I had the free time to invest in the migration, it seemed worth doing for the following long-term benefits:\nMore control over the styling, content, and layout. On WordPress.com, I was on the AU$60 / year Personal plan (which I got for free while working with Automattic). Adding custom CSS requires upgrading to the AU$120 / year Premium plan. More advanced customisation via WordPress plugins requires paying for the AU$396 / year Business plan. With Hugo, I have full control over the website’s source code – for free. Indeed, it felt liberating to customise my chosen Hugo theme and eliminate inline styling in old posts (which I previously used to work around the custom CSS limitation on WordPress.com).\nBetter editing experience. Since 2014 and through the years when I published the most content, I used the Classic WordPress editor. Not being a fan of heavy WYSIWYG editors, I used to edit posts in HTML mode and focus on the content with minimal markup. Since December 2018, WordPress has shipped with Gutenberg as the default editor. While it is possible to use the Classic editor as a block within Gutenberg, I find the experience too clunky. And I’m not the only one: As of November 2021, the Gutenberg plugin has an average rating of 2.1 stars (including many recent one-star reviews), and the Gutenberg repository has over 700 open bugs.\nMoving to Hugo means that I’m free to write my posts in Markdown or HTML using any offline text editor, and experience no surprises when my posts are published. As a bonus, you can see the source of this and all other posts on GitHub. Given that plain text files have been around for far longer than WordPress and Gutenberg, the approach of relying on Markdown for this website is likely to continue working well for decades, even if I end up replacing Hugo. And for a bit of fun, going with Markdown means that GitHub Copilot can try to help with suggestions that range from laughable to eerily insightful.\nGitHub Copilot trying to help with this post Platform-independent follower list. On WordPress.com, about half my followers subscribed via email. The other half used the WordPress.com Reader. Reader subscribers can only be ported to other WordPress sites. With the migration, all subscribers join a single mailing list that is easy to port across service providers.\nLower running costs. Hosting a site with a custom domain on GitHub Pages is free, but mapping a custom domain to a WordPress.com site requires payment. While the plan cost is low compared to the time cost of switching, it’s nice to eliminate recurring payments to WordPress.com.\nLearning opportunity. Not being a web developer, I don’t follow changes in the web development world too closely. Taking more ownership and control over my personal site means that I have to refresh some of my knowledge, i.e., the time spent on the migration wasn’t completely wasted on mindless work.\nNaturally, I identified some potential risks: Hugo is younger and more likely to be abandoned by its developers than WordPress, maintenance tasks may end up being too time-consuming, and I might miss some features offered by WordPress.com. Ultimately, I decided that the benefits outweigh the risks, which was just the first in a string of decisions on the journey to move off WordPress.com.\nDecisions, decisions… (or: solution components) WordPress.com is an integrated solution, where many useful features are included even on the Free plan. As such, I would still recommend it for people who are less technically inclined, or to those who aren’t interested in fine control over their website. A good way to appreciate what’s included in WordPress.com is to try to migrate an existing site off the platform. I found the process a bit overwhelming at first, but ultimately I persevered and ended up with the following solution components.\nSite generator: Hugo. The biggest change was in the site generation approach – from the dynamic WordPress to the static Hugo. This switch makes sense for my website: I write a new post every once in a while, and it remains unchanged for years. Hence, the same content gets served tens of thousands of times. Moving to a static site generator obviates the need for a traditional database – my posts are simple Markdown files that Hugo turns to HTML. Together with a bit of CSS and JS, that’s enough to serve the same content forever.\nOf the many static site generation options, I chose Hugo because it seems popular and well-maintained, and because I find its focus on speed attractive. I also like that it’s simple to install and deploy to many hosts, and that its documentation is clear and comprehensive.\nHugo theme: PaperMod. When initially testing Hugo, I went with the Ananke theme from the quick start manual. Then I switched to Beautiful Hugo for its built-in Staticman comment support. When I realised that this support is limited and easy to mimic in other themes, I switched to PaperMod after seeing it on Dan C Williams’s site. PaperMod has a few quirks, but it’s easy to override anything I don’t like (see my tweaks on this site’s repo).\nHost: GitHub Pages. I wanted to avoid opening new accounts where possible, so hosting the site on GitHub Pages was a natural and safe choice: It’s backed by a massive company and has been free to use since 2008. I also like GitHub’s sustainability policy, though this should be the standard – any tech company can and should get to at least net zero this decade.\nDNS, CDN, and more: Cloudflare. I’ve used Cloudflare in the past and was impressed with the range of high-quality services they provide for free or for a low price. Therefore, making Cloudflare the DNS and CDN provider for this site was a no-brainer. I’m also planning to use it for my domain registration, as Cloudflare now provides registrar services at wholesale prices. On the sustainability front, Cloudflare is committed to powering its network with 100% renewables – it’s going in the right direction, but it’s not as clear on Scope 3 emissions as GitHub.\nComments: Static display + GitHub issues. By far, the most annoying part of the migration was settling on a solution for comments. The Hugo docs suggest Disqus as the default, but they also note many other options. With about 150 comments over nearly eight years, this site is hardly a vibrant discussion forum – using Disqus feels like an overkill. After a bit of research, I learned about Staticman, which can be self-hosted to turn every comment into a static YAML file that gets rendered by Hugo. I liked the static generation aspect of the Staticman approach, but I didn’t like the idea of complicating things by running another service. Therefore, I settled on my own comment layout and stylesheet, which includes buttons to add new comments as GitHub issues. For this, I found the posts by Khalid Yasoob and Dan C Williams helpful, though I deviated from their solutions.\nAs I was already moderating comments on my WordPress.com site, I doubt that the additional overhead of manually turning issues into YAML files would be unmanageable. In any case, I can iterate on my solution by adding issue templates and automating the conversion of issues to YAML. Other than the added processing overhead, a downside of my approach in comparison to Staticman is that it requires commenters to have a GitHub account. Given my audience, I think it’s a reasonable requirement, and it should help mitigate spam. In any case, I’m not married to this solution – I can always switch to Staticman, Disqus, or any other commenting system. That’s the beauty of gaining control over my website.\nContact form: Google. I had a WordPress.com contact form on my About page, which I replaced with an embedded Google Form. As Google Forms don’t have built-in spam protection from anonymous users, my form requires users to log in to Google. This limits options for potential contacts, but I’m also contactable via LinkedIn or GitHub. While all these options require an account, they’re free and backed by companies that are serious about fighting spam. And of course, Google is a sustainability leader.\nMailing list: TinyLetter. As TinyLetter has been around for years and is owned by MailChimp, it feels like a safe choice for managing my current email subscriber list (unless it grows beyond TinyLetter’s limits). In any case, porting an email list is easy, as no one owns email. Unfortunately, I’m unsure about TinyLetter’s sustainability, but with Mailchimp’s recent acquisition by Intuit, I hope it will be covered by Intuit’s ambitious sustainability goals.\nAnalytics: Cloudflare. I considered installing Google Analytics, which I didn’t have on my WordPress.com site because it requires a Premium plan. However, I decided against it given the prevalence of Google Analytics blockers (especially among tech-savvy audiences). Taking a bit of my own advice, I asked myself why I needed analytics? The main reasons are: Verifying the site works as expected, and getting a broad idea of where traffic is coming from and which posts are popular. For this, “accurate” view counts are unnecessary, as is close tracking of individuals. Therefore, I went with the lightweight web analytics provided by Cloudflare, which doesn’t collect personal user data. In some respects, it is more limited than the free stats offered by WordPress.com, e.g., Cloudflare’s data retention period is 30 days. But since my focus is operational, I don’t need to retain stats from past months and years – they feel like vanity metrics that won’t change my behaviour.\nSource: Measuring what matters: How to pick a good metric Making the Big Switch The migration process was similar to that described by Yasoob Khalid. Notable changes from Yasoob’s post were excluding the Staticman setup, tweaking the comment conversion script, and importing images into page bundles rather than using the resized images produced by the WordPress-to-Hugo Exporter. Since I had to go post by post to fix various things that broke in the process (e.g., YouTube embeds), I also took the opportunity to manually clean up the image filenames. Once I was happy with the result, I switched the domain mapping on GitHub and Cloudflare, left a note to followers on WordPress.com, and started monitoring traffic via Cloudflare Web Analytics.\nOverall, I’m satisfied with the result. The new layout feels much lighter and less cluttered, but it’s also enriched by features like a dark mode toggle. Lost functionality includes Like buttons, “reblog” options, and the top and bottom menu shown to logged-in WordPress.com users. But these bits feel superfluous – people can still like my posts without a Like button.\nBefore and after look of a recent post As I was eager to finish the initial migration, I avoided spending too much time on non-critical tasks. These include following all the SEO best practices, increasing page speed, applying various style tweaks, and other small changes. With more control over my site, I now have the power to incrementally address such tasks over time.\nIn summary, I found the migration rewarding and educational. It was also fun to go through old posts and get motivated to publish more frequently. I’m looking forward to shifting my focus to the content – stay tuned for new posts!\n","wordCount":"2036","inLanguage":"en","image":"https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration.jpg","datePublished":"2021-11-10T06:30:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Migrating from WordPress.com to Hugo on GitHub + Cloudflare</h1><div class=post-meta><span title='2021-11-10 06:30:00 +0000 UTC'>November 10, 2021</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2021-11-10-migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration_hu6b7664f523075193f9f11d79c1c9dcfa_399617_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bird-migration.jpg alt width=1920 height=938></figure><div class=post-content><p>Last month, I left Automattic (the company behind WordPress.com) after about 4.5 years of working there as a data scientist. As I am moving back into independent consulting, I decided it was time to give my website a facelift and start posting more often. The biggest part of the facelift was migrating off WordPress.com – I now use <a href=https://gohugo.io/ target=_blank rel=noopener>Hugo</a> for site generation and GitHub + Cloudflare for hosting. This post summarises my reasons for switching and some technical choices I made, which may be useful for people who are considering a similar migration.</p><h2 id=why-switch-from-wordpresscom-to-hugo>Why switch from WordPress.com to Hugo?<a hidden class=anchor aria-hidden=true href=#why-switch-from-wordpresscom-to-hugo>#</a></h2><p>The easiest short-term choice would have been to stick with WordPress.com and spend more time on publishing new posts and working on other projects. However, if I were to start a new personal site today, it&rsquo;s unlikely I would choose WordPress.com, i.e., <em>not</em> migrating would have been due to inertia. Given that I had the free time to invest in the migration, it seemed worth doing for the following long-term benefits:</p><ul><li><p><strong>More control over the styling, content, and layout.</strong> On WordPress.com, I was on the AU$60 / year Personal plan (which I got for free while working with Automattic). Adding custom CSS requires upgrading to the AU$120 / year Premium plan. More advanced customisation via WordPress plugins requires paying for the AU$396 / year Business plan. With Hugo, I have full control over the website&rsquo;s source code – for free. Indeed, it felt liberating to customise my chosen Hugo theme and eliminate inline styling in old posts (which I previously used to work around the custom CSS limitation on WordPress.com).</p></li><li><p><strong>Better editing experience.</strong> Since 2014 and through the years when I published the most content, I used the Classic WordPress editor. Not being a fan of heavy <a href=https://en.wikipedia.org/wiki/WYSIWYG title="what you see is what you get" target=_blank rel=noopener>WYSIWYG</a> editors, I used to edit posts in HTML mode and focus on the content with minimal markup. Since December 2018, WordPress has shipped with Gutenberg as the default editor. While it is possible to use the Classic editor as a block within Gutenberg, I find the experience too clunky. And I&rsquo;m not the only one: As of November 2021, <a href=https://wordpress.org/support/plugin/gutenberg/reviews/ target=_blank rel=noopener>the Gutenberg plugin has an average rating of 2.1 stars</a> (including many recent one-star reviews), and <a href="https://github.com/WordPress/gutenberg/issues?q=is%3Aissue+is%3Aopen+label%3A%22%5BType%5D+Bug%22" target=_blank rel=noopener>the Gutenberg repository has over 700 open bugs</a>.</p><p>Moving to Hugo means that I&rsquo;m free to write my posts in Markdown or HTML using any offline text editor, and experience no surprises when my posts are published. As a bonus, you can see the source of <a href=https://github.com/yanirs/yanirseroussi.com/tree/master/content/posts target=_blank rel=noopener>this and all other posts</a> on GitHub. Given that plain text files have been around for far longer than WordPress and Gutenberg, the approach of relying on Markdown for this website is likely to continue working well for decades, even if I end up replacing Hugo. And for a bit of fun, going with Markdown means that <a href=https://copilot.github.com/ target=_blank rel=noopener>GitHub Copilot</a> can try to help with suggestions that range from laughable to eerily insightful.</p><figure><a href=github-copilot.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/github-copilot.png 1652w," src=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/github-copilot_hub3762b96d7343316087ed952f41599f3_192815_800x0_resize_box_3.png alt="GitHub Copilot trying to help with this post" loading=lazy></a><figcaption><p>GitHub Copilot trying to help with this post</p></figcaption></figure></li><li><p><strong>Platform-independent follower list.</strong> On WordPress.com, about half my followers subscribed via email. The other half used the WordPress.com Reader. <a href=https://wordpress.com/support/moving-a-blog/moving-your-subscribers/ target=_blank rel=noopener>Reader subscribers can only be ported to other WordPress sites</a>. With the migration, all subscribers join a single mailing list that is easy to port across service providers.</p></li><li><p><strong>Lower running costs.</strong> Hosting a site with a custom domain on GitHub Pages is free, but mapping a custom domain to a WordPress.com site requires payment. While the plan cost is low compared to the time cost of switching, it&rsquo;s nice to eliminate recurring payments to WordPress.com.</p></li><li><p><strong>Learning opportunity.</strong> Not being a web developer, I don&rsquo;t follow changes in the web development world too closely. Taking more ownership and control over my personal site means that I have to refresh some of my knowledge, i.e., the time spent on the migration wasn&rsquo;t completely wasted on mindless work.</p></li></ul><p>Naturally, I identified some potential risks: Hugo is younger and more likely to be abandoned by its developers than WordPress, maintenance tasks may end up being too time-consuming, and I might miss some features offered by WordPress.com. Ultimately, I decided that the benefits outweigh the risks, which was just the first in a string of decisions on the journey to move off WordPress.com.</p><h2 id=decisions-decisions-or-solution-components>Decisions, decisions&mldr; (or: solution components)<a hidden class=anchor aria-hidden=true href=#decisions-decisions-or-solution-components>#</a></h2><p>WordPress.com is an integrated solution, where many useful features are included even on the Free plan. As such, I would still recommend it for people who are less technically inclined, or to those who aren&rsquo;t interested in fine control over their website. A good way to appreciate what&rsquo;s included in WordPress.com is to try to migrate an existing site off the platform. I found the process a bit overwhelming at first, but ultimately I persevered and ended up with the following solution components.</p><p><strong>Site generator: Hugo.</strong> The biggest change was in the site generation approach – from the dynamic WordPress to the static Hugo. This switch makes sense for my website: I write a new post every once in a while, and it remains unchanged for years. Hence, the same content gets served tens of thousands of times. Moving to a static site generator obviates the need for a traditional database – <a href=https://github.com/yanirs/yanirseroussi.com/tree/master/content/posts target=_blank rel=noopener>my posts are simple Markdown files</a> that Hugo turns to HTML. Together with a bit of CSS and JS, that&rsquo;s enough to serve the same content <em>forever</em>.</p><p>Of <a href=https://jamstack.org/generators/ target=_blank rel=noopener>the many static site generation options</a>, I chose Hugo because it seems popular and well-maintained, and because I find its focus on speed attractive. I also like that it&rsquo;s <a href=https://gohugo.io/getting-started/installing/ target=_blank rel=noopener>simple to install</a> and <a href=https://gohugo.io/hosting-and-deployment/ target=_blank rel=noopener>deploy to many hosts</a>, and that its documentation is clear and comprehensive.</p><p><strong>Hugo theme: PaperMod.</strong> When initially testing Hugo, I went with <a href=https://gohugo.io/getting-started/quick-start/ target=_blank rel=noopener>the Ananke theme from the quick start manual</a>. Then I switched to <a href=https://themes.gohugo.io/themes/beautifulhugo/ target=_blank rel=noopener>Beautiful Hugo</a> for its built-in Staticman comment support. When I realised that this support is limited and easy to mimic in other themes, I switched to <a href=https://github.com/adityatelange/hugo-PaperMod/ target=_blank rel=noopener>PaperMod</a> after seeing it on <a href=https://dancwilliams.com/ target=_blank rel=noopener>Dan C Williams&rsquo;s site</a>. PaperMod has a few quirks, but it&rsquo;s easy to override anything I don&rsquo;t like (see my tweaks <a href=https://github.com/yanirs/yanirseroussi.com target=_blank rel=noopener>on this site&rsquo;s repo</a>).</p><p><strong>Host: GitHub Pages.</strong> I wanted to avoid opening new accounts where possible, so hosting the site on <a href=https://pages.github.com/ target=_blank rel=noopener>GitHub Pages</a> was a natural and safe choice: It&rsquo;s backed by a massive company and has been free to use since 2008. I also like <a href=https://github.blog/2021-04-22-environmental-sustainability-github/ target=_blank rel=noopener>GitHub&rsquo;s sustainability policy</a>, though this should be the standard – <em>any tech company can and should get to at least net zero this decade</em>.</p><p><strong>DNS, CDN, and more: Cloudflare.</strong> I&rsquo;ve used Cloudflare in the past and was impressed with the range of high-quality services they provide for free or for a low price. Therefore, making Cloudflare the DNS and CDN provider for this site was a no-brainer. I&rsquo;m also planning to use it for my domain registration, as <a href=https://blog.cloudflare.com/cloudflare-registrar/ target=_blank rel=noopener>Cloudflare now provides registrar services at wholesale prices</a>. On the sustainability front, <a href=https://blog.cloudflare.com/cloudflare-committed-to-building-a-greener-internet/ target=_blank rel=noopener>Cloudflare is committed to powering its network with 100% renewables</a> – it&rsquo;s going in the right direction, but it&rsquo;s not as clear on <a href=https://ghgprotocol.org/sites/default/files/standards_supporting/FAQ.pdf target=_blank rel=noopener>Scope 3 emissions</a> as GitHub.</p><p><strong>Comments: Static display + GitHub issues.</strong> By far, the most annoying part of the migration was settling on a solution for comments. <a href=https://gohugo.io/content-management/comments/ target=_blank rel=noopener>The Hugo docs suggest Disqus as the default</a>, but they also note many other options. With about 150 comments over nearly eight years, this site is hardly a vibrant discussion forum – using Disqus feels like an overkill. After a bit of research, I learned about <a href=https://staticman.net/ target=_blank rel=noopener>Staticman</a>, which can be self-hosted to turn every comment into a static YAML file that gets rendered by Hugo. I liked the static generation aspect of the Staticman approach, but I didn&rsquo;t like the idea of complicating things by running another service. Therefore, I settled on <a href=https://github.com/yanirs/yanirseroussi.com/blob/master/layouts/partials/comments.html target=_blank rel=noopener>my own comment layout</a> and <a href=https://github.com/yanirs/yanirseroussi.com/blob/master/assets/css/extended/comments.css target=_blank rel=noopener>stylesheet</a>, which includes buttons to add new comments as GitHub issues. For this, I found the posts by <a href=https://yasoob.me/posts/running_staticman_on_static_hugo_blog_with_nested_comments/ target=_blank rel=noopener>Khalid Yasoob</a> and <a href=https://dancwilliams.com/hugo-staticman-nested-replies-and-email-notifications/ target=_blank rel=noopener>Dan C Williams</a> helpful, though I deviated from their solutions.</p><p>As I was already moderating comments on my WordPress.com site, I doubt that the additional overhead of manually turning issues into YAML files would be unmanageable. In any case, I can iterate on my solution by adding issue templates and automating the conversion of issues to YAML. Other than the added processing overhead, a downside of my approach in comparison to Staticman is that it requires commenters to have a GitHub account. Given my audience, I think it&rsquo;s a reasonable requirement, and it should help mitigate spam. In any case, I&rsquo;m not married to this solution – I can always switch to Staticman, Disqus, or any other commenting system. That&rsquo;s the beauty of gaining control over my website.</p><p><strong>Contact form: Google.</strong> I had a WordPress.com contact form on <a href=https://yanirseroussi.com/about/>my About page</a>, which I replaced with an embedded Google Form. As <a href=https://xfanatical.com/blog/3-ways-to-protect-google-forms-from-spamming/ target=_blank rel=noopener>Google Forms don&rsquo;t have built-in spam protection from anonymous users</a>, my form requires users to log in to Google. This limits options for potential contacts, but I&rsquo;m also contactable via LinkedIn or GitHub. While all these options require an account, they&rsquo;re free and backed by companies that are serious about fighting spam. And of course, <a href=https://sustainability.google/ target=_blank rel=noopener>Google is a sustainability leader</a>.</p><p><strong>Mailing list: TinyLetter.</strong> As TinyLetter has been around for years and is owned by MailChimp, it feels like a safe choice for managing my current email subscriber list (unless it grows beyond TinyLetter&rsquo;s limits). In any case, porting an email list is easy, as <a href=https://www.cgpgrey.com/blog/the-professional-sharer target=_blank rel=noopener>no one owns email</a>. Unfortunately, I&rsquo;m unsure about TinyLetter&rsquo;s sustainability, but with <a href=https://techcrunch.com/2021/09/13/intuit-confirms-12b-deal-to-buy-mailchimp/ target=_blank rel=noopener>Mailchimp&rsquo;s recent acquisition by Intuit</a>, I hope it will be covered by <a href=https://www.intuit.com/company/corporate-responsibility/climate/ target=_blank rel=noopener>Intuit&rsquo;s ambitious sustainability goals</a>.</p><p><strong>Analytics: Cloudflare.</strong> I considered installing Google Analytics, which I didn&rsquo;t have on my WordPress.com site because it requires a Premium plan. However, I decided against it given the prevalence of Google Analytics blockers (<a href=https://plausible.io/blog/google-analytics-adblockers-missing-data target=_blank rel=noopener>especially among tech-savvy audiences</a>). Taking a bit of <a href=https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/>my own advice</a>, I asked myself <em>why</em> I needed analytics? The main reasons are: Verifying the site works as expected, and getting a broad idea of where traffic is coming from and which posts are popular. For this, &ldquo;accurate&rdquo; view counts are unnecessary, as is close tracking of individuals. Therefore, I went with <a href=https://developers.cloudflare.com/analytics/web-analytics target=_blank rel=noopener>the lightweight web analytics provided by Cloudflare</a>, which doesn&rsquo;t collect personal user data. In some respects, it is more limited than the free stats offered by WordPress.com, e.g., Cloudflare&rsquo;s data retention period is 30 days. But since my focus is operational, I don&rsquo;t need to retain stats from past months and years – they feel like vanity metrics that won&rsquo;t change my behaviour.</p><figure><a href=bad-metric.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bad-metric_hu35a9560a7e4074b1b75b8e55f467b6b1_377557_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/bad-metric_hu35a9560a7e4074b1b75b8e55f467b6b1_377557_480x0_resize_box_3.png 480w,
diff --git a/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.html b/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.html
index 9e03531b3..a4aaf5f29 100644
--- a/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.html
+++ b/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Use your human brain to avoid artificial intelligence disasters | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="artificial intelligence,data science,deep learning,ethics,fast.ai,machine learning"><meta name=description content="Overview of a talk I gave at a deep learning course, focusing on AI ethics as the need for humans to think on the context and consequences of applying AI."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Use your human brain to avoid artificial intelligence disasters"><meta property="og:description" content="Overview of a talk I gave at a deep learning course, focusing on AI ethics as the need for humans to think on the context and consequences of applying AI."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/"><meta property="og:image" content="https://yanirseroussi.com/think-about-your-modelling-context.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2021-11-22T03:45:00+00:00"><meta property="article:modified_time" content="2021-11-22T13:52:18+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/think-about-your-modelling-context.png"><meta name=twitter:title content="Use your human brain to avoid artificial intelligence disasters"><meta name=twitter:description content="Overview of a talk I gave at a deep learning course, focusing on AI ethics as the need for humans to think on the context and consequences of applying AI."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Use your human brain to avoid artificial intelligence disasters","item":"https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Use your human brain to avoid artificial intelligence disasters","name":"Use your human brain to avoid artificial intelligence disasters","description":"Overview of a talk I gave at a deep learning course, focusing on AI ethics as the need for humans to think on the context and consequences of applying AI.","keywords":["artificial intelligence","data science","deep learning","ethics","fast.ai","machine learning"],"articleBody":"Earlier this year, I helped mentor a local edition of fast.ai’s Practical Deep Learning for Coders. Each mentor gave a brief talk on a given week’s subject, adding to the material covered in the recorded lectures. My talk (embedded below) supplemented the data ethics lesson. While the mere mention of the word ethics can elicit instant yawns from some people, the main message for me is that it’s critical for humans to think about the context and consequences of deploying machine learning models.\nUnfortunately, this message sometimes gets muddied amidst the outrage about specific applications that conflict with the values of the outraged parties. But I believe it’s possible to transcend narrow moralities and agree that better outcomes arise when humans think deeply about their deep learning systems. Or to put it more bluntly, any fool can build machine learning models, but it takes thoughtful humans to build good artificial intelligence applications.\nSource: Three Panel Soul - dog philosophy Of course, what constitutes good is an open question, which I touched on in the talk. Other key points include:\nThe modelling context is much broader than any machine learning model. Considering context is where human brains shine. Thoughtlessness can have a negative impact on society and on your career. Moral values vary across time, space, cultures, and individuals, e.g., along five moral foundations. Any data scientist, machine learning engineer, or modern human should develop their critical thinking skills. The Calling Bullshit course from the University of Washington is a great starting point – essentially Data Literacy 101. Bullshit is easier to detect than call. Deciding on a level of bullshit calling is like tuning a model’s learning rate. A good chunk of the talk was spent on the case study on criminal machine learning from the Calling Bullshit website. I was pleased with the level of engagement on this segment, especially since a lockdown forced us to deliver the class online at short notice. You can watch the full talk below (my part ends after 24 minutes), view the slides here, and check out supplementary materials from all mentors on GitHub.\n","wordCount":"351","inLanguage":"en","image":"https://yanirseroussi.com/think-about-your-modelling-context.png","datePublished":"2021-11-22T03:45:00Z","dateModified":"2021-11-22T13:52:18+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Use your human brain to avoid artificial intelligence disasters</h1><div class=post-meta><span title='2021-11-22 03:45:00 +0000 UTC'>November 22, 2021</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2021-11-22-use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/think-about-your-modelling-context.png alt="If you don't think about your modelling context, you're gonna have a bad time."></figure><div class=post-content><p>Earlier this year, I helped mentor a local edition of <a href=https://course.fast.ai/ target=_blank rel=noopener>fast.ai&rsquo;s <em>Practical Deep Learning for Coders</em></a>. Each mentor gave a brief talk on a given week&rsquo;s subject, adding to the material covered in the recorded lectures. My talk (embedded below) supplemented <a href="https://www.youtube.com/watch?v=krIVOb23EH8" target=_blank rel=noopener>the data ethics lesson</a>. While the mere mention of the word <em>ethics</em> can elicit instant yawns from some people, the main message for me is that <strong>it&rsquo;s critical for humans to think about the context and consequences of deploying machine learning models</strong>.</p><p>Unfortunately, this message sometimes gets muddied amidst the outrage about specific applications that conflict with the values of the outraged parties. But I believe it&rsquo;s possible to transcend narrow moralities and agree that better outcomes arise when humans think deeply about their deep learning systems. Or to put it more bluntly, <strong>any fool can build machine learning models, but it takes thoughtful humans to build <em>good</em> artificial intelligence applications.</strong></p><figure><a href=dog-philosophy.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="artificial intelligence,data science,deep learning,ethics,fast.ai,machine learning"><meta name=description content="Overview of a talk I gave at a deep learning course, focusing on AI ethics as the need for humans to think on the context and consequences of applying AI."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Use your human brain to avoid artificial intelligence disasters"><meta property="og:description" content="Overview of a talk I gave at a deep learning course, focusing on AI ethics as the need for humans to think on the context and consequences of applying AI."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/"><meta property="og:image" content="https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/think-about-your-modelling-context.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2021-11-22T03:45:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/think-about-your-modelling-context.png"><meta name=twitter:title content="Use your human brain to avoid artificial intelligence disasters"><meta name=twitter:description content="Overview of a talk I gave at a deep learning course, focusing on AI ethics as the need for humans to think on the context and consequences of applying AI."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Use your human brain to avoid artificial intelligence disasters","item":"https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Use your human brain to avoid artificial intelligence disasters","name":"Use your human brain to avoid artificial intelligence disasters","description":"Overview of a talk I gave at a deep learning course, focusing on AI ethics as the need for humans to think on the context and consequences of applying AI.","keywords":["artificial intelligence","data science","deep learning","ethics","fast.ai","machine learning"],"articleBody":"Earlier this year, I helped mentor a local edition of fast.ai’s Practical Deep Learning for Coders. Each mentor gave a brief talk on a given week’s subject, adding to the material covered in the recorded lectures. My talk (embedded below) supplemented the data ethics lesson. While the mere mention of the word ethics can elicit instant yawns from some people, the main message for me is that it’s critical for humans to think about the context and consequences of deploying machine learning models.\nUnfortunately, this message sometimes gets muddied amidst the outrage about specific applications that conflict with the values of the outraged parties. But I believe it’s possible to transcend narrow moralities and agree that better outcomes arise when humans think deeply about their deep learning systems. Or to put it more bluntly, any fool can build machine learning models, but it takes thoughtful humans to build good artificial intelligence applications.\nSource: Three Panel Soul - dog philosophy Of course, what constitutes good is an open question, which I touched on in the talk. Other key points include:\nThe modelling context is much broader than any machine learning model. Considering context is where human brains shine. Thoughtlessness can have a negative impact on society and on your career. Moral values vary across time, space, cultures, and individuals, e.g., along five moral foundations. Any data scientist, machine learning engineer, or modern human should develop their critical thinking skills. The Calling Bullshit course from the University of Washington is a great starting point – essentially Data Literacy 101. Bullshit is easier to detect than call. Deciding on a level of bullshit calling is like tuning a model’s learning rate. A good chunk of the talk was spent on the case study on criminal machine learning from the Calling Bullshit website. I was pleased with the level of engagement on this segment, especially since a lockdown forced us to deliver the class online at short notice. You can watch the full talk below (my part ends after 24 minutes), view the slides here, and check out supplementary materials from all mentors on GitHub.\n","wordCount":"351","inLanguage":"en","image":"https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/think-about-your-modelling-context.png","datePublished":"2021-11-22T03:45:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Use your human brain to avoid artificial intelligence disasters</h1><div class=post-meta><span title='2021-11-22 03:45:00 +0000 UTC'>November 22, 2021</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2021-11-22-use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/think-about-your-modelling-context.png alt="If you don't think about your modelling context, you're gonna have a bad time."></figure><div class=post-content><p>Earlier this year, I helped mentor a local edition of <a href=https://course.fast.ai/ target=_blank rel=noopener>fast.ai&rsquo;s <em>Practical Deep Learning for Coders</em></a>. Each mentor gave a brief talk on a given week&rsquo;s subject, adding to the material covered in the recorded lectures. My talk (embedded below) supplemented <a href="https://www.youtube.com/watch?v=krIVOb23EH8" target=_blank rel=noopener>the data ethics lesson</a>. While the mere mention of the word <em>ethics</em> can elicit instant yawns from some people, the main message for me is that <strong>it&rsquo;s critical for humans to think about the context and consequences of deploying machine learning models</strong>.</p><p>Unfortunately, this message sometimes gets muddied amidst the outrage about specific applications that conflict with the values of the outraged parties. But I believe it&rsquo;s possible to transcend narrow moralities and agree that better outcomes arise when humans think deeply about their deep learning systems. Or to put it more bluntly, <strong>any fool can build machine learning models, but it takes thoughtful humans to build <em>good</em> artificial intelligence applications.</strong></p><figure><a href=dog-philosophy.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/dog-philosophy_huc3cf4931803219559fd375fea6b748c2_70831_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/dog-philosophy_huc3cf4931803219559fd375fea6b748c2_70831_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/dog-philosophy_huc3cf4931803219559fd375fea6b748c2_70831_720x0_resize_box_3.png 720w,
diff --git a/2022/01/14/analysis-strategies-in-online-a-b-experiments/index.html b/2022/01/14/analysis-strategies-in-online-a-b-experiments/index.html
index 78e6d4af5..87d61502f 100644
--- a/2022/01/14/analysis-strategies-in-online-a-b-experiments/index.html
+++ b/2022/01/14/analysis-strategies-in-online-a-b-experiments/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="a/b testing,causal inference,data science,marketing,statistics"><meta name=description content="Epidemiologists analyse clinical trials to estimate the intention-to-treat and per-protocol effects. This post applies their strategies to online experiments."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials"><meta property="og:description" content="Epidemiologists analyse clinical trials to estimate the intention-to-treat and per-protocol effects. This post applies their strategies to online experiments."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/"><meta property="og:image" content="https://yanirseroussi.com/online-drug-experiment.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-01-14T00:05:40+00:00"><meta property="article:modified_time" content="2022-01-17T09:00:05+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/online-drug-experiment.jpg"><meta name=twitter:title content="Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials"><meta name=twitter:description content="Epidemiologists analyse clinical trials to estimate the intention-to-treat and per-protocol effects. This post applies their strategies to online experiments."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials","item":"https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials","name":"Analysis strategies in online A\/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials","description":"Epidemiologists analyse clinical trials to estimate the intention-to-treat and per-protocol effects. This post applies their strategies to online experiments.","keywords":["a/b testing","causal inference","data science","marketing","statistics"],"articleBody":" In theory, there is no difference between theory and practice. In practice, there is.\nBenjamin Brewster Many discussions of online A/B experiments deal with the sunny day scenario: You randomly assign users to groups A and B, expose group A to the control variant and group B to the treatment variant, run statistical tests on your chosen metrics, and assume that metric differences between the groups that aren’t explained by randomness are due to exposure to the treatment.\nHowever, it’s not always a sunny day for the online experimenter. Challenges include dealing with bot traffic and malicious users, and implementation realities that may make users experience both variants or neither of them. While many of these problems have parallels in clinical trials, I haven’t found many resources that explore these parallels. In this post, I share some lessons I learned from the rich clinical trial literature while building Automattic’s experimentation platform, focusing on analysis strategies that deal with deviations from the ideal experiment scenario.\nReminder: Why we run A/B experiments Uncontrolled versus controlled experiment While the practice of running online A/B experiments is now commonplace, it’s worth reflecting on why such experiments work. Why can’t we just roll out any treatments we think of, measure the metric changes, and assume that differences beyond what we expect from random variation are due to the genius (or folly) of our implemented treatments?\nWell, it’s not that simple because the world isn’t static. Even if we don’t make any changes, we’re likely to see different outcomes from month to month and day to day, as the world and our user population change. This is represented by the top part of the diagram above: While we’re interested in the causal impact of the Treatment on the Outcome, many Unknowns may affect both. That is, without an A/B experiment, the Unknowns act as confounders that make it impossible to estimate the causal effect without further assumptions.\nWith an ideal A/B experiment, we make exposure to the Treatment depend only on our randomisation mechanism – the Assigner on the bottom part of the diagram. Assuming everything goes to plan, we end up with two distinct groups for which exposure to the Treatment is only due to our randomisation mechanism. This allows us to conclude that any differences in the Outcome across the groups beyond what’s expected from randomness are due to the Treatment.\nHowever, reality is often different from this ideal scenario.\nRunning example To make things more concrete, let’s take a simple example: You run a crypto exchange, and you want to maximise signups from one of your landing pages. The current call-to-action text is “sign up”. You’re wondering whether changing it to “sign up today!” would instill a sense of urgency and increase the signup conversion rate (signups divided by unique visitors).\nsign up OR sign up today! A simplified mockup of the variants. Which one would you choose?\nPlacing this scenario into the above diagram, if we were to simply change the text, i.e., apply the Treatment to everyone, we wouldn’t be able to confidently tell whether the text change was the cause of any observed difference in the conversion rate. For example, if our release coincided with a surge of interest in cryptocurrency, this surge may be one of the Unknowns that would cause more motivated users to come to our exchange and sign up. That is, the surge would affect both exposure to the Treatment and the Outcome.\nWhen we run an ideal A/B experiment, we don’t have this problem. Factors like a surge of interest in crypto don’t affect the assignment of users to the control group A (“sign up”) and the treatment group B (“sign up today!”). We can compare the conversion rates across the groups, estimate random variability with our favourite A/B testing calculator, and rejoice. Right?\nWell, not so fast…\nProblems, problems… In the ideal scenario, all the users that were assigned to one of the experiment groups experience their assigned variant and produce a measurable outcome. In our running example, the groups are A: control and B: treatment with a simple exposure of seeing “sign up” for the former and “sign up today!” for the latter. The outcome is a successful signup or an absence of a signup. To make the outcome well-defined, it’s often a good idea to limit outcome measurement to events that happen (or don’t happen) within a reasonable attribution window from exposure or assignment. In our example, a reasonable attribution window is probably on the order of hours, as we don’t expect the call-to-action text to have long-lasting effects.\nPotential deviations from the ideal scenario include:\nAssignment of ineligible users. In our running example, these may be bots or users that already have an account. If we include many ineligible users in our analysis, we may underestimate the effect size even if their distribution across groups is uniform. Crossovers. These are users that manage to experience both variants. For example, they may come across our site on mobile with the “sign up today!” text, and then switch to desktop and see the “sign up” message. Depending on the instrumentation we have in place, we may not be able to detect such users, or we may only detect them if they sign up on one device and then log in on the other device. Assignment without exposure. Due to implementation constraints, we may not be guaranteed that assigned users are actually exposed to the treatment and control. In our running example, it may be that the assignment is done on the backend while exposure happens conditionally and asynchronously on the frontend – some users may bounce in the gap between assignment and exposure, and never see the call-to-action text. Multiple exposures. Once a user has been assigned, they may get exposed to the treatment and control multiple times (without crossing over). In our example, they may visit the landing page repeatedly and see the “sign up” or “sign up today!” text multiple times before deciding to sign up. Epidemiologist jargon and analysis strategies While clinical trials are more tightly controlled than online A/B experiments, they are also susceptible to problems like assignment of ineligible patients and non-adherence to treatment (e.g., crossover, non-exposure, and multiple exposures). Hence, much has been written on addressing these problems at the analysis stage. However, when researching the topic, overcoming the domain-specific language barrier was a bit of a challenge, as the terminology used by online experimenters is different from the terminology used by epidemiologists. Fortunately, I came across the term intention-to-treat at some point, which opened the door to decades of research on the topic.\nTwo papers I found useful are Intention-to-treat concept: A review (Gupta, 2011) and Guidelines for estimating causal effects in pragmatic randomized trials (Murray, Swanson, and Hernán, 2019). Seeing Miguel Hernán on the author list was an especially positive signal for me, as he is responsible for some of my favourite resources on causal inference, including the most practical book I’ve read on the topic.\nThe definitions and guidelines from these two papers provide a solid foundation for thinking about problems of ineligibility and non-adherence. Specifically, Gupta defines intention-to-treat as an analysis strategy “that includes all randomized patients in the groups to which they were randomly assigned, regardless of their adherence with the entry criteria, regardless of the treatment they actually received, and regardless of subsequent withdrawal from treatment or deviation from the protocol.”\nThere are often good reasons to exclude some randomised participants from analysis. Depending on the exclusions, this may or may not bias the results. The use of conservative exclusions can be described as modified intention-to-treat, which according to Gupta “allows the exclusion of some randomized subjects in a justified way (such as patients who were deemed ineligible after randomization or certain patients who never started treatment). However, the definition given to the modified ITT (mITT) in randomized controlled trials has been found to be irregular and arbitrary because there is a lack of consistent guidelines for its application. The mITT analysis allows a subjective approach in entry criteria, which may lead to confusion, inaccurate results and bias.”\nExclusions and further adjustments are usually an attempt to estimate the per-protocol effect, which is defined by Murray, Swanson, and Hernán as “the effect of receiving the assigned treatment strategies throughout the follow-up as specified in the study protocol.” Unfortunately, obtaining a valid estimate of the per-protocol effect isn’t trivial: “To validly estimate the per-protocol effect, baseline variables which predict adherence and are prognostic for the outcome need to be accounted for, either through direct adjustment or via an instrumental variable analysis. Yet two commonly used analytic approaches do not incorporate any such adjustment: (1) Naïve per-protocol analysis, that is, restricting the analytic subset to adherent individuals; and (2) As-treated analysis, that is, comparing individuals based on the treatment they choose.” In other words, if we’re not careful, the per-protocol analysis may become analogous to an uncontrolled experiment, as depicted at the top of the diagram above.\nWhat should be done in practice? From my reading of the clinical trial literature, the tendency is to use multiple analysis strategies. For example, the first guideline noted by Murray, Swanson, and Hernán is: “To adequately guide decision making by all stakeholders, report estimates of both the intention-to-treat effect and the per-protocol effect, as well as methods and key conditions underlying the estimation procedures.” This echoes the 1988 US FDA guidelines that require applicants to provide an intention-to-treat analysis in addition to the applicant’s preferred per-protocol analyses. Similarly, the 1998 European Medicines Agency guidelines provide more details on the intention-to-treat, modified intention-to-treat, and per-protocol strategies, stating that: “In general, it is advantageous to demonstrate a lack of sensitivity of the principal trial results to alternative choices of the set of subjects analysed. […] When the full analysis set and the per protocol set lead to essentially the same conclusions, confidence in the trial results is increased, bearing in mind, however, that the need to exclude a substantial proportion of subjects from the per protocol analysis throws some doubt on the overall validity of the trial.”\nWhile the stakes in online experiments are typically much lower than in human drug approval, I believe that applying multiple analysis strategies is still a great idea. We did that for Automattic’s experimentation platform, where we flagged discrepancies between the strategies if they led to conflicting conclusions. One downside of this approach is that it complicates the presentation of results in comparison to using a single strategy. If you face the same challenge, you may draw inspiration from seeing how it’s addressed by the open source frontend of Automattic’s experimentation platform.\nGoing back to our running example, we can perform the following analyses to deal with the deviations noted above:\nIntention-to-treat. Includes all users based on their initial group assignment, regardless of what variant they were exposed to. Modified intention-to-treat: No ineligible users. This applies to cases where we detect the ineligibility after assignment, but the eligibility criteria are based on factors that could have been known before the experiment. Hence, it should be safe to exclude the ineligible users after the fact. In our example, excluding bots and existing users should increase the observed effect size, but not change the preferred variant. Modified intention-to-treat: No crossovers. If we have a mechanism to detect some crossovers, excluding them and comparing the results to the intention-to-treat analysis may uncover implementation bugs. It’s worth noting that crossovers shouldn’t occur in cases where we can uniquely identify users at all stages of the experiment – it is a problem that is more likely to occur when dealing with anonymous users, as in our landing page example. As such, and given the inability to detect all crossovers, A/B experiments should be avoided when users are highly motivated to cross over. For example, displaying different price levels based on anonymous and transient identifiers like cookies is often a bad idea. Naive per-protocol: Exposed users. For this analysis, we’d only include users that were exposed to the control and treatment texts. As noted by Murray, Swanson, and Hernán, this is naive because we should adjust our estimates based on variables that predict exposure. However, if missing exposures are only due to the inherent limitations of online experiments, this falls more under the modified intention-to-treat criterion noted by Gupta, of excluding “patients who never started treatment”. Things get more complicated if we wish to use each exposure as a distinct starting point for measuring multiple assignment windows (the multiple exposures scenario above), which is akin to patients choosing their own dosage – far from a controlled experiment. For automated analysis, it’s better to use the first exposure as the attribution window start, as it should be unaffected by the experiment variants. For all analysis approaches, it’s critical to verify that there is no sample ratio mismatch in the analysed population, i.e., that the distribution of users across variants matches what we expect from a random assignment. If this isn’t the case, manual analysis by a qualified data scientist is needed. The result of this manual analysis may be that the results should be discarded, as sample ratio mismatches are a common indicator of implementation bugs. This is discussed in detail in the book Trustworthy Online Controlled Experiments, which also includes a chapter on exposure-based analysis (called triggering in the book). Among other recommendations, the authors suggest analysing the unexposed users. If everything goes as expected, metrics for the assigned-but-unexposed populations would behave like A/A experiment metrics, i.e., any differences between the groups should be due to random variability.\nHaving rigorous consistency checks in place and falling back to manual analysis when any discrepancies are detected should help avoid the pitfalls of unsafe user exclusions that’d bias the results. Given the need for careful adjustments to get a valid per-protocol estimate in case anything goes wrong, it is often best to fix any underlying issues and rerun the experiment. Usually, this is much cheaper to do in an online setting than in clinical trials.\nClosing thoughts and further reading Once you move from the theory of experimentation to the practice of running experiments in the real world, you discover the many complexities involved in doing it well. This applies whether you’re an epidemiologist or an online experimenter. As noted in the preface to the trustworthy experiments book: “Getting numbers is easy; getting numbers you can trust is hard!”\nThis post only scratched the surface of one area of experimentation: Deciding what population to analyse once the experiment was run. There is, of course, a lot more to online experimentation and causal inference than what I could cover here. But I hope that this message is clear: Approach experimentation with humility, and aim to learn from a broad set of teachers rather than limit yourself to the relatively-recent developments in online experiments.\nAs mentioned above, some resources that are worth reading to learn more include my favourite causal inference book, the trustworthy experiments book, and the guidelines for pragmatic trials. There are also a bunch of resources on my causal inference list, and my post on Bayesian A/B testing should be of interest if you made it to this point. Finally, I’m always happy to discuss these topics, so feel free to contact me or leave a comment with your thoughts.\nCover image by Tumisu from Pixabay\n","wordCount":"2567","inLanguage":"en","image":"https://yanirseroussi.com/online-drug-experiment.jpg","datePublished":"2022-01-14T00:05:40Z","dateModified":"2022-01-17T09:00:05+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials</h1><div class=post-meta><span title='2022-01-14 00:05:40 +0000 UTC'>January 14, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-01-14-analysis-strategies-in-online-a-b-experiments/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment.jpg alt width=1920 height=1152></figure><div class=post-content><blockquote><p>In theory, there is no difference between theory and practice. In practice, there is.</p><footer><strong>Benjamin Brewster</strong></footer></blockquote><p>Many discussions of online A/B experiments deal with the sunny day scenario: You randomly assign users to groups A and B, expose group A to the control variant and group B to the treatment variant, run statistical tests on your chosen metrics, and assume that metric differences between the groups that aren&rsquo;t explained by randomness are due to exposure to the treatment.</p><p>However, it&rsquo;s not always a sunny day for the online experimenter. Challenges include dealing with bot traffic and malicious users, and implementation realities that may make users experience both variants or neither of them. While many of these problems have parallels in clinical trials, I haven&rsquo;t found many resources that explore these parallels. In this post, I share some lessons I learned from the rich clinical trial literature while building <a href=https://data.blog/category/experimentation-platform/ target=_blank rel=noopener>Automattic&rsquo;s experimentation platform</a>, focusing on analysis strategies that deal with deviations from the ideal experiment scenario.</p><h2 id=reminder-why-we-run-ab-experiments>Reminder: Why we run A/B experiments<a hidden class=anchor aria-hidden=true href=#reminder-why-we-run-ab-experiments>#</a></h2><figure><a href=uncontrolled-versus-controlled-experiment.svg target=_blank rel=noopener><img src=uncontrolled-versus-controlled-experiment.svg alt="Uncontrolled versus controlled experiment" loading=lazy></a><figcaption><p>Uncontrolled versus controlled experiment</p></figcaption></figure><p>While the practice of running online A/B experiments is now commonplace, it&rsquo;s worth reflecting on why such experiments work. Why can&rsquo;t we just roll out any treatments we think of, measure the metric changes, and assume that differences beyond what we expect from random variation are due to the genius (or folly) of our implemented treatments?</p><p>Well, it&rsquo;s not that simple because the world isn&rsquo;t static. Even if we don&rsquo;t make any changes, <a href=https://www.linkedin.com/pulse/how-identify-your-marketing-lies-start-telling-truth-tiberio-caetano/ target=_blank rel=noopener>we&rsquo;re likely to see different outcomes from month to month and day to day</a>, as the world and our user population change. This is represented by the top part of the diagram above: While we&rsquo;re interested in the causal impact of the <code>Treatment</code> on the <code>Outcome</code>, many <code>Unknowns</code> may affect both. That is, without an A/B experiment, the <code>Unknowns</code> act as confounders that make it impossible to estimate the causal effect without <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>further assumptions</a>.</p><p>With an ideal A/B experiment, we make exposure to the <code>Treatment</code> depend only on our randomisation mechanism – the <code>Assigner</code> on the bottom part of the diagram. Assuming everything goes to plan, we end up with two distinct groups for which exposure to the <code>Treatment</code> is only due to our randomisation mechanism. This allows us to conclude that any differences in the <code>Outcome</code> across the groups beyond what&rsquo;s expected from randomness are due to the <code>Treatment</code>.</p><p>However, reality is often different from this ideal scenario.</p><h2 id=running-example>Running example<a hidden class=anchor aria-hidden=true href=#running-example>#</a></h2><p>To make things more concrete, let&rsquo;s take a simple example: You run a crypto exchange, and you want to maximise signups from one of your landing pages. The current call-to-action text is <em>&ldquo;sign up&rdquo;</em>. You&rsquo;re wondering whether changing it to <em>&ldquo;sign up today!&rdquo;</em> would instill a sense of urgency and increase the signup conversion rate (signups divided by unique visitors).</p><figure><a class=comment-button href=# onclick='alert("This is variant A: control")' style=float:unset>sign up</a>
+<meta name=keywords content="a/b testing,causal inference,data science,marketing,statistics"><meta name=description content="Epidemiologists analyse clinical trials to estimate the intention-to-treat and per-protocol effects. This post applies their strategies to online experiments."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials"><meta property="og:description" content="Epidemiologists analyse clinical trials to estimate the intention-to-treat and per-protocol effects. This post applies their strategies to online experiments."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/"><meta property="og:image" content="https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-01-14T00:05:40+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment.jpg"><meta name=twitter:title content="Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials"><meta name=twitter:description content="Epidemiologists analyse clinical trials to estimate the intention-to-treat and per-protocol effects. This post applies their strategies to online experiments."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials","item":"https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials","name":"Analysis strategies in online A\/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials","description":"Epidemiologists analyse clinical trials to estimate the intention-to-treat and per-protocol effects. This post applies their strategies to online experiments.","keywords":["a/b testing","causal inference","data science","marketing","statistics"],"articleBody":" In theory, there is no difference between theory and practice. In practice, there is.\nBenjamin Brewster Many discussions of online A/B experiments deal with the sunny day scenario: You randomly assign users to groups A and B, expose group A to the control variant and group B to the treatment variant, run statistical tests on your chosen metrics, and assume that metric differences between the groups that aren’t explained by randomness are due to exposure to the treatment.\nHowever, it’s not always a sunny day for the online experimenter. Challenges include dealing with bot traffic and malicious users, and implementation realities that may make users experience both variants or neither of them. While many of these problems have parallels in clinical trials, I haven’t found many resources that explore these parallels. In this post, I share some lessons I learned from the rich clinical trial literature while building Automattic’s experimentation platform, focusing on analysis strategies that deal with deviations from the ideal experiment scenario.\nReminder: Why we run A/B experiments Uncontrolled versus controlled experiment While the practice of running online A/B experiments is now commonplace, it’s worth reflecting on why such experiments work. Why can’t we just roll out any treatments we think of, measure the metric changes, and assume that differences beyond what we expect from random variation are due to the genius (or folly) of our implemented treatments?\nWell, it’s not that simple because the world isn’t static. Even if we don’t make any changes, we’re likely to see different outcomes from month to month and day to day, as the world and our user population change. This is represented by the top part of the diagram above: While we’re interested in the causal impact of the Treatment on the Outcome, many Unknowns may affect both. That is, without an A/B experiment, the Unknowns act as confounders that make it impossible to estimate the causal effect without further assumptions.\nWith an ideal A/B experiment, we make exposure to the Treatment depend only on our randomisation mechanism – the Assigner on the bottom part of the diagram. Assuming everything goes to plan, we end up with two distinct groups for which exposure to the Treatment is only due to our randomisation mechanism. This allows us to conclude that any differences in the Outcome across the groups beyond what’s expected from randomness are due to the Treatment.\nHowever, reality is often different from this ideal scenario.\nRunning example To make things more concrete, let’s take a simple example: You run a crypto exchange, and you want to maximise signups from one of your landing pages. The current call-to-action text is “sign up”. You’re wondering whether changing it to “sign up today!” would instill a sense of urgency and increase the signup conversion rate (signups divided by unique visitors).\nsign up OR sign up today! A simplified mockup of the variants. Which one would you choose?\nPlacing this scenario into the above diagram, if we were to simply change the text, i.e., apply the Treatment to everyone, we wouldn’t be able to confidently tell whether the text change was the cause of any observed difference in the conversion rate. For example, if our release coincided with a surge of interest in cryptocurrency, this surge may be one of the Unknowns that would cause more motivated users to come to our exchange and sign up. That is, the surge would affect both exposure to the Treatment and the Outcome.\nWhen we run an ideal A/B experiment, we don’t have this problem. Factors like a surge of interest in crypto don’t affect the assignment of users to the control group A (“sign up”) and the treatment group B (“sign up today!”). We can compare the conversion rates across the groups, estimate random variability with our favourite A/B testing calculator, and rejoice. Right?\nWell, not so fast…\nProblems, problems… In the ideal scenario, all the users that were assigned to one of the experiment groups experience their assigned variant and produce a measurable outcome. In our running example, the groups are A: control and B: treatment with a simple exposure of seeing “sign up” for the former and “sign up today!” for the latter. The outcome is a successful signup or an absence of a signup. To make the outcome well-defined, it’s often a good idea to limit outcome measurement to events that happen (or don’t happen) within a reasonable attribution window from exposure or assignment. In our example, a reasonable attribution window is probably on the order of hours, as we don’t expect the call-to-action text to have long-lasting effects.\nPotential deviations from the ideal scenario include:\nAssignment of ineligible users. In our running example, these may be bots or users that already have an account. If we include many ineligible users in our analysis, we may underestimate the effect size even if their distribution across groups is uniform. Crossovers. These are users that manage to experience both variants. For example, they may come across our site on mobile with the “sign up today!” text, and then switch to desktop and see the “sign up” message. Depending on the instrumentation we have in place, we may not be able to detect such users, or we may only detect them if they sign up on one device and then log in on the other device. Assignment without exposure. Due to implementation constraints, we may not be guaranteed that assigned users are actually exposed to the treatment and control. In our running example, it may be that the assignment is done on the backend while exposure happens conditionally and asynchronously on the frontend – some users may bounce in the gap between assignment and exposure, and never see the call-to-action text. Multiple exposures. Once a user has been assigned, they may get exposed to the treatment and control multiple times (without crossing over). In our example, they may visit the landing page repeatedly and see the “sign up” or “sign up today!” text multiple times before deciding to sign up. Epidemiologist jargon and analysis strategies While clinical trials are more tightly controlled than online A/B experiments, they are also susceptible to problems like assignment of ineligible patients and non-adherence to treatment (e.g., crossover, non-exposure, and multiple exposures). Hence, much has been written on addressing these problems at the analysis stage. However, when researching the topic, overcoming the domain-specific language barrier was a bit of a challenge, as the terminology used by online experimenters is different from the terminology used by epidemiologists. Fortunately, I came across the term intention-to-treat at some point, which opened the door to decades of research on the topic.\nTwo papers I found useful are Intention-to-treat concept: A review (Gupta, 2011) and Guidelines for estimating causal effects in pragmatic randomized trials (Murray, Swanson, and Hernán, 2019). Seeing Miguel Hernán on the author list was an especially positive signal for me, as he is responsible for some of my favourite resources on causal inference, including the most practical book I’ve read on the topic.\nThe definitions and guidelines from these two papers provide a solid foundation for thinking about problems of ineligibility and non-adherence. Specifically, Gupta defines intention-to-treat as an analysis strategy “that includes all randomized patients in the groups to which they were randomly assigned, regardless of their adherence with the entry criteria, regardless of the treatment they actually received, and regardless of subsequent withdrawal from treatment or deviation from the protocol.”\nThere are often good reasons to exclude some randomised participants from analysis. Depending on the exclusions, this may or may not bias the results. The use of conservative exclusions can be described as modified intention-to-treat, which according to Gupta “allows the exclusion of some randomized subjects in a justified way (such as patients who were deemed ineligible after randomization or certain patients who never started treatment). However, the definition given to the modified ITT (mITT) in randomized controlled trials has been found to be irregular and arbitrary because there is a lack of consistent guidelines for its application. The mITT analysis allows a subjective approach in entry criteria, which may lead to confusion, inaccurate results and bias.”\nExclusions and further adjustments are usually an attempt to estimate the per-protocol effect, which is defined by Murray, Swanson, and Hernán as “the effect of receiving the assigned treatment strategies throughout the follow-up as specified in the study protocol.” Unfortunately, obtaining a valid estimate of the per-protocol effect isn’t trivial: “To validly estimate the per-protocol effect, baseline variables which predict adherence and are prognostic for the outcome need to be accounted for, either through direct adjustment or via an instrumental variable analysis. Yet two commonly used analytic approaches do not incorporate any such adjustment: (1) Naïve per-protocol analysis, that is, restricting the analytic subset to adherent individuals; and (2) As-treated analysis, that is, comparing individuals based on the treatment they choose.” In other words, if we’re not careful, the per-protocol analysis may become analogous to an uncontrolled experiment, as depicted at the top of the diagram above.\nWhat should be done in practice? From my reading of the clinical trial literature, the tendency is to use multiple analysis strategies. For example, the first guideline noted by Murray, Swanson, and Hernán is: “To adequately guide decision making by all stakeholders, report estimates of both the intention-to-treat effect and the per-protocol effect, as well as methods and key conditions underlying the estimation procedures.” This echoes the 1988 US FDA guidelines that require applicants to provide an intention-to-treat analysis in addition to the applicant’s preferred per-protocol analyses. Similarly, the 1998 European Medicines Agency guidelines provide more details on the intention-to-treat, modified intention-to-treat, and per-protocol strategies, stating that: “In general, it is advantageous to demonstrate a lack of sensitivity of the principal trial results to alternative choices of the set of subjects analysed. […] When the full analysis set and the per protocol set lead to essentially the same conclusions, confidence in the trial results is increased, bearing in mind, however, that the need to exclude a substantial proportion of subjects from the per protocol analysis throws some doubt on the overall validity of the trial.”\nWhile the stakes in online experiments are typically much lower than in human drug approval, I believe that applying multiple analysis strategies is still a great idea. We did that for Automattic’s experimentation platform, where we flagged discrepancies between the strategies if they led to conflicting conclusions. One downside of this approach is that it complicates the presentation of results in comparison to using a single strategy. If you face the same challenge, you may draw inspiration from seeing how it’s addressed by the open source frontend of Automattic’s experimentation platform.\nGoing back to our running example, we can perform the following analyses to deal with the deviations noted above:\nIntention-to-treat. Includes all users based on their initial group assignment, regardless of what variant they were exposed to. Modified intention-to-treat: No ineligible users. This applies to cases where we detect the ineligibility after assignment, but the eligibility criteria are based on factors that could have been known before the experiment. Hence, it should be safe to exclude the ineligible users after the fact. In our example, excluding bots and existing users should increase the observed effect size, but not change the preferred variant. Modified intention-to-treat: No crossovers. If we have a mechanism to detect some crossovers, excluding them and comparing the results to the intention-to-treat analysis may uncover implementation bugs. It’s worth noting that crossovers shouldn’t occur in cases where we can uniquely identify users at all stages of the experiment – it is a problem that is more likely to occur when dealing with anonymous users, as in our landing page example. As such, and given the inability to detect all crossovers, A/B experiments should be avoided when users are highly motivated to cross over. For example, displaying different price levels based on anonymous and transient identifiers like cookies is often a bad idea. Naive per-protocol: Exposed users. For this analysis, we’d only include users that were exposed to the control and treatment texts. As noted by Murray, Swanson, and Hernán, this is naive because we should adjust our estimates based on variables that predict exposure. However, if missing exposures are only due to the inherent limitations of online experiments, this falls more under the modified intention-to-treat criterion noted by Gupta, of excluding “patients who never started treatment”. Things get more complicated if we wish to use each exposure as a distinct starting point for measuring multiple assignment windows (the multiple exposures scenario above), which is akin to patients choosing their own dosage – far from a controlled experiment. For automated analysis, it’s better to use the first exposure as the attribution window start, as it should be unaffected by the experiment variants. For all analysis approaches, it’s critical to verify that there is no sample ratio mismatch in the analysed population, i.e., that the distribution of users across variants matches what we expect from a random assignment. If this isn’t the case, manual analysis by a qualified data scientist is needed. The result of this manual analysis may be that the results should be discarded, as sample ratio mismatches are a common indicator of implementation bugs. This is discussed in detail in the book Trustworthy Online Controlled Experiments, which also includes a chapter on exposure-based analysis (called triggering in the book). Among other recommendations, the authors suggest analysing the unexposed users. If everything goes as expected, metrics for the assigned-but-unexposed populations would behave like A/A experiment metrics, i.e., any differences between the groups should be due to random variability.\nHaving rigorous consistency checks in place and falling back to manual analysis when any discrepancies are detected should help avoid the pitfalls of unsafe user exclusions that’d bias the results. Given the need for careful adjustments to get a valid per-protocol estimate in case anything goes wrong, it is often best to fix any underlying issues and rerun the experiment. Usually, this is much cheaper to do in an online setting than in clinical trials.\nClosing thoughts and further reading Once you move from the theory of experimentation to the practice of running experiments in the real world, you discover the many complexities involved in doing it well. This applies whether you’re an epidemiologist or an online experimenter. As noted in the preface to the trustworthy experiments book: “Getting numbers is easy; getting numbers you can trust is hard!”\nThis post only scratched the surface of one area of experimentation: Deciding what population to analyse once the experiment was run. There is, of course, a lot more to online experimentation and causal inference than what I could cover here. But I hope that this message is clear: Approach experimentation with humility, and aim to learn from a broad set of teachers rather than limit yourself to the relatively-recent developments in online experiments.\nAs mentioned above, some resources that are worth reading to learn more include my favourite causal inference book, the trustworthy experiments book, and the guidelines for pragmatic trials. There are also a bunch of resources on my causal inference list, and my post on Bayesian A/B testing should be of interest if you made it to this point. Finally, I’m always happy to discuss these topics, so feel free to contact me or leave a comment with your thoughts.\nCover image by Tumisu from Pixabay\n","wordCount":"2567","inLanguage":"en","image":"https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment.jpg","datePublished":"2022-01-14T00:05:40Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials</h1><div class=post-meta><span title='2022-01-14 00:05:40 +0000 UTC'>January 14, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-01-14-analysis-strategies-in-online-a-b-experiments/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment_hu383e813c0f222ce1ae47728263e063c0_165739_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment.jpg 1920w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/online-drug-experiment.jpg alt width=1920 height=1152></figure><div class=post-content><blockquote><p>In theory, there is no difference between theory and practice. In practice, there is.</p><footer><strong>Benjamin Brewster</strong></footer></blockquote><p>Many discussions of online A/B experiments deal with the sunny day scenario: You randomly assign users to groups A and B, expose group A to the control variant and group B to the treatment variant, run statistical tests on your chosen metrics, and assume that metric differences between the groups that aren&rsquo;t explained by randomness are due to exposure to the treatment.</p><p>However, it&rsquo;s not always a sunny day for the online experimenter. Challenges include dealing with bot traffic and malicious users, and implementation realities that may make users experience both variants or neither of them. While many of these problems have parallels in clinical trials, I haven&rsquo;t found many resources that explore these parallels. In this post, I share some lessons I learned from the rich clinical trial literature while building <a href=https://data.blog/category/experimentation-platform/ target=_blank rel=noopener>Automattic&rsquo;s experimentation platform</a>, focusing on analysis strategies that deal with deviations from the ideal experiment scenario.</p><h2 id=reminder-why-we-run-ab-experiments>Reminder: Why we run A/B experiments<a hidden class=anchor aria-hidden=true href=#reminder-why-we-run-ab-experiments>#</a></h2><figure><a href=uncontrolled-versus-controlled-experiment.svg target=_blank rel=noopener><img src=uncontrolled-versus-controlled-experiment.svg alt="Uncontrolled versus controlled experiment" loading=lazy></a><figcaption><p>Uncontrolled versus controlled experiment</p></figcaption></figure><p>While the practice of running online A/B experiments is now commonplace, it&rsquo;s worth reflecting on why such experiments work. Why can&rsquo;t we just roll out any treatments we think of, measure the metric changes, and assume that differences beyond what we expect from random variation are due to the genius (or folly) of our implemented treatments?</p><p>Well, it&rsquo;s not that simple because the world isn&rsquo;t static. Even if we don&rsquo;t make any changes, <a href=https://www.linkedin.com/pulse/how-identify-your-marketing-lies-start-telling-truth-tiberio-caetano/ target=_blank rel=noopener>we&rsquo;re likely to see different outcomes from month to month and day to day</a>, as the world and our user population change. This is represented by the top part of the diagram above: While we&rsquo;re interested in the causal impact of the <code>Treatment</code> on the <code>Outcome</code>, many <code>Unknowns</code> may affect both. That is, without an A/B experiment, the <code>Unknowns</code> act as confounders that make it impossible to estimate the causal effect without <a href=https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/>further assumptions</a>.</p><p>With an ideal A/B experiment, we make exposure to the <code>Treatment</code> depend only on our randomisation mechanism – the <code>Assigner</code> on the bottom part of the diagram. Assuming everything goes to plan, we end up with two distinct groups for which exposure to the <code>Treatment</code> is only due to our randomisation mechanism. This allows us to conclude that any differences in the <code>Outcome</code> across the groups beyond what&rsquo;s expected from randomness are due to the <code>Treatment</code>.</p><p>However, reality is often different from this ideal scenario.</p><h2 id=running-example>Running example<a hidden class=anchor aria-hidden=true href=#running-example>#</a></h2><p>To make things more concrete, let&rsquo;s take a simple example: You run a crypto exchange, and you want to maximise signups from one of your landing pages. The current call-to-action text is <em>&ldquo;sign up&rdquo;</em>. You&rsquo;re wondering whether changing it to <em>&ldquo;sign up today!&rdquo;</em> would instill a sense of urgency and increase the signup conversion rate (signups divided by unique visitors).</p><figure><a class=comment-button href=# onclick='alert("This is variant A: control")' style=float:unset>sign up</a>
 <small>OR</small>
 <a class=comment-button href=# onclick='alert("This is variant B: treatment")' style=float:unset>sign up today!</a><figcaption><p>A simplified mockup of the variants. Which one would you choose?</p></figcaption></figure><p>Placing this scenario into the above diagram, if we were to simply change the text, i.e., apply the <code>Treatment</code> to everyone, we wouldn&rsquo;t be able to confidently tell whether the text change was the <em>cause</em> of any observed difference in the conversion rate. For example, if our release coincided with a surge of interest in cryptocurrency, this surge may be one of the <code>Unknowns</code> that would cause more motivated users to come to our exchange and sign up. That is, the surge would affect both exposure to the <code>Treatment</code> and the <code>Outcome</code>.</p><p>When we run an ideal A/B experiment, we don&rsquo;t have this problem. Factors like a surge of interest in crypto don&rsquo;t affect the assignment of users to the control group A (<em>&ldquo;sign up&rdquo;</em>) and the treatment group B (<em>&ldquo;sign up today!&rdquo;</em>). We can compare the conversion rates across the groups, estimate random variability with <a href=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/>our favourite A/B testing calculator</a>, and rejoice. Right?</p><p>Well, not so fast&mldr;</p><h2 id=problems-problems>Problems, problems&mldr;<a hidden class=anchor aria-hidden=true href=#problems-problems>#</a></h2><p>In the ideal scenario, all the users that were assigned to one of the experiment groups experience their assigned variant and produce a measurable outcome. In our running example, the groups are <code>A: control</code> and <code>B: treatment</code> with a simple exposure of seeing <em>&ldquo;sign up&rdquo;</em> for the former and <em>&ldquo;sign up today!&rdquo;</em> for the latter. The outcome is a successful signup or an absence of a signup. To make the outcome well-defined, it&rsquo;s often a good idea to limit outcome measurement to events that happen (or don&rsquo;t happen) within a reasonable <em>attribution window</em> from exposure or assignment. In our example, a reasonable attribution window is probably on the order of hours, as we don&rsquo;t expect the call-to-action text to have long-lasting effects.</p><p>Potential deviations from the ideal scenario include:</p><ul><li><strong>Assignment of ineligible users.</strong> In our running example, these may be bots or users that already have an account. If we include many ineligible users in our analysis, we may underestimate the effect size even if their distribution across groups is uniform.</li><li><strong>Crossovers.</strong> These are users that manage to experience both variants. For example, they may come across our site on mobile with the <em>&ldquo;sign up today!&rdquo;</em> text, and then switch to desktop and see the <em>&ldquo;sign up&rdquo;</em> message. Depending on the instrumentation we have in place, we may not be able to detect such users, or we may only detect them if they sign up on one device and then log in on the other device.</li><li><strong>Assignment without exposure.</strong> Due to implementation constraints, we may not be guaranteed that assigned users are actually exposed to the treatment and control. In our running example, it may be that the assignment is done on the backend while exposure happens conditionally and asynchronously on the frontend – some users may bounce in the gap between assignment and exposure, and never see the call-to-action text.</li><li><strong>Multiple exposures.</strong> Once a user has been assigned, they may get exposed to the treatment and control multiple times (without crossing over). In our example, they may visit the landing page repeatedly and see the <em>&ldquo;sign up&rdquo;</em> or <em>&ldquo;sign up today!&rdquo;</em> text multiple times before deciding to sign up.</li></ul><h2 id=epidemiologist-jargon-and-analysis-strategies>Epidemiologist jargon and analysis strategies<a hidden class=anchor aria-hidden=true href=#epidemiologist-jargon-and-analysis-strategies>#</a></h2><p>While clinical trials are more tightly controlled than online A/B experiments, they are also susceptible to problems like assignment of ineligible patients and non-adherence to treatment (e.g., crossover, non-exposure, and multiple exposures). Hence, much has been written on addressing these problems at the analysis stage. However, when researching the topic, overcoming the domain-specific language barrier was a bit of a challenge, as the terminology used by online experimenters is different from the terminology used by epidemiologists. Fortunately, I came across the term <em>intention-to-treat</em> at some point, which opened the door to decades of research on the topic.</p><p>Two papers I found useful are <a href=https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159210/ target=_blank rel=noopener><em>Intention-to-treat concept: A review</em></a> (Gupta, 2011) and <a href=https://arxiv.org/abs/1911.06030 target=_blank rel=noopener><em>Guidelines for estimating causal effects in pragmatic randomized trials</em></a> (Murray, Swanson, and Hernán, 2019). Seeing <a href=https://www.hsph.harvard.edu/miguel-hernan/ target=_blank rel=noopener>Miguel Hernán</a> on the author list was an especially positive signal for me, as he is responsible for <a href=https://yanirseroussi.com/causal-inference-resources/>some of my favourite resources on causal inference</a>, including <a href=https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/>the most practical book I&rsquo;ve read on the topic</a>.</p><p>The definitions and guidelines from these two papers provide a solid foundation for thinking about problems of ineligibility and non-adherence. Specifically, Gupta defines intention-to-treat as an analysis strategy <em>&ldquo;that includes all randomized patients in the groups to which they were randomly assigned, regardless of their adherence with the entry criteria, regardless of the treatment they actually received, and regardless of subsequent withdrawal from treatment or deviation from the protocol.&rdquo;</em></p><p>There are often good reasons to exclude some randomised participants from analysis. Depending on the exclusions, this may or may not bias the results. The use of conservative exclusions can be described as modified intention-to-treat, which according to Gupta <em>&ldquo;allows the exclusion of some randomized subjects in a justified way (such as patients who were deemed ineligible after randomization or certain patients who never started treatment). However, the definition given to the modified ITT (mITT) in randomized controlled trials has been found to be irregular and arbitrary because there is a lack of consistent guidelines for its application. The mITT analysis allows a subjective approach in entry criteria, which may lead to confusion, inaccurate results and bias.&rdquo;</em></p><p>Exclusions and further adjustments are usually an attempt to estimate the per-protocol effect, which is defined by Murray, Swanson, and Hernán as <em>&ldquo;the effect of receiving the assigned treatment strategies throughout the follow-up as specified in the study protocol.&rdquo;</em> Unfortunately, obtaining a valid estimate of the per-protocol effect isn&rsquo;t trivial: <em>&ldquo;To validly estimate the per-protocol effect, baseline variables which predict adherence and are prognostic for the outcome need to be accounted for, either through direct adjustment or via an instrumental variable analysis. Yet two commonly used analytic approaches do not incorporate any such adjustment: (1) Naïve per-protocol analysis, that is, restricting the analytic subset to adherent individuals; and (2) As-treated analysis, that is, comparing individuals based on the treatment they choose.&rdquo;</em> In other words, if we&rsquo;re not careful, the per-protocol analysis may become analogous to an uncontrolled experiment, as depicted at the top of the diagram above.</p><h2 id=what-should-be-done-in-practice>What should be done in practice?<a hidden class=anchor aria-hidden=true href=#what-should-be-done-in-practice>#</a></h2><p>From my reading of the clinical trial literature, the tendency is to use multiple analysis strategies. For example, the first guideline noted by Murray, Swanson, and Hernán is: <em>&ldquo;To adequately guide decision making by all stakeholders, report estimates of both the intention-to-treat effect and the per-protocol effect, as well as methods and key conditions underlying the estimation procedures.&rdquo;</em> This echoes <a href=https://www.fda.gov/regulatory-information/search-fda-guidance-documents/format-and-content-clinical-and-statistical-sections-application target=_blank rel=noopener>the 1988 US FDA guidelines</a> that require applicants to provide an intention-to-treat analysis in addition to the applicant&rsquo;s preferred per-protocol analyses. Similarly, <a href=https://www.ema.europa.eu/en/documents/scientific-guideline/ich-e-9-statistical-principles-clinical-trials-step-5_en.pdf target=_blank rel=noopener>the 1998 European Medicines Agency guidelines</a> provide more details on the intention-to-treat, modified intention-to-treat, and per-protocol strategies, stating that: <em>&ldquo;In general, it is advantageous to demonstrate a lack of sensitivity of the principal trial results to alternative choices of the set of subjects analysed. [&mldr;] When the full analysis set and the per protocol set lead to essentially the same conclusions, confidence in the trial results is increased, bearing in mind, however, that the need to exclude a substantial proportion of subjects from the per protocol analysis throws some doubt on the overall validity of the trial.&rdquo;</em></p><p>While the stakes in online experiments are typically much lower than in human drug approval, I believe that applying multiple analysis strategies is still a great idea. We did that for Automattic&rsquo;s experimentation platform, where we flagged discrepancies between the strategies if they led to conflicting conclusions. One downside of this approach is that it complicates the presentation of results in comparison to using a single strategy. If you face the same challenge, you may draw inspiration from seeing how it&rsquo;s addressed by the <a href=https://github.com/Automattic/abacus target=_blank rel=noopener>open source frontend of Automattic&rsquo;s experimentation platform</a>.</p><p>Going back to our running example, we can perform the following analyses to deal with the deviations noted above:</p><ul><li><strong>Intention-to-treat.</strong> Includes all users based on their initial group assignment, regardless of what variant they were exposed to.</li><li><strong>Modified intention-to-treat: No ineligible users.</strong> This applies to cases where we detect the ineligibility after assignment, but the eligibility criteria are based on factors that could have been known before the experiment. Hence, it <em>should</em> be safe to exclude the ineligible users after the fact. In our example, excluding bots and existing users should increase the observed effect size, but not change the preferred variant.</li><li><strong>Modified intention-to-treat: No crossovers.</strong> If we have a mechanism to detect <em>some</em> crossovers, excluding them and comparing the results to the intention-to-treat analysis may uncover implementation bugs. It&rsquo;s worth noting that crossovers shouldn&rsquo;t occur in cases where we can uniquely identify users at all stages of the experiment – it is a problem that is more likely to occur when dealing with anonymous users, as in our landing page example. As such, and given the inability to detect all crossovers, A/B experiments should be avoided when users are highly motivated to cross over. For example, displaying different price levels based on anonymous and transient identifiers like cookies is often a bad idea.</li><li><strong>Naive per-protocol: Exposed users.</strong> For this analysis, we&rsquo;d only include users that were exposed to the control and treatment texts. As noted by Murray, Swanson, and Hernán, this is naive because we <em>should</em> adjust our estimates based on variables that predict exposure. However, if missing exposures are only due to the inherent limitations of online experiments, this falls more under the modified intention-to-treat criterion noted by Gupta, of excluding <em>&ldquo;patients who never started treatment&rdquo;</em>. Things get more complicated if we wish to use each exposure as a distinct starting point for measuring multiple assignment windows (the <em>multiple exposures</em> scenario above), which is akin to patients choosing their own dosage – far from a controlled experiment. For automated analysis, it&rsquo;s better to use the first exposure as the attribution window start, as it should be unaffected by the experiment variants.</li></ul><p>For all analysis approaches, it&rsquo;s critical to verify that there is no <em>sample ratio mismatch</em> in the analysed population, i.e., that the distribution of users across variants matches what we expect from a random assignment. If this isn&rsquo;t the case, manual analysis by a qualified data scientist is needed. The result of this manual analysis may be that the results should be discarded, as sample ratio mismatches are a common indicator of implementation bugs. This is discussed in detail in the book <a href=https://experimentguide.com/ target=_blank rel=noopener><em>Trustworthy Online Controlled Experiments</em></a>, which also includes a chapter on exposure-based analysis (called <em>triggering</em> in the book). Among other recommendations, the authors suggest analysing the <em>unexposed</em> users. If everything goes as expected, metrics for the assigned-but-unexposed populations would behave like A/A experiment metrics, i.e., any differences between the groups should be due to random variability.</p><p>Having rigorous consistency checks in place and falling back to manual analysis when any discrepancies are detected should help avoid the pitfalls of unsafe user exclusions that&rsquo;d bias the results. Given the need for careful adjustments to get a valid per-protocol estimate in case anything goes wrong, it is often best to fix any underlying issues and rerun the experiment. Usually, this is much cheaper to do in an online setting than in clinical trials.</p><h2 id=closing-thoughts-and-further-reading>Closing thoughts and further reading<a hidden class=anchor aria-hidden=true href=#closing-thoughts-and-further-reading>#</a></h2><p>Once you move from the theory of experimentation to the practice of running experiments in the real world, you discover the many complexities involved in doing it well. This applies whether you&rsquo;re an epidemiologist or an online experimenter. As noted in the preface to the trustworthy experiments book: <em>&ldquo;Getting numbers is easy; getting numbers you can trust is hard!&rdquo;</em></p><p>This post only scratched the surface of one area of experimentation: Deciding what population to analyse once the experiment was run. There is, of course, a lot more to online experimentation and causal inference than what I could cover here. But I hope that this message is clear: <strong>Approach experimentation with humility, and aim to learn from a broad set of teachers rather than limit yourself to the relatively-recent developments in online experiments.</strong></p><p>As mentioned above, some resources that are worth reading to learn more include <a href=https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/ target=_blank rel=noopener>my favourite causal inference book</a>, <a href=https://experimentguide.com/ target=_blank rel=noopener>the trustworthy experiments book</a>, and <a href=https://arxiv.org/abs/1911.06030 target=_blank rel=noopener>the guidelines for pragmatic trials</a>. There are also a bunch of resources on <a href=https://yanirseroussi.com/causal-inference-resources/>my causal inference list</a>, and <a href=https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/>my post on Bayesian A/B testing</a> should be of interest if you made it to this point. Finally, I&rsquo;m always happy to discuss these topics, so feel free to <a href=https://yanirseroussi.com/about/>contact me</a> or leave a comment with your thoughts.</p><hr><p><small>Cover image by <a href=https://pixabay.com/photos/online-pharmacy-pills-click-3962209/ target=_blank rel=noopener>Tumisu from Pixabay</a></small></p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/a/b-testing/>a/b testing</a></li><li><a href=https://yanirseroussi.com/tags/causal-inference/>causal inference</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/marketing/>marketing</a></li><li><a href=https://yanirseroussi.com/tags/statistics/>statistics</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials on x" href="https://x.com/intent/tweet/?text=Analysis%20strategies%20in%20online%20A%2fB%20experiments%3a%20Intention-to-treat%2c%20per-protocol%2c%20and%20other%20lessons%20from%20clinical%20trials&amp;url=https%3a%2f%2fyanirseroussi.com%2f2022%2f01%2f14%2fanalysis-strategies-in-online-a-b-experiments%2f&amp;hashtags=a%2fbtesting%2ccausalinference%2cdatascience%2cmarketing%2cstatistics"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2022%2f01%2f14%2fanalysis-strategies-in-online-a-b-experiments%2f&amp;title=Analysis%20strategies%20in%20online%20A%2fB%20experiments%3a%20Intention-to-treat%2c%20per-protocol%2c%20and%20other%20lessons%20from%20clinical%20trials&amp;summary=Analysis%20strategies%20in%20online%20A%2fB%20experiments%3a%20Intention-to-treat%2c%20per-protocol%2c%20and%20other%20lessons%20from%20clinical%20trials&amp;source=https%3a%2f%2fyanirseroussi.com%2f2022%2f01%2f14%2fanalysis-strategies-in-online-a-b-experiments%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2022%2f01%2f14%2fanalysis-strategies-in-online-a-b-experiments%2f&title=Analysis%20strategies%20in%20online%20A%2fB%20experiments%3a%20Intention-to-treat%2c%20per-protocol%2c%20and%20other%20lessons%20from%20clinical%20trials"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2022%2f01%2f14%2fanalysis-strategies-in-online-a-b-experiments%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials on whatsapp" href="https://api.whatsapp.com/send?text=Analysis%20strategies%20in%20online%20A%2fB%20experiments%3a%20Intention-to-treat%2c%20per-protocol%2c%20and%20other%20lessons%20from%20clinical%20trials%20-%20https%3a%2f%2fyanirseroussi.com%2f2022%2f01%2f14%2fanalysis-strategies-in-online-a-b-experiments%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials on telegram" href="https://telegram.me/share/url?text=Analysis%20strategies%20in%20online%20A%2fB%20experiments%3a%20Intention-to-treat%2c%20per-protocol%2c%20and%20other%20lessons%20from%20clinical%20trials&amp;url=https%3a%2f%2fyanirseroussi.com%2f2022%2f01%2f14%2fanalysis-strategies-in-online-a-b-experiments%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials on ycombinator" href="https://news.ycombinator.com/submitlink?t=Analysis%20strategies%20in%20online%20A%2fB%20experiments%3a%20Intention-to-treat%2c%20per-protocol%2c%20and%20other%20lessons%20from%20clinical%20trials&u=https%3a%2f%2fyanirseroussi.com%2f2022%2f01%2f14%2fanalysis-strategies-in-online-a-b-experiments%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
diff --git a/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.html b/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.html
index 0f53bb366..ff01dc54d 100644
--- a/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.html
+++ b/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Building useful machine learning tools keeps getting easier: A fish ID case study | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="artificial intelligence,data science,deep learning,fast.ai,machine learning,marine science,Reef Life Survey,software engineering,web development"><meta name=description content="Lessons learned building a fish ID web app with fast.ai and Streamlit, in an attempt to reduce my fear of missing out on the latest deep learning developments."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Building useful machine learning tools keeps getting easier: A fish ID case study"><meta property="og:description" content="Lessons learned building a fish ID web app with fast.ai and Streamlit, in an attempt to reduce my fear of missing out on the latest deep learning developments."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/"><meta property="og:image" content="https://yanirseroussi.com/cardinals.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-03-20T04:30:00+00:00"><meta property="article:modified_time" content="2023-07-10T16:35:18+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/cardinals.jpg"><meta name=twitter:title content="Building useful machine learning tools keeps getting easier: A fish ID case study"><meta name=twitter:description content="Lessons learned building a fish ID web app with fast.ai and Streamlit, in an attempt to reduce my fear of missing out on the latest deep learning developments."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Building useful machine learning tools keeps getting easier: A fish ID case study","item":"https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Building useful machine learning tools keeps getting easier: A fish ID case study","name":"Building useful machine learning tools keeps getting easier: A fish ID case study","description":"Lessons learned building a fish ID web app with fast.ai and Streamlit, in an attempt to reduce my fear of missing out on the latest deep learning developments.","keywords":["artificial intelligence","data science","deep learning","fast.ai","machine learning","marine science","Reef Life Survey","software engineering","web development"],"articleBody":"Being a data scientist is a constant struggle with FOMO (fear of missing out): While you spend your time and attention on one tool, technique, or domain, dozens of other areas keep advancing at breakneck speed. It is impossible to keep up with everything. Fortunately, some advancements make it easy for a single person to accomplish tasks that previously required a team of experts. I covered some aspects of this phenomenon in a previous post: Software commodities are eating interesting data science work. Today’s post covers a specific case study, of how I recently overcame some of my deep learning FOMO by building a fish ID web app.\nBackground Until October last year, I was working as a data scientist with Automattic. I was with the company for about 4.5 years in total. In my final two years, I was the tech lead for the company’s unified experimentation platform. In the two years prior to that, I co-led the implementation of the company’s machine learning pipeline. My interest in causal inference was one of the reasons I got involved with the unified experimentation platform, but this involvement meant I neglected my machine learning skills. Similarly, the machine learning pipeline I worked on was focused on marketing applications with tabular data. This meant that there was no need for me to do anything in computer vision or deep learning for many years. In fact, the last time I touched computer vision was due to deep learning FOMO, back in 2015.\nAround the middle of last year, I helped mentor a local edition of the fast.ai Practical Deep Learning for Coders course. I figured it’d help me catch up on some recent developments, while helping others in the community. Given my hobby of volunteering as a scuba diver with the Reef Life Survey (RLS) project, it seemed like a good opportunity to do a side project around automated fish ID. However, the reality of full-time remote work meant that I had little motivation to spend extra time in front of the computer, so that side project never got off the ground.\nFortunately, I decided to leave Automattic and pursue work that better aligns with my values and interests. Rather than jumping into another full-time role, I decided to spend some time exploring and learning – a great antidote to the data science FOMO. First on the agenda after migrating my site off WordPress.com was making progress on the automated fish ID project. While it is still experimental, it’s now live on the RLS website, with the code available in my deep-fish repo.\nThe fish ID tool As far as machine learning applications go, the tool I built isn’t groundbreaking – and that’s exactly the point. Many machine learning apps are boring and “uncool” (fulling fast.ai’s goal of making neural nets uncool again). But such apps are often useful. In my case, the tool scratches an itch felt by many RLS volunteers and other divers: Given a photo taken at a certain location, what fish is in the photo?\nThe tool relies on a classification model trained on images from the RLS website. In addition to the model, it lets users filter results based on previously-observed species at RLS sites. The following video demonstrates how it works:\nI built the computer vision model with fast.ai, and the web app with Streamlit. It only took a couple of weeks to put everything together, and it could have easily been faster if I hadn’t taken the time to understand the underlying modelling code and tinker with various things. I’m sure that the model can be improved – my initial modelling attempts yielded a top-10 accuracy of about 60%, which I subsequently improved to about 72%. The main challenge is that there are 6,628 images and 2,167 species in the dataset I used, so it’s likely that some species can’t be identified reliably from the available training images.\nYou can read through my modelling experiments in the project’s notebooks. Copyright for the images belongs to the photographers, so I can’t share the full dataset.\nLessons learned Rather than writing too much about the model and the code, which aren’t too unusual, I’d like to share a few lessons I learned while working on this project.\n1. Getting reasonable performance out of a deep learning model can be cheap and easy. This lesson is highlighted in the introduction to the fast.ai course: With a few lines of code (and the right data), it’s easy to train reasonable models. It can also be cheap: I only used my laptop’s GPU for most of the experiments, and relied on Kaggle’s free notebook environment for experiments that I couldn’t run locally. On my dataset, I found that training a bigger (ResNet50) model with Kaggle didn’t improve accuracy in comparison to the smaller (ResNet18) model I could fit into my laptop’s GPU memory. This would definitely vary by dataset, but the point is that reasonable performance doesn’t necessarily require much human or computer work. In fact, much of the time I spent on modelling was for my own benefit, to better understand the material taught by fast.ai. Conceptually, I was pleased to discover that many things remained the same since my last foray into computer vision: Reasonable performance can be obtained by using established techniques and pre-trained architectures, while focusing on the data, the modelling pipeline, and augmentations. In my experience, this principle applies to many machine learning problems. This is summarised well by the directive from Google’s Rules of Machine Learning to “do machine learning like the great engineer you are, not like the great machine learning expert you aren’t.”\n2. Building a Streamlit UI feels like magic. I’ve heard about Streamlit years ago, but this was my first time using it. I was impressed with how quickly I could put together a useful app using only Python. I went from a vague idea to a pretty complete implementation in a day (with some additional tinkering in subsequent days). It really is a game changer for data scientists.\n3. Deploying a Streamlit app is a bit less magical. Streamlit Cloud seemed like a straightforward way to deploy Streamlit apps, but I ran into issues because I used a Conda environment. I managed to work around those issues, but it seems like the environment installed on Cloud isn’t truly isolated: Judging by the logs, Streamlit Cloud reads the Conda file and installs the required packages into an existing environment. This results in weird error messages that are hard to debug. I also ran into memory issues, which seem to be un-debuggable with the information provided by Streamlit Cloud. Still, I decided to initially deploy the app to Streamlit Cloud’s free tier and wrap it in an iframe for the RLS website. Given the steep increase in price from the free tier to the lowest paid tier (US$250 / month), it’s likely I’d switch to self-hosting if I run into more issues. This is a disappointing contrast to the magical experience of building the UI, but I hope that Streamlit Cloud would become easier to use with time.\n4. The fast.ai library is a great starting point, despite its quirks. Using fast.ai felt a bit like cheating, in the sense captured by xkcd’s Real Programmers comic. Given the hype, it feels like it should be harder to build useful models – real data scientists use PyTorch directly! But no, in reality it makes sense to use the best tool for the job. And there’s nothing wrong with something being easy or fast, as it lets you spend more time elsewhere. In the words of the principles behind the agile manifesto: “Simplicity – the art of maximizing the amount of work not done – is essential.”\nReal Programmers don’t do easy things. Source: xkcd. That said, the fast.ai library isn’t perfect. Debugging can be a bit frustrating, as it tries to do a lot of things automatically and mutates many objects in surprising ways. Its documentation is also somewhat lacking (perhaps due to the use of notebooks as the primary development environment), and its naming conventions can be a bit odd (especially the overuse of acronyms). But these are minor annoyances rather than blockers. It does work well for its main use cases, and it’s possible to go down to the PyTorch level when necessary.\n5. As always, it’s all about the data and how you use it. This is hardly a new lesson for me, but it’s worth reiterating. Given the maturity of computer vision and other machine learning packages, data scientists should focus on getting relevant data and understanding the problem well. As Andrej Karpathy noted in his 2019 recipe for training neural nets, and I said in my 2014 Kaggle tips, you should aim to become one with the data.\n6. FOMO will always be there, but it can be lessened. In general, I care more about making useful things than about using the latest techniques. This is why I prioritised working with RLS to get my tool deployed. Still, FOMO in data science is a well-documented phenomenon, and I suffer from it too. It’s encouraging that – given some free time and a clear head – it’s not that hard to catch up on recent developments. This is made especially easy by the availability of many free resources, like fast.ai. The main thing to remember is to focus on principles rather than worry about the million methods and tools that are out there – it was true in 1911, and it’s still true today.\n","wordCount":"1595","inLanguage":"en","image":"https://yanirseroussi.com/cardinals.jpg","datePublished":"2022-03-20T04:30:00Z","dateModified":"2023-07-10T16:35:18+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Building useful machine learning tools keeps getting easier: A fish ID case study</h1><div class=post-meta><span title='2022-03-20 04:30:00 +0000 UTC'>March 20, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-03-20-building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals.jpg 3066w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals.jpg alt width=3066 height=1214></figure><div class=post-content><p>Being a data scientist is a constant struggle with FOMO (<em>fear of missing out</em>): While you spend your time and attention on one tool, technique, or domain, dozens of other areas keep advancing at breakneck speed. It is impossible to keep up with everything. Fortunately, some advancements make it easy for a single person to accomplish tasks that previously required a team of experts. I covered some aspects of this phenomenon in a previous post: <a href=https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/>Software commodities are eating interesting data science work</a>. Today&rsquo;s post covers a specific case study, of how I recently overcame some of my deep learning FOMO by building a fish ID web app.</p><h2 id=background>Background<a hidden class=anchor aria-hidden=true href=#background>#</a></h2><p>Until October last year, I was working as a data scientist with Automattic. I was with the company for about 4.5 years in total. In my final two years, I was the tech lead for <a href=https://data.blog/2021/04/14/architecting-explat-automattics-new-experimentation-platform/ target=_blank rel=noopener>the company&rsquo;s unified experimentation platform</a>. In the two years prior to that, I co-led the implementation of <a href=https://data.blog/2018/11/15/introducing-pipe-the-automattic-machine-learning-pipeline/ target=_blank rel=noopener>the company&rsquo;s machine learning pipeline</a>. My interest in causal inference was one of the reasons I got involved with the unified experimentation platform, but this involvement meant I neglected my machine learning skills. Similarly, the machine learning pipeline I worked on was focused on marketing applications with tabular data. This meant that there was no need for me to do anything in computer vision or deep learning for many years. In fact, the last time I touched computer vision was <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/>due to deep learning FOMO, back in 2015</a>.</p><p>Around the middle of last year, <a href=https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/>I helped mentor a local edition of the fast.ai <em>Practical Deep Learning for Coders</em> course</a>. I figured it&rsquo;d help me catch up on some recent developments, while helping others in the community. Given my hobby of <a href=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/>volunteering as a scuba diver with the Reef Life Survey (RLS) project</a>, it seemed like a good opportunity to do a side project around automated fish ID. However, the reality of full-time remote work meant that I had little motivation to spend extra time in front of the computer, so that side project never got off the ground.</p><p>Fortunately, I decided to leave Automattic and pursue work that better aligns with my values and interests. Rather than jumping into another full-time role, I decided to spend some time exploring and learning – a great antidote to the data science FOMO. First on the agenda after <a href=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/>migrating my site off WordPress.com</a> was making progress on the automated fish ID project. While it is still experimental, <a href=https://reeflifesurvey.com/fish-id/ target=_blank rel=noopener>it&rsquo;s now live on the RLS website</a>, with the code available in <a href=https://github.com/yanirs/deep-fish target=_blank rel=noopener>my deep-fish repo</a>.</p><h2 id=the-fish-id-tool>The fish ID tool<a hidden class=anchor aria-hidden=true href=#the-fish-id-tool>#</a></h2><p>As far as machine learning applications go, the tool I built isn&rsquo;t groundbreaking – and that&rsquo;s exactly the point. Many machine learning apps are boring and &ldquo;uncool&rdquo; (fulling <a href=https://www.fast.ai/about/#slogan target=_blank rel=noopener>fast.ai&rsquo;s goal</a> of <em>making neural nets uncool again</em>). But such apps are often useful. In my case, the tool scratches an itch felt by many RLS volunteers and other divers: <em>Given a photo taken at a certain location, what fish is in the photo?</em></p><p>The tool relies on a classification model trained on images from the RLS website. In addition to the model, it lets users filter results based on previously-observed species at RLS sites. The following video demonstrates how it works:</p><p style=text-align:center><iframe src=https://drive.google.com/file/d/1hpfRY26ZQXHzhYpAIP-3ShsqpuCSXfth/preview width=640 height=480 allow=autoplay></iframe></p><p>I built the computer vision model with fast.ai, and the web app with <a href=https://streamlit.io/ target=_blank rel=noopener>Streamlit</a>. It only took a couple of weeks to put everything together, and it could have easily been faster if I hadn&rsquo;t taken the time to understand the underlying modelling code and tinker with various things. I&rsquo;m sure that the model can be improved – my initial modelling attempts yielded a top-10 accuracy of about 60%, which I subsequently improved to about 72%. The main challenge is that there are 6,628 images and 2,167 species in the dataset I used, so it&rsquo;s likely that some species can&rsquo;t be identified reliably from the available training images.</p><p>You can read through my modelling experiments in <a href=https://github.com/yanirs/deep-fish/tree/master/notebooks target=_blank rel=noopener>the project&rsquo;s notebooks</a>. Copyright for the images belongs to the photographers, so I can&rsquo;t share the full dataset.</p><h2 id=lessons-learned>Lessons learned<a hidden class=anchor aria-hidden=true href=#lessons-learned>#</a></h2><p>Rather than writing too much about the model and the code, which aren&rsquo;t too unusual, I&rsquo;d like to share a few lessons I learned while working on this project.</p><p><strong>1. Getting reasonable performance out of a deep learning model can be cheap and easy.</strong> This lesson is highlighted in <a href=https://course.fast.ai/ target=_blank rel=noopener>the introduction to the fast.ai course</a>: With a few lines of code (and the right data), it&rsquo;s easy to train reasonable models. It can also be cheap: I only used my laptop&rsquo;s GPU for most of the experiments, and relied on Kaggle&rsquo;s free notebook environment for experiments that I couldn&rsquo;t run locally. On my dataset, I found that training a bigger (ResNet50) model with Kaggle didn&rsquo;t improve accuracy in comparison to the smaller (ResNet18) model I could fit into my laptop&rsquo;s GPU memory. This would definitely vary by dataset, but the point is that reasonable performance doesn&rsquo;t necessarily require much human or computer work. In fact, much of the time I spent on modelling was for my own benefit, to better understand the material taught by fast.ai. Conceptually, I was pleased to discover that many things remained the same since my last foray into computer vision: Reasonable performance can be obtained by using established techniques and pre-trained architectures, while focusing on the data, the modelling pipeline, and augmentations. In my experience, this principle applies to many machine learning problems. This is summarised well by the directive from <a href=https://developers.google.com/machine-learning/guides/rules-of-ml/ target=_blank rel=noopener>Google&rsquo;s Rules of Machine Learning</a> to <em>&ldquo;do machine learning like the great engineer you are, not like the great machine learning expert you aren&rsquo;t.&rdquo;</em></p><p><strong>2. Building a Streamlit UI feels like magic.</strong> I&rsquo;ve heard about Streamlit years ago, but this was my first time using it. I was impressed with how quickly I could put together a useful app using only Python. I went from a vague idea to a pretty complete implementation in a day (with some additional tinkering in subsequent days). It really is a game changer for data scientists.</p><p><strong>3. Deploying a Streamlit app is a bit less magical.</strong> Streamlit Cloud seemed like a straightforward way to deploy Streamlit apps, but I ran into issues because I used a Conda environment. I managed to work around those issues, but it seems like the environment installed on Cloud isn&rsquo;t truly isolated: Judging by the logs, Streamlit Cloud reads the Conda file and installs the required packages into an existing environment. This results in weird error messages that are hard to debug. I also ran into memory issues, which seem to be un-debuggable with the information provided by Streamlit Cloud. Still, I decided to initially deploy the app to Streamlit Cloud&rsquo;s free tier and wrap it in an iframe for the RLS website. Given the steep increase in price from the free tier to the lowest paid tier (US$250 / month), it&rsquo;s likely I&rsquo;d switch to self-hosting if I run into more issues. This is a disappointing contrast to the magical experience of building the UI, but I hope that Streamlit Cloud would become easier to use with time.</p><p><strong>4. The fast.ai library is a great starting point, despite its quirks.</strong> Using fast.ai felt a bit like cheating, in the sense captured by xkcd&rsquo;s Real Programmers comic. Given the hype, it feels like it should be harder to build useful models – <em>real data scientists use PyTorch directly!</em> But no, in reality it makes sense to use the best tool for the job. And there&rsquo;s nothing wrong with something being easy or fast, as it lets you spend more time elsewhere. In the words of <a href=https://agilemanifesto.org/principles.html target=_blank rel=noopener>the principles behind the agile manifesto</a>: <em>&ldquo;Simplicity – the art of maximizing the amount of work not done – is essential.&rdquo;</em></p><figure class=white-bg><a href=real-programmers-xkcd.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="artificial intelligence,data science,deep learning,fast.ai,machine learning,marine science,Reef Life Survey,software engineering,web development"><meta name=description content="Lessons learned building a fish ID web app with fast.ai and Streamlit, in an attempt to reduce my fear of missing out on the latest deep learning developments."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Building useful machine learning tools keeps getting easier: A fish ID case study"><meta property="og:description" content="Lessons learned building a fish ID web app with fast.ai and Streamlit, in an attempt to reduce my fear of missing out on the latest deep learning developments."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/"><meta property="og:image" content="https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-03-20T04:30:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals.jpg"><meta name=twitter:title content="Building useful machine learning tools keeps getting easier: A fish ID case study"><meta name=twitter:description content="Lessons learned building a fish ID web app with fast.ai and Streamlit, in an attempt to reduce my fear of missing out on the latest deep learning developments."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Building useful machine learning tools keeps getting easier: A fish ID case study","item":"https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Building useful machine learning tools keeps getting easier: A fish ID case study","name":"Building useful machine learning tools keeps getting easier: A fish ID case study","description":"Lessons learned building a fish ID web app with fast.ai and Streamlit, in an attempt to reduce my fear of missing out on the latest deep learning developments.","keywords":["artificial intelligence","data science","deep learning","fast.ai","machine learning","marine science","Reef Life Survey","software engineering","web development"],"articleBody":"Being a data scientist is a constant struggle with FOMO (fear of missing out): While you spend your time and attention on one tool, technique, or domain, dozens of other areas keep advancing at breakneck speed. It is impossible to keep up with everything. Fortunately, some advancements make it easy for a single person to accomplish tasks that previously required a team of experts. I covered some aspects of this phenomenon in a previous post: Software commodities are eating interesting data science work. Today’s post covers a specific case study, of how I recently overcame some of my deep learning FOMO by building a fish ID web app.\nBackground Until October last year, I was working as a data scientist with Automattic. I was with the company for about 4.5 years in total. In my final two years, I was the tech lead for the company’s unified experimentation platform. In the two years prior to that, I co-led the implementation of the company’s machine learning pipeline. My interest in causal inference was one of the reasons I got involved with the unified experimentation platform, but this involvement meant I neglected my machine learning skills. Similarly, the machine learning pipeline I worked on was focused on marketing applications with tabular data. This meant that there was no need for me to do anything in computer vision or deep learning for many years. In fact, the last time I touched computer vision was due to deep learning FOMO, back in 2015.\nAround the middle of last year, I helped mentor a local edition of the fast.ai Practical Deep Learning for Coders course. I figured it’d help me catch up on some recent developments, while helping others in the community. Given my hobby of volunteering as a scuba diver with the Reef Life Survey (RLS) project, it seemed like a good opportunity to do a side project around automated fish ID. However, the reality of full-time remote work meant that I had little motivation to spend extra time in front of the computer, so that side project never got off the ground.\nFortunately, I decided to leave Automattic and pursue work that better aligns with my values and interests. Rather than jumping into another full-time role, I decided to spend some time exploring and learning – a great antidote to the data science FOMO. First on the agenda after migrating my site off WordPress.com was making progress on the automated fish ID project. While it is still experimental, it’s now live on the RLS website, with the code available in my deep-fish repo.\nThe fish ID tool As far as machine learning applications go, the tool I built isn’t groundbreaking – and that’s exactly the point. Many machine learning apps are boring and “uncool” (fulling fast.ai’s goal of making neural nets uncool again). But such apps are often useful. In my case, the tool scratches an itch felt by many RLS volunteers and other divers: Given a photo taken at a certain location, what fish is in the photo?\nThe tool relies on a classification model trained on images from the RLS website. In addition to the model, it lets users filter results based on previously-observed species at RLS sites. The following video demonstrates how it works:\nI built the computer vision model with fast.ai, and the web app with Streamlit. It only took a couple of weeks to put everything together, and it could have easily been faster if I hadn’t taken the time to understand the underlying modelling code and tinker with various things. I’m sure that the model can be improved – my initial modelling attempts yielded a top-10 accuracy of about 60%, which I subsequently improved to about 72%. The main challenge is that there are 6,628 images and 2,167 species in the dataset I used, so it’s likely that some species can’t be identified reliably from the available training images.\nYou can read through my modelling experiments in the project’s notebooks. Copyright for the images belongs to the photographers, so I can’t share the full dataset.\nLessons learned Rather than writing too much about the model and the code, which aren’t too unusual, I’d like to share a few lessons I learned while working on this project.\n1. Getting reasonable performance out of a deep learning model can be cheap and easy. This lesson is highlighted in the introduction to the fast.ai course: With a few lines of code (and the right data), it’s easy to train reasonable models. It can also be cheap: I only used my laptop’s GPU for most of the experiments, and relied on Kaggle’s free notebook environment for experiments that I couldn’t run locally. On my dataset, I found that training a bigger (ResNet50) model with Kaggle didn’t improve accuracy in comparison to the smaller (ResNet18) model I could fit into my laptop’s GPU memory. This would definitely vary by dataset, but the point is that reasonable performance doesn’t necessarily require much human or computer work. In fact, much of the time I spent on modelling was for my own benefit, to better understand the material taught by fast.ai. Conceptually, I was pleased to discover that many things remained the same since my last foray into computer vision: Reasonable performance can be obtained by using established techniques and pre-trained architectures, while focusing on the data, the modelling pipeline, and augmentations. In my experience, this principle applies to many machine learning problems. This is summarised well by the directive from Google’s Rules of Machine Learning to “do machine learning like the great engineer you are, not like the great machine learning expert you aren’t.”\n2. Building a Streamlit UI feels like magic. I’ve heard about Streamlit years ago, but this was my first time using it. I was impressed with how quickly I could put together a useful app using only Python. I went from a vague idea to a pretty complete implementation in a day (with some additional tinkering in subsequent days). It really is a game changer for data scientists.\n3. Deploying a Streamlit app is a bit less magical. Streamlit Cloud seemed like a straightforward way to deploy Streamlit apps, but I ran into issues because I used a Conda environment. I managed to work around those issues, but it seems like the environment installed on Cloud isn’t truly isolated: Judging by the logs, Streamlit Cloud reads the Conda file and installs the required packages into an existing environment. This results in weird error messages that are hard to debug. I also ran into memory issues, which seem to be un-debuggable with the information provided by Streamlit Cloud. Still, I decided to initially deploy the app to Streamlit Cloud’s free tier and wrap it in an iframe for the RLS website. Given the steep increase in price from the free tier to the lowest paid tier (US$250 / month), it’s likely I’d switch to self-hosting if I run into more issues. This is a disappointing contrast to the magical experience of building the UI, but I hope that Streamlit Cloud would become easier to use with time.\n4. The fast.ai library is a great starting point, despite its quirks. Using fast.ai felt a bit like cheating, in the sense captured by xkcd’s Real Programmers comic. Given the hype, it feels like it should be harder to build useful models – real data scientists use PyTorch directly! But no, in reality it makes sense to use the best tool for the job. And there’s nothing wrong with something being easy or fast, as it lets you spend more time elsewhere. In the words of the principles behind the agile manifesto: “Simplicity – the art of maximizing the amount of work not done – is essential.”\nReal Programmers don’t do easy things. Source: xkcd. That said, the fast.ai library isn’t perfect. Debugging can be a bit frustrating, as it tries to do a lot of things automatically and mutates many objects in surprising ways. Its documentation is also somewhat lacking (perhaps due to the use of notebooks as the primary development environment), and its naming conventions can be a bit odd (especially the overuse of acronyms). But these are minor annoyances rather than blockers. It does work well for its main use cases, and it’s possible to go down to the PyTorch level when necessary.\n5. As always, it’s all about the data and how you use it. This is hardly a new lesson for me, but it’s worth reiterating. Given the maturity of computer vision and other machine learning packages, data scientists should focus on getting relevant data and understanding the problem well. As Andrej Karpathy noted in his 2019 recipe for training neural nets, and I said in my 2014 Kaggle tips, you should aim to become one with the data.\n6. FOMO will always be there, but it can be lessened. In general, I care more about making useful things than about using the latest techniques. This is why I prioritised working with RLS to get my tool deployed. Still, FOMO in data science is a well-documented phenomenon, and I suffer from it too. It’s encouraging that – given some free time and a clear head – it’s not that hard to catch up on recent developments. This is made especially easy by the availability of many free resources, like fast.ai. The main thing to remember is to focus on principles rather than worry about the million methods and tools that are out there – it was true in 1911, and it’s still true today.\n","wordCount":"1595","inLanguage":"en","image":"https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals.jpg","datePublished":"2022-03-20T04:30:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Building useful machine learning tools keeps getting easier: A fish ID case study</h1><div class=post-meta><span title='2022-03-20 04:30:00 +0000 UTC'>March 20, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-03-20-building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals_hu2278fbb0a04afae4f432aacc3e29a944_909882_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals.jpg 3066w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/cardinals.jpg alt width=3066 height=1214></figure><div class=post-content><p>Being a data scientist is a constant struggle with FOMO (<em>fear of missing out</em>): While you spend your time and attention on one tool, technique, or domain, dozens of other areas keep advancing at breakneck speed. It is impossible to keep up with everything. Fortunately, some advancements make it easy for a single person to accomplish tasks that previously required a team of experts. I covered some aspects of this phenomenon in a previous post: <a href=https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/>Software commodities are eating interesting data science work</a>. Today&rsquo;s post covers a specific case study, of how I recently overcame some of my deep learning FOMO by building a fish ID web app.</p><h2 id=background>Background<a hidden class=anchor aria-hidden=true href=#background>#</a></h2><p>Until October last year, I was working as a data scientist with Automattic. I was with the company for about 4.5 years in total. In my final two years, I was the tech lead for <a href=https://data.blog/2021/04/14/architecting-explat-automattics-new-experimentation-platform/ target=_blank rel=noopener>the company&rsquo;s unified experimentation platform</a>. In the two years prior to that, I co-led the implementation of <a href=https://data.blog/2018/11/15/introducing-pipe-the-automattic-machine-learning-pipeline/ target=_blank rel=noopener>the company&rsquo;s machine learning pipeline</a>. My interest in causal inference was one of the reasons I got involved with the unified experimentation platform, but this involvement meant I neglected my machine learning skills. Similarly, the machine learning pipeline I worked on was focused on marketing applications with tabular data. This meant that there was no need for me to do anything in computer vision or deep learning for many years. In fact, the last time I touched computer vision was <a href=https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/>due to deep learning FOMO, back in 2015</a>.</p><p>Around the middle of last year, <a href=https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/>I helped mentor a local edition of the fast.ai <em>Practical Deep Learning for Coders</em> course</a>. I figured it&rsquo;d help me catch up on some recent developments, while helping others in the community. Given my hobby of <a href=https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/>volunteering as a scuba diver with the Reef Life Survey (RLS) project</a>, it seemed like a good opportunity to do a side project around automated fish ID. However, the reality of full-time remote work meant that I had little motivation to spend extra time in front of the computer, so that side project never got off the ground.</p><p>Fortunately, I decided to leave Automattic and pursue work that better aligns with my values and interests. Rather than jumping into another full-time role, I decided to spend some time exploring and learning – a great antidote to the data science FOMO. First on the agenda after <a href=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/>migrating my site off WordPress.com</a> was making progress on the automated fish ID project. While it is still experimental, <a href=https://reeflifesurvey.com/fish-id/ target=_blank rel=noopener>it&rsquo;s now live on the RLS website</a>, with the code available in <a href=https://github.com/yanirs/deep-fish target=_blank rel=noopener>my deep-fish repo</a>.</p><h2 id=the-fish-id-tool>The fish ID tool<a hidden class=anchor aria-hidden=true href=#the-fish-id-tool>#</a></h2><p>As far as machine learning applications go, the tool I built isn&rsquo;t groundbreaking – and that&rsquo;s exactly the point. Many machine learning apps are boring and &ldquo;uncool&rdquo; (fulling <a href=https://www.fast.ai/about/#slogan target=_blank rel=noopener>fast.ai&rsquo;s goal</a> of <em>making neural nets uncool again</em>). But such apps are often useful. In my case, the tool scratches an itch felt by many RLS volunteers and other divers: <em>Given a photo taken at a certain location, what fish is in the photo?</em></p><p>The tool relies on a classification model trained on images from the RLS website. In addition to the model, it lets users filter results based on previously-observed species at RLS sites. The following video demonstrates how it works:</p><p style=text-align:center><iframe src=https://drive.google.com/file/d/1hpfRY26ZQXHzhYpAIP-3ShsqpuCSXfth/preview width=640 height=480 allow=autoplay></iframe></p><p>I built the computer vision model with fast.ai, and the web app with <a href=https://streamlit.io/ target=_blank rel=noopener>Streamlit</a>. It only took a couple of weeks to put everything together, and it could have easily been faster if I hadn&rsquo;t taken the time to understand the underlying modelling code and tinker with various things. I&rsquo;m sure that the model can be improved – my initial modelling attempts yielded a top-10 accuracy of about 60%, which I subsequently improved to about 72%. The main challenge is that there are 6,628 images and 2,167 species in the dataset I used, so it&rsquo;s likely that some species can&rsquo;t be identified reliably from the available training images.</p><p>You can read through my modelling experiments in <a href=https://github.com/yanirs/deep-fish/tree/master/notebooks target=_blank rel=noopener>the project&rsquo;s notebooks</a>. Copyright for the images belongs to the photographers, so I can&rsquo;t share the full dataset.</p><h2 id=lessons-learned>Lessons learned<a hidden class=anchor aria-hidden=true href=#lessons-learned>#</a></h2><p>Rather than writing too much about the model and the code, which aren&rsquo;t too unusual, I&rsquo;d like to share a few lessons I learned while working on this project.</p><p><strong>1. Getting reasonable performance out of a deep learning model can be cheap and easy.</strong> This lesson is highlighted in <a href=https://course.fast.ai/ target=_blank rel=noopener>the introduction to the fast.ai course</a>: With a few lines of code (and the right data), it&rsquo;s easy to train reasonable models. It can also be cheap: I only used my laptop&rsquo;s GPU for most of the experiments, and relied on Kaggle&rsquo;s free notebook environment for experiments that I couldn&rsquo;t run locally. On my dataset, I found that training a bigger (ResNet50) model with Kaggle didn&rsquo;t improve accuracy in comparison to the smaller (ResNet18) model I could fit into my laptop&rsquo;s GPU memory. This would definitely vary by dataset, but the point is that reasonable performance doesn&rsquo;t necessarily require much human or computer work. In fact, much of the time I spent on modelling was for my own benefit, to better understand the material taught by fast.ai. Conceptually, I was pleased to discover that many things remained the same since my last foray into computer vision: Reasonable performance can be obtained by using established techniques and pre-trained architectures, while focusing on the data, the modelling pipeline, and augmentations. In my experience, this principle applies to many machine learning problems. This is summarised well by the directive from <a href=https://developers.google.com/machine-learning/guides/rules-of-ml/ target=_blank rel=noopener>Google&rsquo;s Rules of Machine Learning</a> to <em>&ldquo;do machine learning like the great engineer you are, not like the great machine learning expert you aren&rsquo;t.&rdquo;</em></p><p><strong>2. Building a Streamlit UI feels like magic.</strong> I&rsquo;ve heard about Streamlit years ago, but this was my first time using it. I was impressed with how quickly I could put together a useful app using only Python. I went from a vague idea to a pretty complete implementation in a day (with some additional tinkering in subsequent days). It really is a game changer for data scientists.</p><p><strong>3. Deploying a Streamlit app is a bit less magical.</strong> Streamlit Cloud seemed like a straightforward way to deploy Streamlit apps, but I ran into issues because I used a Conda environment. I managed to work around those issues, but it seems like the environment installed on Cloud isn&rsquo;t truly isolated: Judging by the logs, Streamlit Cloud reads the Conda file and installs the required packages into an existing environment. This results in weird error messages that are hard to debug. I also ran into memory issues, which seem to be un-debuggable with the information provided by Streamlit Cloud. Still, I decided to initially deploy the app to Streamlit Cloud&rsquo;s free tier and wrap it in an iframe for the RLS website. Given the steep increase in price from the free tier to the lowest paid tier (US$250 / month), it&rsquo;s likely I&rsquo;d switch to self-hosting if I run into more issues. This is a disappointing contrast to the magical experience of building the UI, but I hope that Streamlit Cloud would become easier to use with time.</p><p><strong>4. The fast.ai library is a great starting point, despite its quirks.</strong> Using fast.ai felt a bit like cheating, in the sense captured by xkcd&rsquo;s Real Programmers comic. Given the hype, it feels like it should be harder to build useful models – <em>real data scientists use PyTorch directly!</em> But no, in reality it makes sense to use the best tool for the job. And there&rsquo;s nothing wrong with something being easy or fast, as it lets you spend more time elsewhere. In the words of <a href=https://agilemanifesto.org/principles.html target=_blank rel=noopener>the principles behind the agile manifesto</a>: <em>&ldquo;Simplicity – the art of maximizing the amount of work not done – is essential.&rdquo;</em></p><figure class=white-bg><a href=real-programmers-xkcd.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/real-programmers-xkcd_hu9dfb6fbb197bd50ff3498ba40a1f618e_84499_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/real-programmers-xkcd_hu9dfb6fbb197bd50ff3498ba40a1f618e_84499_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/real-programmers-xkcd_hu9dfb6fbb197bd50ff3498ba40a1f618e_84499_720x0_resize_box_3.png 720w,
diff --git a/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.html b/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.html
index 330d0797b..9a148d7ae 100644
--- a/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.html
+++ b/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>The mission matters: Moving to climate tech as a data scientist | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="Automattic,career,climate change,data science,environment,Orkestra,personal,politics,remote work,sustainability"><meta name=description content="Discussing my recent career move into climate tech as a way of doing more to help mitigate dangerous climate change."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The mission matters: Moving to climate tech as a data scientist"><meta property="og:description" content="Discussing my recent career move into climate tech as a way of doing more to help mitigate dangerous climate change."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/"><meta property="og:image" content="https://yanirseroussi.com/dolphin-on-a-mission.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-06-06T00:00:00+00:00"><meta property="article:modified_time" content="2022-06-06T10:07:53+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/dolphin-on-a-mission.jpg"><meta name=twitter:title content="The mission matters: Moving to climate tech as a data scientist"><meta name=twitter:description content="Discussing my recent career move into climate tech as a way of doing more to help mitigate dangerous climate change."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The mission matters: Moving to climate tech as a data scientist","item":"https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The mission matters: Moving to climate tech as a data scientist","name":"The mission matters: Moving to climate tech as a data scientist","description":"Discussing my recent career move into climate tech as a way of doing more to help mitigate dangerous climate change.","keywords":["Automattic","career","climate change","data science","environment","Orkestra","personal","politics","remote work","sustainability"],"articleBody":" So we are facing the most consequential fork in the road. If we continue as now, we are going to be irreparably going down a course of constant destruction, with much human pain and biodiversity loss. Or we can choose to go in the other direction, a path of reconstruction and regeneration, and at least diminish the negative impacts of climate change to something that is manageable.\nBut we can only choose it this decade. Our parents did not have this choice, because they didn’t have the capital, technologies and understanding. And for our children, it will be too late. So this is the decade and we are the generation.\nChristiana Figueres Interview on The Future we Choose (2020) Multiple factors contributed to my decision to leave Automattic last year. One factor was that the company’s mission to “democratize publishing and eCommerce” doesn’t resonate with me:1 First, publishing and eCommerce are already widely accessible. Second, despite decades of increased access to a wide variety of publication tools, global democracy is declining. Third, a corollary of the mission is hosting publications by the likes of News Corp Australia, an organisation that is harming Australian democracy according to former prime ministers from both sides of politics. Fourth, I believe that there are more pressing problems I can spend my time on.2\nOne such problem is the climate crisis. I was fortunate to have spent a small amount of time on it at Automattic, where I co-founded a sustainability employee group and led the company’s first purchases of carbon offsets and removals. However, this was a side gig.3\nWhen I left Automattic, I was hoping to get involved more directly in climate and environmental action. Having recently joined Orkestra – a company whose mission is “to power the world’s energy decision-making” – I figured it’s time to share some thoughts on the climate tech space, along with resources that others may find useful.\nDoing more with my climate obsession I’ve always cared about the environment, but my levels of activity in the area have fluctuated over the years. While it’s no excuse, I suppose that environmental issues often appear too intractable, especially with the growth of human population and of the percentage of humans who live in societies that require continuous economic growth to prosper. Collectively, we still haven’t figured out how to obtain prosperity without growth. Given the magnitude of the problems, even the most influential individuals can only make a relatively small impact on driving solutions.\nStill, being unable to do everything doesn’t mean one should do nothing, as the too-small-to-matter excuse can even be applied at the country level. For example, some people claim that given Australia’s small share of global emissions, it can’t play a significant role in addressing climate change. This conveniently neglects the fact that Australia has one of the highest per-capita carbon footprints in the world, and that it is a major exporter of fossil fuels. Clearly, Australia can do more to help achieve the collective goal of keeping global heating below truly dangerous levels. And doing it intelligently would help prosperity, as Australia is uniquely positioned to become a green energy superpower. Given the results of the last federal election, most Australians fall on the “do more” side of the debate.\nAnyway, I am not a country, but similar logic applies: I can do more as an individual, even though my personal emissions are negligible when compared to the daunting amount emitted by humanity as a whole.\nWhen it comes to climate action, a couple of key milestones for me were in 2015, when I became more aware of how I could divest from fossil fuels, and in early 2020, when the massive fires in Australia made me want to do something. Among other things, this led me to push for climate action within Automattic, as noted above. It also led me to – somewhat obsessively – consume quite a few resources on the topic. Honourable mentions go to Outrage + Optimism, Volts, TIL Climate, and My Climate Journey – many others are sprinkled throughout this article.\nOne outcome of the obsession is that I’m more aware of the impacts of climate change, environmental degradation, and government inaction. Massive fires? Climate change increases their frequency and severity. Global pandemics? Habitat loss and greater human-animal interaction increase their probability, while air pollution increases risk from respiratory infections. Widespread floods? Climate change increases flood impact and over-development on floodplains leads to avoidable suffering. More frequent coral bleaching? Increased emissions lead to ocean heatwaves and acidification, while reduced water quality and overfishing certainly don’t help ocean ecosystems.\nSo yeah, big problems. And one can always more/better to help. But it’s usually possible to also do less or worse. Therefore, I believe in doing more while cutting people slack, as suggested by Sami Grover:\nSo by all means, skip that next beef burger, or take a pass on that cheap flight to Cancún. But then ask yourself how you can magnify the impact of what you do. Are there campaigns or advocacy groups you can join? Can you talk to friends or family about the shifts you are making? Can you influence policy or practices at your place of work or study? Can you identify barriers to action that are preventing others from joining in?\nIn so doing, remember to cut yourself, and those around you, some slack. We are not each on an individual journey to slash our footprint to zero. We are on a collective mission to shift the only true footprint that matters: that of society as a whole.\nClimate tech and its intersections with data science When it comes to doing more, one path that a growing number of people seem to take is getting into climate tech. What is climate tech? Good question. To me, defining it is somewhat reminiscent of attempts to define data science, which I’ve tackled in posts from 2014 to 2018.4 In the same way that data science encompassed things that some people have been doing for decades, climate tech is giving a new name to existing activities. Broadly, I’d say that it’s work on technology to reverse, mitigate, and adapt to anthropogenic climate change.\nAnother parallel I see between data science and climate tech is that many things with tenuous connections to the field get lumped into it, in an attempt to capitalise on its trendiness. I think we’re past the peak of the data science hype, but there was a time when people who had only taken cursory looks at data rebranded as data scientists. Similarly, there are “climate tech” companies out there that may have a negative or neutral impact on fighting climate change. Personally, I’m also skeptical of grouping adaptation efforts under climate tech. For example, dealing with extreme weather events is needed even in a world with a stable climate, so I don’t think such work captures the intention behind climate tech (though it can be valuable).\nMost importantly, no matter how you define climate tech and data science, there is a need for data skills to develop technologies that address climate change. And this is where data scientists who are concerned about the climate (like me) can help make difference. In the words of Saul Griffith:\nIf you are a tech worker, stop making social media and delivery apps and make software that helps people use less energy, balances the grid, automates the design of solar and wind plants, makes public transit work better, and does other useful things to accelerate our transition to renewables.\nA structured approach to making career decisions I’m fortunate to have skills that are in demand in the current market. I’m also fortunate to be in a financial position that allows me to take unpaid time off. Put together, this means that I have a high degree of freedom to choose how I spend my time.\nIn the past, I’ve advocated for asking why about every career step. And indeed, I can explain the reasoning behind every point in my resume. Sometimes, a step is due to dumb luck, e.g., I discovered that I was a data scientist in 2012, the year Harvard Business Review deemed it the sexiest job of the 21st century5 – I didn’t plan to become a data scientist when I started my PhD in 2009. And sometimes, a step is more planned – I specifically targeted Automattic as one of the few established fully-remote companies that was hiring data scientists in 2017, as my goals included living outside major cities and having a job that I can hold for more than a year without wanting to run away.\nGiven that my current position presents more options than I’ve had in the past, I decided to have a look through 80,000 Hours. I’ve been aware of their work for years, but my vague impression was that they’re overly utilitarian. However, digging through their resources, I found that they emphasise the importance of personal fit and well-being, both when it comes to career paths and to problem areas. For example, they aren’t too pushy about choosing the problems that they find most pressing if it doesn’t align with one’s beliefs and values.\nThe 80,000 Hours website contains a wealth of well-reasoned articles. I found the self-guided course on career planning useful to go through, as it helped me apply their main ideas to my situation. While I don’t feel like it led to a major shift in my views and plans, having more structure and a richer terminology to think through my career decisions is helpful.\nThat said, one area where I diverge from the 80,000 Hours philosophy is in concern about far-future human extinction. They conclude that climate change is less recommended than other problems as the odds of it leading to human extinction are low. However, working in the climate space should alleviate human suffering in this century and reduce the extinction risks of nonhuman animals. Both of these are important to me, especially given the rich cultural lives of animals like whales and dolphins.\nWhales have cultures and massive brains, but perhaps you don’t care. How I ended up with Orkestra In retrospect, my ~4.5 years at Automattic could be divided into the pre-pandemic and pandemic periods. Pre-pandemic, I got to travel a few times a year to meet my colleagues in person. From the time the pandemic hit, this wasn’t an option. While I was lucky to be with a company that had already figured out how to work remotely, I found the complete lack of in-person interaction with my colleagues to be too isolating and monotonous.6 Together with the pandemic-era stressors that affected pretty much everything, I felt that Automattic had become a less pleasant place to work.\nWhen reflecting on my decision to leave, I realised that I had experienced two of the three dimensions of occupational burnout: I was high on exhaustion and cynicism, but felt like I still had professional efficacy. As I take pride in doing good work, I was concerned about losing my sense of efficacy and burning out on all three fronts. It was definitely time to leave, especially since burnout is seen by researchers as “a sign of a major dysfunction within an organization [that] says more about the workplace than it does about the employees”.\nGiven my recent burnout experience, I was reluctant to jump into a full-time job. I took some time to relax, and worked on side projects like getting my website off WordPress.com and developing a web app for fish identification. Concurrently, I was also looking to learn more about the climate tech space. I was already a member of the Climate Action Tech community and a consumer of various other climate-related resources, but my search had also led me to places like the Climate People agency and the Work on Climate community. Looking through these resources became a part of my routine, and it was on the Climate Action Tech Slack that I saw a short message by Chris Cooper, advertising open positions at Orkestra (then called Vippy).\nFrom the time I decided to enquire, things moved quickly. By early February, we agreed to engage in a short-term contract where I would do data science work for three days per week. This was largely because I wanted to keep my options open and avoid over-committing myself, especially after the burnout I experienced at Automattic.\nWhile the original plan was to use the contract as a trial towards full-time employment, I found that I enjoyed working only three focused days on Orkestra. It was a refreshing change from the sort of work I was doing at Automattic – perhaps a similar feeling to that of a former Automattic employee who moved to DuckDuckGo: “the big shift was to an all-business-low-drama environment, meaning that my job was cognitively harder but emotionally easier”.\nTherefore, while Orkestra would have preferred for me to come on board as a full-timer, we recently agreed that I join as a 70%-time employee, which on most weeks means three long workdays. I think it’s a win-win, as human productivity isn’t a linear function of time spent working – with 70% I’m likely to produce more than I would in the same amount of time as a full-timer. And I have plenty of time off work, which reduces risk factors associated with excessive time dealing with rectangles.\nIn general, I see this sort of flexibility as the future of work in many professions. The forty-hour workweek isn’t sacred – Keynes predicted its demise almost a hundred years ago. With remote and hybrid work becoming the norm in jobs that don’t require in-person presence, employers calling themselves flexible should go beyond remote options.\nA question I get a lot is what I do in my days off work. I guess it’s pretty much the same stuff people do on shorter weekends, but with more time to spare. For example, one area that I’ve had more time to invest in is my involvement with the Reef Life Survey Foundation – I’m helping on several trips and with some technical work. In general, if the Orkestra arrangement sticks in the long term, it should also give me time for open source contributions and skill development. As I noted previously, there’s just so much interesting stuff happening in data science that no single job can cover it all – the FOMO is real! With extra time in the week, I can fight the FOMO more effectively, while still having enough off-rectangle time.\nFinally, what about the work I do with Orkestra? I can’t share much yet, but I can say that I’m learning a lot about the energy space. I hope to post more about it in the future, so please stay tuned.\nRectangles are useful, but we also need time without them. In addition to the mission, Automattic CEO Matt Mullenweg has shared his vision of making Automattic the Berkshire Hathaway of the internet, a goal that I find even less inspiring. ↩︎\nWhile I was aware of the mission when I joined Automattic in 2017, it wasn’t a critical criterion for me. Over the years, I’ve become more conscious of the role online platforms play in destabilising societies. I now believe that it’s important for platforms to acknowledge their responsibilities and delegate power to external regulators, e.g., as Facebook is doing with their Oversight Board (which is still an imperfect solution). ↩︎\nIt’s also an open question whether it’s possible to offset things like the harmful work of the Murdoch press. ↩︎\nI still like the 2018 definition, so hopefully I’m done with defining data science. ↩︎\nAccording to a recent study, data science is seen as an incredibly boring job. Not sexy at all. ↩︎\nDespite this, I wasn’t particularly looking forward to going back to frequent long-haul flights – it was an aspect of Automattic work I never liked. This made the prospect of post-pandemic work with Automattic less appealing, even without considering the climate impact of so much flying. ↩︎\n","wordCount":"2677","inLanguage":"en","image":"https://yanirseroussi.com/dolphin-on-a-mission.jpg","datePublished":"2022-06-06T00:00:00Z","dateModified":"2022-06-06T10:07:53+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The mission matters: Moving to climate tech as a data scientist</h1><div class=post-meta><span title='2022-06-06 00:00:00 +0000 UTC'>June 6, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-06-06-the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission.jpg 3238w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission.jpg alt width=3238 height=1821></figure><div class=post-content><blockquote><p><p>So we are facing the most consequential fork in the road. If we continue as now, we are going to be irreparably going down a course of constant destruction, with much human pain and biodiversity loss. Or we can choose to go in the other direction, a path of reconstruction and regeneration, and at least diminish the negative impacts of climate change to something that is manageable.</p><p>But we can only choose it this decade. Our parents did not have this choice, because they didn’t have the capital, technologies and understanding. And for our children, it will be too late. So this is the decade and we are the generation.</p></p><footer><strong>Christiana Figueres</strong>
+<meta name=keywords content="Automattic,career,climate change,data science,environment,Orkestra,personal,politics,remote work,sustainability"><meta name=description content="Discussing my recent career move into climate tech as a way of doing more to help mitigate dangerous climate change."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="The mission matters: Moving to climate tech as a data scientist"><meta property="og:description" content="Discussing my recent career move into climate tech as a way of doing more to help mitigate dangerous climate change."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/"><meta property="og:image" content="https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-06-06T00:00:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission.jpg"><meta name=twitter:title content="The mission matters: Moving to climate tech as a data scientist"><meta name=twitter:description content="Discussing my recent career move into climate tech as a way of doing more to help mitigate dangerous climate change."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"The mission matters: Moving to climate tech as a data scientist","item":"https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"The mission matters: Moving to climate tech as a data scientist","name":"The mission matters: Moving to climate tech as a data scientist","description":"Discussing my recent career move into climate tech as a way of doing more to help mitigate dangerous climate change.","keywords":["Automattic","career","climate change","data science","environment","Orkestra","personal","politics","remote work","sustainability"],"articleBody":" So we are facing the most consequential fork in the road. If we continue as now, we are going to be irreparably going down a course of constant destruction, with much human pain and biodiversity loss. Or we can choose to go in the other direction, a path of reconstruction and regeneration, and at least diminish the negative impacts of climate change to something that is manageable.\nBut we can only choose it this decade. Our parents did not have this choice, because they didn’t have the capital, technologies and understanding. And for our children, it will be too late. So this is the decade and we are the generation.\nChristiana Figueres Interview on The Future we Choose (2020) Multiple factors contributed to my decision to leave Automattic last year. One factor was that the company’s mission to “democratize publishing and eCommerce” doesn’t resonate with me:1 First, publishing and eCommerce are already widely accessible. Second, despite decades of increased access to a wide variety of publication tools, global democracy is declining. Third, a corollary of the mission is hosting publications by the likes of News Corp Australia, an organisation that is harming Australian democracy according to former prime ministers from both sides of politics. Fourth, I believe that there are more pressing problems I can spend my time on.2\nOne such problem is the climate crisis. I was fortunate to have spent a small amount of time on it at Automattic, where I co-founded a sustainability employee group and led the company’s first purchases of carbon offsets and removals. However, this was a side gig.3\nWhen I left Automattic, I was hoping to get involved more directly in climate and environmental action. Having recently joined Orkestra – a company whose mission is “to power the world’s energy decision-making” – I figured it’s time to share some thoughts on the climate tech space, along with resources that others may find useful.\nDoing more with my climate obsession I’ve always cared about the environment, but my levels of activity in the area have fluctuated over the years. While it’s no excuse, I suppose that environmental issues often appear too intractable, especially with the growth of human population and of the percentage of humans who live in societies that require continuous economic growth to prosper. Collectively, we still haven’t figured out how to obtain prosperity without growth. Given the magnitude of the problems, even the most influential individuals can only make a relatively small impact on driving solutions.\nStill, being unable to do everything doesn’t mean one should do nothing, as the too-small-to-matter excuse can even be applied at the country level. For example, some people claim that given Australia’s small share of global emissions, it can’t play a significant role in addressing climate change. This conveniently neglects the fact that Australia has one of the highest per-capita carbon footprints in the world, and that it is a major exporter of fossil fuels. Clearly, Australia can do more to help achieve the collective goal of keeping global heating below truly dangerous levels. And doing it intelligently would help prosperity, as Australia is uniquely positioned to become a green energy superpower. Given the results of the last federal election, most Australians fall on the “do more” side of the debate.\nAnyway, I am not a country, but similar logic applies: I can do more as an individual, even though my personal emissions are negligible when compared to the daunting amount emitted by humanity as a whole.\nWhen it comes to climate action, a couple of key milestones for me were in 2015, when I became more aware of how I could divest from fossil fuels, and in early 2020, when the massive fires in Australia made me want to do something. Among other things, this led me to push for climate action within Automattic, as noted above. It also led me to – somewhat obsessively – consume quite a few resources on the topic. Honourable mentions go to Outrage + Optimism, Volts, TIL Climate, and My Climate Journey – many others are sprinkled throughout this article.\nOne outcome of the obsession is that I’m more aware of the impacts of climate change, environmental degradation, and government inaction. Massive fires? Climate change increases their frequency and severity. Global pandemics? Habitat loss and greater human-animal interaction increase their probability, while air pollution increases risk from respiratory infections. Widespread floods? Climate change increases flood impact and over-development on floodplains leads to avoidable suffering. More frequent coral bleaching? Increased emissions lead to ocean heatwaves and acidification, while reduced water quality and overfishing certainly don’t help ocean ecosystems.\nSo yeah, big problems. And one can always more/better to help. But it’s usually possible to also do less or worse. Therefore, I believe in doing more while cutting people slack, as suggested by Sami Grover:\nSo by all means, skip that next beef burger, or take a pass on that cheap flight to Cancún. But then ask yourself how you can magnify the impact of what you do. Are there campaigns or advocacy groups you can join? Can you talk to friends or family about the shifts you are making? Can you influence policy or practices at your place of work or study? Can you identify barriers to action that are preventing others from joining in?\nIn so doing, remember to cut yourself, and those around you, some slack. We are not each on an individual journey to slash our footprint to zero. We are on a collective mission to shift the only true footprint that matters: that of society as a whole.\nClimate tech and its intersections with data science When it comes to doing more, one path that a growing number of people seem to take is getting into climate tech. What is climate tech? Good question. To me, defining it is somewhat reminiscent of attempts to define data science, which I’ve tackled in posts from 2014 to 2018.4 In the same way that data science encompassed things that some people have been doing for decades, climate tech is giving a new name to existing activities. Broadly, I’d say that it’s work on technology to reverse, mitigate, and adapt to anthropogenic climate change.\nAnother parallel I see between data science and climate tech is that many things with tenuous connections to the field get lumped into it, in an attempt to capitalise on its trendiness. I think we’re past the peak of the data science hype, but there was a time when people who had only taken cursory looks at data rebranded as data scientists. Similarly, there are “climate tech” companies out there that may have a negative or neutral impact on fighting climate change. Personally, I’m also skeptical of grouping adaptation efforts under climate tech. For example, dealing with extreme weather events is needed even in a world with a stable climate, so I don’t think such work captures the intention behind climate tech (though it can be valuable).\nMost importantly, no matter how you define climate tech and data science, there is a need for data skills to develop technologies that address climate change. And this is where data scientists who are concerned about the climate (like me) can help make difference. In the words of Saul Griffith:\nIf you are a tech worker, stop making social media and delivery apps and make software that helps people use less energy, balances the grid, automates the design of solar and wind plants, makes public transit work better, and does other useful things to accelerate our transition to renewables.\nA structured approach to making career decisions I’m fortunate to have skills that are in demand in the current market. I’m also fortunate to be in a financial position that allows me to take unpaid time off. Put together, this means that I have a high degree of freedom to choose how I spend my time.\nIn the past, I’ve advocated for asking why about every career step. And indeed, I can explain the reasoning behind every point in my resume. Sometimes, a step is due to dumb luck, e.g., I discovered that I was a data scientist in 2012, the year Harvard Business Review deemed it the sexiest job of the 21st century5 – I didn’t plan to become a data scientist when I started my PhD in 2009. And sometimes, a step is more planned – I specifically targeted Automattic as one of the few established fully-remote companies that was hiring data scientists in 2017, as my goals included living outside major cities and having a job that I can hold for more than a year without wanting to run away.\nGiven that my current position presents more options than I’ve had in the past, I decided to have a look through 80,000 Hours. I’ve been aware of their work for years, but my vague impression was that they’re overly utilitarian. However, digging through their resources, I found that they emphasise the importance of personal fit and well-being, both when it comes to career paths and to problem areas. For example, they aren’t too pushy about choosing the problems that they find most pressing if it doesn’t align with one’s beliefs and values.\nThe 80,000 Hours website contains a wealth of well-reasoned articles. I found the self-guided course on career planning useful to go through, as it helped me apply their main ideas to my situation. While I don’t feel like it led to a major shift in my views and plans, having more structure and a richer terminology to think through my career decisions is helpful.\nThat said, one area where I diverge from the 80,000 Hours philosophy is in concern about far-future human extinction. They conclude that climate change is less recommended than other problems as the odds of it leading to human extinction are low. However, working in the climate space should alleviate human suffering in this century and reduce the extinction risks of nonhuman animals. Both of these are important to me, especially given the rich cultural lives of animals like whales and dolphins.\nWhales have cultures and massive brains, but perhaps you don’t care. How I ended up with Orkestra In retrospect, my ~4.5 years at Automattic could be divided into the pre-pandemic and pandemic periods. Pre-pandemic, I got to travel a few times a year to meet my colleagues in person. From the time the pandemic hit, this wasn’t an option. While I was lucky to be with a company that had already figured out how to work remotely, I found the complete lack of in-person interaction with my colleagues to be too isolating and monotonous.6 Together with the pandemic-era stressors that affected pretty much everything, I felt that Automattic had become a less pleasant place to work.\nWhen reflecting on my decision to leave, I realised that I had experienced two of the three dimensions of occupational burnout: I was high on exhaustion and cynicism, but felt like I still had professional efficacy. As I take pride in doing good work, I was concerned about losing my sense of efficacy and burning out on all three fronts. It was definitely time to leave, especially since burnout is seen by researchers as “a sign of a major dysfunction within an organization [that] says more about the workplace than it does about the employees”.\nGiven my recent burnout experience, I was reluctant to jump into a full-time job. I took some time to relax, and worked on side projects like getting my website off WordPress.com and developing a web app for fish identification. Concurrently, I was also looking to learn more about the climate tech space. I was already a member of the Climate Action Tech community and a consumer of various other climate-related resources, but my search had also led me to places like the Climate People agency and the Work on Climate community. Looking through these resources became a part of my routine, and it was on the Climate Action Tech Slack that I saw a short message by Chris Cooper, advertising open positions at Orkestra (then called Vippy).\nFrom the time I decided to enquire, things moved quickly. By early February, we agreed to engage in a short-term contract where I would do data science work for three days per week. This was largely because I wanted to keep my options open and avoid over-committing myself, especially after the burnout I experienced at Automattic.\nWhile the original plan was to use the contract as a trial towards full-time employment, I found that I enjoyed working only three focused days on Orkestra. It was a refreshing change from the sort of work I was doing at Automattic – perhaps a similar feeling to that of a former Automattic employee who moved to DuckDuckGo: “the big shift was to an all-business-low-drama environment, meaning that my job was cognitively harder but emotionally easier”.\nTherefore, while Orkestra would have preferred for me to come on board as a full-timer, we recently agreed that I join as a 70%-time employee, which on most weeks means three long workdays. I think it’s a win-win, as human productivity isn’t a linear function of time spent working – with 70% I’m likely to produce more than I would in the same amount of time as a full-timer. And I have plenty of time off work, which reduces risk factors associated with excessive time dealing with rectangles.\nIn general, I see this sort of flexibility as the future of work in many professions. The forty-hour workweek isn’t sacred – Keynes predicted its demise almost a hundred years ago. With remote and hybrid work becoming the norm in jobs that don’t require in-person presence, employers calling themselves flexible should go beyond remote options.\nA question I get a lot is what I do in my days off work. I guess it’s pretty much the same stuff people do on shorter weekends, but with more time to spare. For example, one area that I’ve had more time to invest in is my involvement with the Reef Life Survey Foundation – I’m helping on several trips and with some technical work. In general, if the Orkestra arrangement sticks in the long term, it should also give me time for open source contributions and skill development. As I noted previously, there’s just so much interesting stuff happening in data science that no single job can cover it all – the FOMO is real! With extra time in the week, I can fight the FOMO more effectively, while still having enough off-rectangle time.\nFinally, what about the work I do with Orkestra? I can’t share much yet, but I can say that I’m learning a lot about the energy space. I hope to post more about it in the future, so please stay tuned.\nRectangles are useful, but we also need time without them. In addition to the mission, Automattic CEO Matt Mullenweg has shared his vision of making Automattic the Berkshire Hathaway of the internet, a goal that I find even less inspiring. ↩︎\nWhile I was aware of the mission when I joined Automattic in 2017, it wasn’t a critical criterion for me. Over the years, I’ve become more conscious of the role online platforms play in destabilising societies. I now believe that it’s important for platforms to acknowledge their responsibilities and delegate power to external regulators, e.g., as Facebook is doing with their Oversight Board (which is still an imperfect solution). ↩︎\nIt’s also an open question whether it’s possible to offset things like the harmful work of the Murdoch press. ↩︎\nI still like the 2018 definition, so hopefully I’m done with defining data science. ↩︎\nAccording to a recent study, data science is seen as an incredibly boring job. Not sexy at all. ↩︎\nDespite this, I wasn’t particularly looking forward to going back to frequent long-haul flights – it was an aspect of Automattic work I never liked. This made the prospect of post-pandemic work with Automattic less appealing, even without considering the climate impact of so much flying. ↩︎\n","wordCount":"2677","inLanguage":"en","image":"https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission.jpg","datePublished":"2022-06-06T00:00:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">The mission matters: Moving to climate tech as a data scientist</h1><div class=post-meta><span title='2022-06-06 00:00:00 +0000 UTC'>June 6, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-06-06-the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission_hu2278fbb0a04afae4f432aacc3e29a944_976086_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission.jpg 3238w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/dolphin-on-a-mission.jpg alt width=3238 height=1821></figure><div class=post-content><blockquote><p><p>So we are facing the most consequential fork in the road. If we continue as now, we are going to be irreparably going down a course of constant destruction, with much human pain and biodiversity loss. Or we can choose to go in the other direction, a path of reconstruction and regeneration, and at least diminish the negative impacts of climate change to something that is manageable.</p><p>But we can only choose it this decade. Our parents did not have this choice, because they didn’t have the capital, technologies and understanding. And for our children, it will be too late. So this is the decade and we are the generation.</p></p><footer><strong>Christiana Figueres</strong>
 <cite><a href=https://www.theguardian.com/environment/2020/feb/15/christiana-figueres-climate-emergency-this-is-the-decade-the-future-we-choose title=https://www.theguardian.com/environment/2020/feb/15/christiana-figueres-climate-emergency-this-is-the-decade-the-future-we-choose target=_blank rel=noopener>Interview on The Future we Choose (2020)</a></cite></footer></blockquote><p>Multiple factors contributed to my decision to leave Automattic last year. One factor was that <a href=https://transparency.automattic.com/ target=_blank rel=noopener>the company&rsquo;s mission to <em>&ldquo;democratize publishing and eCommerce&rdquo;</em></a> doesn&rsquo;t resonate with me:<sup id=fnref:1><a href=#fn:1 class=footnote-ref role=doc-noteref>1</a></sup> First, publishing and eCommerce are already widely accessible. Second, despite decades of increased access to a wide variety of publication tools, <a href=https://www.economist.com/graphic-detail/2022/02/09/a-new-low-for-global-democracy target=_blank rel=noopener>global democracy is declining</a>. Third, a corollary of the mission is <a href=https://wpvip.com/case-studies/unlocking-power-and-efficiency-for-news-corp-australia/ target=_blank rel=noopener>hosting publications by the likes of News Corp Australia</a>, an organisation <a href=https://www.theguardian.com/media/commentisfree/2021/oct/29/news-corp-opponents-team-up-to-fight-cancer-on-democracy target=_blank rel=noopener>that is harming Australian democracy according to former prime ministers from both sides of politics</a>. Fourth, I believe that there are more pressing problems I can spend my time on.<sup id=fnref:2><a href=#fn:2 class=footnote-ref role=doc-noteref>2</a></sup></p><p>One such problem is the climate crisis. I was fortunate to have spent a small amount of time on it at Automattic, where <a href=https://wordpress.com/blog/2020/09/21/toward-zero-reducing-and-offsetting-our-data-center-power-emissions/ target=_blank rel=noopener>I co-founded a sustainability employee group and led the company&rsquo;s first purchases of carbon offsets and removals</a>. However, this was a side gig.<sup id=fnref:3><a href=#fn:3 class=footnote-ref role=doc-noteref>3</a></sup></p><p>When I left Automattic, I was hoping to get involved more directly in climate and environmental action. Having recently joined <a href=https://www.orkestra.energy/ target=_blank rel=noopener>Orkestra</a> – a company whose mission is <em>&ldquo;to power the world&rsquo;s energy decision-making&rdquo;</em> – I figured it&rsquo;s time to share some thoughts on the climate tech space, along with resources that others may find useful.</p><h2 id=doing-more-with-my-climate-obsession>Doing more with my climate obsession<a hidden class=anchor aria-hidden=true href=#doing-more-with-my-climate-obsession>#</a></h2><p>I&rsquo;ve always cared about the environment, but my levels of activity in the area have fluctuated over the years. While it&rsquo;s no excuse, I suppose that environmental issues often appear too intractable, especially with the growth of human population and of the percentage of humans who live in societies that require continuous economic growth to prosper. Collectively, we still haven&rsquo;t figured out how to obtain <a href=https://en.wikipedia.org/wiki/Prosperity_Without_Growth target=_blank rel=noopener>prosperity without growth</a>. Given the magnitude of the problems, even the most influential individuals can only make a relatively small impact on driving solutions.</p><p>Still, being unable to do <em>everything</em> doesn&rsquo;t mean one should do <em>nothing</em>, as the too-small-to-matter excuse can even be applied at the country level. For example, <a href=https://theconversation.com/how-to-answer-the-argument-that-australias-emissions-are-too-small-to-make-a-difference-118825 target=_blank rel=noopener>some people claim that given Australia&rsquo;s small share of global emissions, it can&rsquo;t play a significant role in addressing climate change</a>. This conveniently neglects the fact that Australia has one of the highest per-capita carbon footprints in the world, and that <a href=https://australiainstitute.org.au/post/new-analysis-australia-ranks-third-for-fossil-fuel-export/ target=_blank rel=noopener>it is a major exporter of fossil fuels</a>. Clearly, Australia can do more to help achieve the collective goal of keeping global heating below truly dangerous levels. And doing it intelligently would help prosperity, as <a href=https://www.blackincbooks.com.au/books/superpower target=_blank rel=noopener>Australia is uniquely positioned to become a green energy superpower</a>. Given <a href=https://en.wikipedia.org/wiki/2022_Australian_federal_election target=_blank rel=noopener>the results of the last federal election</a>, most Australians fall on the &ldquo;do more&rdquo; side of the debate.</p><p>Anyway, I am not a country, but similar logic applies: I can do more as an individual, even though my personal emissions are negligible when compared to the daunting amount emitted by humanity as a whole.</p><p>When it comes to climate action, a couple of key milestones for me were in 2015, when <a href=https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/>I became more aware of how I could divest from fossil fuels</a>, and in early 2020, when the massive fires in Australia made me want to do <em>something</em>. Among other things, this led me to push for climate action within Automattic, as noted above. It also led me to – somewhat obsessively – consume quite a few resources on the topic. Honourable mentions go to <a href=https://www.outrageandoptimism.org/ target=_blank rel=noopener>Outrage + Optimism</a>, <a href=https://www.volts.wtf/ target=_blank rel=noopener>Volts</a>, <a href=https://climate.mit.edu/tilclimate-podcast target=_blank rel=noopener>TIL Climate</a>, and <a href=https://www.mcjcollective.com/ target=_blank rel=noopener>My Climate Journey</a> – many others are sprinkled throughout this article.</p><p>One outcome of the obsession is that I&rsquo;m more aware of the impacts of climate change, environmental degradation, and government inaction. Massive fires? <a href=https://www.climatecouncil.org.au/not-normal-climate-change-bushfire-web/ target=_blank rel=noopener>Climate change increases their frequency and severity</a>. Global pandemics? <a href=https://www.pnas.org/doi/10.1073/pnas.2023540118 target=_blank rel=noopener>Habitat loss and greater human-animal interaction increase their probability</a>, while <a href=https://www.hsph.harvard.edu/c-change/subtopics/coronavirus-and-pollution/ target=_blank rel=noopener>air pollution increases risk from respiratory infections</a>. Widespread floods? <a href=https://www.ipcc.ch/2021/08/09/ar6-wg1-20210809-pr/ target=_blank rel=noopener>Climate change increases flood impact</a> and <a href=https://www.uqp.com.au/books/a-river-with-a-city-problem-a-history-of-brisbane-floods target=_blank rel=noopener>over-development on floodplains leads to avoidable suffering</a>. More frequent coral bleaching? <a href=https://en.wikipedia.org/wiki/Coral_bleaching#Triggers target=_blank rel=noopener>Increased emissions lead to ocean heatwaves and acidification</a>, while <a href=https://www.barrierreef.org/the-reef/threats/poor-water-quality target=_blank rel=noopener>reduced water quality and overfishing certainly don&rsquo;t help ocean ecosystems</a>.</p><p>So yeah, big problems. And one can always more/better to help. But it&rsquo;s usually possible to also do less or worse. Therefore, I believe in doing more while cutting people slack, <a href=https://undark.org/2021/09/09/the-messy-truth-about-carbon-footprints/ target=_blank rel=noopener>as suggested by Sami Grover</a>:</p><blockquote><p>So by all means, skip that next beef burger, or take a pass on that cheap flight to Cancún. But then ask yourself how you can magnify the impact of what you do. Are there campaigns or advocacy groups you can join? Can you talk to friends or family about the shifts you are making? Can you influence policy or practices at your place of work or study? Can you identify barriers to action that are preventing others from joining in?</p><p>In so doing, remember to cut yourself, and those around you, some slack. We are not each on an individual journey to slash our footprint to zero. We are on a collective mission to shift the only true footprint that matters: that of society as a whole.</p></blockquote><h2 id=climate-tech-and-its-intersections-with-data-science>Climate tech and its intersections with data science<a hidden class=anchor aria-hidden=true href=#climate-tech-and-its-intersections-with-data-science>#</a></h2><p>When it comes to doing more, one path that a growing number of people seem to take is getting into climate tech. What is climate tech? <a href=https://workonclimate.org/2022/03/25/climate-workforce-insights/ target=_blank rel=noopener>Good question</a>. To me, defining it is somewhat reminiscent of attempts to define data science, which I&rsquo;ve tackled in posts from <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>2014</a> to <a href=https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/>2018</a>.<sup id=fnref:4><a href=#fn:4 class=footnote-ref role=doc-noteref>4</a></sup> In the same way that data science encompassed things that some people have been doing for decades, climate tech is giving a new name to existing activities. Broadly, I&rsquo;d say that it&rsquo;s work on technology to reverse, mitigate, and adapt to anthropogenic climate change.</p><p>Another parallel I see between data science and climate tech is that many things with tenuous connections to the field get lumped into it, in an attempt to capitalise on its trendiness. I think we&rsquo;re past the peak of the data science hype, but there was a time when people who had only taken cursory looks at data rebranded as data scientists. Similarly, there are &ldquo;climate tech&rdquo; companies out there that may have a negative or neutral impact on fighting climate change. Personally, I&rsquo;m also skeptical of grouping adaptation efforts under climate tech. For example, dealing with extreme weather events is needed even in a world with a stable climate, so I don&rsquo;t think such work captures the intention behind climate tech (though it can be valuable).</p><p>Most importantly, no matter how you define climate tech and data science, <strong>there is a need for data skills to develop technologies that address climate change.</strong> And this is where data scientists who are concerned about the climate (like me) can help make difference. In <a href=https://mitpress.mit.edu/books/electrify target=_blank rel=noopener>the words of Saul Griffith</a>:</p><blockquote><p>If you are a tech worker, stop making social media and delivery apps and make software that helps people use less energy, balances the grid, automates the design of solar and wind plants, makes public transit work better, and does other useful things to accelerate our transition to renewables.</p></blockquote><h2 id=a-structured-approach-to-making-career-decisions>A structured approach to making career decisions<a hidden class=anchor aria-hidden=true href=#a-structured-approach-to-making-career-decisions>#</a></h2><p>I&rsquo;m fortunate to have skills that are in demand in the current market. I&rsquo;m also fortunate to be in a financial position that allows me to take unpaid time off. Put together, this means that I have a high degree of freedom to choose how I spend my time.</p><p>In the past, <a href=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/>I&rsquo;ve advocated for asking <em>why</em> about every career step</a>. And indeed, I can explain the reasoning behind every point in my resume. Sometimes, a step is due to dumb luck, e.g., I discovered that I was a data scientist in 2012, the year Harvard Business Review <a href=https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century target=_blank rel=noopener>deemed it</a> <em>the sexiest job of the 21st century</em><sup id=fnref:5><a href=#fn:5 class=footnote-ref role=doc-noteref>5</a></sup> – I didn&rsquo;t plan to become a data scientist when I started my PhD in 2009. And sometimes, a step is more planned – I specifically targeted Automattic as one of the few established fully-remote companies that was hiring data scientists in 2017, as my goals included living outside major cities and having a job that I can hold for more than a year without wanting to run away.</p><p>Given that my current position presents more options than I&rsquo;ve had in the past, I decided to <a href=https://80000hours.org/ target=_blank rel=noopener>have a look through 80,000 Hours</a>. I&rsquo;ve been aware of their work for years, but my vague impression was that they&rsquo;re overly utilitarian. However, digging through their resources, I found that they emphasise the importance of personal fit and well-being, both when it comes to career paths and to problem areas. For example, they aren&rsquo;t too pushy about choosing the problems that they find most pressing if it doesn&rsquo;t align with one&rsquo;s beliefs and values.</p><p>The 80,000 Hours website contains a wealth of well-reasoned articles. I found <a href=https://80000hours.org/career-planning/ target=_blank rel=noopener>the self-guided course on career planning</a> useful to go through, as it helped me apply their main ideas to my situation. While I don&rsquo;t feel like it led to a major shift in my views and plans, having more structure and a richer terminology to think through my career decisions is helpful.</p><p>That said, one area where I diverge from the 80,000 Hours philosophy is in concern about far-future human extinction. They conclude that <a href=https://80000hours.org/problem-profiles/climate-change/ target=_blank rel=noopener>climate change is less recommended than other problems</a> as the odds of it leading to human extinction are low. However, working in the climate space should alleviate human suffering in this century and reduce the extinction risks of nonhuman animals. Both of these are important to me, especially given <a href=https://press.uchicago.edu/ucp/books/book/chicago/C/bo12789830.html target=_blank rel=noopener>the rich cultural lives of animals like whales and dolphins</a>.</p><figure><a href=https://poorlydrawnlines.com/comic/the-whales/ target=_blank rel=noopener><img sizes="(min-width: 768px) 700px,
 100vw" srcset="https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/the-whales-poorlydrawnlines_hu266a6c9caa5760a80789194dfbd8f4db_167546_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/the-whales-poorlydrawnlines_hu266a6c9caa5760a80789194dfbd8f4db_167546_480x0_resize_box_3.png 480w,
diff --git a/2022/09/12/causal-machine-learning-book-draft-review/index.html b/2022/09/12/causal-machine-learning-book-draft-review/index.html
index e356e3a04..b01ca553b 100644
--- a/2022/09/12/causal-machine-learning-book-draft-review/index.html
+++ b/2022/09/12/causal-machine-learning-book-draft-review/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Causal Machine Learning is off to a good start, despite some issues | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="artificial intelligence,causal inference,data science,machine learning"><meta name=description content="Reviewing the first three chapters of the book Causal Machine Learning by Robert Osazuwa Ness."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Causal Machine Learning is off to a good start, despite some issues"><meta property="og:description" content="Reviewing the first three chapters of the book Causal Machine Learning by Robert Osazuwa Ness."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/"><meta property="og:image" content="https://yanirseroussi.com/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-09-12T02:45:00+00:00"><meta property="article:modified_time" content="2022-09-12T12:56:22+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png"><meta name=twitter:title content="Causal Machine Learning is off to a good start, despite some issues"><meta name=twitter:description content="Reviewing the first three chapters of the book Causal Machine Learning by Robert Osazuwa Ness."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Causal Machine Learning is off to a good start, despite some issues","item":"https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Causal Machine Learning is off to a good start, despite some issues","name":"Causal Machine Learning is off to a good start, despite some issues","description":"Reviewing the first three chapters of the book Causal Machine Learning by Robert Osazuwa Ness.","keywords":["artificial intelligence","causal inference","data science","machine learning"],"articleBody":"I was recently given a free eBook copy of the MEAP of Causal Machine Learning. MEAP stands for Manning Early Access Program, where books are published one chapter at a time. While the current version could use better copyediting and proofreading, I’m keen on reading more of the book as it becomes available.\nCausal Machine Learning addresses a gap in the causal inference literature: While much has been published on the topic, putting the theory to practice in the real world can be challenging. For example, even though I considered Causal Inference: What If to be the most practical book I’ve read on the topic, I haven’t used much of its content directly. This is partly due to my focus on other areas, e.g., online experimentation and the energy space. But it is also due to the availability of sample code and mature packages that can be quickly adapted to my needs. The book aims to address the latter through a code-first approach that utilises Python packages such as Pyro, pgmpy, and DoWhy.\nDespite the code-first promise, the book feels a bit slow at getting into the more exciting content. I couldn’t help but compare it to the fast.ai book, which first shows how to build and deploy a custom image classifier, and only then goes into unpacking how it all works. However, despite the verbosity of the first two chapters, by the third chapter things start to get more interesting. At the time of this writing, only chapters 1-3 are available, but upcoming chapters look promising based on the table of contents.\nWhile lacking a production-ready example early in the book is a minor concern, I found the many grammatical errors more distracting. Even though a MEAP is essentially a draft, I think its proofreading level should be higher than that of a blog post.1 This is especially the case for paid content published by an organisation that cares enough to have contacted me to promote the book. As Steven Pinker says in the intro to The Sense of Style:\nStyle earns trust. If readers can see that a writer cares about consistency and accuracy in her prose, they will be reassured that the writer cares about those virtues in conduct they cannot see as easily. Here is how one technology executive explains why he rejects job applications filled with errors of grammar and punctuation: “If it takes someone more than 20 years to notice how to properly use it’s, then that’s not a learning curve I’m comfortable with.” And if that isn’t enough to get you to brush up your prose, consider the discovery of the dating site OkCupid that sloppy grammar and spelling in a profile are “huge turn-offs.” As one client said, “If you’re trying to date a woman, I don’t expect flowery Jane Austen prose. But aren’t you trying to put your best foot forward?”\nAnother source of distraction is the choice of variables for some of the toy examples. For instance, one model of blood type inheritance confuses the phenotype and genotype, claiming that “knowing your grandfather’s [blood] type has no benefit in predicting your type once we know your father’s”. However, knowing the grandparents’ blood types can help predict the grandchild’s blood type even when the parent’s blood type is known. The toy example would work if it focused on genotypes, not on the common meaning of blood type as the phenotype (i.e., observable traits). See pages 58-60 in Probabilistic Graphical Models: Principles and Techniques for a less casual presentation of a similar example.\nWhen observing parent phenotypes (ABO blood types) without genotypes, grandparent phenotypes are informative.\nSource: Wikipedia – ABO blood group system (retrieved on 2022-09-11). I also struggle with overly-casual statements like this one:\nSuppose we were interested in modeling the relationship between altitude and temperature. The two are clearly correlated; the higher up you go, the colder it gets. However, you know temperature doesn’t cause altitude, otherwise heating the air within a city would cause the city to fly. Altitude is the cause, and temperature is the effect.\nIn fact, heating the air within a city would cause the heated air to rise. And extremely high heat can melt a city and the land it’s on, thereby causing a reduction in its altitude.\nWhile this may seem like nitpicking, ill-defined causal graphs are a serious problem. One of my favourite papers on the topic is Does water kill? A call for less casual causal inferences, which argues that \"[while] it is impossible to provide an absolutely precise definition of a version of treatment […] specification of versions of treatment is required only until no meaningful vagueness remains\". However, “declaring a version of treatment sufficiently well-defined is a matter of agreement among experts based on the available substantive knowledge” because we don’t have an objective way of determining that treatments are well-defined. In line with this thinking, the book may benefit from reducing the variety of examples in favour of a handful of small datasets that are more well-defined and defensible.\nDespite these shortcomings, I found chapters 1-3 of Causal Machine Learning pleasant enough to get through, and I look forward to reading more. Getting into DoWhy and other related packages has been on my list, and I’m sure I’ll learn a lot by following the MEAP. After tracking the field for almost a decade and complaining about the relative hype levels of deep learning and causal inference, it’s great to see a practical book that aims to marry the two. The Causal Revolution is truly upon us.\nIt is almost inevitable that when pointing out the mistakes of others I will make mistakes myself. I apologise for any mistakes and welcome feedback. ↩︎\n","wordCount":"952","inLanguage":"en","image":"https://yanirseroussi.com/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png","datePublished":"2022-09-12T02:45:00Z","dateModified":"2022-09-12T12:56:22+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Causal Machine Learning is off to a good start, despite some issues</h1><div class=post-meta><span title='2022-09-12 02:45:00 +0000 UTC'>September 12, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-09-12-causal-machine-learning-book-draft-review/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/_hu29d7aa7a71e422617b3694161c10ba7c_416864_b46c591c1871236fdf504c5fb3243cd3.png 360w ,https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/_hu29d7aa7a71e422617b3694161c10ba7c_416864_3c78bc66e8a0ee5c04cfc469e70687b1.png 480w ,https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png 500w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png alt="[DALL·E](https://labs.openai.com/)'s _steampunk painting of a data scientist reading a book about causal machine learning_." width=500 height=492><p><a href=https://labs.openai.com/ target=_blank rel=noopener>DALL·E</a>&rsquo;s <em>steampunk painting of a data scientist reading a book about causal machine learning</em>.</p></figure><div class=post-content><p>I was recently given a free eBook copy of the MEAP of <a href=https://www.manning.com/books/causal-machine-learning target=_blank rel=noopener><em>Causal Machine Learning</em></a>. <a href=https://www.manning.com/meap-program target=_blank rel=noopener>MEAP stands for Manning Early Access Program</a>, where books are published one chapter at a time. While the current version could use better copyediting and proofreading, I&rsquo;m keen on reading more of the book as it becomes available.</p><p><em>Causal Machine Learning</em> addresses a gap in the causal inference literature: While <a href=https://yanirseroussi.com/causal-inference-reading-list/>much has been published on the topic</a>, putting the theory to practice in the real world can be challenging. For example, even though I considered <a href=https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/ target=_blank rel=noopener><em>Causal Inference: What If</em></a> to be <a href=https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/>the most practical book I&rsquo;ve read on the topic</a>, I haven&rsquo;t used much of its content directly. This is partly due to my focus on other areas, e.g., <a href=https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/>online experimentation</a> and <a href=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/>the energy space</a>. But it is also due to the availability of sample code and mature packages that can be quickly adapted to my needs. The book aims to address the latter through a code-first approach that utilises Python packages such as <a href=https://pyro.ai/ target=_blank rel=noopener>Pyro</a>, <a href=https://pgmpy.org/ target=_blank rel=noopener>pgmpy</a>, and <a href=https://py-why.github.io/dowhy/ target=_blank rel=noopener>DoWhy</a>.</p><p>Despite the code-first promise, the book feels a bit slow at getting into the more exciting content. I couldn&rsquo;t help but compare it to <a href=https://github.com/fastai/fastbook target=_blank rel=noopener>the fast.ai book</a>, which first shows how to build and deploy a custom image classifier, and only then goes into unpacking how it all works. However, despite the verbosity of the first two chapters, by the third chapter things start to get more interesting. At the time of this writing, only chapters 1-3 are available, but upcoming chapters look promising based on the table of contents.</p><p>While lacking a production-ready example early in the book is a minor concern, I found the many grammatical errors more distracting. Even though a MEAP is essentially a draft, I think its proofreading level should be higher than that of a blog post.<sup id=fnref:1><a href=#fn:1 class=footnote-ref role=doc-noteref>1</a></sup> This is especially the case for paid content published by an organisation that cares enough to have contacted me to promote the book. As Steven Pinker says in the intro to <a href=https://stevenpinker.com/publications/sense-style-thinking-persons-guide-writing-21st-century target=_blank rel=noopener>The Sense of Style</a>:</p><blockquote><p>Style earns trust. If readers can see that a writer cares about consistency and accuracy in her prose, they will be reassured that the writer cares about those virtues in conduct they cannot see as easily. Here is how one technology executive explains why he rejects job applications filled with errors of grammar and punctuation: &ldquo;If it takes someone more than 20 years to notice how to properly use it&rsquo;s, then that&rsquo;s not a learning curve I&rsquo;m comfortable with.&rdquo; And if that isn&rsquo;t enough to get you to brush up your prose, consider the discovery of the dating site OkCupid that sloppy grammar and spelling in a profile are &ldquo;huge turn-offs.&rdquo; As one client said, &ldquo;If you&rsquo;re trying to date a woman, I don&rsquo;t expect flowery Jane Austen prose. But aren&rsquo;t you trying to put your best foot forward?&rdquo;</p></blockquote><p>Another source of distraction is the choice of variables for some of the toy examples. For instance, one model of blood type inheritance confuses the phenotype and genotype, claiming that <em>&ldquo;knowing your grandfather&rsquo;s [blood] type has no benefit in predicting your type once we know your father&rsquo;s&rdquo;</em>. However, <a href=https://en.wikipedia.org/wiki/ABO_blood_group_system#Genetics target=_blank rel=noopener>knowing the grandparents&rsquo; blood types <em>can</em> help predict the grandchild&rsquo;s blood type even when the parent&rsquo;s blood type is known</a>. The toy example would work if it focused on genotypes, not on the common meaning of <em>blood type</em> as the phenotype (i.e., observable traits). See <a href="https://books.google.com.au/books?id=7dzpHCHzNQ4C&amp;lpg=PA59&amp;ots=px4BFm4XAP&amp;pg=PA58#v=onepage&amp;q&amp;f=false" target=_blank rel=noopener>pages 58-60 in Probabilistic Graphical Models: Principles and Techniques</a> for a less casual presentation of a similar example.</p><figure><a href=wikipedia-blood-group-inheritance-table.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="artificial intelligence,causal inference,data science,machine learning"><meta name=description content="Reviewing the first three chapters of the book Causal Machine Learning by Robert Osazuwa Ness."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Causal Machine Learning is off to a good start, despite some issues"><meta property="og:description" content="Reviewing the first three chapters of the book Causal Machine Learning by Robert Osazuwa Ness."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/"><meta property="og:image" content="https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-09-12T02:45:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png"><meta name=twitter:title content="Causal Machine Learning is off to a good start, despite some issues"><meta name=twitter:description content="Reviewing the first three chapters of the book Causal Machine Learning by Robert Osazuwa Ness."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Causal Machine Learning is off to a good start, despite some issues","item":"https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Causal Machine Learning is off to a good start, despite some issues","name":"Causal Machine Learning is off to a good start, despite some issues","description":"Reviewing the first three chapters of the book Causal Machine Learning by Robert Osazuwa Ness.","keywords":["artificial intelligence","causal inference","data science","machine learning"],"articleBody":"I was recently given a free eBook copy of the MEAP of Causal Machine Learning. MEAP stands for Manning Early Access Program, where books are published one chapter at a time. While the current version could use better copyediting and proofreading, I’m keen on reading more of the book as it becomes available.\nCausal Machine Learning addresses a gap in the causal inference literature: While much has been published on the topic, putting the theory to practice in the real world can be challenging. For example, even though I considered Causal Inference: What If to be the most practical book I’ve read on the topic, I haven’t used much of its content directly. This is partly due to my focus on other areas, e.g., online experimentation and the energy space. But it is also due to the availability of sample code and mature packages that can be quickly adapted to my needs. The book aims to address the latter through a code-first approach that utilises Python packages such as Pyro, pgmpy, and DoWhy.\nDespite the code-first promise, the book feels a bit slow at getting into the more exciting content. I couldn’t help but compare it to the fast.ai book, which first shows how to build and deploy a custom image classifier, and only then goes into unpacking how it all works. However, despite the verbosity of the first two chapters, by the third chapter things start to get more interesting. At the time of this writing, only chapters 1-3 are available, but upcoming chapters look promising based on the table of contents.\nWhile lacking a production-ready example early in the book is a minor concern, I found the many grammatical errors more distracting. Even though a MEAP is essentially a draft, I think its proofreading level should be higher than that of a blog post.1 This is especially the case for paid content published by an organisation that cares enough to have contacted me to promote the book. As Steven Pinker says in the intro to The Sense of Style:\nStyle earns trust. If readers can see that a writer cares about consistency and accuracy in her prose, they will be reassured that the writer cares about those virtues in conduct they cannot see as easily. Here is how one technology executive explains why he rejects job applications filled with errors of grammar and punctuation: “If it takes someone more than 20 years to notice how to properly use it’s, then that’s not a learning curve I’m comfortable with.” And if that isn’t enough to get you to brush up your prose, consider the discovery of the dating site OkCupid that sloppy grammar and spelling in a profile are “huge turn-offs.” As one client said, “If you’re trying to date a woman, I don’t expect flowery Jane Austen prose. But aren’t you trying to put your best foot forward?”\nAnother source of distraction is the choice of variables for some of the toy examples. For instance, one model of blood type inheritance confuses the phenotype and genotype, claiming that “knowing your grandfather’s [blood] type has no benefit in predicting your type once we know your father’s”. However, knowing the grandparents’ blood types can help predict the grandchild’s blood type even when the parent’s blood type is known. The toy example would work if it focused on genotypes, not on the common meaning of blood type as the phenotype (i.e., observable traits). See pages 58-60 in Probabilistic Graphical Models: Principles and Techniques for a less casual presentation of a similar example.\nWhen observing parent phenotypes (ABO blood types) without genotypes, grandparent phenotypes are informative.\nSource: Wikipedia – ABO blood group system (retrieved on 2022-09-11). I also struggle with overly-casual statements like this one:\nSuppose we were interested in modeling the relationship between altitude and temperature. The two are clearly correlated; the higher up you go, the colder it gets. However, you know temperature doesn’t cause altitude, otherwise heating the air within a city would cause the city to fly. Altitude is the cause, and temperature is the effect.\nIn fact, heating the air within a city would cause the heated air to rise. And extremely high heat can melt a city and the land it’s on, thereby causing a reduction in its altitude.\nWhile this may seem like nitpicking, ill-defined causal graphs are a serious problem. One of my favourite papers on the topic is Does water kill? A call for less casual causal inferences, which argues that \"[while] it is impossible to provide an absolutely precise definition of a version of treatment […] specification of versions of treatment is required only until no meaningful vagueness remains\". However, “declaring a version of treatment sufficiently well-defined is a matter of agreement among experts based on the available substantive knowledge” because we don’t have an objective way of determining that treatments are well-defined. In line with this thinking, the book may benefit from reducing the variety of examples in favour of a handful of small datasets that are more well-defined and defensible.\nDespite these shortcomings, I found chapters 1-3 of Causal Machine Learning pleasant enough to get through, and I look forward to reading more. Getting into DoWhy and other related packages has been on my list, and I’m sure I’ll learn a lot by following the MEAP. After tracking the field for almost a decade and complaining about the relative hype levels of deep learning and causal inference, it’s great to see a practical book that aims to marry the two. The Causal Revolution is truly upon us.\nIt is almost inevitable that when pointing out the mistakes of others I will make mistakes myself. I apologise for any mistakes and welcome feedback. ↩︎\n","wordCount":"952","inLanguage":"en","image":"https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png","datePublished":"2022-09-12T02:45:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Causal Machine Learning is off to a good start, despite some issues</h1><div class=post-meta><span title='2022-09-12 02:45:00 +0000 UTC'>September 12, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-09-12-causal-machine-learning-book-draft-review/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/_hu29d7aa7a71e422617b3694161c10ba7c_416864_b46c591c1871236fdf504c5fb3243cd3.png 360w ,https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/_hu29d7aa7a71e422617b3694161c10ba7c_416864_3c78bc66e8a0ee5c04cfc469e70687b1.png 480w ,https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png 500w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/dall-e-a-steampunk-painting-of-a-data-scientist-reading-a-book-about-causal-machine-learning.png alt="[DALL·E](https://labs.openai.com/)'s _steampunk painting of a data scientist reading a book about causal machine learning_." width=500 height=492><p><a href=https://labs.openai.com/ target=_blank rel=noopener>DALL·E</a>&rsquo;s <em>steampunk painting of a data scientist reading a book about causal machine learning</em>.</p></figure><div class=post-content><p>I was recently given a free eBook copy of the MEAP of <a href=https://www.manning.com/books/causal-machine-learning target=_blank rel=noopener><em>Causal Machine Learning</em></a>. <a href=https://www.manning.com/meap-program target=_blank rel=noopener>MEAP stands for Manning Early Access Program</a>, where books are published one chapter at a time. While the current version could use better copyediting and proofreading, I&rsquo;m keen on reading more of the book as it becomes available.</p><p><em>Causal Machine Learning</em> addresses a gap in the causal inference literature: While <a href=https://yanirseroussi.com/causal-inference-reading-list/>much has been published on the topic</a>, putting the theory to practice in the real world can be challenging. For example, even though I considered <a href=https://www.hsph.harvard.edu/miguel-hernan/causal-inference-book/ target=_blank rel=noopener><em>Causal Inference: What If</em></a> to be <a href=https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/>the most practical book I&rsquo;ve read on the topic</a>, I haven&rsquo;t used much of its content directly. This is partly due to my focus on other areas, e.g., <a href=https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/>online experimentation</a> and <a href=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/>the energy space</a>. But it is also due to the availability of sample code and mature packages that can be quickly adapted to my needs. The book aims to address the latter through a code-first approach that utilises Python packages such as <a href=https://pyro.ai/ target=_blank rel=noopener>Pyro</a>, <a href=https://pgmpy.org/ target=_blank rel=noopener>pgmpy</a>, and <a href=https://py-why.github.io/dowhy/ target=_blank rel=noopener>DoWhy</a>.</p><p>Despite the code-first promise, the book feels a bit slow at getting into the more exciting content. I couldn&rsquo;t help but compare it to <a href=https://github.com/fastai/fastbook target=_blank rel=noopener>the fast.ai book</a>, which first shows how to build and deploy a custom image classifier, and only then goes into unpacking how it all works. However, despite the verbosity of the first two chapters, by the third chapter things start to get more interesting. At the time of this writing, only chapters 1-3 are available, but upcoming chapters look promising based on the table of contents.</p><p>While lacking a production-ready example early in the book is a minor concern, I found the many grammatical errors more distracting. Even though a MEAP is essentially a draft, I think its proofreading level should be higher than that of a blog post.<sup id=fnref:1><a href=#fn:1 class=footnote-ref role=doc-noteref>1</a></sup> This is especially the case for paid content published by an organisation that cares enough to have contacted me to promote the book. As Steven Pinker says in the intro to <a href=https://stevenpinker.com/publications/sense-style-thinking-persons-guide-writing-21st-century target=_blank rel=noopener>The Sense of Style</a>:</p><blockquote><p>Style earns trust. If readers can see that a writer cares about consistency and accuracy in her prose, they will be reassured that the writer cares about those virtues in conduct they cannot see as easily. Here is how one technology executive explains why he rejects job applications filled with errors of grammar and punctuation: &ldquo;If it takes someone more than 20 years to notice how to properly use it&rsquo;s, then that&rsquo;s not a learning curve I&rsquo;m comfortable with.&rdquo; And if that isn&rsquo;t enough to get you to brush up your prose, consider the discovery of the dating site OkCupid that sloppy grammar and spelling in a profile are &ldquo;huge turn-offs.&rdquo; As one client said, &ldquo;If you&rsquo;re trying to date a woman, I don&rsquo;t expect flowery Jane Austen prose. But aren&rsquo;t you trying to put your best foot forward?&rdquo;</p></blockquote><p>Another source of distraction is the choice of variables for some of the toy examples. For instance, one model of blood type inheritance confuses the phenotype and genotype, claiming that <em>&ldquo;knowing your grandfather&rsquo;s [blood] type has no benefit in predicting your type once we know your father&rsquo;s&rdquo;</em>. However, <a href=https://en.wikipedia.org/wiki/ABO_blood_group_system#Genetics target=_blank rel=noopener>knowing the grandparents&rsquo; blood types <em>can</em> help predict the grandchild&rsquo;s blood type even when the parent&rsquo;s blood type is known</a>. The toy example would work if it focused on genotypes, not on the common meaning of <em>blood type</em> as the phenotype (i.e., observable traits). See <a href="https://books.google.com.au/books?id=7dzpHCHzNQ4C&amp;lpg=PA59&amp;ots=px4BFm4XAP&amp;pg=PA58#v=onepage&amp;q&amp;f=false" target=_blank rel=noopener>pages 58-60 in Probabilistic Graphical Models: Principles and Techniques</a> for a less casual presentation of a similar example.</p><figure><a href=wikipedia-blood-group-inheritance-table.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/wikipedia-blood-group-inheritance-table_huf49d4a91fcaf351951cef8de23dd1e54_204843_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/wikipedia-blood-group-inheritance-table_huf49d4a91fcaf351951cef8de23dd1e54_204843_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/wikipedia-blood-group-inheritance-table_huf49d4a91fcaf351951cef8de23dd1e54_204843_720x0_resize_box_3.png 720w,
diff --git a/2022/12/11/chatgpt-is-transformative-ai/index.html b/2022/12/11/chatgpt-is-transformative-ai/index.html
index ccf48b9ac..47c417391 100644
--- a/2022/12/11/chatgpt-is-transformative-ai/index.html
+++ b/2022/12/11/chatgpt-is-transformative-ai/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>ChatGPT is transformative AI | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="artificial intelligence,futurism,machine intelligence,machine learning"><meta name=description content="My perspective after a week of using ChatGPT: This is a step change in finding distilled information, and it&rsquo;s only the beginning."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="ChatGPT is transformative AI"><meta property="og:description" content="My perspective after a week of using ChatGPT: This is a step change in finding distilled information, and it&rsquo;s only the beginning."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/"><meta property="og:image" content="https://yanirseroussi.com/mage-space-prompt-human-brain-expanding.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-12-11T00:00:00+00:00"><meta property="article:modified_time" content="2022-12-11T10:07:24+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/mage-space-prompt-human-brain-expanding.webp"><meta name=twitter:title content="ChatGPT is transformative AI"><meta name=twitter:description content="My perspective after a week of using ChatGPT: This is a step change in finding distilled information, and it&rsquo;s only the beginning."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"ChatGPT is transformative AI","item":"https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"ChatGPT is transformative AI","name":"ChatGPT is transformative AI","description":"My perspective after a week of using ChatGPT: This is a step change in finding distilled information, and it\u0026rsquo;s only the beginning.","keywords":["artificial intelligence","futurism","machine intelligence","machine learning"],"articleBody":"I remember the days before Google: Finding answers on the internet was tedious and clunky, I had to switch between search engines or run meta-searches to get workable results, and it still felt like I wasn’t finding all the information that was out there. Then Google came along and everything changed – I felt like I had gained new super-powers.\nUsing ChatGPT feels at least as transformative as switching from AltaVista to Google. After only a few days of working with ChatGPT, I feel like it has made me much more effective. It’d be hard to go back to pre-ChatGPT life.\nIt’s worth noting that I tend to be a mid-to-late adopter of shiny new consumer tech. I’m also a bit of an AI hype skeptic. Twitter? Seemed like a useless tool back in the day, and a mostly harmful one these days. Facebook? I resisted for a few years, and reluctantly ended up joining to avoid missing out on real-life social activity. Smartphones? Very useful, but also very distracting – I often have mine on airplane mode to avoid getting sucked in. Crypto? Still too volatile and speculative for me. Dall-E and Stable Diffusion? Fun toys, but not too useful in my everyday life.\nChatGPT is different because it distills information that is out there and makes it relevant to me. I feel like I’m still retaining agency, unlike with social media and other tools that are designed to suck me in. ChatGPT is more like a classic search engine that’s there to help when needed. I’m hooked, but not addicted.\nIn the past week, my work-related ChatGPT usage included questions about Nginx, Prefect, Python, AWS, React, MySQL, Google Sheets, and probably a few other tools. This makes it vastly more useful than GitHub Copilot, which I stopped using when it became paid. The problem with GitHub Copilot isn’t that it doesn’t provide useful output – some of its code completions feel like pure magic. The issue is more with the interface – it often distracts me from what I’m trying to do. In that sense, it’s less like a copilot and more like a backseat driver.\nChatGPT feels like a helpful copilot, personal assistant, coach, and much more – definitely worth paying for. In addition to technical advice, I asked it questions about the meaning of time, the Joel test for learning designers, rephrasing text, investment, and career-related issues. It wasn’t always correct, but it was often informative and thought-provoking. This is more than can be said for interactions with some humans.\nThe OpenAI team pretty much nailed the user experience and interface. With an ongoing chat, I can get more useful results by refining my queries. Unlike with a search engine, I don’t need to wade through sometimes-dodgy websites and discrepant interfaces to get what I’m looking for. ChatGPT makes information accessible and useful – like Google’s mission, but often better than Google (though it may catch up).\nThe exciting and terrifying thing is that the tech is still in its infancy. It’s going to get radically better and different, and disrupt many industries and people. The rise of machine intelligence continues – ChatGPT is a significant transformative AI step.\n","wordCount":"538","inLanguage":"en","image":"https://yanirseroussi.com/mage-space-prompt-human-brain-expanding.webp","datePublished":"2022-12-11T00:00:00Z","dateModified":"2022-12-11T10:07:24+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">ChatGPT is transformative AI</h1><div class=post-meta><span title='2022-12-11 00:00:00 +0000 UTC'>December 11, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-12-11-chatgpt-is-transformative-ai/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_720x0_resize_q75_h2_box_2.webp 720w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_1080x0_resize_q75_h2_box_2.webp 1080w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_1500x0_resize_q75_h2_box_2.webp 1500w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding.webp 3616w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding.webp alt="[Mage](https://www.mage.space/)'s interpretation of _human brain expanding_" width=3616 height=2048><p><a href=https://www.mage.space/ target=_blank rel=noopener>Mage</a>&rsquo;s interpretation of <em>human brain expanding</em></p></figure><div class=post-content><p>I remember the days before Google: Finding answers on the internet was tedious and clunky, I had to switch between search engines or run meta-searches to get workable results, and it still felt like I wasn&rsquo;t finding all the information that was out there. Then Google came along and everything changed – I felt like I had gained new super-powers.</p><p><strong>Using <a href=https://openai.com/blog/chatgpt/ target=_blank rel=noopener>ChatGPT</a> feels at least as transformative as switching from <a href=https://en.wikipedia.org/wiki/AltaVista target=_blank rel=noopener>AltaVista</a> to Google.</strong> After only a few days of working with ChatGPT, I feel like it has made me much more effective. It&rsquo;d be hard to go back to pre-ChatGPT life.</p><p>It&rsquo;s worth noting that I tend to be a mid-to-late adopter of shiny new consumer tech. I&rsquo;m also <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>a bit of an AI hype skeptic</a>. <em>Twitter?</em> Seemed like a useless tool back in the day, and a mostly harmful one these days. <em>Facebook?</em> I resisted for a few years, and reluctantly ended up joining to avoid missing out on real-life social activity. <em>Smartphones?</em> Very useful, but also very distracting – I often have mine on airplane mode to avoid getting sucked in. <em>Crypto?</em> Still too volatile and speculative for me. <em>Dall-E and Stable Diffusion?</em> Fun toys, but not too useful in my everyday life.</p><p><strong>ChatGPT is different because it distills information that is out there and makes it relevant to me.</strong> I feel like I&rsquo;m still retaining agency, unlike with social media and other tools that are designed to suck me in. ChatGPT is more like a classic search engine that&rsquo;s there to help when needed. I&rsquo;m <a href=https://www.nirandfar.com/hooked/ target=_blank rel=noopener>hooked</a>, but not addicted.</p><p>In the past week, my work-related ChatGPT usage included questions about Nginx, Prefect, Python, AWS, React, MySQL, Google Sheets, and probably a few other tools. This makes it vastly more useful than GitHub Copilot, which I stopped using when it became paid. The problem with GitHub Copilot isn&rsquo;t that it doesn&rsquo;t provide useful output – some of its code completions feel like pure magic. The issue is more with the interface – it often distracts me from what I&rsquo;m trying to do. In that sense, it&rsquo;s less like a copilot and more like a <a href=https://en.wikipedia.org/wiki/Back-seat_driver target=_blank rel=noopener>backseat driver</a>.</p><p>ChatGPT feels like a helpful copilot, personal assistant, coach, and much more – definitely worth paying for. In addition to technical advice, I asked it questions about the meaning of time, <a href=https://www.linkedin.com/posts/anushka-fowler-27524038_chatgpt-learningdesign-activity-7005677354651971584-iMvT target=_blank rel=noopener>the Joel test for learning designers</a>, rephrasing text, investment, and career-related issues. It wasn&rsquo;t always correct, but it was often informative and thought-provoking. This is more than can be said for interactions with some humans.</p><p>The OpenAI team pretty much nailed the user experience and interface. With an ongoing chat, I can get more useful results by refining my queries. Unlike with a search engine, I don&rsquo;t need to wade through sometimes-dodgy websites and discrepant interfaces to get what I&rsquo;m looking for. ChatGPT makes information accessible and useful – like Google&rsquo;s mission, but often better than Google (though <a href=https://bigtechnology.substack.com/p/why-google-missed-chatgpt target=_blank rel=noopener>it may catch up</a>).</p><p>The exciting and terrifying thing is that the tech is still in its infancy. It&rsquo;s going to get radically better and <em>different</em>, and disrupt many industries and people. <a href=https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/>The rise of machine intelligence continues</a> – ChatGPT is a significant <a href=https://www.lesswrong.com/tag/transformative-ai target=_blank rel=noopener>transformative AI</a> step.</p><figure><a href=chat-gpt-is-transformative-ai.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="artificial intelligence,futurism,machine intelligence,machine learning"><meta name=description content="My perspective after a week of using ChatGPT: This is a step change in finding distilled information, and it&rsquo;s only the beginning."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="ChatGPT is transformative AI"><meta property="og:description" content="My perspective after a week of using ChatGPT: This is a step change in finding distilled information, and it&rsquo;s only the beginning."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/"><meta property="og:image" content="https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2022-12-11T00:00:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding.webp"><meta name=twitter:title content="ChatGPT is transformative AI"><meta name=twitter:description content="My perspective after a week of using ChatGPT: This is a step change in finding distilled information, and it&rsquo;s only the beginning."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"ChatGPT is transformative AI","item":"https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"ChatGPT is transformative AI","name":"ChatGPT is transformative AI","description":"My perspective after a week of using ChatGPT: This is a step change in finding distilled information, and it\u0026rsquo;s only the beginning.","keywords":["artificial intelligence","futurism","machine intelligence","machine learning"],"articleBody":"I remember the days before Google: Finding answers on the internet was tedious and clunky, I had to switch between search engines or run meta-searches to get workable results, and it still felt like I wasn’t finding all the information that was out there. Then Google came along and everything changed – I felt like I had gained new super-powers.\nUsing ChatGPT feels at least as transformative as switching from AltaVista to Google. After only a few days of working with ChatGPT, I feel like it has made me much more effective. It’d be hard to go back to pre-ChatGPT life.\nIt’s worth noting that I tend to be a mid-to-late adopter of shiny new consumer tech. I’m also a bit of an AI hype skeptic. Twitter? Seemed like a useless tool back in the day, and a mostly harmful one these days. Facebook? I resisted for a few years, and reluctantly ended up joining to avoid missing out on real-life social activity. Smartphones? Very useful, but also very distracting – I often have mine on airplane mode to avoid getting sucked in. Crypto? Still too volatile and speculative for me. Dall-E and Stable Diffusion? Fun toys, but not too useful in my everyday life.\nChatGPT is different because it distills information that is out there and makes it relevant to me. I feel like I’m still retaining agency, unlike with social media and other tools that are designed to suck me in. ChatGPT is more like a classic search engine that’s there to help when needed. I’m hooked, but not addicted.\nIn the past week, my work-related ChatGPT usage included questions about Nginx, Prefect, Python, AWS, React, MySQL, Google Sheets, and probably a few other tools. This makes it vastly more useful than GitHub Copilot, which I stopped using when it became paid. The problem with GitHub Copilot isn’t that it doesn’t provide useful output – some of its code completions feel like pure magic. The issue is more with the interface – it often distracts me from what I’m trying to do. In that sense, it’s less like a copilot and more like a backseat driver.\nChatGPT feels like a helpful copilot, personal assistant, coach, and much more – definitely worth paying for. In addition to technical advice, I asked it questions about the meaning of time, the Joel test for learning designers, rephrasing text, investment, and career-related issues. It wasn’t always correct, but it was often informative and thought-provoking. This is more than can be said for interactions with some humans.\nThe OpenAI team pretty much nailed the user experience and interface. With an ongoing chat, I can get more useful results by refining my queries. Unlike with a search engine, I don’t need to wade through sometimes-dodgy websites and discrepant interfaces to get what I’m looking for. ChatGPT makes information accessible and useful – like Google’s mission, but often better than Google (though it may catch up).\nThe exciting and terrifying thing is that the tech is still in its infancy. It’s going to get radically better and different, and disrupt many industries and people. The rise of machine intelligence continues – ChatGPT is a significant transformative AI step.\n","wordCount":"538","inLanguage":"en","image":"https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding.webp","datePublished":"2022-12-11T00:00:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">ChatGPT is transformative AI</h1><div class=post-meta><span title='2022-12-11 00:00:00 +0000 UTC'>December 11, 2022</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2022-12-11-chatgpt-is-transformative-ai/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_720x0_resize_q75_h2_box_2.webp 720w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_1080x0_resize_q75_h2_box_2.webp 1080w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding_hu9fea10895dd82f471654699883db12ed_221426_1500x0_resize_q75_h2_box_2.webp 1500w ,https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding.webp 3616w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/mage-space-prompt-human-brain-expanding.webp alt="[Mage](https://www.mage.space/)'s interpretation of _human brain expanding_" width=3616 height=2048><p><a href=https://www.mage.space/ target=_blank rel=noopener>Mage</a>&rsquo;s interpretation of <em>human brain expanding</em></p></figure><div class=post-content><p>I remember the days before Google: Finding answers on the internet was tedious and clunky, I had to switch between search engines or run meta-searches to get workable results, and it still felt like I wasn&rsquo;t finding all the information that was out there. Then Google came along and everything changed – I felt like I had gained new super-powers.</p><p><strong>Using <a href=https://openai.com/blog/chatgpt/ target=_blank rel=noopener>ChatGPT</a> feels at least as transformative as switching from <a href=https://en.wikipedia.org/wiki/AltaVista target=_blank rel=noopener>AltaVista</a> to Google.</strong> After only a few days of working with ChatGPT, I feel like it has made me much more effective. It&rsquo;d be hard to go back to pre-ChatGPT life.</p><p>It&rsquo;s worth noting that I tend to be a mid-to-late adopter of shiny new consumer tech. I&rsquo;m also <a href=https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/>a bit of an AI hype skeptic</a>. <em>Twitter?</em> Seemed like a useless tool back in the day, and a mostly harmful one these days. <em>Facebook?</em> I resisted for a few years, and reluctantly ended up joining to avoid missing out on real-life social activity. <em>Smartphones?</em> Very useful, but also very distracting – I often have mine on airplane mode to avoid getting sucked in. <em>Crypto?</em> Still too volatile and speculative for me. <em>Dall-E and Stable Diffusion?</em> Fun toys, but not too useful in my everyday life.</p><p><strong>ChatGPT is different because it distills information that is out there and makes it relevant to me.</strong> I feel like I&rsquo;m still retaining agency, unlike with social media and other tools that are designed to suck me in. ChatGPT is more like a classic search engine that&rsquo;s there to help when needed. I&rsquo;m <a href=https://www.nirandfar.com/hooked/ target=_blank rel=noopener>hooked</a>, but not addicted.</p><p>In the past week, my work-related ChatGPT usage included questions about Nginx, Prefect, Python, AWS, React, MySQL, Google Sheets, and probably a few other tools. This makes it vastly more useful than GitHub Copilot, which I stopped using when it became paid. The problem with GitHub Copilot isn&rsquo;t that it doesn&rsquo;t provide useful output – some of its code completions feel like pure magic. The issue is more with the interface – it often distracts me from what I&rsquo;m trying to do. In that sense, it&rsquo;s less like a copilot and more like a <a href=https://en.wikipedia.org/wiki/Back-seat_driver target=_blank rel=noopener>backseat driver</a>.</p><p>ChatGPT feels like a helpful copilot, personal assistant, coach, and much more – definitely worth paying for. In addition to technical advice, I asked it questions about the meaning of time, <a href=https://www.linkedin.com/posts/anushka-fowler-27524038_chatgpt-learningdesign-activity-7005677354651971584-iMvT target=_blank rel=noopener>the Joel test for learning designers</a>, rephrasing text, investment, and career-related issues. It wasn&rsquo;t always correct, but it was often informative and thought-provoking. This is more than can be said for interactions with some humans.</p><p>The OpenAI team pretty much nailed the user experience and interface. With an ongoing chat, I can get more useful results by refining my queries. Unlike with a search engine, I don&rsquo;t need to wade through sometimes-dodgy websites and discrepant interfaces to get what I&rsquo;m looking for. ChatGPT makes information accessible and useful – like Google&rsquo;s mission, but often better than Google (though <a href=https://bigtechnology.substack.com/p/why-google-missed-chatgpt target=_blank rel=noopener>it may catch up</a>).</p><p>The exciting and terrifying thing is that the tech is still in its infancy. It&rsquo;s going to get radically better and <em>different</em>, and disrupt many industries and people. <a href=https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/>The rise of machine intelligence continues</a> – ChatGPT is a significant <a href=https://www.lesswrong.com/tag/transformative-ai target=_blank rel=noopener>transformative AI</a> step.</p><figure><a href=chat-gpt-is-transformative-ai.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/chat-gpt-is-transformative-ai_hu2a6554fee21e5adc8b4e5b4193592a03_183217_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/chat-gpt-is-transformative-ai_hu2a6554fee21e5adc8b4e5b4193592a03_183217_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/chat-gpt-is-transformative-ai_hu2a6554fee21e5adc8b4e5b4193592a03_183217_720x0_resize_box_3.png 720w,
diff --git a/2023/04/21/remaining-relevant-as-a-small-language-model/index.html b/2023/04/21/remaining-relevant-as-a-small-language-model/index.html
index ceb607f53..ddc79c93c 100644
--- a/2023/04/21/remaining-relevant-as-a-small-language-model/index.html
+++ b/2023/04/21/remaining-relevant-as-a-small-language-model/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Remaining relevant as a small language model | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="artificial intelligence,career,futurism,machine intelligence"><meta name=description content="Bing Chat recently quipped that humans are small language models. Here are some of my thoughts on how we small language models can remain relevant (for now)."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Remaining relevant as a small language model"><meta property="og:description" content="Bing Chat recently quipped that humans are small language models. Here are some of my thoughts on how we small language models can remain relevant (for now)."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/"><meta property="og:image" content="https://yanirseroussi.com/mage-horse-versus-car-minimalistic.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-04-21T00:06:30+00:00"><meta property="article:modified_time" content="2023-04-21T16:32:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/mage-horse-versus-car-minimalistic.webp"><meta name=twitter:title content="Remaining relevant as a small language model"><meta name=twitter:description content="Bing Chat recently quipped that humans are small language models. Here are some of my thoughts on how we small language models can remain relevant (for now)."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Remaining relevant as a small language model","item":"https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Remaining relevant as a small language model","name":"Remaining relevant as a small language model","description":"Bing Chat recently quipped that humans are small language models. Here are some of my thoughts on how we small language models can remain relevant (for now).","keywords":["artificial intelligence","career","futurism","machine intelligence"],"articleBody":" Computer science as a field is in for a pretty major upheaval few of us are really prepared for. Programming will be obsolete.\nMatt Welsh The End of Programming (2023) Many of us feel both despair and awe when contemplating recent AI developments: Despair due to the rapid pace of automation that threatens personal and social stability. Awe due to the seemingly-magical ability of computers to outperform most humans on an ever-expanding range of tasks. But there is nothing magical about human intelligence, just as there is no magic formula that makes horses gallop and birds fly. That is, it can all be replicated with the right machinery.\nIn its wild early days, Bing Chat referred to a user as “a late version of a small language model”. While there’s more to humans than language, there’s no denying that our language processing abilities are limited by our biology. Meanwhile, computers don’t face the same constraints. This raises the question: As small language models, what can we do that is still of value?\nBing Chat actually made some good points on humans Given how rapidly things are evolving, it’s hard for me to say anything definitive, but here are some things I believe to be true:\nDespite the hype and inevitability of bullshit applications, the current wave of AI innovation has concrete everyday uses that are already transforming our world – it’s not another crypto. This is reflected by the excitement of normally level-headed people who have seen tech trends come and go, such as Bill Gates, Matt Welsh, and Steve Yegge (among many others). Human society has a seemingly-insatiable appetite for inventing bullshit jobs. If one could wave a magic wand and reorganise the world, we could all work less and have more. Given that such a wand does not exist, I wouldn’t bet on AI displacing human labour in an orderly or reasonable manner. At best, it’s going to be messy. Current-generation AI models have limited real-world understanding. They don’t have the curiosity, rigour, truthfulness, and real-world grounding that some humans have, i.e., these models don’t exhibit a deep capacity for critical thinking. Some humans who work in language-driven domains exhibit a low capacity for critical thinking (e.g., some programmers). Such humans are prime targets for displacement by AI. Therefore, for us small language models to remain economically relevant in a world where large language models are becoming more pervasive, we need to keep developing our critical thinking skills. We definitely can’t beat computers on breadth of knowledge, cost, reliability, or work capacity. In my experience, deep critical thinking is also what distinguishes mediocre from excellent employees.\nIn the past, many organisations had to choose between employing mediocre workers and simply not getting some tasks done. Now, a new option is evolving: Hand over such tasks to AI agents. And this isn’t a hypothetical scenario. Personally, given the choice of reviewing flawed code produced by an AI or the same code produced by a human, I much prefer the former. This is mostly because AIs like ChatGPT respond better to feedback. Similarly, Simon Willison recently observed that working with ChatGPT Code Interpreter is like having a free intern that responds incredibly well to feedback. He also noted that AI-enhanced development makes him more ambitious with his projects.\nThere’s an often-repeated claim that “you won’t be replaced by AI, you’ll be replaced by a person using AI”. I’m not too sure about that – horses were almost fully replaced by motor vehicles, for example. That claim is likely true for now, though – mastering new tools is an important skill, which is where human curiosity and rigour come in. But in the long term, I’d much rather see a world where humans become as economically irrelevant as horses. I’d rather we all flourish and have more time for play – let the machines do what we today call work.\nWhat might happen once you’ve finally mastered the latest AI tools. Source: Reddit ","wordCount":"662","inLanguage":"en","image":"https://yanirseroussi.com/mage-horse-versus-car-minimalistic.webp","datePublished":"2023-04-21T00:06:30Z","dateModified":"2023-04-21T16:32:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Remaining relevant as a small language model</h1><div class=post-meta><span title='2023-04-21 00:06:30 +0000 UTC'>April 21, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-04-21-remaining-relevant-as-a-small-language-model/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_720x0_resize_q75_h2_box_2.webp 720w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_1080x0_resize_q75_h2_box_2.webp 1080w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_1500x0_resize_q75_h2_box_2.webp 1500w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic.webp 2048w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic.webp alt="[Mage](https://www.mage.space/)'s interpretation of _horse versus car minimalistic_" width=2048 height=1024><p><a href=https://www.mage.space/ target=_blank rel=noopener>Mage</a>&rsquo;s interpretation of <em>horse versus car minimalistic</em></p></figure><div class=post-content><blockquote><p>Computer science as a field is in for a pretty major upheaval few of us are really prepared for. Programming will be obsolete.</p><footer><strong>Matt Welsh</strong>
+<meta name=keywords content="artificial intelligence,career,futurism,machine intelligence"><meta name=description content="Bing Chat recently quipped that humans are small language models. Here are some of my thoughts on how we small language models can remain relevant (for now)."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Remaining relevant as a small language model"><meta property="og:description" content="Bing Chat recently quipped that humans are small language models. Here are some of my thoughts on how we small language models can remain relevant (for now)."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/"><meta property="og:image" content="https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-04-21T00:06:30+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic.webp"><meta name=twitter:title content="Remaining relevant as a small language model"><meta name=twitter:description content="Bing Chat recently quipped that humans are small language models. Here are some of my thoughts on how we small language models can remain relevant (for now)."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Remaining relevant as a small language model","item":"https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Remaining relevant as a small language model","name":"Remaining relevant as a small language model","description":"Bing Chat recently quipped that humans are small language models. Here are some of my thoughts on how we small language models can remain relevant (for now).","keywords":["artificial intelligence","career","futurism","machine intelligence"],"articleBody":" Computer science as a field is in for a pretty major upheaval few of us are really prepared for. Programming will be obsolete.\nMatt Welsh The End of Programming (2023) Many of us feel both despair and awe when contemplating recent AI developments: Despair due to the rapid pace of automation that threatens personal and social stability. Awe due to the seemingly-magical ability of computers to outperform most humans on an ever-expanding range of tasks. But there is nothing magical about human intelligence, just as there is no magic formula that makes horses gallop and birds fly. That is, it can all be replicated with the right machinery.\nIn its wild early days, Bing Chat referred to a user as “a late version of a small language model”. While there’s more to humans than language, there’s no denying that our language processing abilities are limited by our biology. Meanwhile, computers don’t face the same constraints. This raises the question: As small language models, what can we do that is still of value?\nBing Chat actually made some good points on humans Given how rapidly things are evolving, it’s hard for me to say anything definitive, but here are some things I believe to be true:\nDespite the hype and inevitability of bullshit applications, the current wave of AI innovation has concrete everyday uses that are already transforming our world – it’s not another crypto. This is reflected by the excitement of normally level-headed people who have seen tech trends come and go, such as Bill Gates, Matt Welsh, and Steve Yegge (among many others). Human society has a seemingly-insatiable appetite for inventing bullshit jobs. If one could wave a magic wand and reorganise the world, we could all work less and have more. Given that such a wand does not exist, I wouldn’t bet on AI displacing human labour in an orderly or reasonable manner. At best, it’s going to be messy. Current-generation AI models have limited real-world understanding. They don’t have the curiosity, rigour, truthfulness, and real-world grounding that some humans have, i.e., these models don’t exhibit a deep capacity for critical thinking. Some humans who work in language-driven domains exhibit a low capacity for critical thinking (e.g., some programmers). Such humans are prime targets for displacement by AI. Therefore, for us small language models to remain economically relevant in a world where large language models are becoming more pervasive, we need to keep developing our critical thinking skills. We definitely can’t beat computers on breadth of knowledge, cost, reliability, or work capacity. In my experience, deep critical thinking is also what distinguishes mediocre from excellent employees.\nIn the past, many organisations had to choose between employing mediocre workers and simply not getting some tasks done. Now, a new option is evolving: Hand over such tasks to AI agents. And this isn’t a hypothetical scenario. Personally, given the choice of reviewing flawed code produced by an AI or the same code produced by a human, I much prefer the former. This is mostly because AIs like ChatGPT respond better to feedback. Similarly, Simon Willison recently observed that working with ChatGPT Code Interpreter is like having a free intern that responds incredibly well to feedback. He also noted that AI-enhanced development makes him more ambitious with his projects.\nThere’s an often-repeated claim that “you won’t be replaced by AI, you’ll be replaced by a person using AI”. I’m not too sure about that – horses were almost fully replaced by motor vehicles, for example. That claim is likely true for now, though – mastering new tools is an important skill, which is where human curiosity and rigour come in. But in the long term, I’d much rather see a world where humans become as economically irrelevant as horses. I’d rather we all flourish and have more time for play – let the machines do what we today call work.\nWhat might happen once you’ve finally mastered the latest AI tools. Source: Reddit ","wordCount":"662","inLanguage":"en","image":"https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic.webp","datePublished":"2023-04-21T00:06:30Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Remaining relevant as a small language model</h1><div class=post-meta><span title='2023-04-21 00:06:30 +0000 UTC'>April 21, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-04-21-remaining-relevant-as-a-small-language-model/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_720x0_resize_q75_h2_box_2.webp 720w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_1080x0_resize_q75_h2_box_2.webp 1080w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic_huea5d140e2f65b46659ed7ae4a95a035f_308502_1500x0_resize_q75_h2_box_2.webp 1500w ,https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic.webp 2048w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/mage-horse-versus-car-minimalistic.webp alt="[Mage](https://www.mage.space/)'s interpretation of _horse versus car minimalistic_" width=2048 height=1024><p><a href=https://www.mage.space/ target=_blank rel=noopener>Mage</a>&rsquo;s interpretation of <em>horse versus car minimalistic</em></p></figure><div class=post-content><blockquote><p>Computer science as a field is in for a pretty major upheaval few of us are really prepared for. Programming will be obsolete.</p><footer><strong>Matt Welsh</strong>
 <cite><a href=https://cacm.acm.org/magazines/2023/1/267976-the-end-of-programming/fulltext title=https://cacm.acm.org/magazines/2023/1/267976-the-end-of-programming/fulltext target=_blank rel=noopener>The End of Programming (2023)</a></cite></footer></blockquote><p>Many of us feel both despair and awe when contemplating recent AI developments: <em>Despair</em> due to the rapid pace of automation that threatens personal and social stability. <em>Awe</em> due to the seemingly-magical ability of computers to outperform most humans on an ever-expanding range of tasks. But there is nothing magical about human intelligence, just as there is no magic formula that makes horses gallop and birds fly. That is, it can all be replicated with the right machinery.</p><p>In its wild early days, Bing Chat referred to a user as <em>&ldquo;a late version of a small language model&rdquo;.</em> While there&rsquo;s more to humans than language, there&rsquo;s no denying that our language processing abilities are limited by our biology. Meanwhile, <a href=https://www.cold-takes.com/forecasting-transformative-ai-the-biological-anchors-method-in-a-nutshell/ target=_blank rel=noopener>computers don&rsquo;t face the same constraints</a>. This raises the question: <em>As small language models, what can we do that is still of value?</em></p><figure><a href=bing-chat-small-language-model.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/bing-chat-small-language-model_huafde71da5be9f6a07d947e542269a2bf_696853_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/bing-chat-small-language-model_huafde71da5be9f6a07d947e542269a2bf_696853_480x0_resize_box_3.png 480w,
diff --git a/2023/05/26/how-hackable-are-automated-coding-assessments/index.html b/2023/05/26/how-hackable-are-automated-coding-assessments/index.html
index efa4b6ef3..bdee063e5 100644
--- a/2023/05/26/how-hackable-are-automated-coding-assessments/index.html
+++ b/2023/05/26/how-hackable-are-automated-coding-assessments/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>How hackable are automated coding assessments? | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="artificial intelligence,career,hackers,software engineering"><meta name=description content="Exploring the hackability of speed-based coding tests, using CodeSignal&rsquo;s Industry Coding Framework as a case study."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="How hackable are automated coding assessments?"><meta property="og:description" content="Exploring the hackability of speed-based coding tests, using CodeSignal&rsquo;s Industry Coding Framework as a case study."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/"><meta property="og:image" content="https://yanirseroussi.com/otter-coding-furiously.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-05-26T00:03:00+00:00"><meta property="article:modified_time" content="2023-05-26T13:08:24+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/otter-coding-furiously.jpg"><meta name=twitter:title content="How hackable are automated coding assessments?"><meta name=twitter:description content="Exploring the hackability of speed-based coding tests, using CodeSignal&rsquo;s Industry Coding Framework as a case study."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"How hackable are automated coding assessments?","item":"https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"How hackable are automated coding assessments?","name":"How hackable are automated coding assessments?","description":"Exploring the hackability of speed-based coding tests, using CodeSignal\u0026rsquo;s Industry Coding Framework as a case study.","keywords":["artificial intelligence","career","hackers","software engineering"],"articleBody":"In the essay The Lesson to Unlearn, Paul Graham makes the claim that students are trained to win by hacking bad tests. That is, to get good grades, one has to avoid spending too much time on material that won’t be turned into test questions. Instead, one’s focus has to be on test-specific study. Students are taught that actual learning is less important than maximising grades. That is the lesson to unlearn.1\nEven though the essay is a few years old, it’s been on my mind recently for two reasons. The first reason is that large language models are excelling in standardised tests: I’m impressed by this progress, but it’s also a reminder of the hackability of such tests and the need to employ critical thinking to stay ahead of the AI automation wave. The second reason is that I did a CodeSignal test myself, which led me to think more deeply on the hackability of automated and timed coding assessments. This post discusses my thoughts on the topic, using CodeSignal’s Industry Coding Framework as a case study. However, most of my observations should apply to similar tests.\nWhat are hackable tests? Hacking a test is different from cheating. Hacking entails following the test’s rules, but optimising your work to exploit its weaknesses and increase your score. It doesn’t necessarily entail changing the underlying properties that the test purports to measure. By contrast, cheating entails behaviours that are prohibited by the test’s rules, such as letting someone else do the test for you, or consulting resources that are defined as off limits.\nA test’s hackability isn’t a binary property. Hackability lies on a scale from unhackable to fully hackable, as demonstrated by the following examples and plot.\nSay we take an adult and measure their height every day around the same time, over a period of a month. We can expect the measurements to have low variance. There’s little the test taker can do to significantly increase their height without cheating. The test is a good representation of the property it aims to measure – an unhackable test.\nOn the other end of the scale, say we take the same person and ask them the same set of questions over the course of a month. Our aim is to assess their skills in a subject area such as programming. Given that we’re repeating the same questions, they can find the answers and try to memorise them after each attempt. Assuming they’re sufficiently motivated, we can expect their scores to increase even if they know nothing about programming. This test is highly hackable. It’s hard to say that it accurately reflects the property it purports to measure, i.e., programming skills. This is because scores are strongly influenced by motivation to succeed in the test, as well as short-term memorisation and retrieval abilities.\nAn improvement over the unchanged test is generating variations from a set of possible questions.2 While our test taker would benefit from deeper skills in the subject area, they can also improve their scores by learning to recognise patterns in test questions, managing their time well, and memorising recurring elements. Again, we can expect their scores to improve over time and fail to accurately reflect the skills we care about. This gets us into the familiar territory of standardised testing, a category that I believe CodeSignal’s Industry Coding Assessments fall under. That is, tests that are not fully hackable, but still fall short of reflecting the properties they claim to measure.\nVisualising hackable test scores as a function of time: f(t) = b + h * sqrt(t) + N(0, σ2). Starting from the same baseline b, scores increase with time t due to the hackability factor h that is multiplied by sqrt(t) (ability to improve decays with time). Each test attempt is affected by measurement noise, which comes from a normal distribution with mean zero and variance σ2. I assume that variance and hackability are positively correlated. While this function is made up and missing an upper bound, the shape of the curves should be about right. See notebook for source code. Confessions of a test hacker Before diving into the hackability of CodeSignal’s Industry Coding Framework, here’s a bit of background on my history as a test hacker.\nBack in the day, I got pretty good at hacking tests. I enjoyed learning, but I also enjoyed getting high grades. This goes back to primary and high school and to my undergraduate degree in computer science – I graduated summa cum laude from a well-regarded university. My undergraduate days included hacks such as spending nearly all my waking hours solving past test questions during exam periods, as well as avoiding electives that had a reputation for being excessively time-consuming.\nSimilar test hacking skills were useful when interviewing with big tech companies. Early in my career, I worked with Intel, Qualcomm, and Google, and successfully interviewed with a few other tech companies. On a conceptual level, tech company tests weren’t that different from university tests, except that they were mostly oral (the dreaded whiteboard coding test), and could cover a wider breadth of topics. But even in 2005-2010, many questions leaked online, so I could follow the tried-and-tested hack of preparing by solving old test questions.\nWhile I can do well in standardised timed tests, I never liked them. Despite being hackable, they are stressful, and maximising one’s score requires adequate preparation that is different from learning deeply about the subject matter. Perhaps the most absurd example of this was when I had to take an IELTS exam (a standardised English test) for the second time after completing my PhD, as part of my Australian permanent residency application.3 This was four years after taking the IELTS exam for the first time (in Israel). I spent the intervening years in Australia, authored peer-reviewed papers and a thesis, and gave multiple conference talks. There’s no doubt that my English skills improved over those years, and yet my second IELTS scores were lower.\nWhy were my second IELTS scores lower? Partly because I didn’t prepare for the speaking part of the exam, so I didn’t have much to say when the examiner asked me about my favourite colours and the favourite colours of my friends (yes, for real). I ended up paying the fee to contest the result, and it got bumped up to be closer to my pre-PhD scores. Still, this serves as a salient example of a hackable test. You can improve your IELTS score by getting better at doing IELTS exams, and without any change to your underlying English skills.\nOnce I became an Australian permanent resident, I had to do a driving test to convert my Israeli licence. This was also silly, as I was legally allowed to drive in Australia while I was on a student visa. Those years of driving weren’t enough to automatically convert my licence to the Australian system, so I was subjected to the driving test. While a bit stressful, it wasn’t too bad because driving tests are a close simulation of the skill they aim to measure – driving on streets and highways. As such, they’re not too hackable, though I was careful to signal my intent to the tester in a way that’s somewhat unnatural (e.g., braking and indicating earlier than necessary to avoid getting penalised). I had no issues passing the test.\nFortunately, I managed to avoid convoluted tests in the decade or so since that second IELTS exam. For job applications, I’ve mostly had my skills assessed through custom take-home assignments and paid trial work, e.g., in my long application process with Automattic and my last position with Orkestra, which started as a short-term contract. Those evaluations were less hackable than the whiteboard engineering questions of my early career, and therefore felt like a better reflection of the skills they were assessing.\nOn the hackability of CodeSignal’s Industry Coding Framework Last week, I went through CodeSignal’s Industry Coding Assessment as part of a job application. While I agreed not to share the content of the assessment, there’s plenty I can discuss based on public information from CodeSignal’s website.\nThe whole experience felt like an unpleasant throwback to my old test hacking days in the noughties, but with a shinier user interface. While I’m rusty at standardised code tests, I did what any good test hacker would do: I started my preparation by searching for “Industry Coding Framework” on the web and on Blind, and reading through CodeSignal’s blog and resources. My initial search didn’t yield any unusual hacks, so I followed CodeSignal’s advice and did some of their practice questions. These turned out to be similar to the sort of questions I solved on whiteboards back in the day, except that these days, solutions are automatically scored in a web-based IDE.\nGetting familiar with CodeSignal’s environment and refreshing my speed-solving abilities was definitely helpful when I took the real assessment, and that is a prime indicator of hackability. CodeSignal states that their Industry Coding Framework is designed to evaluate the programming skills of mid-to-senior engineers. These are skills that accrue over years and decades, much like English language skills. The ideal test for such skills shouldn’t be hackable, i.e., scores should be unaffected by repetition of similar tests over a short period. However, on the morning of the test I discovered that CodeSignal’s Industry Coding Assessments are hackable by design.\nWhat I discovered was in a technical brief I initially overlooked, titled Industry Coding Skills Evaluation Framework (a longer version is stored in the Internet Archive). In the brief, they give the following breakdown of questions in their Industry Coding Assessments:\nLevel Expected time in minutes 1 10-15 2 20-30 3 30-60 4 30-60 Adding up the time ranges gives us an estimate of 90-165 minutes to complete the assessment. But the time they give candidates to complete the test is… 90 minutes! In their own words:\nThe maximum allowed completion time for the assessment is 90 minutes; however, candidates are not necessarily expected to complete all tasks within this time limit. While longer assessments allow more accurate measurement of candidate skills, the willingness to complete assessments decreases dramatically for tests longer than 2 hours. Moreover, a major factor in assessing candidates’ skill levels is to see how far they can progress within the given time frame.\nIt makes sense that candidates don’t want to spend too much time on artificial tests. But a better approach would be to design a test that can be completed within the allotted time by skilled candidates who don’t engage in test hacking. Alternatively, they could allow more time and penalise candidates for going over the minimum of 90 minutes. This would make it easier to tell the difference between people who are slightly slower than the cut-off and those who are significantly slower. Implementation speed does matter in the real world, but it’s rarely measured on the order of minutes.\nAs it stands, my opinion is that making speed a key factor in test success makes it hackable because test-specific practice can lead to dramatically better results. CodeSignal’s decision to emphasise speed turns the test into a game like Speedcubing, and a game can be defined as the overcoming of unnecessary obstacles. Gamification may be in vogue, but I believe it’s better to keep it out of the job application process.\nFurther evidence for hackability comes from the fact that CodeSignal limits test attempts over varying time windows. If CodeSignal’s assessments were more like measuring one’s height or basic driving skills, this wouldn’t be needed. Further, this somewhat favours people who are in better assessment shape, e.g., because they’re applying to many jobs and are highly motivated to get them. Sadly, I found a thread on CodeSignal’s General Coding Assessment that says that the same CodeSignal results can be used by multiple companies, which means that people get locked out of opportunities for the time window that’s determined by CodeSignal. Anecdotally, while researching this post, I also discovered that many people dislike CodeSignal and have made similar observations to mine about the validity of their evaluations. Further, when it comes to General Coding Assessments, one can find many tips on test hacking (e.g., on Reddit and GitHub).\nAnother key issue is that the effective time given for the test isn’t 90 minutes. It’s typically two weeks from the time of notification, where one can’t see the test, plus 90 minutes to do the test. The two weeks can be used for extensive test hacking, depending on the test taker’s available time and motivation.\nAs both my available time and motivation were lacking, I didn’t use the full two weeks. I quickly lost interest in solving the same kind of questions I solved around 2005. I also suspected that some of the practice questions provided by CodeSignal had little relevance to the Industry Coding Framework. In addition, I read on Glassdoor and Blind that the company that asked me to take the test had ghosted some candidates after they had passed it, so I figured that maximising my test preparation time wasn’t worth it. With more than a week left before the deadline, I decided to take the test and move on.\nBeyond hackability: Other issues with CodeSignal and automated assessments To my surprise, when I clicked the Take Test button, I was given an option to do a demo test. Hiding the demo behind that button feels a bit unfair. I assume that candidates would click the button when they’re ready to take the test, not when they want to do further preparation. But I finished the demo test in 15 out of the allotted 60 minutes, so I felt good enough about it and moved on to the real thing.\nUnfortunately, I ran out of time on the real test and scored 800 / 1000. According to the distribution in the archived version of CodeSignal’s Industry Coding Framework brief, this would have put me at the top 5% of test takers. But I’m not pleased with the result. The code I wrote was horrible and followed practices I’d never follow if I wasn’t trying to optimise for speed. There were also technical issues with the platform that got in my way: The IDE refreshed multiple times and claimed that I had lost connection, and having to use their IDE rather than a notebook environment is also a bit of a pain given the strict time constraints.\nIt’s likely I could have scored higher if I had maximised my test hacking efforts. Spending another week on preparation would have probably made a difference given that the hackability of the test is similar to that of an IELTS exam: Getting from zero to a perfect score is probably impossible over a short time span, but it is possible to nudge the score up by optimising the test-taking strategy and refreshing one’s bag of tricks (the sort of tricks that you don’t have to worry about retrieving quickly from memory under normal circumstances). For example, one relevant preparation step I could have followed was to attempt the sample questions from the Industry Coding Framework brief. I could have even taken it further and used ChatGPT (or another chatbot) to generate variations on the same theme. But as noted, I didn’t feel like it was worth maximising my hacking efforts given the circumstances.\nRegardless of hackability, I believe that the test fails to capture many of the skills it purports to measure. Specifically:\nNo points are given for good design without code that passes the automated tests. This is unlike more manual testing with a human assessor, where partial credit is given for having good ideas but running out of time. Not having to write any tests encourages lazy coding. Normally I’d think through edge cases, but optimising for the test score means that the only edge cases that matter are those that get caught by automated tests. It’s easier to deal with such issues if they get caught rather than spend precious time thinking about them. In real work, you need to spend time testing your code, which often requires more thinking than implementing the core logic. While CodeSignal claims that they test refactoring skills, the test design doesn’t even offer a caricature of real refactoring. In reality, new requirements are added over the course of days, weeks, months, and years – not minutes. And you need to refactor legacy code that runs in production and was written by many people of varying levels of proficiency and time pressures. This is nothing like tweaking throwaway code that you’ve written minutes ago. Putting a high emphasis on implementation speed when aiming to test mid-to-senior developers disadvantages those who have gotten into the habit of avoiding software engineering classic mistakes such as shortchanged quality assurance and code-like-hell programming.4 As noted by Martin Fowler, ignoring internal quality increases the pace of feature delivery early in a project’s life, but slows it down in the longer term (within weeks). Setting a 90-minute time limit on a test that’s supposed to take a minimum of 90 minutes may filter out experienced engineers who have developed good habits and didn’t bother unlearning them for test hacking purposes. This is an instance of the McNamara fallacy – time is easy to measure, but deep skills and good habits aren’t. Unfortunately, CodeSignal is heavily biased towards that which is easy to measure, but rather than admitting these flaws, they make unsupported claims about the effectiveness of their measurement approach (just read the archived version of the brief for a bit of a laugh). Hacking timed tests can be at odds with habits that are needed to develop high-quality feature-rich software. Source: Martin Fowler’s Is High Quality Software Worth the Cost? Closing thoughts: Partial hackability doesn’t imply complete uselessness Hackability is a non-binary measurement. Even hackable tests can be reflective of the properties they’re supposed to measure. As CodeSignal says in their marketing materials, they offer a cost-effective approach to filtering out candidates, at least when compared to manual in-house recruitment. From a hiring perspective, cheap filters are valuable when a company is flooded with qualified candidates, even if such filters have a high false negative rate. The goal is achieved as long as the filter also decreases the false positive rate. Favouring test hackers is a small price to pay for an initial filter – even if you get candidates to optimise for the wrong metrics, this can be corrected with more thoughtful testing down the track. However, turning the application process into a series of games risks alienating some candidates, who won’t bother applying even if they can do the job well.\nAmong other factors, test scores are a function of the test taker’s skills, test design/hackability, and the test taker’s preparation for the specific test.5 I believe that take-home coding assessments and real-work simulations offer a better candidate experience and provide a better signal to companies than artificial time-limited tests like CodeSignal’s Industry Coding Assessments. This is supported by statements from CodeSignal: The brief discussed above explicitly says that “longer assessments allow more accurate measurement of candidate skills”, and they found in their 2023 survey that candidates prefer take-home coding challenges to CodeSignal assessments.6\nMy hope is that this post would help future users of automated coding assessments in general, and CodeSignal’s Industry Coding Framework in particular. Perhaps it’d also nudge CodeSignal to improve their platform. They can do better. I won’t be holding my breath, though – standardised assessments like CodeSignal and IELTS are a part of a massive industry. There’s little incentive for incumbents to change their ways, but it is possible that large language models excelling in test hacking would force their hand.\nSome comments from a Blind thread on coding assessments. Seeing it all as a somewhat-useful game is probably the way to go. Note: I reached out to CodeSignal for a comment on this post, but haven’t heard back after more than a week.\nAs with many Paul Graham essays, I find myself in agreement with some of his ideas and disagreement with others. But hackable tests are definitely a thing, e.g., see teaching to the test and Campbell’s law. ↩︎\nTaking a machine learning analogy, asking the same questions repeatedly is likely to lead to overfitting. Drawing new questions from the same distribution is akin to adding a validation set, while dealing with the sort of problems encountered outside standardised tests is indicative of the generalisation error of the test taker. ↩︎\nIt says a lot about the hackability of higher education that the Australian government requires a PhD graduate from a top Australian university to prove that their English skills haven’t deteriorated after four years in Australia. Similarly, companies that look at educational pedigree but put recent graduates through their own set of tests implicitly distrust the grades given by universities. ↩︎\nThe only time you’re likely to face ridiculous time pressures that are measured in minutes is when something breaks in production. Production issues can be minimised through investment in solid processes and quality over a project’s lifetime. That is, you go slow to go fast and avoid fire-fighting. Take-home exams and real-work simulations are more reflective of the sort of thinking that’s required from senior engineers because good ideas often manifest when you take the time to design a system and avoid jumping into code-like-hell mode. Going with the first thing that comes to mind is a habit that’s better left to chatbots. ↩︎\nPreparation is partly a function of motivation to pass the test, which is a positive indicator despite being unrelated to possessing the skills the test purports to measure. In my case, motivation to maximise the score was lacking, so the company got useful information out of my imperfect score. Why was my motivation lacking? Because the role seemed interesting enough to apply to, but not worth working too hard to get. The opportunity cost of neglecting my other endeavours in favour of test hacking seemed too high. ↩︎\nSee page 11 of the linked survey. Like other materials from CodeSignal, it’s somewhat comical. They state that “candidates view CodeSignal assessments more favorably than timed coding assessments in general (p = 0.034)”, but looking at the table, the mean score given to CodeSignal assessments is 3.41 / 5, while general timed coding assessments were given a mean score of 3.37. That is, a difference of 0.04 – it’s hard to call this practically significant, despite the p-value. Could it be that CodeSignal’s IO psychologists missed the many memos on p-value pitfalls, such as the one by the American Statistical Association? In any case, if they consider the 0.04 difference to be notable, why do they say nothing about the 0.06 difference in favour of take-home coding assignments or the 0.17 difference in favour of coding interviews? Personally, I’d also report the full distribution rather than just the means. It’s easy enough to visualise a five-point scale. ↩︎\n","wordCount":"3832","inLanguage":"en","image":"https://yanirseroussi.com/otter-coding-furiously.jpg","datePublished":"2023-05-26T00:03:00Z","dateModified":"2023-05-26T13:08:24+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">How hackable are automated coding assessments?</h1><div class=post-meta><span title='2023-05-26 00:03:00 +0000 UTC'>May 26, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-05-26-how-hackable-are-automated-coding-assessments/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously_hu6b7664f523075193f9f11d79c1c9dcfa_195397_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously_hu6b7664f523075193f9f11d79c1c9dcfa_195397_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously_hu6b7664f523075193f9f11d79c1c9dcfa_195397_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously.jpg 1023w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously.jpg alt="Bing's interpretation of _an otter coding furiously in an attempt to pass a coding test_" width=1023 height=914><p>Bing&rsquo;s interpretation of <em>an otter coding furiously in an attempt to pass a coding test</em></p></figure><div class=post-content><p>In the essay <a href=http://www.paulgraham.com/lesson.html target=_blank rel=noopener>The Lesson to Unlearn</a>, Paul Graham makes the claim that students are trained to win by hacking bad tests. That is, to get good grades, one has to avoid spending too much time on material that won&rsquo;t be turned into test questions. Instead, one&rsquo;s focus has to be on test-specific study. Students are taught that actual learning is less important than maximising grades. That is the lesson to unlearn.<sup id=fnref:1><a href=#fn:1 class=footnote-ref role=doc-noteref>1</a></sup></p><p>Even though the essay is a few years old, it&rsquo;s been on my mind recently for two reasons. The first reason is that <a href=https://openai.com/product/gpt-4 target=_blank rel=noopener>large language models are excelling in standardised tests</a>: <a href=https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/>I&rsquo;m impressed by this progress</a>, but it&rsquo;s also a reminder of the hackability of such tests and <a href=https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/>the need to employ critical thinking to stay ahead of the AI automation wave</a>. The second reason is that I did a <a href=https://codesignal.com/ target=_blank rel=noopener>CodeSignal</a> test myself, which led me to think more deeply on the hackability of automated and timed coding assessments. This post discusses my thoughts on the topic, using CodeSignal&rsquo;s Industry Coding Framework as a case study. However, most of my observations should apply to similar tests.</p><h2 id=what-are-hackable-tests>What are hackable tests?<a hidden class=anchor aria-hidden=true href=#what-are-hackable-tests>#</a></h2><p>Hacking a test is different from cheating. Hacking entails following the test&rsquo;s rules, but optimising your work to exploit its weaknesses and increase your score. It doesn&rsquo;t necessarily entail changing the underlying properties that the test purports to measure. By contrast, cheating entails behaviours that are prohibited by the test&rsquo;s rules, such as letting someone else do the test for you, or consulting resources that are defined as off limits.</p><p>A test&rsquo;s hackability isn&rsquo;t a binary property. Hackability lies on a scale from unhackable to fully hackable, as demonstrated by the following examples and plot.</p><p>Say we take an adult and measure their height every day around the same time, over a period of a month. We can expect the measurements to have low variance. There&rsquo;s little the test taker can do to significantly increase their height without cheating. The test is a good representation of the property it aims to measure – <strong>an unhackable test.</strong></p><p>On the other end of the scale, say we take the same person and ask them the same set of questions over the course of a month. Our aim is to assess their skills in a subject area such as programming. Given that we&rsquo;re repeating the same questions, they can find the answers and try to memorise them after each attempt. Assuming they&rsquo;re sufficiently motivated, we can expect their scores to increase even if they know nothing about programming. <strong>This test is highly hackable.</strong> It&rsquo;s hard to say that it accurately reflects the property it purports to measure, i.e., programming skills. This is because scores are strongly influenced by motivation to succeed in the test, as well as short-term memorisation and retrieval abilities.</p><p>An improvement over the unchanged test is generating variations from a set of possible questions.<sup id=fnref:2><a href=#fn:2 class=footnote-ref role=doc-noteref>2</a></sup> While our test taker would benefit from deeper skills in the subject area, they can also improve their scores by learning to recognise patterns in test questions, managing their time well, and memorising recurring elements. Again, we can expect their scores to improve over time and fail to accurately reflect the skills we care about. This gets us into the familiar territory of standardised testing, a category that I believe <a href=https://codesignal.com/resource/industry-coding-data-sheet/ target=_blank rel=noopener>CodeSignal&rsquo;s Industry Coding Assessments</a> fall under. That is, tests that are not fully hackable, but still fall short of reflecting the properties they claim to measure.</p><figure><a href=test-hackability-plot.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="artificial intelligence,career,hackers,software engineering"><meta name=description content="Exploring the hackability of speed-based coding tests, using CodeSignal&rsquo;s Industry Coding Framework as a case study."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="How hackable are automated coding assessments?"><meta property="og:description" content="Exploring the hackability of speed-based coding tests, using CodeSignal&rsquo;s Industry Coding Framework as a case study."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/"><meta property="og:image" content="https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-05-26T00:03:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously.jpg"><meta name=twitter:title content="How hackable are automated coding assessments?"><meta name=twitter:description content="Exploring the hackability of speed-based coding tests, using CodeSignal&rsquo;s Industry Coding Framework as a case study."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"How hackable are automated coding assessments?","item":"https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"How hackable are automated coding assessments?","name":"How hackable are automated coding assessments?","description":"Exploring the hackability of speed-based coding tests, using CodeSignal\u0026rsquo;s Industry Coding Framework as a case study.","keywords":["artificial intelligence","career","hackers","software engineering"],"articleBody":"In the essay The Lesson to Unlearn, Paul Graham makes the claim that students are trained to win by hacking bad tests. That is, to get good grades, one has to avoid spending too much time on material that won’t be turned into test questions. Instead, one’s focus has to be on test-specific study. Students are taught that actual learning is less important than maximising grades. That is the lesson to unlearn.1\nEven though the essay is a few years old, it’s been on my mind recently for two reasons. The first reason is that large language models are excelling in standardised tests: I’m impressed by this progress, but it’s also a reminder of the hackability of such tests and the need to employ critical thinking to stay ahead of the AI automation wave. The second reason is that I did a CodeSignal test myself, which led me to think more deeply on the hackability of automated and timed coding assessments. This post discusses my thoughts on the topic, using CodeSignal’s Industry Coding Framework as a case study. However, most of my observations should apply to similar tests.\nWhat are hackable tests? Hacking a test is different from cheating. Hacking entails following the test’s rules, but optimising your work to exploit its weaknesses and increase your score. It doesn’t necessarily entail changing the underlying properties that the test purports to measure. By contrast, cheating entails behaviours that are prohibited by the test’s rules, such as letting someone else do the test for you, or consulting resources that are defined as off limits.\nA test’s hackability isn’t a binary property. Hackability lies on a scale from unhackable to fully hackable, as demonstrated by the following examples and plot.\nSay we take an adult and measure their height every day around the same time, over a period of a month. We can expect the measurements to have low variance. There’s little the test taker can do to significantly increase their height without cheating. The test is a good representation of the property it aims to measure – an unhackable test.\nOn the other end of the scale, say we take the same person and ask them the same set of questions over the course of a month. Our aim is to assess their skills in a subject area such as programming. Given that we’re repeating the same questions, they can find the answers and try to memorise them after each attempt. Assuming they’re sufficiently motivated, we can expect their scores to increase even if they know nothing about programming. This test is highly hackable. It’s hard to say that it accurately reflects the property it purports to measure, i.e., programming skills. This is because scores are strongly influenced by motivation to succeed in the test, as well as short-term memorisation and retrieval abilities.\nAn improvement over the unchanged test is generating variations from a set of possible questions.2 While our test taker would benefit from deeper skills in the subject area, they can also improve their scores by learning to recognise patterns in test questions, managing their time well, and memorising recurring elements. Again, we can expect their scores to improve over time and fail to accurately reflect the skills we care about. This gets us into the familiar territory of standardised testing, a category that I believe CodeSignal’s Industry Coding Assessments fall under. That is, tests that are not fully hackable, but still fall short of reflecting the properties they claim to measure.\nVisualising hackable test scores as a function of time: f(t) = b + h * sqrt(t) + N(0, σ2). Starting from the same baseline b, scores increase with time t due to the hackability factor h that is multiplied by sqrt(t) (ability to improve decays with time). Each test attempt is affected by measurement noise, which comes from a normal distribution with mean zero and variance σ2. I assume that variance and hackability are positively correlated. While this function is made up and missing an upper bound, the shape of the curves should be about right. See notebook for source code. Confessions of a test hacker Before diving into the hackability of CodeSignal’s Industry Coding Framework, here’s a bit of background on my history as a test hacker.\nBack in the day, I got pretty good at hacking tests. I enjoyed learning, but I also enjoyed getting high grades. This goes back to primary and high school and to my undergraduate degree in computer science – I graduated summa cum laude from a well-regarded university. My undergraduate days included hacks such as spending nearly all my waking hours solving past test questions during exam periods, as well as avoiding electives that had a reputation for being excessively time-consuming.\nSimilar test hacking skills were useful when interviewing with big tech companies. Early in my career, I worked with Intel, Qualcomm, and Google, and successfully interviewed with a few other tech companies. On a conceptual level, tech company tests weren’t that different from university tests, except that they were mostly oral (the dreaded whiteboard coding test), and could cover a wider breadth of topics. But even in 2005-2010, many questions leaked online, so I could follow the tried-and-tested hack of preparing by solving old test questions.\nWhile I can do well in standardised timed tests, I never liked them. Despite being hackable, they are stressful, and maximising one’s score requires adequate preparation that is different from learning deeply about the subject matter. Perhaps the most absurd example of this was when I had to take an IELTS exam (a standardised English test) for the second time after completing my PhD, as part of my Australian permanent residency application.3 This was four years after taking the IELTS exam for the first time (in Israel). I spent the intervening years in Australia, authored peer-reviewed papers and a thesis, and gave multiple conference talks. There’s no doubt that my English skills improved over those years, and yet my second IELTS scores were lower.\nWhy were my second IELTS scores lower? Partly because I didn’t prepare for the speaking part of the exam, so I didn’t have much to say when the examiner asked me about my favourite colours and the favourite colours of my friends (yes, for real). I ended up paying the fee to contest the result, and it got bumped up to be closer to my pre-PhD scores. Still, this serves as a salient example of a hackable test. You can improve your IELTS score by getting better at doing IELTS exams, and without any change to your underlying English skills.\nOnce I became an Australian permanent resident, I had to do a driving test to convert my Israeli licence. This was also silly, as I was legally allowed to drive in Australia while I was on a student visa. Those years of driving weren’t enough to automatically convert my licence to the Australian system, so I was subjected to the driving test. While a bit stressful, it wasn’t too bad because driving tests are a close simulation of the skill they aim to measure – driving on streets and highways. As such, they’re not too hackable, though I was careful to signal my intent to the tester in a way that’s somewhat unnatural (e.g., braking and indicating earlier than necessary to avoid getting penalised). I had no issues passing the test.\nFortunately, I managed to avoid convoluted tests in the decade or so since that second IELTS exam. For job applications, I’ve mostly had my skills assessed through custom take-home assignments and paid trial work, e.g., in my long application process with Automattic and my last position with Orkestra, which started as a short-term contract. Those evaluations were less hackable than the whiteboard engineering questions of my early career, and therefore felt like a better reflection of the skills they were assessing.\nOn the hackability of CodeSignal’s Industry Coding Framework Last week, I went through CodeSignal’s Industry Coding Assessment as part of a job application. While I agreed not to share the content of the assessment, there’s plenty I can discuss based on public information from CodeSignal’s website.\nThe whole experience felt like an unpleasant throwback to my old test hacking days in the noughties, but with a shinier user interface. While I’m rusty at standardised code tests, I did what any good test hacker would do: I started my preparation by searching for “Industry Coding Framework” on the web and on Blind, and reading through CodeSignal’s blog and resources. My initial search didn’t yield any unusual hacks, so I followed CodeSignal’s advice and did some of their practice questions. These turned out to be similar to the sort of questions I solved on whiteboards back in the day, except that these days, solutions are automatically scored in a web-based IDE.\nGetting familiar with CodeSignal’s environment and refreshing my speed-solving abilities was definitely helpful when I took the real assessment, and that is a prime indicator of hackability. CodeSignal states that their Industry Coding Framework is designed to evaluate the programming skills of mid-to-senior engineers. These are skills that accrue over years and decades, much like English language skills. The ideal test for such skills shouldn’t be hackable, i.e., scores should be unaffected by repetition of similar tests over a short period. However, on the morning of the test I discovered that CodeSignal’s Industry Coding Assessments are hackable by design.\nWhat I discovered was in a technical brief I initially overlooked, titled Industry Coding Skills Evaluation Framework (a longer version is stored in the Internet Archive). In the brief, they give the following breakdown of questions in their Industry Coding Assessments:\nLevel Expected time in minutes 1 10-15 2 20-30 3 30-60 4 30-60 Adding up the time ranges gives us an estimate of 90-165 minutes to complete the assessment. But the time they give candidates to complete the test is… 90 minutes! In their own words:\nThe maximum allowed completion time for the assessment is 90 minutes; however, candidates are not necessarily expected to complete all tasks within this time limit. While longer assessments allow more accurate measurement of candidate skills, the willingness to complete assessments decreases dramatically for tests longer than 2 hours. Moreover, a major factor in assessing candidates’ skill levels is to see how far they can progress within the given time frame.\nIt makes sense that candidates don’t want to spend too much time on artificial tests. But a better approach would be to design a test that can be completed within the allotted time by skilled candidates who don’t engage in test hacking. Alternatively, they could allow more time and penalise candidates for going over the minimum of 90 minutes. This would make it easier to tell the difference between people who are slightly slower than the cut-off and those who are significantly slower. Implementation speed does matter in the real world, but it’s rarely measured on the order of minutes.\nAs it stands, my opinion is that making speed a key factor in test success makes it hackable because test-specific practice can lead to dramatically better results. CodeSignal’s decision to emphasise speed turns the test into a game like Speedcubing, and a game can be defined as the overcoming of unnecessary obstacles. Gamification may be in vogue, but I believe it’s better to keep it out of the job application process.\nFurther evidence for hackability comes from the fact that CodeSignal limits test attempts over varying time windows. If CodeSignal’s assessments were more like measuring one’s height or basic driving skills, this wouldn’t be needed. Further, this somewhat favours people who are in better assessment shape, e.g., because they’re applying to many jobs and are highly motivated to get them. Sadly, I found a thread on CodeSignal’s General Coding Assessment that says that the same CodeSignal results can be used by multiple companies, which means that people get locked out of opportunities for the time window that’s determined by CodeSignal. Anecdotally, while researching this post, I also discovered that many people dislike CodeSignal and have made similar observations to mine about the validity of their evaluations. Further, when it comes to General Coding Assessments, one can find many tips on test hacking (e.g., on Reddit and GitHub).\nAnother key issue is that the effective time given for the test isn’t 90 minutes. It’s typically two weeks from the time of notification, where one can’t see the test, plus 90 minutes to do the test. The two weeks can be used for extensive test hacking, depending on the test taker’s available time and motivation.\nAs both my available time and motivation were lacking, I didn’t use the full two weeks. I quickly lost interest in solving the same kind of questions I solved around 2005. I also suspected that some of the practice questions provided by CodeSignal had little relevance to the Industry Coding Framework. In addition, I read on Glassdoor and Blind that the company that asked me to take the test had ghosted some candidates after they had passed it, so I figured that maximising my test preparation time wasn’t worth it. With more than a week left before the deadline, I decided to take the test and move on.\nBeyond hackability: Other issues with CodeSignal and automated assessments To my surprise, when I clicked the Take Test button, I was given an option to do a demo test. Hiding the demo behind that button feels a bit unfair. I assume that candidates would click the button when they’re ready to take the test, not when they want to do further preparation. But I finished the demo test in 15 out of the allotted 60 minutes, so I felt good enough about it and moved on to the real thing.\nUnfortunately, I ran out of time on the real test and scored 800 / 1000. According to the distribution in the archived version of CodeSignal’s Industry Coding Framework brief, this would have put me at the top 5% of test takers. But I’m not pleased with the result. The code I wrote was horrible and followed practices I’d never follow if I wasn’t trying to optimise for speed. There were also technical issues with the platform that got in my way: The IDE refreshed multiple times and claimed that I had lost connection, and having to use their IDE rather than a notebook environment is also a bit of a pain given the strict time constraints.\nIt’s likely I could have scored higher if I had maximised my test hacking efforts. Spending another week on preparation would have probably made a difference given that the hackability of the test is similar to that of an IELTS exam: Getting from zero to a perfect score is probably impossible over a short time span, but it is possible to nudge the score up by optimising the test-taking strategy and refreshing one’s bag of tricks (the sort of tricks that you don’t have to worry about retrieving quickly from memory under normal circumstances). For example, one relevant preparation step I could have followed was to attempt the sample questions from the Industry Coding Framework brief. I could have even taken it further and used ChatGPT (or another chatbot) to generate variations on the same theme. But as noted, I didn’t feel like it was worth maximising my hacking efforts given the circumstances.\nRegardless of hackability, I believe that the test fails to capture many of the skills it purports to measure. Specifically:\nNo points are given for good design without code that passes the automated tests. This is unlike more manual testing with a human assessor, where partial credit is given for having good ideas but running out of time. Not having to write any tests encourages lazy coding. Normally I’d think through edge cases, but optimising for the test score means that the only edge cases that matter are those that get caught by automated tests. It’s easier to deal with such issues if they get caught rather than spend precious time thinking about them. In real work, you need to spend time testing your code, which often requires more thinking than implementing the core logic. While CodeSignal claims that they test refactoring skills, the test design doesn’t even offer a caricature of real refactoring. In reality, new requirements are added over the course of days, weeks, months, and years – not minutes. And you need to refactor legacy code that runs in production and was written by many people of varying levels of proficiency and time pressures. This is nothing like tweaking throwaway code that you’ve written minutes ago. Putting a high emphasis on implementation speed when aiming to test mid-to-senior developers disadvantages those who have gotten into the habit of avoiding software engineering classic mistakes such as shortchanged quality assurance and code-like-hell programming.4 As noted by Martin Fowler, ignoring internal quality increases the pace of feature delivery early in a project’s life, but slows it down in the longer term (within weeks). Setting a 90-minute time limit on a test that’s supposed to take a minimum of 90 minutes may filter out experienced engineers who have developed good habits and didn’t bother unlearning them for test hacking purposes. This is an instance of the McNamara fallacy – time is easy to measure, but deep skills and good habits aren’t. Unfortunately, CodeSignal is heavily biased towards that which is easy to measure, but rather than admitting these flaws, they make unsupported claims about the effectiveness of their measurement approach (just read the archived version of the brief for a bit of a laugh). Hacking timed tests can be at odds with habits that are needed to develop high-quality feature-rich software. Source: Martin Fowler’s Is High Quality Software Worth the Cost? Closing thoughts: Partial hackability doesn’t imply complete uselessness Hackability is a non-binary measurement. Even hackable tests can be reflective of the properties they’re supposed to measure. As CodeSignal says in their marketing materials, they offer a cost-effective approach to filtering out candidates, at least when compared to manual in-house recruitment. From a hiring perspective, cheap filters are valuable when a company is flooded with qualified candidates, even if such filters have a high false negative rate. The goal is achieved as long as the filter also decreases the false positive rate. Favouring test hackers is a small price to pay for an initial filter – even if you get candidates to optimise for the wrong metrics, this can be corrected with more thoughtful testing down the track. However, turning the application process into a series of games risks alienating some candidates, who won’t bother applying even if they can do the job well.\nAmong other factors, test scores are a function of the test taker’s skills, test design/hackability, and the test taker’s preparation for the specific test.5 I believe that take-home coding assessments and real-work simulations offer a better candidate experience and provide a better signal to companies than artificial time-limited tests like CodeSignal’s Industry Coding Assessments. This is supported by statements from CodeSignal: The brief discussed above explicitly says that “longer assessments allow more accurate measurement of candidate skills”, and they found in their 2023 survey that candidates prefer take-home coding challenges to CodeSignal assessments.6\nMy hope is that this post would help future users of automated coding assessments in general, and CodeSignal’s Industry Coding Framework in particular. Perhaps it’d also nudge CodeSignal to improve their platform. They can do better. I won’t be holding my breath, though – standardised assessments like CodeSignal and IELTS are a part of a massive industry. There’s little incentive for incumbents to change their ways, but it is possible that large language models excelling in test hacking would force their hand.\nSome comments from a Blind thread on coding assessments. Seeing it all as a somewhat-useful game is probably the way to go. Note: I reached out to CodeSignal for a comment on this post, but haven’t heard back after more than a week.\nAs with many Paul Graham essays, I find myself in agreement with some of his ideas and disagreement with others. But hackable tests are definitely a thing, e.g., see teaching to the test and Campbell’s law. ↩︎\nTaking a machine learning analogy, asking the same questions repeatedly is likely to lead to overfitting. Drawing new questions from the same distribution is akin to adding a validation set, while dealing with the sort of problems encountered outside standardised tests is indicative of the generalisation error of the test taker. ↩︎\nIt says a lot about the hackability of higher education that the Australian government requires a PhD graduate from a top Australian university to prove that their English skills haven’t deteriorated after four years in Australia. Similarly, companies that look at educational pedigree but put recent graduates through their own set of tests implicitly distrust the grades given by universities. ↩︎\nThe only time you’re likely to face ridiculous time pressures that are measured in minutes is when something breaks in production. Production issues can be minimised through investment in solid processes and quality over a project’s lifetime. That is, you go slow to go fast and avoid fire-fighting. Take-home exams and real-work simulations are more reflective of the sort of thinking that’s required from senior engineers because good ideas often manifest when you take the time to design a system and avoid jumping into code-like-hell mode. Going with the first thing that comes to mind is a habit that’s better left to chatbots. ↩︎\nPreparation is partly a function of motivation to pass the test, which is a positive indicator despite being unrelated to possessing the skills the test purports to measure. In my case, motivation to maximise the score was lacking, so the company got useful information out of my imperfect score. Why was my motivation lacking? Because the role seemed interesting enough to apply to, but not worth working too hard to get. The opportunity cost of neglecting my other endeavours in favour of test hacking seemed too high. ↩︎\nSee page 11 of the linked survey. Like other materials from CodeSignal, it’s somewhat comical. They state that “candidates view CodeSignal assessments more favorably than timed coding assessments in general (p = 0.034)”, but looking at the table, the mean score given to CodeSignal assessments is 3.41 / 5, while general timed coding assessments were given a mean score of 3.37. That is, a difference of 0.04 – it’s hard to call this practically significant, despite the p-value. Could it be that CodeSignal’s IO psychologists missed the many memos on p-value pitfalls, such as the one by the American Statistical Association? In any case, if they consider the 0.04 difference to be notable, why do they say nothing about the 0.06 difference in favour of take-home coding assignments or the 0.17 difference in favour of coding interviews? Personally, I’d also report the full distribution rather than just the means. It’s easy enough to visualise a five-point scale. ↩︎\n","wordCount":"3832","inLanguage":"en","image":"https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously.jpg","datePublished":"2023-05-26T00:03:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">How hackable are automated coding assessments?</h1><div class=post-meta><span title='2023-05-26 00:03:00 +0000 UTC'>May 26, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-05-26-how-hackable-are-automated-coding-assessments/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously_hu6b7664f523075193f9f11d79c1c9dcfa_195397_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously_hu6b7664f523075193f9f11d79c1c9dcfa_195397_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously_hu6b7664f523075193f9f11d79c1c9dcfa_195397_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously.jpg 1023w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/otter-coding-furiously.jpg alt="Bing's interpretation of _an otter coding furiously in an attempt to pass a coding test_" width=1023 height=914><p>Bing&rsquo;s interpretation of <em>an otter coding furiously in an attempt to pass a coding test</em></p></figure><div class=post-content><p>In the essay <a href=http://www.paulgraham.com/lesson.html target=_blank rel=noopener>The Lesson to Unlearn</a>, Paul Graham makes the claim that students are trained to win by hacking bad tests. That is, to get good grades, one has to avoid spending too much time on material that won&rsquo;t be turned into test questions. Instead, one&rsquo;s focus has to be on test-specific study. Students are taught that actual learning is less important than maximising grades. That is the lesson to unlearn.<sup id=fnref:1><a href=#fn:1 class=footnote-ref role=doc-noteref>1</a></sup></p><p>Even though the essay is a few years old, it&rsquo;s been on my mind recently for two reasons. The first reason is that <a href=https://openai.com/product/gpt-4 target=_blank rel=noopener>large language models are excelling in standardised tests</a>: <a href=https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/>I&rsquo;m impressed by this progress</a>, but it&rsquo;s also a reminder of the hackability of such tests and <a href=https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/>the need to employ critical thinking to stay ahead of the AI automation wave</a>. The second reason is that I did a <a href=https://codesignal.com/ target=_blank rel=noopener>CodeSignal</a> test myself, which led me to think more deeply on the hackability of automated and timed coding assessments. This post discusses my thoughts on the topic, using CodeSignal&rsquo;s Industry Coding Framework as a case study. However, most of my observations should apply to similar tests.</p><h2 id=what-are-hackable-tests>What are hackable tests?<a hidden class=anchor aria-hidden=true href=#what-are-hackable-tests>#</a></h2><p>Hacking a test is different from cheating. Hacking entails following the test&rsquo;s rules, but optimising your work to exploit its weaknesses and increase your score. It doesn&rsquo;t necessarily entail changing the underlying properties that the test purports to measure. By contrast, cheating entails behaviours that are prohibited by the test&rsquo;s rules, such as letting someone else do the test for you, or consulting resources that are defined as off limits.</p><p>A test&rsquo;s hackability isn&rsquo;t a binary property. Hackability lies on a scale from unhackable to fully hackable, as demonstrated by the following examples and plot.</p><p>Say we take an adult and measure their height every day around the same time, over a period of a month. We can expect the measurements to have low variance. There&rsquo;s little the test taker can do to significantly increase their height without cheating. The test is a good representation of the property it aims to measure – <strong>an unhackable test.</strong></p><p>On the other end of the scale, say we take the same person and ask them the same set of questions over the course of a month. Our aim is to assess their skills in a subject area such as programming. Given that we&rsquo;re repeating the same questions, they can find the answers and try to memorise them after each attempt. Assuming they&rsquo;re sufficiently motivated, we can expect their scores to increase even if they know nothing about programming. <strong>This test is highly hackable.</strong> It&rsquo;s hard to say that it accurately reflects the property it purports to measure, i.e., programming skills. This is because scores are strongly influenced by motivation to succeed in the test, as well as short-term memorisation and retrieval abilities.</p><p>An improvement over the unchanged test is generating variations from a set of possible questions.<sup id=fnref:2><a href=#fn:2 class=footnote-ref role=doc-noteref>2</a></sup> While our test taker would benefit from deeper skills in the subject area, they can also improve their scores by learning to recognise patterns in test questions, managing their time well, and memorising recurring elements. Again, we can expect their scores to improve over time and fail to accurately reflect the skills we care about. This gets us into the familiar territory of standardised testing, a category that I believe <a href=https://codesignal.com/resource/industry-coding-data-sheet/ target=_blank rel=noopener>CodeSignal&rsquo;s Industry Coding Assessments</a> fall under. That is, tests that are not fully hackable, but still fall short of reflecting the properties they claim to measure.</p><figure><a href=test-hackability-plot.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/test-hackability-plot_huc6cd7ee0ede95da30b0428b769a2cee3_186486_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/test-hackability-plot_huc6cd7ee0ede95da30b0428b769a2cee3_186486_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/test-hackability-plot_huc6cd7ee0ede95da30b0428b769a2cee3_186486_720x0_resize_box_3.png 720w,
diff --git a/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/index.html b/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/index.html
index 4f263e9e6..72422d104 100644
--- a/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/index.html
+++ b/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Was data science a failure mode of software engineering? | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="artificial intelligence,career,data science,software engineering"><meta name=description content="Yes, data science projects have suffered from classic software engineering mistakes, but the field is maturing with the rise of new engineering roles."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Was data science a failure mode of software engineering?"><meta property="og:description" content="Yes, data science projects have suffered from classic software engineering mistakes, but the field is maturing with the rise of new engineering roles."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/"><meta property="og:image" content="https://yanirseroussi.com/data-science-software-engineering-failure.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-06-30T00:06:30+00:00"><meta property="article:modified_time" content="2023-06-30T16:33:40+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/data-science-software-engineering-failure.jpg"><meta name=twitter:title content="Was data science a failure mode of software engineering?"><meta name=twitter:description content="Yes, data science projects have suffered from classic software engineering mistakes, but the field is maturing with the rise of new engineering roles."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Was data science a failure mode of software engineering?","item":"https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Was data science a failure mode of software engineering?","name":"Was data science a failure mode of software engineering?","description":"Yes, data science projects have suffered from classic software engineering mistakes, but the field is maturing with the rise of new engineering roles.","keywords":["artificial intelligence","career","data science","software engineering"],"articleBody":"The world was a different place in 2012. I had just finished my PhD, and I wasn’t sure what my title should be. My formal academic transcript said that I specialised in artificial intelligence, but back then it seemed silly to call myself an AI Expert. As I was interested in startups, I became the first employee of Giveable, and my title was Data Scientist. This was the year Harvard Business Review declared data scientist to be the sexiest job of the 21st century, so it suited me just fine. I got to do work I found interesting while reaping the benefits of being in an over-hyped profession.\nAs data science was a new term, I attempted to decipher its evolving meaning. In 2014, I liked the definition by Josh Wills, who saw it as the intersection of software engineering and statistics. By 2018, I came to see it as the union of many fields, with practitioners who support and drive decisions by employing descriptive analytics, predictive models, and causal inference. In 2020, I reflected on the trend of software commodities displacing interesting data science work. Now, I look back and wonder: Was data science a failure mode of software engineering? That is, did many data science projects repeat classic software engineering mistakes (especially in the early days)?\nBreaking Betteridge’s law of headlines, my answer to these questions is yes. I believe that many instances of data science projects exhibited classic software engineering mistakes, especially in the 2010s. Things appear to be getting better, though. The emergence of professions like data engineering, machine learning engineering, and analytics engineering represents a move away from getting data scientists to fail at software engineering – simply because they need to do less of it. But this isn’t the case everywhere, as data maturity varies across organisations.\nFailure mode examples For a data science project to exhibit a failure mode of software engineering, it needs to: (1) have working software as one of its outcomes; and (2) fail in a way that software engineering projects are known to fail.\nNot all data science projects meet my first criterion. Some projects end with a one-off report as their outcome, which is fine if that’s the project’s goal. However, many data science projects aim to deliver software systems that need to operate continuously and reliably. Quoting one of the principles behind the agile manifesto, for projects of the latter type, working software is the primary measure of progress. My sense is that such projects were driving the data science hype, e.g., a personalisation system that automatically increases revenue is both more exciting and more valuable than a one-off report.\nFor my second criterion, I’ll discuss some classic software engineering mistakes and how they manifest in data science projects. These come from a list compiled by Steve McConnell in 1996 and updated in 2008. While some mistakes have become less common, many are still repeated to this day. As Jeff Atwood noted in 2007, “classic mistakes are classic because they’re so seductive.” The updated list contains 42 mistakes, so I’ll highlight five I find especially pertinent: unrealistic expectations, heroics, research-oriented development, silver-bullet syndrome, and lack of automated source-code control.\n(M1) Unrealistic expectations. This mistake had the highest exposure index in McConnell’s 2008 report, meaning it’s both frequent and severe. I don’t have solid data on the occurrence of this mistake in data science projects, but unrealistic expectations go hand in hand with an over-hyped field. This is exemplified by the Gartner hype cycle, where technologies hit a peak of inflated expectations followed by a trough of disillusionment. While the general validity of the hype cycle model is questionable, I’ve experienced enough instances of unrealistic expectations and heard enough stories to believe that many data science projects have not escaped this classic mistake.\nGartner hype cycle. Source: Olga Tarkovskiy, CC BY-SA 3.0, via Wikimedia Commons. (M2) Heroics. This classic mistake is probably best exemplified by the labelling of data scientist as the sexiest job of the 21st century. Yes, it was just a Harvard Business Review article, but with almost 2,000 scholarly citations and numerous other mentions, it’s beyond doubt that it had helped paint a picture of data scientists as heroes who “understand how to fish out answers to important business questions from today’s tsunami of unstructured information”. More careful reading of the original article reveals that the authors referred to data scientist as “the hot job of the decade” (emphasis mine). Indeed, the same authors published a follow-up article in 2022 that implicitly follows Betteridge’s law of headlines: Is Data Scientist Still the Sexiest Job of the 21st Century? The 2022 article notes that “businesses now need to create and oversee diverse data science teams rather than searching for data scientist unicorns”, or in more detail:\nThe data science role is also now supplemented with a variety of other jobs. The assumption in 2012 was that data scientists could do all required tasks in a data science application — from conceptualizing the use case, to interfacing with business and technology stakeholders, to developing the algorithm and deploying it into production. Now, however, there has been a proliferation of related jobs to handle many of those tasks, including machine learning engineer, data engineer, AI specialist, analytics and AI translators, and data oriented product managers. LinkedIn reported some of these jobs as being more popular than data scientists in its “Jobs on the Rise” reports for 2021 and 2022 for the U.S.\nWhile I have my doubts about AI specialist ever becoming a well-defined profession, it seems like the days of “sexy” data science heroes are thankfully behind us.\n(M3) Research-oriented development. This classic mistake is possibly one of the top reasons so many data science projects had failed to make it to production. Leaning towards research was probably due to many early data scientists coming from academia (or having an academic fetish, as noted by one Reddit commenter). However, there’s a fine distinction to draw between research and experimentation. Research aims to expand the frontiers of knowledge, which is an expensive, high-risk activity that should be avoided by most organisations. By contrast, experimentation aims to uncover truths and opportunities within a limited area, e.g., optimising landing pages for a specific product. In many business domains, rigorous experimentation requires a robust software platform. Having worked on such a platform myself, I consider this type of experimentation to be a data science success story, even though some people see A/B testing and causal inference as less “sexy” than machine learning.\n(M4) Silver-bullet syndrome. McConnell’s description of this mistake should be familiar to anyone who’s been in tech for long enough:\nOn some projects, there is an over reliance on the advertised benefits of previously unused technologies, tools, or 3rd party applications and too little information about how well they would do in the current development environment. When project teams latch onto a single new methodology or new technology and expect it to solve their cost, schedule, or quality problems, they are inevitably disappointed.\nThe silver bullet may be data science, or it may be AI, large language models, big data, or blockchain. As long as humans are running things, it’s unlikely we’d run out of silver bullets. While many have warned that data science isn’t a silver bullet (e.g., see my early posts on data’s hierarchy of needs and avoiding premature hiring of data scientists), people still fell for it. And people still fall for the latest shiny thing. I’m not immune either, e.g., I still find ChatGPT to be transformative and believe that AI will keep changing our world. But I can’t put a timeline on global transformations, and I doubt that a single technology would solve all the world’s problems.\nWill big data ever make a resurgence as a silver bullet? (M5) Lack of automated source-code control. While McConnell’s 2008 report found this to be a low-frequency mistake, data science may have helped resuscitate it. This is due to multiple factors:\nUsing notebooks for development and experimentation. I use notebooks myself – they are popular for a reason. However, they don’t play well with source control systems without additional tooling. For example, it’s hard to collaborate on notebooks as people do on plain code – just try merging notebook changes from multiple authors for a bit of fun. Many data scientists came from fields where source control wasn’t common. This is probably decreasing now with Git being the standard source control system, but it wasn’t the case about a decade ago. Data transformations don’t always live under source control. For example, analytics flows might be buried in stored procedures or database views, or worse – copied around by analysts. Again, this is changing thanks to a growing awareness and the rise of tools like dbt, but not everyone has adopted the modern data stack yet. Related to a lack of source-code control is a lack of control over model versioning and data lineage, as it takes some experience to develop an appreciation of the need for versioning and reproducibility. Still, it’s easy to end up with a big mess even with an awareness of these issues and the best intentions. It’s hard to control everything.\nLearning from history while moving forward Data science is maturing. We’ve gone from a “sexy” field to people calling themselves “recovering data scientists”, saying goodbye to the field, and declaring that “there will be no data science job titles by 2029”. Anecdotally, it seems to me like data science is becoming less of a failure mode of software engineering – perhaps because data scientists are no longer expected to single-handedly ship complex software systems.\nPersonally, I still struggle with giving a concise title to what I do, just like in 2012. Data scientist has become a loaded term – some see data science as a cost centre that fails to deliver tangible results. As I’ve never stopped doing software engineering, I try to emphasise it by saying that I’m a full-stack data scientist and software engineer. This is a bit of a mouthful, and full-stack is also a loaded term, but it seems apt because I’ve shipped production code that ranges from old-school C on pre-Android phones through data pipelines to web applications. My main concern these days is putting my skills to good use, especially within climate tech and related areas. But as I’m freelancing, I went with Data \u0026 AI Consultant as my LinkedIn job title – maybe my PhD specialisation in AI wasn’t so silly after all…?\n","wordCount":"1758","inLanguage":"en","image":"https://yanirseroussi.com/data-science-software-engineering-failure.jpg","datePublished":"2023-06-30T00:06:30Z","dateModified":"2023-06-30T16:33:40+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Was data science a failure mode of software engineering?</h1><div class=post-meta><span title='2023-06-30 00:06:30 +0000 UTC'>June 30, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-06-30-was-data-science-a-failure-mode-of-software-engineering/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure_hu6b7664f523075193f9f11d79c1c9dcfa_143432_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure_hu6b7664f523075193f9f11d79c1c9dcfa_143432_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure_hu6b7664f523075193f9f11d79c1c9dcfa_143432_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure.jpg 1024w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure.jpg alt="Not sure what's going on here, but it came from an odd conversation on the topic with Bing. Seems apt." width=1024 height=678><p>Not sure what&rsquo;s going on here, but it came from an odd conversation on the topic with Bing. Seems apt.</p></figure><div class=post-content><p>The world was a different place in 2012. I had just finished <a href=https://yanirseroussi.com/phd-work/>my PhD</a>, and I wasn&rsquo;t sure what my title should be. My formal academic transcript said that I specialised in artificial intelligence, but back then it seemed silly to call myself an AI Expert. As I was interested in startups, I became the first employee of <a href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/>Giveable</a>, and my title was Data Scientist. This was the year <a href=https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century target=_blank rel=noopener>Harvard Business Review declared data scientist to be the sexiest job of the 21st century</a>, so it suited me just fine. I got to do work I found interesting while reaping the benefits of being in an over-hyped profession.</p><p>As data science was a new term, I attempted to decipher its evolving meaning. In 2014, <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>I liked the definition by Josh Wills</a>, who saw it as the intersection of software engineering and statistics. By 2018, I came to see it as <a href=https://data.blog/2018/03/20/engineering-data-science-at-automattic/ target=_blank rel=noopener>the union of many fields</a>, with <a href=https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/>practitioners who support and drive decisions by employing descriptive analytics, predictive models, and causal inference</a>. In 2020, I <a href=https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/>reflected on the trend of software commodities displacing interesting data science work</a>. Now, I look back and wonder: Was data science a failure mode of software engineering? That is, did many data science projects repeat classic software engineering mistakes (especially in the early days)?</p><p>Breaking <a href=https://en.wikipedia.org/wiki/Betteridge%27s_law_of_headlines target=_blank rel=noopener>Betteridge&rsquo;s law of headlines</a>, my answer to these questions is <em>yes</em>. I believe that many instances of data science projects exhibited classic software engineering mistakes, especially in the 2010s. Things appear to be getting better, though. The emergence of professions like data engineering, machine learning engineering, and analytics engineering represents a move away from getting data scientists to fail at software engineering – simply because they need to do less of it. But this isn&rsquo;t the case everywhere, as data maturity varies across organisations.</p><h2 id=failure-mode-examples>Failure mode examples<a hidden class=anchor aria-hidden=true href=#failure-mode-examples>#</a></h2><p>For a data science project to exhibit a failure mode of software engineering, it needs to: (1) have working software as one of its outcomes; and (2) fail in a way that software engineering projects are known to fail.</p><p>Not all data science projects meet my first criterion. Some projects end with a one-off report as their outcome, which is fine if that&rsquo;s the project&rsquo;s goal. However, many data science projects aim to deliver software systems that need to operate continuously and reliably. Quoting one of <a href=https://agilemanifesto.org/principles.html target=_blank rel=noopener>the principles behind the agile manifesto</a>, for projects of the latter type, <em>working software is the primary measure of progress</em>. My sense is that such projects were driving the data science hype, e.g., a personalisation system that automatically increases revenue is both more exciting and more valuable than a one-off report.</p><p>For my second criterion, I&rsquo;ll discuss some classic software engineering mistakes and how they manifest in data science projects. These come from <a href=https://www.construx.com/wp-content/uploads/2020/04/CxWhitePaper_ClassicMistakes.pdf target=_blank rel=noopener>a list compiled by Steve McConnell in 1996 and updated in 2008</a>. While some mistakes have become less common, many are still repeated to this day. As <a href=https://blog.codinghorror.com/escaping-from-gilligans-island/ target=_blank rel=noopener>Jeff Atwood noted in 2007</a>, <em>&ldquo;classic mistakes are classic because they&rsquo;re so seductive.&rdquo;</em> The updated list contains 42 mistakes, so I&rsquo;ll highlight five I find especially pertinent: <em>unrealistic expectations</em>, <em>heroics</em>, <em>research-oriented development</em>, <em>silver-bullet syndrome</em>, and <em>lack of automated source-code control</em>.</p><p><strong>(M1) Unrealistic expectations.</strong> This mistake had the highest exposure index in McConnell&rsquo;s 2008 report, meaning it&rsquo;s both frequent and severe. I don&rsquo;t have solid data on the occurrence of this mistake in data science projects, but unrealistic expectations go hand in hand with an over-hyped field. This is exemplified by the <a href=https://en.wikipedia.org/wiki/Gartner_hype_cycle target=_blank rel=noopener>Gartner hype cycle</a>, where technologies hit a <em>peak of inflated expectations</em> followed by a <em>trough of disillusionment</em>. While the general validity of the hype cycle model is questionable, I&rsquo;ve experienced enough instances of unrealistic expectations and heard enough stories to believe that many data science projects have not escaped this classic mistake.</p><figure><a href=gartner-hype-cycle.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="artificial intelligence,career,data science,software engineering"><meta name=description content="Yes, data science projects have suffered from classic software engineering mistakes, but the field is maturing with the rise of new engineering roles."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Was data science a failure mode of software engineering?"><meta property="og:description" content="Yes, data science projects have suffered from classic software engineering mistakes, but the field is maturing with the rise of new engineering roles."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/"><meta property="og:image" content="https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-06-30T00:06:30+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure.jpg"><meta name=twitter:title content="Was data science a failure mode of software engineering?"><meta name=twitter:description content="Yes, data science projects have suffered from classic software engineering mistakes, but the field is maturing with the rise of new engineering roles."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Was data science a failure mode of software engineering?","item":"https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Was data science a failure mode of software engineering?","name":"Was data science a failure mode of software engineering?","description":"Yes, data science projects have suffered from classic software engineering mistakes, but the field is maturing with the rise of new engineering roles.","keywords":["artificial intelligence","career","data science","software engineering"],"articleBody":"The world was a different place in 2012. I had just finished my PhD, and I wasn’t sure what my title should be. My formal academic transcript said that I specialised in artificial intelligence, but back then it seemed silly to call myself an AI Expert. As I was interested in startups, I became the first employee of Giveable, and my title was Data Scientist. This was the year Harvard Business Review declared data scientist to be the sexiest job of the 21st century, so it suited me just fine. I got to do work I found interesting while reaping the benefits of being in an over-hyped profession.\nAs data science was a new term, I attempted to decipher its evolving meaning. In 2014, I liked the definition by Josh Wills, who saw it as the intersection of software engineering and statistics. By 2018, I came to see it as the union of many fields, with practitioners who support and drive decisions by employing descriptive analytics, predictive models, and causal inference. In 2020, I reflected on the trend of software commodities displacing interesting data science work. Now, I look back and wonder: Was data science a failure mode of software engineering? That is, did many data science projects repeat classic software engineering mistakes (especially in the early days)?\nBreaking Betteridge’s law of headlines, my answer to these questions is yes. I believe that many instances of data science projects exhibited classic software engineering mistakes, especially in the 2010s. Things appear to be getting better, though. The emergence of professions like data engineering, machine learning engineering, and analytics engineering represents a move away from getting data scientists to fail at software engineering – simply because they need to do less of it. But this isn’t the case everywhere, as data maturity varies across organisations.\nFailure mode examples For a data science project to exhibit a failure mode of software engineering, it needs to: (1) have working software as one of its outcomes; and (2) fail in a way that software engineering projects are known to fail.\nNot all data science projects meet my first criterion. Some projects end with a one-off report as their outcome, which is fine if that’s the project’s goal. However, many data science projects aim to deliver software systems that need to operate continuously and reliably. Quoting one of the principles behind the agile manifesto, for projects of the latter type, working software is the primary measure of progress. My sense is that such projects were driving the data science hype, e.g., a personalisation system that automatically increases revenue is both more exciting and more valuable than a one-off report.\nFor my second criterion, I’ll discuss some classic software engineering mistakes and how they manifest in data science projects. These come from a list compiled by Steve McConnell in 1996 and updated in 2008. While some mistakes have become less common, many are still repeated to this day. As Jeff Atwood noted in 2007, “classic mistakes are classic because they’re so seductive.” The updated list contains 42 mistakes, so I’ll highlight five I find especially pertinent: unrealistic expectations, heroics, research-oriented development, silver-bullet syndrome, and lack of automated source-code control.\n(M1) Unrealistic expectations. This mistake had the highest exposure index in McConnell’s 2008 report, meaning it’s both frequent and severe. I don’t have solid data on the occurrence of this mistake in data science projects, but unrealistic expectations go hand in hand with an over-hyped field. This is exemplified by the Gartner hype cycle, where technologies hit a peak of inflated expectations followed by a trough of disillusionment. While the general validity of the hype cycle model is questionable, I’ve experienced enough instances of unrealistic expectations and heard enough stories to believe that many data science projects have not escaped this classic mistake.\nGartner hype cycle. Source: Olga Tarkovskiy, CC BY-SA 3.0, via Wikimedia Commons. (M2) Heroics. This classic mistake is probably best exemplified by the labelling of data scientist as the sexiest job of the 21st century. Yes, it was just a Harvard Business Review article, but with almost 2,000 scholarly citations and numerous other mentions, it’s beyond doubt that it had helped paint a picture of data scientists as heroes who “understand how to fish out answers to important business questions from today’s tsunami of unstructured information”. More careful reading of the original article reveals that the authors referred to data scientist as “the hot job of the decade” (emphasis mine). Indeed, the same authors published a follow-up article in 2022 that implicitly follows Betteridge’s law of headlines: Is Data Scientist Still the Sexiest Job of the 21st Century? The 2022 article notes that “businesses now need to create and oversee diverse data science teams rather than searching for data scientist unicorns”, or in more detail:\nThe data science role is also now supplemented with a variety of other jobs. The assumption in 2012 was that data scientists could do all required tasks in a data science application — from conceptualizing the use case, to interfacing with business and technology stakeholders, to developing the algorithm and deploying it into production. Now, however, there has been a proliferation of related jobs to handle many of those tasks, including machine learning engineer, data engineer, AI specialist, analytics and AI translators, and data oriented product managers. LinkedIn reported some of these jobs as being more popular than data scientists in its “Jobs on the Rise” reports for 2021 and 2022 for the U.S.\nWhile I have my doubts about AI specialist ever becoming a well-defined profession, it seems like the days of “sexy” data science heroes are thankfully behind us.\n(M3) Research-oriented development. This classic mistake is possibly one of the top reasons so many data science projects had failed to make it to production. Leaning towards research was probably due to many early data scientists coming from academia (or having an academic fetish, as noted by one Reddit commenter). However, there’s a fine distinction to draw between research and experimentation. Research aims to expand the frontiers of knowledge, which is an expensive, high-risk activity that should be avoided by most organisations. By contrast, experimentation aims to uncover truths and opportunities within a limited area, e.g., optimising landing pages for a specific product. In many business domains, rigorous experimentation requires a robust software platform. Having worked on such a platform myself, I consider this type of experimentation to be a data science success story, even though some people see A/B testing and causal inference as less “sexy” than machine learning.\n(M4) Silver-bullet syndrome. McConnell’s description of this mistake should be familiar to anyone who’s been in tech for long enough:\nOn some projects, there is an over reliance on the advertised benefits of previously unused technologies, tools, or 3rd party applications and too little information about how well they would do in the current development environment. When project teams latch onto a single new methodology or new technology and expect it to solve their cost, schedule, or quality problems, they are inevitably disappointed.\nThe silver bullet may be data science, or it may be AI, large language models, big data, or blockchain. As long as humans are running things, it’s unlikely we’d run out of silver bullets. While many have warned that data science isn’t a silver bullet (e.g., see my early posts on data’s hierarchy of needs and avoiding premature hiring of data scientists), people still fell for it. And people still fall for the latest shiny thing. I’m not immune either, e.g., I still find ChatGPT to be transformative and believe that AI will keep changing our world. But I can’t put a timeline on global transformations, and I doubt that a single technology would solve all the world’s problems.\nWill big data ever make a resurgence as a silver bullet? (M5) Lack of automated source-code control. While McConnell’s 2008 report found this to be a low-frequency mistake, data science may have helped resuscitate it. This is due to multiple factors:\nUsing notebooks for development and experimentation. I use notebooks myself – they are popular for a reason. However, they don’t play well with source control systems without additional tooling. For example, it’s hard to collaborate on notebooks as people do on plain code – just try merging notebook changes from multiple authors for a bit of fun. Many data scientists came from fields where source control wasn’t common. This is probably decreasing now with Git being the standard source control system, but it wasn’t the case about a decade ago. Data transformations don’t always live under source control. For example, analytics flows might be buried in stored procedures or database views, or worse – copied around by analysts. Again, this is changing thanks to a growing awareness and the rise of tools like dbt, but not everyone has adopted the modern data stack yet. Related to a lack of source-code control is a lack of control over model versioning and data lineage, as it takes some experience to develop an appreciation of the need for versioning and reproducibility. Still, it’s easy to end up with a big mess even with an awareness of these issues and the best intentions. It’s hard to control everything.\nLearning from history while moving forward Data science is maturing. We’ve gone from a “sexy” field to people calling themselves “recovering data scientists”, saying goodbye to the field, and declaring that “there will be no data science job titles by 2029”. Anecdotally, it seems to me like data science is becoming less of a failure mode of software engineering – perhaps because data scientists are no longer expected to single-handedly ship complex software systems.\nPersonally, I still struggle with giving a concise title to what I do, just like in 2012. Data scientist has become a loaded term – some see data science as a cost centre that fails to deliver tangible results. As I’ve never stopped doing software engineering, I try to emphasise it by saying that I’m a full-stack data scientist and software engineer. This is a bit of a mouthful, and full-stack is also a loaded term, but it seems apt because I’ve shipped production code that ranges from old-school C on pre-Android phones through data pipelines to web applications. My main concern these days is putting my skills to good use, especially within climate tech and related areas. But as I’m freelancing, I went with Data \u0026 AI Consultant as my LinkedIn job title – maybe my PhD specialisation in AI wasn’t so silly after all…?\n","wordCount":"1758","inLanguage":"en","image":"https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure.jpg","datePublished":"2023-06-30T00:06:30Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Was data science a failure mode of software engineering?</h1><div class=post-meta><span title='2023-06-30 00:06:30 +0000 UTC'>June 30, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-06-30-was-data-science-a-failure-mode-of-software-engineering/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure_hu6b7664f523075193f9f11d79c1c9dcfa_143432_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure_hu6b7664f523075193f9f11d79c1c9dcfa_143432_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure_hu6b7664f523075193f9f11d79c1c9dcfa_143432_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure.jpg 1024w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/data-science-software-engineering-failure.jpg alt="Not sure what's going on here, but it came from an odd conversation on the topic with Bing. Seems apt." width=1024 height=678><p>Not sure what&rsquo;s going on here, but it came from an odd conversation on the topic with Bing. Seems apt.</p></figure><div class=post-content><p>The world was a different place in 2012. I had just finished <a href=https://yanirseroussi.com/phd-work/>my PhD</a>, and I wasn&rsquo;t sure what my title should be. My formal academic transcript said that I specialised in artificial intelligence, but back then it seemed silly to call myself an AI Expert. As I was interested in startups, I became the first employee of <a href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/>Giveable</a>, and my title was Data Scientist. This was the year <a href=https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century target=_blank rel=noopener>Harvard Business Review declared data scientist to be the sexiest job of the 21st century</a>, so it suited me just fine. I got to do work I found interesting while reaping the benefits of being in an over-hyped profession.</p><p>As data science was a new term, I attempted to decipher its evolving meaning. In 2014, <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>I liked the definition by Josh Wills</a>, who saw it as the intersection of software engineering and statistics. By 2018, I came to see it as <a href=https://data.blog/2018/03/20/engineering-data-science-at-automattic/ target=_blank rel=noopener>the union of many fields</a>, with <a href=https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/>practitioners who support and drive decisions by employing descriptive analytics, predictive models, and causal inference</a>. In 2020, I <a href=https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/>reflected on the trend of software commodities displacing interesting data science work</a>. Now, I look back and wonder: Was data science a failure mode of software engineering? That is, did many data science projects repeat classic software engineering mistakes (especially in the early days)?</p><p>Breaking <a href=https://en.wikipedia.org/wiki/Betteridge%27s_law_of_headlines target=_blank rel=noopener>Betteridge&rsquo;s law of headlines</a>, my answer to these questions is <em>yes</em>. I believe that many instances of data science projects exhibited classic software engineering mistakes, especially in the 2010s. Things appear to be getting better, though. The emergence of professions like data engineering, machine learning engineering, and analytics engineering represents a move away from getting data scientists to fail at software engineering – simply because they need to do less of it. But this isn&rsquo;t the case everywhere, as data maturity varies across organisations.</p><h2 id=failure-mode-examples>Failure mode examples<a hidden class=anchor aria-hidden=true href=#failure-mode-examples>#</a></h2><p>For a data science project to exhibit a failure mode of software engineering, it needs to: (1) have working software as one of its outcomes; and (2) fail in a way that software engineering projects are known to fail.</p><p>Not all data science projects meet my first criterion. Some projects end with a one-off report as their outcome, which is fine if that&rsquo;s the project&rsquo;s goal. However, many data science projects aim to deliver software systems that need to operate continuously and reliably. Quoting one of <a href=https://agilemanifesto.org/principles.html target=_blank rel=noopener>the principles behind the agile manifesto</a>, for projects of the latter type, <em>working software is the primary measure of progress</em>. My sense is that such projects were driving the data science hype, e.g., a personalisation system that automatically increases revenue is both more exciting and more valuable than a one-off report.</p><p>For my second criterion, I&rsquo;ll discuss some classic software engineering mistakes and how they manifest in data science projects. These come from <a href=https://www.construx.com/wp-content/uploads/2020/04/CxWhitePaper_ClassicMistakes.pdf target=_blank rel=noopener>a list compiled by Steve McConnell in 1996 and updated in 2008</a>. While some mistakes have become less common, many are still repeated to this day. As <a href=https://blog.codinghorror.com/escaping-from-gilligans-island/ target=_blank rel=noopener>Jeff Atwood noted in 2007</a>, <em>&ldquo;classic mistakes are classic because they&rsquo;re so seductive.&rdquo;</em> The updated list contains 42 mistakes, so I&rsquo;ll highlight five I find especially pertinent: <em>unrealistic expectations</em>, <em>heroics</em>, <em>research-oriented development</em>, <em>silver-bullet syndrome</em>, and <em>lack of automated source-code control</em>.</p><p><strong>(M1) Unrealistic expectations.</strong> This mistake had the highest exposure index in McConnell&rsquo;s 2008 report, meaning it&rsquo;s both frequent and severe. I don&rsquo;t have solid data on the occurrence of this mistake in data science projects, but unrealistic expectations go hand in hand with an over-hyped field. This is exemplified by the <a href=https://en.wikipedia.org/wiki/Gartner_hype_cycle target=_blank rel=noopener>Gartner hype cycle</a>, where technologies hit a <em>peak of inflated expectations</em> followed by a <em>trough of disillusionment</em>. While the general validity of the hype cycle model is questionable, I&rsquo;ve experienced enough instances of unrealistic expectations and heard enough stories to believe that many data science projects have not escaped this classic mistake.</p><figure><a href=gartner-hype-cycle.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/gartner-hype-cycle_hu3069bca74bd842e837c8187d5549a703_102150_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/gartner-hype-cycle_hu3069bca74bd842e837c8187d5549a703_102150_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/gartner-hype-cycle_hu3069bca74bd842e837c8187d5549a703_102150_720x0_resize_box_3.png 720w,
diff --git a/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/index.html b/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/index.html
index cd26cfa6f..b3108004d 100644
--- a/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/index.html
+++ b/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>My rediscovery of quiet writing on the open web | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="blogging,personal"><meta name=description content="Reflections on publishing on this website: Writing publicly to share thoughts and documentation beats chasing views and likes."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My rediscovery of quiet writing on the open web"><meta property="og:description" content="Reflections on publishing on this website: Writing publicly to share thoughts and documentation beats chasing views and likes."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/"><meta property="og:image" content="https://yanirseroussi.com/rikoriko-cave.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-08-28T05:30:00+00:00"><meta property="article:modified_time" content="2023-09-23T08:52:24+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/rikoriko-cave.jpg"><meta name=twitter:title content="My rediscovery of quiet writing on the open web"><meta name=twitter:description content="Reflections on publishing on this website: Writing publicly to share thoughts and documentation beats chasing views and likes."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My rediscovery of quiet writing on the open web","item":"https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My rediscovery of quiet writing on the open web","name":"My rediscovery of quiet writing on the open web","description":"Reflections on publishing on this website: Writing publicly to share thoughts and documentation beats chasing views and likes.","keywords":["blogging","personal"],"articleBody":"I published my first post on this website almost ten years ago. My motivation was modest: Publicly link to useful stuff I wrote. Recently, I tidied up my homepage and added short descriptions to old posts, which made me reflect on how this website has changed over the years: From a quiet place for sharing some tips and progress reports, through B-list data science “influencing”, and back to a quiet place. This post summarises some of my reflections, in no particular order.\nViews are addictive and never enough. The first time a post I published had over a thousand views, I was excited. Then some of my posts had over ten thousand views, so a mere thousand became a disappointment. I didn’t intentionally optimise for views – it happened because I wrote on popular topics, and distributed my posts through channels that worked at the time. If I had chosen to optimise for popularity, I probably would have grown dissatisfied with view counts in the tens of thousands, and then with hundreds of thousands, and then with millions. Instead, I went backwards in terms of popularity and audience size: I deleted my Twitter account a few years ago because I found the platform unpleasant, and I lost half my followers when I migrated my site from WordPress.com to Hugo two years ago (I couldn’t port non-email followers who were using the proprietary WordPress.com Reader). More importantly, I often write about topics that may be of low appeal to current followers, and don’t invest much effort in getting the word out. I don’t even bother with accurate long-term tracking of views and interactions – I only use Cloudflare Analytics to validate that the website is still working.\nMeaningful engagement is more satisfying than views and likes. I removed the Like functionality when I got my website off WordPress.com. I don’t miss it, as likes have the same addictive “never enough” qualities as views. That said, views and likes are correlated with the amount of more satisfying engagement, which comes in the form of thoughtful comments and private messages. By reducing my distribution efforts, I have also reduced the amount of meaningful engagement, but such is life. I still have other motivators.\nWriting publicly helps me think. Even with a low number of views, the fact that practically anyone in the world can read something I wrote makes me take it more seriously. I put more effort into making myself clear and checking references than if I were to write for myself. In addition, the process of writing often becomes a process of discovery – as I write things down and add structure to a post, my subject becomes clearer to me.\nWriting publicly creates valuable documentation. Even if no one is reading right now, posts on this website remain accessible for years. I often link to my own writing – not (always) out of vanity, but because it’s relevant in a specific context. Recently, I started experimenting with easier-to-produce posts that I share under a today I learned (TIL) section – a format I learned about from following Simon Willison. So far, my TIL section is pretty much documentation for myself, as I put no effort into telling people about specific TIL posts. We’ll see how it goes in the long run.\nPlatform independence is awesome (if you have the right skills). Getting my website off WordPress.com a couple of years ago was a bit of a pain, but I love the extra control it gives me. On a platform like WordPress.com, I would have had to pay extra to do something like give all my posts short meta-descriptions and organise them on a single page, as I did recently. The same goes for setting up the TIL section, which was a breeze with Hugo. Being able to have fine-grained control over the rendered content and its structure works well for me, but it’s not for everyone (there’s a reason why a large portion of the web uses WordPress). Still, it has never been easier and cheaper to self-host a static site like mine.\nDurable tech works well for quiet writing. Legacy technologies tend to get a bad rap. Many people prefer building with shiny new tech on shiny new platforms. Publishing on the web is no exception, as trendy ways of sharing content come and go. Twenty years ago, most of today’s social media didn’t exist. How much of it will exist in twenty years? Making any prediction is hard, but I’m willing to bet that twenty years from now, there will still be tools that can serve and render my website (the HTML / CSS / JS output of Hugo), as it exists right now. I wouldn’t make the same bet on nascent social media platforms or on writing-centric platforms such as Substack. This is in line with the Lindy effect, which states that “the future life expectancy of some non-perishable things, like a technology or an idea, is proportional to their current age”. It often makes perfect sense to go for new tech, as it comes with new capabilities. I prefer to be cautious, as I want to focus on what I get out of writing rather than on bouncing between platforms and tools.\nSource: Ditcherville: I love your email list! CC BY-NC 4.0 by Jonathan Stark ","wordCount":"889","inLanguage":"en","image":"https://yanirseroussi.com/rikoriko-cave.jpg","datePublished":"2023-08-28T05:30:00Z","dateModified":"2023-09-23T08:52:24+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My rediscovery of quiet writing on the open web</h1><div class=post-meta><span title='2023-08-28 05:30:00 +0000 UTC'>August 28, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-08-28-my-rediscovery-of-quiet-writing-on-the-open-web/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave_hu6dc18de4dca0b72c45f8983b19275919_772988_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave_hu6dc18de4dca0b72c45f8983b19275919_772988_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave_hu6dc18de4dca0b72c45f8983b19275919_772988_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave.jpg 1024w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave.jpg alt="Rikoriko cave" width=1024 height=576></figure><div class=post-content><p>I published <a href=https://yanirseroussi.com/2014/01/19/kaggle-beginner-tips/>my first post on this website</a> almost ten years ago. My motivation was modest: Publicly link to useful stuff I wrote. Recently, I tidied up my homepage and added short descriptions to old posts, which made me reflect on how this website has changed over the years: From a quiet place for sharing some tips and progress reports, through B-list data science &ldquo;influencing&rdquo;, and back to a quiet place. This post summarises some of my reflections, in no particular order.</p><p><strong>Views are addictive and never enough.</strong> The first time a post I published had over a thousand views, <a href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/>I was excited</a>. Then some of my posts had over ten thousand views, so a mere thousand became a disappointment. I didn&rsquo;t intentionally optimise for views – it happened because I wrote on popular topics, and distributed my posts through channels that worked at the time. If I had chosen to optimise for popularity, I probably would have grown dissatisfied with view counts in the tens of thousands, and then with hundreds of thousands, and then with millions. Instead, I went backwards in terms of popularity and audience size: I deleted my Twitter account a few years ago because I found the platform unpleasant, and I lost half my followers when I <a href=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/>migrated my site from WordPress.com to Hugo</a> two years ago (I couldn&rsquo;t port non-email followers who were using the proprietary WordPress.com Reader). More importantly, I often write about topics that may be of low appeal to current followers, and don&rsquo;t invest much effort in getting the word out. I don&rsquo;t even bother with accurate long-term tracking of views and interactions – I only use Cloudflare Analytics to validate that the website is still working.</p><p><strong>Meaningful engagement is more satisfying than views and likes.</strong> I removed the <em>Like</em> functionality when I got my website off WordPress.com. I don&rsquo;t miss it, as likes have the same addictive &ldquo;never enough&rdquo; qualities as views. That said, views and likes are correlated with the amount of more satisfying engagement, which comes in the form of thoughtful comments and private messages. By reducing my distribution efforts, I have also reduced the amount of meaningful engagement, but such is life. I still have other motivators.</p><p><strong>Writing publicly helps me think.</strong> Even with a low number of views, the fact that practically anyone in the world can read something I wrote makes me take it more seriously. I put more effort into making myself clear and checking references than if I were to write for myself. In addition, the process of writing often becomes a process of discovery – as I write things down and add structure to a post, my subject becomes clearer to me.</p><p><strong>Writing publicly creates valuable documentation.</strong> Even if no one is reading <em>right now</em>, posts on this website remain accessible for years. I often link to my own writing – not (always) out of vanity, but because it&rsquo;s relevant in a specific context. Recently, I started experimenting with easier-to-produce posts that I share under <a href=https://yanirseroussi.com/til/>a <em>today I learned</em> (TIL) section</a> – a format I learned about from following <a href=https://til.simonwillison.net/ target=_blank rel=noopener>Simon Willison</a>. So far, my TIL section is pretty much documentation for myself, as I put no effort into telling people about specific TIL posts. We&rsquo;ll see how it goes in the long run.</p><p><strong>Platform independence is awesome (if you have the right skills).</strong> Getting my website off WordPress.com a couple of years ago was a bit of a pain, but I love the extra control it gives me. On a platform like WordPress.com, I would have had to pay extra to do something like give all my posts short meta-descriptions and organise them on a single page, as I did recently. The same goes for setting up the TIL section, which <a href=https://yanirseroussi.com/til/2023/07/17/making-a-til-section-with-hugo-and-papermod/>was a breeze with Hugo</a>. Being able to have fine-grained control over the rendered content and its structure works well for me, but it&rsquo;s not for everyone (there&rsquo;s a reason why a large portion of the web uses WordPress). Still, it has never been easier and cheaper to self-host a static site like mine.</p><p><strong>Durable tech works well for quiet writing.</strong> Legacy technologies tend to get a bad rap. Many people prefer building with shiny new tech on shiny new platforms. Publishing on the web is no exception, as trendy ways of sharing content come and go. Twenty years ago, most of today&rsquo;s social media didn&rsquo;t exist. How much of it will exist in twenty years? Making any prediction is hard, but I&rsquo;m willing to bet that twenty years from now, there will still be tools that can serve and render my website (the HTML / CSS / JS output of Hugo), as it exists right now. I wouldn&rsquo;t make the same bet on nascent social media platforms or on writing-centric platforms such as Substack. This is in line with <a href=https://en.wikipedia.org/wiki/Lindy_effect target=_blank rel=noopener>the Lindy effect</a>, which states that <em>&ldquo;the future life expectancy of some non-perishable things, like a technology or an idea, is proportional to their current age&rdquo;</em>. It often makes perfect sense to go for new tech, as it comes with new capabilities. I prefer to be cautious, as I want to focus on what I get out of writing rather than on bouncing between platforms and tools.</p><figure><a href=ditcherville-39-i-love-your-email-list.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content="blogging,personal"><meta name=description content="Reflections on publishing on this website: Writing publicly to share thoughts and documentation beats chasing views and likes."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My rediscovery of quiet writing on the open web"><meta property="og:description" content="Reflections on publishing on this website: Writing publicly to share thoughts and documentation beats chasing views and likes."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/"><meta property="og:image" content="https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-08-28T05:30:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave.jpg"><meta name=twitter:title content="My rediscovery of quiet writing on the open web"><meta name=twitter:description content="Reflections on publishing on this website: Writing publicly to share thoughts and documentation beats chasing views and likes."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My rediscovery of quiet writing on the open web","item":"https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My rediscovery of quiet writing on the open web","name":"My rediscovery of quiet writing on the open web","description":"Reflections on publishing on this website: Writing publicly to share thoughts and documentation beats chasing views and likes.","keywords":["blogging","personal"],"articleBody":"I published my first post on this website almost ten years ago. My motivation was modest: Publicly link to useful stuff I wrote. Recently, I tidied up my homepage and added short descriptions to old posts, which made me reflect on how this website has changed over the years: From a quiet place for sharing some tips and progress reports, through B-list data science “influencing”, and back to a quiet place. This post summarises some of my reflections, in no particular order.\nViews are addictive and never enough. The first time a post I published had over a thousand views, I was excited. Then some of my posts had over ten thousand views, so a mere thousand became a disappointment. I didn’t intentionally optimise for views – it happened because I wrote on popular topics, and distributed my posts through channels that worked at the time. If I had chosen to optimise for popularity, I probably would have grown dissatisfied with view counts in the tens of thousands, and then with hundreds of thousands, and then with millions. Instead, I went backwards in terms of popularity and audience size: I deleted my Twitter account a few years ago because I found the platform unpleasant, and I lost half my followers when I migrated my site from WordPress.com to Hugo two years ago (I couldn’t port non-email followers who were using the proprietary WordPress.com Reader). More importantly, I often write about topics that may be of low appeal to current followers, and don’t invest much effort in getting the word out. I don’t even bother with accurate long-term tracking of views and interactions – I only use Cloudflare Analytics to validate that the website is still working.\nMeaningful engagement is more satisfying than views and likes. I removed the Like functionality when I got my website off WordPress.com. I don’t miss it, as likes have the same addictive “never enough” qualities as views. That said, views and likes are correlated with the amount of more satisfying engagement, which comes in the form of thoughtful comments and private messages. By reducing my distribution efforts, I have also reduced the amount of meaningful engagement, but such is life. I still have other motivators.\nWriting publicly helps me think. Even with a low number of views, the fact that practically anyone in the world can read something I wrote makes me take it more seriously. I put more effort into making myself clear and checking references than if I were to write for myself. In addition, the process of writing often becomes a process of discovery – as I write things down and add structure to a post, my subject becomes clearer to me.\nWriting publicly creates valuable documentation. Even if no one is reading right now, posts on this website remain accessible for years. I often link to my own writing – not (always) out of vanity, but because it’s relevant in a specific context. Recently, I started experimenting with easier-to-produce posts that I share under a today I learned (TIL) section – a format I learned about from following Simon Willison. So far, my TIL section is pretty much documentation for myself, as I put no effort into telling people about specific TIL posts. We’ll see how it goes in the long run.\nPlatform independence is awesome (if you have the right skills). Getting my website off WordPress.com a couple of years ago was a bit of a pain, but I love the extra control it gives me. On a platform like WordPress.com, I would have had to pay extra to do something like give all my posts short meta-descriptions and organise them on a single page, as I did recently. The same goes for setting up the TIL section, which was a breeze with Hugo. Being able to have fine-grained control over the rendered content and its structure works well for me, but it’s not for everyone (there’s a reason why a large portion of the web uses WordPress). Still, it has never been easier and cheaper to self-host a static site like mine.\nDurable tech works well for quiet writing. Legacy technologies tend to get a bad rap. Many people prefer building with shiny new tech on shiny new platforms. Publishing on the web is no exception, as trendy ways of sharing content come and go. Twenty years ago, most of today’s social media didn’t exist. How much of it will exist in twenty years? Making any prediction is hard, but I’m willing to bet that twenty years from now, there will still be tools that can serve and render my website (the HTML / CSS / JS output of Hugo), as it exists right now. I wouldn’t make the same bet on nascent social media platforms or on writing-centric platforms such as Substack. This is in line with the Lindy effect, which states that “the future life expectancy of some non-perishable things, like a technology or an idea, is proportional to their current age”. It often makes perfect sense to go for new tech, as it comes with new capabilities. I prefer to be cautious, as I want to focus on what I get out of writing rather than on bouncing between platforms and tools.\nSource: Ditcherville: I love your email list! CC BY-NC 4.0 by Jonathan Stark ","wordCount":"889","inLanguage":"en","image":"https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave.jpg","datePublished":"2023-08-28T05:30:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My rediscovery of quiet writing on the open web</h1><div class=post-meta><span title='2023-08-28 05:30:00 +0000 UTC'>August 28, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-08-28-my-rediscovery-of-quiet-writing-on-the-open-web/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave_hu6dc18de4dca0b72c45f8983b19275919_772988_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave_hu6dc18de4dca0b72c45f8983b19275919_772988_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave_hu6dc18de4dca0b72c45f8983b19275919_772988_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave.jpg 1024w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/rikoriko-cave.jpg alt="Rikoriko cave" width=1024 height=576></figure><div class=post-content><p>I published <a href=https://yanirseroussi.com/2014/01/19/kaggle-beginner-tips/>my first post on this website</a> almost ten years ago. My motivation was modest: Publicly link to useful stuff I wrote. Recently, I tidied up my homepage and added short descriptions to old posts, which made me reflect on how this website has changed over the years: From a quiet place for sharing some tips and progress reports, through B-list data science &ldquo;influencing&rdquo;, and back to a quiet place. This post summarises some of my reflections, in no particular order.</p><p><strong>Views are addictive and never enough.</strong> The first time a post I published had over a thousand views, <a href=https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/>I was excited</a>. Then some of my posts had over ten thousand views, so a mere thousand became a disappointment. I didn&rsquo;t intentionally optimise for views – it happened because I wrote on popular topics, and distributed my posts through channels that worked at the time. If I had chosen to optimise for popularity, I probably would have grown dissatisfied with view counts in the tens of thousands, and then with hundreds of thousands, and then with millions. Instead, I went backwards in terms of popularity and audience size: I deleted my Twitter account a few years ago because I found the platform unpleasant, and I lost half my followers when I <a href=https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/>migrated my site from WordPress.com to Hugo</a> two years ago (I couldn&rsquo;t port non-email followers who were using the proprietary WordPress.com Reader). More importantly, I often write about topics that may be of low appeal to current followers, and don&rsquo;t invest much effort in getting the word out. I don&rsquo;t even bother with accurate long-term tracking of views and interactions – I only use Cloudflare Analytics to validate that the website is still working.</p><p><strong>Meaningful engagement is more satisfying than views and likes.</strong> I removed the <em>Like</em> functionality when I got my website off WordPress.com. I don&rsquo;t miss it, as likes have the same addictive &ldquo;never enough&rdquo; qualities as views. That said, views and likes are correlated with the amount of more satisfying engagement, which comes in the form of thoughtful comments and private messages. By reducing my distribution efforts, I have also reduced the amount of meaningful engagement, but such is life. I still have other motivators.</p><p><strong>Writing publicly helps me think.</strong> Even with a low number of views, the fact that practically anyone in the world can read something I wrote makes me take it more seriously. I put more effort into making myself clear and checking references than if I were to write for myself. In addition, the process of writing often becomes a process of discovery – as I write things down and add structure to a post, my subject becomes clearer to me.</p><p><strong>Writing publicly creates valuable documentation.</strong> Even if no one is reading <em>right now</em>, posts on this website remain accessible for years. I often link to my own writing – not (always) out of vanity, but because it&rsquo;s relevant in a specific context. Recently, I started experimenting with easier-to-produce posts that I share under <a href=https://yanirseroussi.com/til/>a <em>today I learned</em> (TIL) section</a> – a format I learned about from following <a href=https://til.simonwillison.net/ target=_blank rel=noopener>Simon Willison</a>. So far, my TIL section is pretty much documentation for myself, as I put no effort into telling people about specific TIL posts. We&rsquo;ll see how it goes in the long run.</p><p><strong>Platform independence is awesome (if you have the right skills).</strong> Getting my website off WordPress.com a couple of years ago was a bit of a pain, but I love the extra control it gives me. On a platform like WordPress.com, I would have had to pay extra to do something like give all my posts short meta-descriptions and organise them on a single page, as I did recently. The same goes for setting up the TIL section, which <a href=https://yanirseroussi.com/til/2023/07/17/making-a-til-section-with-hugo-and-papermod/>was a breeze with Hugo</a>. Being able to have fine-grained control over the rendered content and its structure works well for me, but it&rsquo;s not for everyone (there&rsquo;s a reason why a large portion of the web uses WordPress). Still, it has never been easier and cheaper to self-host a static site like mine.</p><p><strong>Durable tech works well for quiet writing.</strong> Legacy technologies tend to get a bad rap. Many people prefer building with shiny new tech on shiny new platforms. Publishing on the web is no exception, as trendy ways of sharing content come and go. Twenty years ago, most of today&rsquo;s social media didn&rsquo;t exist. How much of it will exist in twenty years? Making any prediction is hard, but I&rsquo;m willing to bet that twenty years from now, there will still be tools that can serve and render my website (the HTML / CSS / JS output of Hugo), as it exists right now. I wouldn&rsquo;t make the same bet on nascent social media platforms or on writing-centric platforms such as Substack. This is in line with <a href=https://en.wikipedia.org/wiki/Lindy_effect target=_blank rel=noopener>the Lindy effect</a>, which states that <em>&ldquo;the future life expectancy of some non-perishable things, like a technology or an idea, is proportional to their current age&rdquo;</em>. It often makes perfect sense to go for new tech, as it comes with new capabilities. I prefer to be cautious, as I want to focus on what I get out of writing rather than on bouncing between platforms and tools.</p><figure><a href=ditcherville-39-i-love-your-email-list.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/ditcherville-39-i-love-your-email-list_hue4c7fc22fce0574ea830c25d91fbdb2a_253127_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/ditcherville-39-i-love-your-email-list_hue4c7fc22fce0574ea830c25d91fbdb2a_253127_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/ditcherville-39-i-love-your-email-list_hue4c7fc22fce0574ea830c25d91fbdb2a_253127_720x0_resize_box_3.png 720w,
diff --git a/2023/10/25/lessons-from-reluctant-data-engineering/index.html b/2023/10/25/lessons-from-reluctant-data-engineering/index.html
index 4ed47a3db..d3f554b90 100644
--- a/2023/10/25/lessons-from-reluctant-data-engineering/index.html
+++ b/2023/10/25/lessons-from-reluctant-data-engineering/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Lessons from reluctant data engineering | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="career,data engineering,data science,software engineering"><meta name=description content="Video and summary of a talk I gave at DataEngBytes Brisbane on what I learned from doing data engineering as part of every data science role I had."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Lessons from reluctant data engineering"><meta property="og:description" content="Video and summary of a talk I gave at DataEngBytes Brisbane on what I learned from doing data engineering as part of every data science role I had."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/"><meta property="og:image" content="https://yanirseroussi.com/yanir-seroussi-dataengbytes-brisbane-2023.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-10-25T04:45:00+00:00"><meta property="article:modified_time" content="2023-10-25T15:00:21+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/yanir-seroussi-dataengbytes-brisbane-2023.webp"><meta name=twitter:title content="Lessons from reluctant data engineering"><meta name=twitter:description content="Video and summary of a talk I gave at DataEngBytes Brisbane on what I learned from doing data engineering as part of every data science role I had."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Lessons from reluctant data engineering","item":"https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Lessons from reluctant data engineering","name":"Lessons from reluctant data engineering","description":"Video and summary of a talk I gave at DataEngBytes Brisbane on what I learned from doing data engineering as part of every data science role I had.","keywords":["career","data engineering","data science","software engineering"],"articleBody":"In May 2023, I submitted the following talk abstract to the Brisbane DataEngBytes conference.\nAs we all know, solid data engineering is essential to the success of data science and AI applications. And yet, people often get excited about fancy machine learning models and neglect the data engineering layer. This is totally understandable: playing with data in a throwaway notebook is more relaxing than dealing with a data pipeline that keeps finding ways to break in production.\nIn this talk, I’ll share lessons on data engineering from a data science perspective. Everywhere I’ve worked, from small start-ups to established companies, I’ve found that I had to do some data engineering if I wanted my work to ever get to production. While I’ve always been reluctant to do too much of it, my engineering background has placed me in a better position to do it than colleagues who started off as analysts and academics.\nYou could call my work full-stack data science, reluctant data engineering, or some other data \u0026 AI thing. Whatever it is, I hope that my talk will help us all play better with each other, across all layers of the data stack.\nAs I don’t identify as a data engineer and have never attended a DataEngBytes conference, I didn’t know whether my talk would fit the agenda. However, it seemed harmless to submit an abstract and see how it goes.\nWhen I got the acceptance notification and realised I had to turn my abstract into a coherent talk, I was a bit wary of lacking a good grasp of who’s in my audience. However, when the full agenda was published, I realised that the focus of the conference won’t be on arcane data engineering knowledge, given that one of the keynotes was titled “How The Full-Stack Data Scientist Is STILL The Sexiest Job”. It turned out that despite the name and tagline (“by data engineers, for data engineers”), DataEngBytes was a great event for all data professionals.\nHere’s the video of the talk (slides):\nQuick summary. I start off with a disclaimer, stating that I am not a data engineer. Then I show evidence that the market values data engineering more than data science, given the ratio of Data Engineer to Data Scientist job ads (x3 in the AU$100-150k compensation range; x4 in the AU$200k+ range).1 I follow that observation with another disclaimer, stating that some of my lessons may be obvious or better learnt the hard way (as I often have to learn and relearn lessons). Then I detail five chronologically ordered snippets and their corresponding lessons:\n2012: My first data science job, where we made mistakes around technology choice and premature optimisation. The lesson is that shiny tech ain’t always shiny. Like all lessons, this one ends with a quote that shows that what I learned wasn’t entirely new. The first quote is by Donald Knuth from 1974: “We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%.” 2013: My first head of data science job, where we solved real scaling issues by following principles and adapting solutions to our situation. The lesson is that shiny tech can be transformative; but principles beat tools, which goes with a 1911 quote by Harrington Emerson: “As to methods, there may be a million and then some, but principles are few. The person who grasps principles can successfully select their own methods. The person who tries methods, ignoring principles, is sure to have trouble.”2 2015: My first enterprise consulting stint, where I experienced being a not-so-useful data scientist and working with some not-so-useful data engineers. This led me to dabble in “shadow IT” (a term I learned at the conference), and build a separate Python machine learning pipeline to work around various limitations. The lesson is that you should solve problems; don’t be the problem, or in the words of circa 2004 Google: “Focus on the user and all else will follow.” 2017: My first remote data science job, where I played around with many job functions across the data stack and went down various data rabbit holes. The lesson is to go deep; trust but verify, which goes with a 1999 quote by Eric S. Raymond: “Given enough eyeballs, all bugs are shallow.” 2022: My first committed climate and biodiversity moves (still a work in progress). The lesson is that tech \u0026 titles are tools; focus on what matters, but recall Rabbi Tarfon’s quote from almost two thousand years ago: “You are not obliged to complete the work, but neither are you free to desist from it.” The main takeaway from the talk is that data problems have human roots – and human solutions. This is because:\nHumans get excited by shiny tech… and produce transformative tech. Humans optimise prematurely… and when it makes sense. Humans can act as unreasonable blockers… and as the users we serve. Humans generate messy data… and clean it up. Humans get distracted by tools… and use them for beneficial ends. This is based on Seek searches for jobs advertised in July 2023. Given the limitations of Seek search, it’s not an accurate representation of the demand for each role, as the results included all ads that mentioned the terms. One could also argue that data engineers tend to change jobs more than data scientists, fuelling demand. Despite this, I think the results support the general message around the value of data engineering, especially as others have noted the need for 4-5 data engineers per data scientist in organisations with complex data engineering requirements. ↩︎\nEmerson referred to man rather than person in the original quote, but I took the liberty to make it gender-neutral and retain the original message. ↩︎\n","wordCount":"969","inLanguage":"en","image":"https://yanirseroussi.com/yanir-seroussi-dataengbytes-brisbane-2023.webp","datePublished":"2023-10-25T04:45:00Z","dateModified":"2023-10-25T15:00:21+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Lessons from reluctant data engineering</h1><div class=post-meta><span title='2023-10-25 04:45:00 +0000 UTC'>October 25, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-10-25-lessons-from-reluctant-data-engineering/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023_hu804925b1940ee0b95918a52a0d7d78df_87060_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023_hu804925b1940ee0b95918a52a0d7d78df_87060_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023.webp 676w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023.webp alt="Yanir Seroussi presenting at DataEngBytes Brisbane 2023" width=676 height=450></figure><div class=post-content><p>In May 2023, I submitted the following talk abstract to the Brisbane <a href=https://dataengconf.com.au/ target=_blank rel=noopener>DataEngBytes</a> conference.</p><blockquote><p>As we all know, solid data engineering is essential to the success of data science and AI applications. And yet, people often get excited about fancy machine learning models and neglect the data engineering layer. This is totally understandable: playing with data in a throwaway notebook is more relaxing than dealing with a data pipeline that keeps finding ways to break in production.</p><p>In this talk, I&rsquo;ll share lessons on data engineering from a data science perspective. Everywhere I&rsquo;ve worked, from small start-ups to established companies, I&rsquo;ve found that I had to do some data engineering if I wanted my work to ever get to production. While I&rsquo;ve always been reluctant to do too much of it, my engineering background has placed me in a better position to do it than colleagues who started off as analysts and academics.</p><p>You could call my work full-stack data science, reluctant data engineering, or some other data & AI thing. Whatever it is, I hope that my talk will help us all play better with each other, across all layers of the data stack.</p></blockquote><p>As I don&rsquo;t identify as a data engineer and have never attended a DataEngBytes conference, I didn&rsquo;t know whether my talk would fit the agenda. However, it seemed harmless to submit an abstract and see how it goes.</p><p>When I got the acceptance notification and realised I had to turn my abstract into a coherent talk, I was a bit wary of lacking a good grasp of who&rsquo;s in my audience. However, when the full agenda was published, I realised that the focus of the conference won&rsquo;t be on arcane data engineering knowledge, given that one of the keynotes was titled <em>&ldquo;How The Full-Stack Data Scientist Is STILL The Sexiest Job&rdquo;</em>. It turned out that despite the name and tagline (<em>&ldquo;by data engineers, for data engineers&rdquo;</em>), DataEngBytes was a great event for all data professionals.</p><p>Here&rsquo;s the video of the talk (<a href=https://docs.google.com/presentation/d/100GiDkp3UKfQtWtxZOF4CaJWTuSYtkEYxkI0_INdqq8/edit target=_blank rel=noopener>slides</a>):</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/NE6e7Xx7OLQ style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="Talk video: Lessons from reluctant data engineering"></iframe></div><br><strong>Quick summary.</strong> I start off with a disclaimer, stating that I am not a data engineer. Then I show evidence that the market values data engineering more than data science, given the ratio of <em>Data Engineer</em> to <em>Data Scientist</em> job ads (x3 in the AU$100-150k compensation range; x4 in the AU$200k+ range).<sup id=fnref:1><a href=#fn:1 class=footnote-ref role=doc-noteref>1</a></sup> I follow that observation with another disclaimer, stating that some of my lessons may be obvious or better learnt the hard way (as I often have to learn and relearn lessons). Then I detail five chronologically ordered snippets and their corresponding lessons:</p><ol><li>2012: My first data science job, where we made mistakes around technology choice and premature optimisation. The lesson is that <strong>shiny tech ain&rsquo;t always shiny</strong>. Like all lessons, this one ends with a quote that shows that what I learned wasn&rsquo;t entirely new. The first quote is by Donald Knuth from 1974: <em>&ldquo;We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%.&rdquo;</em></li><li>2013: My first head of data science job, where we solved real scaling issues by following principles and adapting solutions to our situation. The lesson is that <strong>shiny tech can be transformative; but principles beat tools</strong>, which goes with a 1911 quote by Harrington Emerson: <em>&ldquo;As to methods, there may be a million and then some, but principles are few. The person who grasps principles can successfully select their own methods. The person who tries methods, ignoring principles, is sure to have trouble.&rdquo;</em><sup id=fnref:2><a href=#fn:2 class=footnote-ref role=doc-noteref>2</a></sup></li><li>2015: My first enterprise consulting stint, where I experienced being a not-so-useful data scientist and working with some not-so-useful data engineers. This led me to dabble in &ldquo;shadow IT&rdquo; (a term I learned at the conference), and build a separate Python machine learning pipeline to work around various limitations. The lesson is that you should <strong>solve problems; don’t be the problem</strong>, or in the words of circa 2004 Google: <em>&ldquo;Focus on the user and all else will follow.&rdquo;</em></li><li>2017: My first remote data science job, where I played around with many job functions across the data stack and went down various data rabbit holes. The lesson is to <strong>go deep; trust but verify</strong>, which goes with a 1999 quote by Eric S. Raymond: <em>&ldquo;Given enough eyeballs, all bugs are shallow.&rdquo;</em></li><li>2022: My first committed climate and biodiversity moves (still a work in progress). The lesson is that <strong>tech & titles are tools; focus on what matters</strong>, but recall Rabbi Tarfon&rsquo;s quote from almost two thousand years ago: <em>&ldquo;You are not obliged to complete the work, but neither are you free to desist from it.&rdquo;</em></li></ol><p>The main takeaway from the talk is that <strong>data problems have human roots – and human solutions</strong>. This is because:</p><ul><li>Humans get excited by shiny tech&mldr; and produce transformative tech.</li><li>Humans optimise prematurely&mldr; and when it makes sense.</li><li>Humans can act as unreasonable blockers&mldr; and as the users we serve.</li><li>Humans generate messy data&mldr; and clean it up.</li><li>Humans get distracted by tools&mldr; and use them for beneficial ends.</li></ul><div class=footnotes role=doc-endnotes><hr><ol><li id=fn:1><p>This is based on <a href=https://www.seek.com.au/ target=_blank rel=noopener>Seek</a> searches for jobs advertised in July 2023. Given the limitations of Seek search, it&rsquo;s not an accurate representation of the demand for each role, as the results included all ads that <em>mentioned</em> the terms. One could also argue that data engineers tend to change jobs more than data scientists, fuelling demand. Despite this, I think the results support the general message around the value of data engineering, especially as <a href=https://www.oreilly.com/radar/data-engineers-vs-data-scientists/ target=_blank rel=noopener>others have noted the need for 4-5 data engineers per data scientist in organisations with complex data engineering requirements</a>.&#160;<a href=#fnref:1 class=footnote-backref role=doc-backlink>&#8617;&#xfe0e;</a></p></li><li id=fn:2><p>Emerson referred to <em>man</em> rather than <em>person</em> in the original quote, but I took the liberty to make it gender-neutral and retain the original message.&#160;<a href=#fnref:2 class=footnote-backref role=doc-backlink>&#8617;&#xfe0e;</a></p></li></ol></div></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-engineering/>data engineering</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on x" href="https://x.com/intent/tweet/?text=Lessons%20from%20reluctant%20data%20engineering&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f&amp;hashtags=career%2cdataengineering%2cdatascience%2csoftwareengineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f&amp;title=Lessons%20from%20reluctant%20data%20engineering&amp;summary=Lessons%20from%20reluctant%20data%20engineering&amp;source=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f&title=Lessons%20from%20reluctant%20data%20engineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on whatsapp" href="https://api.whatsapp.com/send?text=Lessons%20from%20reluctant%20data%20engineering%20-%20https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on telegram" href="https://telegram.me/share/url?text=Lessons%20from%20reluctant%20data%20engineering&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on ycombinator" href="https://news.ycombinator.com/submitlink?t=Lessons%20from%20reluctant%20data%20engineering&u=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="career,data engineering,data science,software engineering"><meta name=description content="Video and summary of a talk I gave at DataEngBytes Brisbane on what I learned from doing data engineering as part of every data science role I had."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Lessons from reluctant data engineering"><meta property="og:description" content="Video and summary of a talk I gave at DataEngBytes Brisbane on what I learned from doing data engineering as part of every data science role I had."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/"><meta property="og:image" content="https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-10-25T04:45:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023.webp"><meta name=twitter:title content="Lessons from reluctant data engineering"><meta name=twitter:description content="Video and summary of a talk I gave at DataEngBytes Brisbane on what I learned from doing data engineering as part of every data science role I had."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Lessons from reluctant data engineering","item":"https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Lessons from reluctant data engineering","name":"Lessons from reluctant data engineering","description":"Video and summary of a talk I gave at DataEngBytes Brisbane on what I learned from doing data engineering as part of every data science role I had.","keywords":["career","data engineering","data science","software engineering"],"articleBody":"In May 2023, I submitted the following talk abstract to the Brisbane DataEngBytes conference.\nAs we all know, solid data engineering is essential to the success of data science and AI applications. And yet, people often get excited about fancy machine learning models and neglect the data engineering layer. This is totally understandable: playing with data in a throwaway notebook is more relaxing than dealing with a data pipeline that keeps finding ways to break in production.\nIn this talk, I’ll share lessons on data engineering from a data science perspective. Everywhere I’ve worked, from small start-ups to established companies, I’ve found that I had to do some data engineering if I wanted my work to ever get to production. While I’ve always been reluctant to do too much of it, my engineering background has placed me in a better position to do it than colleagues who started off as analysts and academics.\nYou could call my work full-stack data science, reluctant data engineering, or some other data \u0026 AI thing. Whatever it is, I hope that my talk will help us all play better with each other, across all layers of the data stack.\nAs I don’t identify as a data engineer and have never attended a DataEngBytes conference, I didn’t know whether my talk would fit the agenda. However, it seemed harmless to submit an abstract and see how it goes.\nWhen I got the acceptance notification and realised I had to turn my abstract into a coherent talk, I was a bit wary of lacking a good grasp of who’s in my audience. However, when the full agenda was published, I realised that the focus of the conference won’t be on arcane data engineering knowledge, given that one of the keynotes was titled “How The Full-Stack Data Scientist Is STILL The Sexiest Job”. It turned out that despite the name and tagline (“by data engineers, for data engineers”), DataEngBytes was a great event for all data professionals.\nHere’s the video of the talk (slides):\nQuick summary. I start off with a disclaimer, stating that I am not a data engineer. Then I show evidence that the market values data engineering more than data science, given the ratio of Data Engineer to Data Scientist job ads (x3 in the AU$100-150k compensation range; x4 in the AU$200k+ range).1 I follow that observation with another disclaimer, stating that some of my lessons may be obvious or better learnt the hard way (as I often have to learn and relearn lessons). Then I detail five chronologically ordered snippets and their corresponding lessons:\n2012: My first data science job, where we made mistakes around technology choice and premature optimisation. The lesson is that shiny tech ain’t always shiny. Like all lessons, this one ends with a quote that shows that what I learned wasn’t entirely new. The first quote is by Donald Knuth from 1974: “We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%.” 2013: My first head of data science job, where we solved real scaling issues by following principles and adapting solutions to our situation. The lesson is that shiny tech can be transformative; but principles beat tools, which goes with a 1911 quote by Harrington Emerson: “As to methods, there may be a million and then some, but principles are few. The person who grasps principles can successfully select their own methods. The person who tries methods, ignoring principles, is sure to have trouble.”2 2015: My first enterprise consulting stint, where I experienced being a not-so-useful data scientist and working with some not-so-useful data engineers. This led me to dabble in “shadow IT” (a term I learned at the conference), and build a separate Python machine learning pipeline to work around various limitations. The lesson is that you should solve problems; don’t be the problem, or in the words of circa 2004 Google: “Focus on the user and all else will follow.” 2017: My first remote data science job, where I played around with many job functions across the data stack and went down various data rabbit holes. The lesson is to go deep; trust but verify, which goes with a 1999 quote by Eric S. Raymond: “Given enough eyeballs, all bugs are shallow.” 2022: My first committed climate and biodiversity moves (still a work in progress). The lesson is that tech \u0026 titles are tools; focus on what matters, but recall Rabbi Tarfon’s quote from almost two thousand years ago: “You are not obliged to complete the work, but neither are you free to desist from it.” The main takeaway from the talk is that data problems have human roots – and human solutions. This is because:\nHumans get excited by shiny tech… and produce transformative tech. Humans optimise prematurely… and when it makes sense. Humans can act as unreasonable blockers… and as the users we serve. Humans generate messy data… and clean it up. Humans get distracted by tools… and use them for beneficial ends. This is based on Seek searches for jobs advertised in July 2023. Given the limitations of Seek search, it’s not an accurate representation of the demand for each role, as the results included all ads that mentioned the terms. One could also argue that data engineers tend to change jobs more than data scientists, fuelling demand. Despite this, I think the results support the general message around the value of data engineering, especially as others have noted the need for 4-5 data engineers per data scientist in organisations with complex data engineering requirements. ↩︎\nEmerson referred to man rather than person in the original quote, but I took the liberty to make it gender-neutral and retain the original message. ↩︎\n","wordCount":"969","inLanguage":"en","image":"https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023.webp","datePublished":"2023-10-25T04:45:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Lessons from reluctant data engineering</h1><div class=post-meta><span title='2023-10-25 04:45:00 +0000 UTC'>October 25, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-10-25-lessons-from-reluctant-data-engineering/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023_hu804925b1940ee0b95918a52a0d7d78df_87060_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023_hu804925b1940ee0b95918a52a0d7d78df_87060_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023.webp 676w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/yanir-seroussi-dataengbytes-brisbane-2023.webp alt="Yanir Seroussi presenting at DataEngBytes Brisbane 2023" width=676 height=450></figure><div class=post-content><p>In May 2023, I submitted the following talk abstract to the Brisbane <a href=https://dataengconf.com.au/ target=_blank rel=noopener>DataEngBytes</a> conference.</p><blockquote><p>As we all know, solid data engineering is essential to the success of data science and AI applications. And yet, people often get excited about fancy machine learning models and neglect the data engineering layer. This is totally understandable: playing with data in a throwaway notebook is more relaxing than dealing with a data pipeline that keeps finding ways to break in production.</p><p>In this talk, I&rsquo;ll share lessons on data engineering from a data science perspective. Everywhere I&rsquo;ve worked, from small start-ups to established companies, I&rsquo;ve found that I had to do some data engineering if I wanted my work to ever get to production. While I&rsquo;ve always been reluctant to do too much of it, my engineering background has placed me in a better position to do it than colleagues who started off as analysts and academics.</p><p>You could call my work full-stack data science, reluctant data engineering, or some other data & AI thing. Whatever it is, I hope that my talk will help us all play better with each other, across all layers of the data stack.</p></blockquote><p>As I don&rsquo;t identify as a data engineer and have never attended a DataEngBytes conference, I didn&rsquo;t know whether my talk would fit the agenda. However, it seemed harmless to submit an abstract and see how it goes.</p><p>When I got the acceptance notification and realised I had to turn my abstract into a coherent talk, I was a bit wary of lacking a good grasp of who&rsquo;s in my audience. However, when the full agenda was published, I realised that the focus of the conference won&rsquo;t be on arcane data engineering knowledge, given that one of the keynotes was titled <em>&ldquo;How The Full-Stack Data Scientist Is STILL The Sexiest Job&rdquo;</em>. It turned out that despite the name and tagline (<em>&ldquo;by data engineers, for data engineers&rdquo;</em>), DataEngBytes was a great event for all data professionals.</p><p>Here&rsquo;s the video of the talk (<a href=https://docs.google.com/presentation/d/100GiDkp3UKfQtWtxZOF4CaJWTuSYtkEYxkI0_INdqq8/edit target=_blank rel=noopener>slides</a>):</p><p><div style=position:relative;padding-bottom:56.25%;height:0;overflow:hidden><iframe src=https://www.youtube.com/embed/NE6e7Xx7OLQ style=position:absolute;top:0;left:0;width:100%;height:100%;border:0 allowfullscreen title="Talk video: Lessons from reluctant data engineering"></iframe></div><br><strong>Quick summary.</strong> I start off with a disclaimer, stating that I am not a data engineer. Then I show evidence that the market values data engineering more than data science, given the ratio of <em>Data Engineer</em> to <em>Data Scientist</em> job ads (x3 in the AU$100-150k compensation range; x4 in the AU$200k+ range).<sup id=fnref:1><a href=#fn:1 class=footnote-ref role=doc-noteref>1</a></sup> I follow that observation with another disclaimer, stating that some of my lessons may be obvious or better learnt the hard way (as I often have to learn and relearn lessons). Then I detail five chronologically ordered snippets and their corresponding lessons:</p><ol><li>2012: My first data science job, where we made mistakes around technology choice and premature optimisation. The lesson is that <strong>shiny tech ain&rsquo;t always shiny</strong>. Like all lessons, this one ends with a quote that shows that what I learned wasn&rsquo;t entirely new. The first quote is by Donald Knuth from 1974: <em>&ldquo;We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%.&rdquo;</em></li><li>2013: My first head of data science job, where we solved real scaling issues by following principles and adapting solutions to our situation. The lesson is that <strong>shiny tech can be transformative; but principles beat tools</strong>, which goes with a 1911 quote by Harrington Emerson: <em>&ldquo;As to methods, there may be a million and then some, but principles are few. The person who grasps principles can successfully select their own methods. The person who tries methods, ignoring principles, is sure to have trouble.&rdquo;</em><sup id=fnref:2><a href=#fn:2 class=footnote-ref role=doc-noteref>2</a></sup></li><li>2015: My first enterprise consulting stint, where I experienced being a not-so-useful data scientist and working with some not-so-useful data engineers. This led me to dabble in &ldquo;shadow IT&rdquo; (a term I learned at the conference), and build a separate Python machine learning pipeline to work around various limitations. The lesson is that you should <strong>solve problems; don’t be the problem</strong>, or in the words of circa 2004 Google: <em>&ldquo;Focus on the user and all else will follow.&rdquo;</em></li><li>2017: My first remote data science job, where I played around with many job functions across the data stack and went down various data rabbit holes. The lesson is to <strong>go deep; trust but verify</strong>, which goes with a 1999 quote by Eric S. Raymond: <em>&ldquo;Given enough eyeballs, all bugs are shallow.&rdquo;</em></li><li>2022: My first committed climate and biodiversity moves (still a work in progress). The lesson is that <strong>tech & titles are tools; focus on what matters</strong>, but recall Rabbi Tarfon&rsquo;s quote from almost two thousand years ago: <em>&ldquo;You are not obliged to complete the work, but neither are you free to desist from it.&rdquo;</em></li></ol><p>The main takeaway from the talk is that <strong>data problems have human roots – and human solutions</strong>. This is because:</p><ul><li>Humans get excited by shiny tech&mldr; and produce transformative tech.</li><li>Humans optimise prematurely&mldr; and when it makes sense.</li><li>Humans can act as unreasonable blockers&mldr; and as the users we serve.</li><li>Humans generate messy data&mldr; and clean it up.</li><li>Humans get distracted by tools&mldr; and use them for beneficial ends.</li></ul><div class=footnotes role=doc-endnotes><hr><ol><li id=fn:1><p>This is based on <a href=https://www.seek.com.au/ target=_blank rel=noopener>Seek</a> searches for jobs advertised in July 2023. Given the limitations of Seek search, it&rsquo;s not an accurate representation of the demand for each role, as the results included all ads that <em>mentioned</em> the terms. One could also argue that data engineers tend to change jobs more than data scientists, fuelling demand. Despite this, I think the results support the general message around the value of data engineering, especially as <a href=https://www.oreilly.com/radar/data-engineers-vs-data-scientists/ target=_blank rel=noopener>others have noted the need for 4-5 data engineers per data scientist in organisations with complex data engineering requirements</a>.&#160;<a href=#fnref:1 class=footnote-backref role=doc-backlink>&#8617;&#xfe0e;</a></p></li><li id=fn:2><p>Emerson referred to <em>man</em> rather than <em>person</em> in the original quote, but I took the liberty to make it gender-neutral and retain the original message.&#160;<a href=#fnref:2 class=footnote-backref role=doc-backlink>&#8617;&#xfe0e;</a></p></li></ol></div></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-engineering/>data engineering</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on x" href="https://x.com/intent/tweet/?text=Lessons%20from%20reluctant%20data%20engineering&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f&amp;hashtags=career%2cdataengineering%2cdatascience%2csoftwareengineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f&amp;title=Lessons%20from%20reluctant%20data%20engineering&amp;summary=Lessons%20from%20reluctant%20data%20engineering&amp;source=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f&title=Lessons%20from%20reluctant%20data%20engineering"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on whatsapp" href="https://api.whatsapp.com/send?text=Lessons%20from%20reluctant%20data%20engineering%20-%20https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on telegram" href="https://telegram.me/share/url?text=Lessons%20from%20reluctant%20data%20engineering&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Lessons from reluctant data engineering on ycombinator" href="https://news.ycombinator.com/submitlink?t=Lessons%20from%20reluctant%20data%20engineering&u=https%3a%2f%2fyanirseroussi.com%2f2023%2f10%2f25%2flessons-from-reluctant-data-engineering%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.html b/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.html
index 4b5ee9422..8c66eb11c 100644
--- a/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.html
+++ b/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Supporting volunteer monitoring of marine biodiversity with modern web and data tools | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data engineering,data visualisation,machine learning,marine science,Reef Life Survey,software engineering,web development"><meta name=description content="Summarising the work Uri Seroussi and I did to improve Reef Life Survey&rsquo;s Reef Species of the World app."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Supporting volunteer monitoring of marine biodiversity with modern web and data tools"><meta property="og:description" content="Summarising the work Uri Seroussi and I did to improve Reef Life Survey&rsquo;s Reef Species of the World app."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/"><meta property="og:image" content="https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-11-29T02:00:00+00:00"><meta property="article:modified_time" content="2023-11-29T12:57:12+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp"><meta name=twitter:title content="Supporting volunteer monitoring of marine biodiversity with modern web and data tools"><meta name=twitter:description content="Summarising the work Uri Seroussi and I did to improve Reef Life Survey&rsquo;s Reef Species of the World app."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Supporting volunteer monitoring of marine biodiversity with modern web and data tools","item":"https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Supporting volunteer monitoring of marine biodiversity with modern web and data tools","name":"Supporting volunteer monitoring of marine biodiversity with modern web and data tools","description":"Summarising the work Uri Seroussi and I did to improve Reef Life Survey\u0026rsquo;s Reef Species of the World app.","keywords":["data engineering","data visualisation","machine learning","marine science","Reef Life Survey","software engineering","web development"],"articleBody":"I’ve been volunteering with the Reef Life Survey (RLS) citizen science project since 2015. RLS volunteers follow the same underwater visual census methodology that has been in use for decades, thereby producing data series that help inform the management of marine ecosystems. In simpler terms, we count fish (and some invertebrates), and this helps various organisations know what’s happening underwater. Among other places, RLS data has been used in scientific publications in Nature and elsewhere, and to inform the management of Australian marine parks.\nOver the years, I created a few online tools to help volunteers with survey work. These included web apps to visualise survey results and study species, as well as infer species from underwater photos. More recently, I agreed to help with the general maintenance of the non-WordPress parts of the RLS website and backend (somewhat reluctantly, but I suppose that’s what happens when you do things out of love).\nTaking greater responsibility to help the tech side of RLS along with an alignment of the research grant stars led to an opportunity to revamp the Reef Species of the World (RSoW) section – a collection of over 5,000 species with in-situ photos, descriptions, and empirical distributions derived from RLS surveys. My focus in this project was on product management, data pipelines, and backend work. I was joined by my brother, Uri Seroussi, who was in charge of front-end development (which became much more substantial than in the original RSoW).\nThe original RSoW was a traditional PHP application that relied on a MySQL database to serve requests, with most of the HTML constructed on the server. By contrast, we re-architected the new RSoW as a progressive web app using Next.js, which has the following advantages and new features:\nFully static site: served faster with reduced server load. Faster search and navigation: happens on the front-end without round-trips to the server. Installable app with offline availability: RSoW can now be installed as a mobile or desktop app, and run without an internet connection. Client-side image classification: offline availability includes image classification in the browser, which is useful when surveying in remote areas. Replacement of previous tools and pipelines: providing a more consistent user experience and improved data reliability. The rest of this post provides details on the architecture and implementation of the new RSoW and its underlying data and machine learning pipelines. But the best way of getting a feel for the data and the tools is to have a play yourself.\nThe new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems The RSoW web app We didn’t start with a blank slate: RSoW was already a public website, with many individual species pages ranking well on web searches (the main source of traffic). As such, a guiding principle was to retain as much of the original functionality as possible, and then build new features on top of it.\nWhen approaching a legacy codebase, there’s always the question of whether rebuilding parts or all of it is a worthwhile endeavour. As Jason Cohen notes, a more apt name for “legacy code” is “revenue code”, i.e., the code that embodies all the original and changed requirements, and has withstood the test of time. Even though RLS’s code isn’t meant to generate revenue, it’s always easy to mess things up when re-implementing existing functionality.\nThe main reasons we decided on a rewrite of the front-end were:\nUser experience: Speed things up, as some species searches were pretty slow due to server round-trips and inefficient database queries. Extensibility: Make it easier to add new features. Offline availability: This is impossible with a traditional PHP back-end, but feasible if all the data and code gets shipped to the client. We chose Next.js as the front-end framework since it’s well-established and supports static exports. Parts of the RLS website run on WordPress, so it’s easy to add statically-generated pages and serve them efficiently via Cloudflare (I wasn’t keen on complicating the stack by adding a Node backend). With static exports, we regenerate all the species pages whenever the data changes, which means that end-user page requests don’t need to touch the database. In addition, the main search page downloads three JSONs with all the data it needs to perform any species search (see sites.json, species.json, and surveys.json in the rls-data repo). Minified and compressed, these JSONs add up to less than 2MB of data, which isn’t tiny, but it is a small price to pay to avoid hitting the database. The JSONs also cache well on Cloudflare, like the rest of the web app’s files.\nFrom a user perspective, replicating the original functionality was the less exciting part of the project. Faster and less buggy code is obviously better, but once feature parity was achieved, we turned our attention to some new features:\nSupporting offline availability and installation by turning RSoW into a progressive web app: On its face, this was supposed to be simple given the next-pwa package, but it turned out to be a bit tricky because the original package was abandoned, and due to multiple layers of caching. It’s well-known that cache invalidation is one of the two hard problems in computer science (along with naming things and off-by-one errors), and progressive web apps offer a lovely variety of caches to deal with – everything needs to be cached on the client for offline availability. We got there after some tinkering and dealing with head-scratching bugs, some of which were caused by other caching layers in addition to the client-side caches (including Cloudflare and some misconfiguration of an early version of the app). Knowledge test: A separate grant came along and Uri had the opportunity to extend RSoW by adding a section that helps test new volunteers ahead of them joining RLS. Species frequency exploration: Bringing in the full functionality from the first tool I built for RLS back in 2017. Client-side image classification: Deprecating the Streamlit app I built a couple of years ago. Data and machine learning pipelines On the back-end, there was an opportunity to simplify things by retiring the original PHP code that processed survey data in favour of the pipelines I implemented in the rls-data repo. Ultimately, survey data comes from the Australian Ocean Data Network (AODN), which holds many more datasets in addition to RLS. Originally, the PHP code that processed survey data into the MySQL database evolved separately from rls-data, which I implemented to generate JSONs for the tools I built. As rls-data is an open source project and the raw survey data is relatively small (\u003c1GB), it made sense to process it with a daily GitHub Action (GHA) script that runs for free. The resultant JSONs are committed to the repo, which means that any unexpected changes are easily tracked (I keep an eye on the commits). It was simple to expand the existing rls-data pipelines to generate all the JSONs needed to serve RSoW, and then say goodbye to the PHP code that implemented similar functionality.\nI’m aware that running data pipelines with GitHub Actions isn’t going to win any awards for sophistication, but it’s a great fit for this project. The key principle is to use the right tool for the job, not the shiniest tool.\nOne part of the original RSoW that we barely touched was the management interface, which allows RLS admins to update species data and upload pictures. The gains from replacing the admin part of RSoW would have been negligible, so it still runs the old PHP code on top of MySQL. Unfortunately, this meant I couldn’t retire all the PHP data pipelines, as species data also comes from the Australian Ocean Data Network and is joined with the edits made by RLS admins. This exemplifies the pragmatism that one often needs to apply when faced with legacy revenue systems: If a system works and there’s no real benefit to replacing it, sticking with the old system is the right thing to do (even if it makes your architecture diagram more complicated).\nI have big plans to improve the machine learning model for inferring RLS species from user images, but it’s somehow never a priority. For RSoW, I did make it a priority to support serving the model with a simple API, but then I decided it’d be worth the effort to export it to ONNX for client-side image classification. This was partly driven by curiosity about ONNX, but it also had two key benefits: (1) support for offline classification; and (2) simplified \u0026 cheaper serving architecture, as ONNX models can be served from S3 and don’t require RLS to pay for server-side compute.\nAs to the machine learning pipelines, they all need to be manually triggered, which is fine since the image data changes slowly. These pipelines are implemented in notebooks and the command-line interface of the ichthywhat repo. I have a bit of a dream of this being an early precursor to complete automation of RLS data collection, with the historical RLS data series continued by divers who would mostly serve as video takers and fish scarers (using cameras without human divers would lead to different biases in the data). However, this is a big project that is probably best left to my next PhD, i.e., it may never happen.\nIn the meantime, I hope to continue diving with RLS, and aim to make pragmatic decisions to keep RSoW running and supporting the community.\n","wordCount":"1574","inLanguage":"en","image":"https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp","datePublished":"2023-11-29T02:00:00Z","dateModified":"2023-11-29T12:57:12+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Supporting volunteer monitoring of marine biodiversity with modern web and data tools</h1><div class=post-meta><span title='2023-11-29 02:00:00 +0000 UTC'>November 29, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-11-29-supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot_huf880d5e70f6cbabeaf9d4b27c6c21664_34088_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot_huf880d5e70f6cbabeaf9d4b27c6c21664_34088_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot_huf880d5e70f6cbabeaf9d4b27c6c21664_34088_720x0_resize_q75_h2_box_2.webp 720w ,https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot_huf880d5e70f6cbabeaf9d4b27c6c21664_34088_1080x0_resize_q75_h2_box_2.webp 1080w ,https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp 1200w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp alt="Screenshot of Reef Species of the World" width=1200 height=591></figure><div class=post-content><p>I&rsquo;ve been volunteering with the <a href=https://reeflifesurvey.com/ target=_blank rel=noopener>Reef Life Survey</a> (RLS) citizen science project since 2015. RLS volunteers follow the same underwater visual census methodology that has been in use for decades, thereby producing data series that help inform the management of marine ecosystems. In simpler terms, we count fish (and some invertebrates), and this helps various organisations know what&rsquo;s happening underwater. Among other places, RLS data has been used in scientific publications in <a href=https://www.nature.com/articles/s41586-023-05833-y target=_blank rel=noopener>Nature</a> and <a href=https://reeflifesurvey.com/scientific-papers-management-reports/ target=_blank rel=noopener>elsewhere</a>, and to inform the <a href=https://parksaustralia.gov.au/marine/science/reef-life-survey/ target=_blank rel=noopener>management of Australian marine parks</a>.</p><p>Over the years, I created a few online tools to help volunteers with survey work. These included web apps to <a href=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/>visualise survey results and study species</a>, as well as <a href=https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/>infer species from underwater photos</a>. More recently, I agreed to help with the general maintenance of the non-WordPress parts of the RLS website and backend (<a href=https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/>somewhat reluctantly</a>, but I suppose that&rsquo;s what happens when you do things out of love).</p><p>Taking greater responsibility to help the tech side of RLS along with an alignment of the research grant stars led to an opportunity to revamp the <a href=https://reeflifesurvey.com/species/ target=_blank rel=noopener>Reef Species of the World</a> (RSoW) section – a collection of over 5,000 species with in-situ photos, descriptions, and empirical distributions derived from RLS surveys. My focus in this project was on product management, data pipelines, and backend work. I was joined by my brother, <a href=https://www.uriseroussi.com/ target=_blank rel=noopener>Uri Seroussi</a>, who was in charge of front-end development (which became much more substantial than in the original RSoW).</p><p>The original RSoW was a traditional PHP application that relied on a MySQL database to serve requests, with most of the HTML constructed on the server. By contrast, we re-architected the new RSoW as a progressive web app using Next.js, which has the following advantages and new features:</p><ul><li><strong>Fully static site:</strong> served faster with reduced server load.</li><li><strong>Faster search and navigation:</strong> happens on the front-end without round-trips to the server.</li><li><strong>Installable app with offline availability:</strong> RSoW can now be installed as a mobile or desktop app, and run without an internet connection.</li><li><strong>Client-side image classification:</strong> offline availability includes image classification in the browser, which is useful when surveying in remote areas.</li><li><strong>Replacement of previous tools and pipelines:</strong> providing a more consistent user experience and improved data reliability.</li></ul><p>The rest of this post provides details on the architecture and implementation of the new RSoW and its underlying data and machine learning pipelines. But the best way of getting a feel for the data and the tools is <a href=https://reeflifesurvey.com/species/ target=_blank rel=noopener>to have a play yourself</a>.</p><figure><a href=reef-species-of-the-world-architecture.svg target=_blank rel=noopener><img src=reef-species-of-the-world-architecture.svg alt="The new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems" loading=lazy></a><figcaption><p>The new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems</p></figcaption></figure><h2 id=the-rsow-web-app>The RSoW web app<a hidden class=anchor aria-hidden=true href=#the-rsow-web-app>#</a></h2><p>We didn&rsquo;t start with a blank slate: RSoW was already a public website, with many individual species pages ranking well on web searches (the main source of traffic). As such, a guiding principle was to retain as much of the original functionality as possible, and then build new features on top of it.</p><p>When approaching a legacy codebase, there&rsquo;s always the question of whether rebuilding parts or all of it is a worthwhile endeavour. As Jason Cohen notes, <a href=https://longform.asmartbear.com/scale target=_blank rel=noopener>a more apt name for &ldquo;legacy code&rdquo; is &ldquo;revenue code&rdquo;</a>, i.e., the code that embodies all the original and changed requirements, and has withstood the test of time. Even though RLS&rsquo;s code isn&rsquo;t meant to generate revenue, it&rsquo;s always easy to mess things up when re-implementing existing functionality.</p><p>The main reasons we decided on a rewrite of the front-end were:</p><ul><li><strong>User experience:</strong> Speed things up, as some species searches were pretty slow due to server round-trips and inefficient database queries.</li><li><strong>Extensibility:</strong> Make it easier to add new features.</li><li><strong>Offline availability:</strong> This is impossible with a traditional PHP back-end, but feasible if all the data and code gets shipped to the client.</li></ul><p>We chose Next.js as the front-end framework since it&rsquo;s well-established and supports static exports. Parts of the RLS website run on WordPress, so it&rsquo;s easy to add statically-generated pages and serve them efficiently via Cloudflare (I wasn&rsquo;t keen on complicating the stack by adding a Node backend). With static exports, we regenerate all the species pages whenever the data changes, which means that end-user page requests don&rsquo;t need to touch the database. In addition, the main search page downloads three JSONs with all the data it needs to perform any species search (see <code>sites.json</code>, <code>species.json</code>, and <code>surveys.json</code> in <a href=https://github.com/yanirs/rls-data/tree/master/output target=_blank rel=noopener>the <code>rls-data</code> repo</a>). Minified and compressed, these JSONs add up to less than 2MB of data, which isn&rsquo;t tiny, but it is a small price to pay to avoid hitting the database. The JSONs also cache well on Cloudflare, like the rest of the web app&rsquo;s files.</p><p>From a user perspective, replicating the original functionality was the less exciting part of the project. Faster and less buggy code is obviously better, but once feature parity was achieved, we turned our attention to some new features:</p><ul><li><strong>Supporting offline availability and installation</strong> by turning RSoW into a <a href=https://developer.mozilla.org/en-US/docs/Web/Progressive_web_apps target=_blank rel=noopener>progressive web app</a>: On its face, this was supposed to be simple given the <code>next-pwa</code> package, but it turned out to be a bit tricky because the original package was abandoned, and due to multiple layers of caching. It&rsquo;s well-known that <a href=https://martinfowler.com/bliki/TwoHardThings.html target=_blank rel=noopener>cache invalidation is one of the two hard problems in computer science</a> (along with naming things and off-by-one errors), and progressive web apps offer a lovely variety of caches to deal with – everything needs to be cached on the client for offline availability. We got there after some tinkering and dealing with head-scratching bugs, some of which were caused by other caching layers in addition to the client-side caches (including Cloudflare and some misconfiguration of an early version of the app).</li><li><strong>Knowledge test</strong>: A separate grant came along and Uri had the opportunity to extend RSoW by adding a section that helps test new volunteers ahead of them joining RLS.</li><li><strong>Species frequency exploration</strong>: Bringing in the full functionality from <a href=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/>the first tool I built for RLS back in 2017</a>.</li><li><strong>Client-side image classification</strong>: Deprecating <a href=https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/>the Streamlit app I built a couple of years ago</a>.</li></ul><h2 id=data-and-machine-learning-pipelines>Data and machine learning pipelines<a hidden class=anchor aria-hidden=true href=#data-and-machine-learning-pipelines>#</a></h2><p>On the back-end, there was an opportunity to simplify things by retiring the original PHP code that processed survey data in favour of the pipelines I implemented in <a href=https://github.com/yanirs/rls-data/ target=_blank rel=noopener>the <code>rls-data</code> repo</a>. Ultimately, survey data comes from the <a href=https://portal.aodn.org.au/ target=_blank rel=noopener>Australian Ocean Data Network</a> (AODN), which holds many more datasets in addition to RLS. Originally, the PHP code that processed survey data into the MySQL database evolved separately from <code>rls-data</code>, which I implemented to generate JSONs for the tools I built. As <code>rls-data</code> is an open source project and the raw survey data is relatively small (&lt;1GB), it made sense to process it with a daily GitHub Action (GHA) script that runs for free. The resultant JSONs are committed to the repo, which means that any unexpected changes are easily tracked (<a href=https://yanirseroussi.com/til/2023/08/14/email-notifications-on-public-github-commits/>I keep an eye on the commits</a>). It was simple to expand the existing <code>rls-data</code> pipelines to generate all the JSONs needed to serve RSoW, and then say goodbye to the PHP code that implemented similar functionality.</p><p>I&rsquo;m aware that running data pipelines with GitHub Actions isn&rsquo;t going to win any awards for sophistication, but it&rsquo;s a great fit for this project. The key principle is to use the right tool for the job, not the shiniest tool.</p><p>One part of the original RSoW that we barely touched was the management interface, which allows RLS admins to update species data and upload pictures. The gains from replacing the admin part of RSoW would have been negligible, so it still runs the old PHP code on top of MySQL. Unfortunately, this meant I couldn&rsquo;t retire all the PHP data pipelines, as species data also comes from the Australian Ocean Data Network and is joined with the edits made by RLS admins. This exemplifies the pragmatism that one often needs to apply when faced with <strike>legacy</strike> revenue systems: If a system works and there&rsquo;s no real benefit to replacing it, sticking with the old system is the right thing to do (even if it makes your architecture diagram more complicated).</p><p>I have <a href=https://github.com/yanirs/ichthywhat/issues/3 target=_blank rel=noopener>big plans</a> to improve the machine learning model for inferring RLS species from user images, but it&rsquo;s somehow never a priority. For RSoW, I did make it a priority to support <a href=https://github.com/yanirs/ichthywhat/pull/11 target=_blank rel=noopener>serving the model with a simple API</a>, but then I decided it&rsquo;d be worth the effort to <a href=https://github.com/yanirs/ichthywhat/pull/20 target=_blank rel=noopener>export it to ONNX for client-side image classification</a>. This was partly driven by curiosity about <a href=https://onnx.ai/ target=_blank rel=noopener>ONNX</a>, but it also had two key benefits: (1) support for offline classification; and (2) simplified & cheaper serving architecture, as ONNX models can be served from S3 and don&rsquo;t require RLS to pay for server-side compute.</p><p>As to the machine learning pipelines, they all need to be manually triggered, which is fine since the image data changes slowly. These pipelines are implemented in <a href=https://github.com/yanirs/ichthywhat target=_blank rel=noopener>notebooks and the command-line interface of the <code>ichthywhat</code> repo</a>. I have a bit of a dream of this being an early precursor to complete automation of RLS data collection, with the historical RLS data series continued by divers who would mostly serve as video takers and fish scarers (using cameras without human divers would lead to different biases in the data). However, this is a big project that is probably best left to my next PhD, i.e., it may never happen.</p><p>In the meantime, I hope to continue diving with RLS, and aim to make pragmatic decisions to keep RSoW running and supporting the community.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-engineering/>data engineering</a></li><li><a href=https://yanirseroussi.com/tags/data-visualisation/>data visualisation</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/marine-science/>marine science</a></li><li><a href=https://yanirseroussi.com/tags/reef-life-survey/>Reef Life Survey</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li><li><a href=https://yanirseroussi.com/tags/web-development/>web development</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on x" href="https://x.com/intent/tweet/?text=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f&amp;hashtags=dataengineering%2cdatavisualisation%2cmachinelearning%2cmarinescience%2cReefLifeSurvey%2csoftwareengineering%2cwebdevelopment"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f&amp;title=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&amp;summary=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&amp;source=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f&title=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on whatsapp" href="https://api.whatsapp.com/send?text=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools%20-%20https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on telegram" href="https://telegram.me/share/url?text=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on ycombinator" href="https://news.ycombinator.com/submitlink?t=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&u=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="data engineering,data visualisation,machine learning,marine science,Reef Life Survey,software engineering,web development"><meta name=description content="Summarising the work Uri Seroussi and I did to improve Reef Life Survey&rsquo;s Reef Species of the World app."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Supporting volunteer monitoring of marine biodiversity with modern web and data tools"><meta property="og:description" content="Summarising the work Uri Seroussi and I did to improve Reef Life Survey&rsquo;s Reef Species of the World app."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/"><meta property="og:image" content="https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp"><meta property="article:section" content="posts"><meta property="article:published_time" content="2023-11-29T02:00:00+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp"><meta name=twitter:title content="Supporting volunteer monitoring of marine biodiversity with modern web and data tools"><meta name=twitter:description content="Summarising the work Uri Seroussi and I did to improve Reef Life Survey&rsquo;s Reef Species of the World app."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Supporting volunteer monitoring of marine biodiversity with modern web and data tools","item":"https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Supporting volunteer monitoring of marine biodiversity with modern web and data tools","name":"Supporting volunteer monitoring of marine biodiversity with modern web and data tools","description":"Summarising the work Uri Seroussi and I did to improve Reef Life Survey\u0026rsquo;s Reef Species of the World app.","keywords":["data engineering","data visualisation","machine learning","marine science","Reef Life Survey","software engineering","web development"],"articleBody":"I’ve been volunteering with the Reef Life Survey (RLS) citizen science project since 2015. RLS volunteers follow the same underwater visual census methodology that has been in use for decades, thereby producing data series that help inform the management of marine ecosystems. In simpler terms, we count fish (and some invertebrates), and this helps various organisations know what’s happening underwater. Among other places, RLS data has been used in scientific publications in Nature and elsewhere, and to inform the management of Australian marine parks.\nOver the years, I created a few online tools to help volunteers with survey work. These included web apps to visualise survey results and study species, as well as infer species from underwater photos. More recently, I agreed to help with the general maintenance of the non-WordPress parts of the RLS website and backend (somewhat reluctantly, but I suppose that’s what happens when you do things out of love).\nTaking greater responsibility to help the tech side of RLS along with an alignment of the research grant stars led to an opportunity to revamp the Reef Species of the World (RSoW) section – a collection of over 5,000 species with in-situ photos, descriptions, and empirical distributions derived from RLS surveys. My focus in this project was on product management, data pipelines, and backend work. I was joined by my brother, Uri Seroussi, who was in charge of front-end development (which became much more substantial than in the original RSoW).\nThe original RSoW was a traditional PHP application that relied on a MySQL database to serve requests, with most of the HTML constructed on the server. By contrast, we re-architected the new RSoW as a progressive web app using Next.js, which has the following advantages and new features:\nFully static site: served faster with reduced server load. Faster search and navigation: happens on the front-end without round-trips to the server. Installable app with offline availability: RSoW can now be installed as a mobile or desktop app, and run without an internet connection. Client-side image classification: offline availability includes image classification in the browser, which is useful when surveying in remote areas. Replacement of previous tools and pipelines: providing a more consistent user experience and improved data reliability. The rest of this post provides details on the architecture and implementation of the new RSoW and its underlying data and machine learning pipelines. But the best way of getting a feel for the data and the tools is to have a play yourself.\nThe new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems The RSoW web app We didn’t start with a blank slate: RSoW was already a public website, with many individual species pages ranking well on web searches (the main source of traffic). As such, a guiding principle was to retain as much of the original functionality as possible, and then build new features on top of it.\nWhen approaching a legacy codebase, there’s always the question of whether rebuilding parts or all of it is a worthwhile endeavour. As Jason Cohen notes, a more apt name for “legacy code” is “revenue code”, i.e., the code that embodies all the original and changed requirements, and has withstood the test of time. Even though RLS’s code isn’t meant to generate revenue, it’s always easy to mess things up when re-implementing existing functionality.\nThe main reasons we decided on a rewrite of the front-end were:\nUser experience: Speed things up, as some species searches were pretty slow due to server round-trips and inefficient database queries. Extensibility: Make it easier to add new features. Offline availability: This is impossible with a traditional PHP back-end, but feasible if all the data and code gets shipped to the client. We chose Next.js as the front-end framework since it’s well-established and supports static exports. Parts of the RLS website run on WordPress, so it’s easy to add statically-generated pages and serve them efficiently via Cloudflare (I wasn’t keen on complicating the stack by adding a Node backend). With static exports, we regenerate all the species pages whenever the data changes, which means that end-user page requests don’t need to touch the database. In addition, the main search page downloads three JSONs with all the data it needs to perform any species search (see sites.json, species.json, and surveys.json in the rls-data repo). Minified and compressed, these JSONs add up to less than 2MB of data, which isn’t tiny, but it is a small price to pay to avoid hitting the database. The JSONs also cache well on Cloudflare, like the rest of the web app’s files.\nFrom a user perspective, replicating the original functionality was the less exciting part of the project. Faster and less buggy code is obviously better, but once feature parity was achieved, we turned our attention to some new features:\nSupporting offline availability and installation by turning RSoW into a progressive web app: On its face, this was supposed to be simple given the next-pwa package, but it turned out to be a bit tricky because the original package was abandoned, and due to multiple layers of caching. It’s well-known that cache invalidation is one of the two hard problems in computer science (along with naming things and off-by-one errors), and progressive web apps offer a lovely variety of caches to deal with – everything needs to be cached on the client for offline availability. We got there after some tinkering and dealing with head-scratching bugs, some of which were caused by other caching layers in addition to the client-side caches (including Cloudflare and some misconfiguration of an early version of the app). Knowledge test: A separate grant came along and Uri had the opportunity to extend RSoW by adding a section that helps test new volunteers ahead of them joining RLS. Species frequency exploration: Bringing in the full functionality from the first tool I built for RLS back in 2017. Client-side image classification: Deprecating the Streamlit app I built a couple of years ago. Data and machine learning pipelines On the back-end, there was an opportunity to simplify things by retiring the original PHP code that processed survey data in favour of the pipelines I implemented in the rls-data repo. Ultimately, survey data comes from the Australian Ocean Data Network (AODN), which holds many more datasets in addition to RLS. Originally, the PHP code that processed survey data into the MySQL database evolved separately from rls-data, which I implemented to generate JSONs for the tools I built. As rls-data is an open source project and the raw survey data is relatively small (\u003c1GB), it made sense to process it with a daily GitHub Action (GHA) script that runs for free. The resultant JSONs are committed to the repo, which means that any unexpected changes are easily tracked (I keep an eye on the commits). It was simple to expand the existing rls-data pipelines to generate all the JSONs needed to serve RSoW, and then say goodbye to the PHP code that implemented similar functionality.\nI’m aware that running data pipelines with GitHub Actions isn’t going to win any awards for sophistication, but it’s a great fit for this project. The key principle is to use the right tool for the job, not the shiniest tool.\nOne part of the original RSoW that we barely touched was the management interface, which allows RLS admins to update species data and upload pictures. The gains from replacing the admin part of RSoW would have been negligible, so it still runs the old PHP code on top of MySQL. Unfortunately, this meant I couldn’t retire all the PHP data pipelines, as species data also comes from the Australian Ocean Data Network and is joined with the edits made by RLS admins. This exemplifies the pragmatism that one often needs to apply when faced with legacy revenue systems: If a system works and there’s no real benefit to replacing it, sticking with the old system is the right thing to do (even if it makes your architecture diagram more complicated).\nI have big plans to improve the machine learning model for inferring RLS species from user images, but it’s somehow never a priority. For RSoW, I did make it a priority to support serving the model with a simple API, but then I decided it’d be worth the effort to export it to ONNX for client-side image classification. This was partly driven by curiosity about ONNX, but it also had two key benefits: (1) support for offline classification; and (2) simplified \u0026 cheaper serving architecture, as ONNX models can be served from S3 and don’t require RLS to pay for server-side compute.\nAs to the machine learning pipelines, they all need to be manually triggered, which is fine since the image data changes slowly. These pipelines are implemented in notebooks and the command-line interface of the ichthywhat repo. I have a bit of a dream of this being an early precursor to complete automation of RLS data collection, with the historical RLS data series continued by divers who would mostly serve as video takers and fish scarers (using cameras without human divers would lead to different biases in the data). However, this is a big project that is probably best left to my next PhD, i.e., it may never happen.\nIn the meantime, I hope to continue diving with RLS, and aim to make pragmatic decisions to keep RSoW running and supporting the community.\n","wordCount":"1574","inLanguage":"en","image":"https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp","datePublished":"2023-11-29T02:00:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Supporting volunteer monitoring of marine biodiversity with modern web and data tools</h1><div class=post-meta><span title='2023-11-29 02:00:00 +0000 UTC'>November 29, 2023</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2023-11-29-supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot_huf880d5e70f6cbabeaf9d4b27c6c21664_34088_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot_huf880d5e70f6cbabeaf9d4b27c6c21664_34088_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot_huf880d5e70f6cbabeaf9d4b27c6c21664_34088_720x0_resize_q75_h2_box_2.webp 720w ,https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot_huf880d5e70f6cbabeaf9d4b27c6c21664_34088_1080x0_resize_q75_h2_box_2.webp 1080w ,https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp 1200w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/reef-species-of-the-world-screenshot.webp alt="Screenshot of Reef Species of the World" width=1200 height=591></figure><div class=post-content><p>I&rsquo;ve been volunteering with the <a href=https://reeflifesurvey.com/ target=_blank rel=noopener>Reef Life Survey</a> (RLS) citizen science project since 2015. RLS volunteers follow the same underwater visual census methodology that has been in use for decades, thereby producing data series that help inform the management of marine ecosystems. In simpler terms, we count fish (and some invertebrates), and this helps various organisations know what&rsquo;s happening underwater. Among other places, RLS data has been used in scientific publications in <a href=https://www.nature.com/articles/s41586-023-05833-y target=_blank rel=noopener>Nature</a> and <a href=https://reeflifesurvey.com/scientific-papers-management-reports/ target=_blank rel=noopener>elsewhere</a>, and to inform the <a href=https://parksaustralia.gov.au/marine/science/reef-life-survey/ target=_blank rel=noopener>management of Australian marine parks</a>.</p><p>Over the years, I created a few online tools to help volunteers with survey work. These included web apps to <a href=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/>visualise survey results and study species</a>, as well as <a href=https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/>infer species from underwater photos</a>. More recently, I agreed to help with the general maintenance of the non-WordPress parts of the RLS website and backend (<a href=https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/>somewhat reluctantly</a>, but I suppose that&rsquo;s what happens when you do things out of love).</p><p>Taking greater responsibility to help the tech side of RLS along with an alignment of the research grant stars led to an opportunity to revamp the <a href=https://reeflifesurvey.com/species/ target=_blank rel=noopener>Reef Species of the World</a> (RSoW) section – a collection of over 5,000 species with in-situ photos, descriptions, and empirical distributions derived from RLS surveys. My focus in this project was on product management, data pipelines, and backend work. I was joined by my brother, <a href=https://www.uriseroussi.com/ target=_blank rel=noopener>Uri Seroussi</a>, who was in charge of front-end development (which became much more substantial than in the original RSoW).</p><p>The original RSoW was a traditional PHP application that relied on a MySQL database to serve requests, with most of the HTML constructed on the server. By contrast, we re-architected the new RSoW as a progressive web app using Next.js, which has the following advantages and new features:</p><ul><li><strong>Fully static site:</strong> served faster with reduced server load.</li><li><strong>Faster search and navigation:</strong> happens on the front-end without round-trips to the server.</li><li><strong>Installable app with offline availability:</strong> RSoW can now be installed as a mobile or desktop app, and run without an internet connection.</li><li><strong>Client-side image classification:</strong> offline availability includes image classification in the browser, which is useful when surveying in remote areas.</li><li><strong>Replacement of previous tools and pipelines:</strong> providing a more consistent user experience and improved data reliability.</li></ul><p>The rest of this post provides details on the architecture and implementation of the new RSoW and its underlying data and machine learning pipelines. But the best way of getting a feel for the data and the tools is <a href=https://reeflifesurvey.com/species/ target=_blank rel=noopener>to have a play yourself</a>.</p><figure><a href=reef-species-of-the-world-architecture.svg target=_blank rel=noopener><img src=reef-species-of-the-world-architecture.svg alt="The new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems" loading=lazy></a><figcaption><p>The new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems</p></figcaption></figure><h2 id=the-rsow-web-app>The RSoW web app<a hidden class=anchor aria-hidden=true href=#the-rsow-web-app>#</a></h2><p>We didn&rsquo;t start with a blank slate: RSoW was already a public website, with many individual species pages ranking well on web searches (the main source of traffic). As such, a guiding principle was to retain as much of the original functionality as possible, and then build new features on top of it.</p><p>When approaching a legacy codebase, there&rsquo;s always the question of whether rebuilding parts or all of it is a worthwhile endeavour. As Jason Cohen notes, <a href=https://longform.asmartbear.com/scale target=_blank rel=noopener>a more apt name for &ldquo;legacy code&rdquo; is &ldquo;revenue code&rdquo;</a>, i.e., the code that embodies all the original and changed requirements, and has withstood the test of time. Even though RLS&rsquo;s code isn&rsquo;t meant to generate revenue, it&rsquo;s always easy to mess things up when re-implementing existing functionality.</p><p>The main reasons we decided on a rewrite of the front-end were:</p><ul><li><strong>User experience:</strong> Speed things up, as some species searches were pretty slow due to server round-trips and inefficient database queries.</li><li><strong>Extensibility:</strong> Make it easier to add new features.</li><li><strong>Offline availability:</strong> This is impossible with a traditional PHP back-end, but feasible if all the data and code gets shipped to the client.</li></ul><p>We chose Next.js as the front-end framework since it&rsquo;s well-established and supports static exports. Parts of the RLS website run on WordPress, so it&rsquo;s easy to add statically-generated pages and serve them efficiently via Cloudflare (I wasn&rsquo;t keen on complicating the stack by adding a Node backend). With static exports, we regenerate all the species pages whenever the data changes, which means that end-user page requests don&rsquo;t need to touch the database. In addition, the main search page downloads three JSONs with all the data it needs to perform any species search (see <code>sites.json</code>, <code>species.json</code>, and <code>surveys.json</code> in <a href=https://github.com/yanirs/rls-data/tree/master/output target=_blank rel=noopener>the <code>rls-data</code> repo</a>). Minified and compressed, these JSONs add up to less than 2MB of data, which isn&rsquo;t tiny, but it is a small price to pay to avoid hitting the database. The JSONs also cache well on Cloudflare, like the rest of the web app&rsquo;s files.</p><p>From a user perspective, replicating the original functionality was the less exciting part of the project. Faster and less buggy code is obviously better, but once feature parity was achieved, we turned our attention to some new features:</p><ul><li><strong>Supporting offline availability and installation</strong> by turning RSoW into a <a href=https://developer.mozilla.org/en-US/docs/Web/Progressive_web_apps target=_blank rel=noopener>progressive web app</a>: On its face, this was supposed to be simple given the <code>next-pwa</code> package, but it turned out to be a bit tricky because the original package was abandoned, and due to multiple layers of caching. It&rsquo;s well-known that <a href=https://martinfowler.com/bliki/TwoHardThings.html target=_blank rel=noopener>cache invalidation is one of the two hard problems in computer science</a> (along with naming things and off-by-one errors), and progressive web apps offer a lovely variety of caches to deal with – everything needs to be cached on the client for offline availability. We got there after some tinkering and dealing with head-scratching bugs, some of which were caused by other caching layers in addition to the client-side caches (including Cloudflare and some misconfiguration of an early version of the app).</li><li><strong>Knowledge test</strong>: A separate grant came along and Uri had the opportunity to extend RSoW by adding a section that helps test new volunteers ahead of them joining RLS.</li><li><strong>Species frequency exploration</strong>: Bringing in the full functionality from <a href=https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/>the first tool I built for RLS back in 2017</a>.</li><li><strong>Client-side image classification</strong>: Deprecating <a href=https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/>the Streamlit app I built a couple of years ago</a>.</li></ul><h2 id=data-and-machine-learning-pipelines>Data and machine learning pipelines<a hidden class=anchor aria-hidden=true href=#data-and-machine-learning-pipelines>#</a></h2><p>On the back-end, there was an opportunity to simplify things by retiring the original PHP code that processed survey data in favour of the pipelines I implemented in <a href=https://github.com/yanirs/rls-data/ target=_blank rel=noopener>the <code>rls-data</code> repo</a>. Ultimately, survey data comes from the <a href=https://portal.aodn.org.au/ target=_blank rel=noopener>Australian Ocean Data Network</a> (AODN), which holds many more datasets in addition to RLS. Originally, the PHP code that processed survey data into the MySQL database evolved separately from <code>rls-data</code>, which I implemented to generate JSONs for the tools I built. As <code>rls-data</code> is an open source project and the raw survey data is relatively small (&lt;1GB), it made sense to process it with a daily GitHub Action (GHA) script that runs for free. The resultant JSONs are committed to the repo, which means that any unexpected changes are easily tracked (<a href=https://yanirseroussi.com/til/2023/08/14/email-notifications-on-public-github-commits/>I keep an eye on the commits</a>). It was simple to expand the existing <code>rls-data</code> pipelines to generate all the JSONs needed to serve RSoW, and then say goodbye to the PHP code that implemented similar functionality.</p><p>I&rsquo;m aware that running data pipelines with GitHub Actions isn&rsquo;t going to win any awards for sophistication, but it&rsquo;s a great fit for this project. The key principle is to use the right tool for the job, not the shiniest tool.</p><p>One part of the original RSoW that we barely touched was the management interface, which allows RLS admins to update species data and upload pictures. The gains from replacing the admin part of RSoW would have been negligible, so it still runs the old PHP code on top of MySQL. Unfortunately, this meant I couldn&rsquo;t retire all the PHP data pipelines, as species data also comes from the Australian Ocean Data Network and is joined with the edits made by RLS admins. This exemplifies the pragmatism that one often needs to apply when faced with <strike>legacy</strike> revenue systems: If a system works and there&rsquo;s no real benefit to replacing it, sticking with the old system is the right thing to do (even if it makes your architecture diagram more complicated).</p><p>I have <a href=https://github.com/yanirs/ichthywhat/issues/3 target=_blank rel=noopener>big plans</a> to improve the machine learning model for inferring RLS species from user images, but it&rsquo;s somehow never a priority. For RSoW, I did make it a priority to support <a href=https://github.com/yanirs/ichthywhat/pull/11 target=_blank rel=noopener>serving the model with a simple API</a>, but then I decided it&rsquo;d be worth the effort to <a href=https://github.com/yanirs/ichthywhat/pull/20 target=_blank rel=noopener>export it to ONNX for client-side image classification</a>. This was partly driven by curiosity about <a href=https://onnx.ai/ target=_blank rel=noopener>ONNX</a>, but it also had two key benefits: (1) support for offline classification; and (2) simplified & cheaper serving architecture, as ONNX models can be served from S3 and don&rsquo;t require RLS to pay for server-side compute.</p><p>As to the machine learning pipelines, they all need to be manually triggered, which is fine since the image data changes slowly. These pipelines are implemented in <a href=https://github.com/yanirs/ichthywhat target=_blank rel=noopener>notebooks and the command-line interface of the <code>ichthywhat</code> repo</a>. I have a bit of a dream of this being an early precursor to complete automation of RLS data collection, with the historical RLS data series continued by divers who would mostly serve as video takers and fish scarers (using cameras without human divers would lead to different biases in the data). However, this is a big project that is probably best left to my next PhD, i.e., it may never happen.</p><p>In the meantime, I hope to continue diving with RLS, and aim to make pragmatic decisions to keep RSoW running and supporting the community.</p></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-engineering/>data engineering</a></li><li><a href=https://yanirseroussi.com/tags/data-visualisation/>data visualisation</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/marine-science/>marine science</a></li><li><a href=https://yanirseroussi.com/tags/reef-life-survey/>Reef Life Survey</a></li><li><a href=https://yanirseroussi.com/tags/software-engineering/>software engineering</a></li><li><a href=https://yanirseroussi.com/tags/web-development/>web development</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on x" href="https://x.com/intent/tweet/?text=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f&amp;hashtags=dataengineering%2cdatavisualisation%2cmachinelearning%2cmarinescience%2cReefLifeSurvey%2csoftwareengineering%2cwebdevelopment"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f&amp;title=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&amp;summary=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&amp;source=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f&title=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on whatsapp" href="https://api.whatsapp.com/send?text=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools%20-%20https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on telegram" href="https://telegram.me/share/url?text=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&amp;url=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Supporting volunteer monitoring of marine biodiversity with modern web and data tools on ycombinator" href="https://news.ycombinator.com/submitlink?t=Supporting%20volunteer%20monitoring%20of%20marine%20biodiversity%20with%20modern%20web%20and%20data%20tools&u=https%3a%2f%2fyanirseroussi.com%2f2023%2f11%2f29%2fsupporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/about/index.html b/about/index.html
index fcdc72263..1a7d5f600 100644
--- a/about/index.html
+++ b/about/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>About Me | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content><meta name=description content="About Yanir Seroussi, a full-stack data scientist and software engineer with over a decade of experience."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/about/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="About Me"><meta property="og:description" content="About Yanir Seroussi, a full-stack data scientist and software engineer with over a decade of experience."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/about/"><meta property="og:image" content="https://yanirseroussi.com/profile.jpg"><meta property="article:section" content><meta property="article:modified_time" content="2023-12-18T11:18:46+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/profile.jpg"><meta name=twitter:title content="About Me"><meta name=twitter:description content="About Yanir Seroussi, a full-stack data scientist and software engineer with over a decade of experience."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"About Me","item":"https://yanirseroussi.com/about/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"About Me","name":"About Me","description":"About Yanir Seroussi, a full-stack data scientist and software engineer with over a decade of experience.","keywords":[],"articleBody":"Data \u0026 AI Expert? As noted on the homepage, you should beware of self-proclaimed experts on the internet and do your own due diligence. I cringe putting the word expert next to my name, as there’s so much to know and learn, especially in a broad area like Data \u0026 AI. Still, underselling myself would also be silly. I’ll let you be the judge.\n⚡ New! These days, I provide independent consulting services around Data \u0026 AI, focusing on small-to-medium organisations in the climate tech and nature-positive sector. See my consulting page for details, or scroll down to contact me.\nObligatory self-promotional blurb I’m an experienced data scientist and software engineer with a deep background in computer science, programming, machine learning, and statistics. My work spans the full spectrum from solving isolated data problems to building production systems that serve millions of users. With a proven capability to work independently and in teams, lead and mentor co-workers, and communicate with both technical and non-technical stakeholders, I consistently deliver value to a variety of clients and projects.\nProof points Words are cheap. Any chatbot could generate a blurb like the above. Let’s go deeper with a few highlights from my work:\nBuilding production systems that serve millions of users. In my work with Automattic, I re-architected and led the implementation of the company’s unified online experimentation platform, and co-led the implementation of machine learning pipelines that had a significant impact on revenue from marketing campaigns. Solving isolated data \u0026 machine learning problems. I’m a retired Kaggle competition master, having ranked in the top ten of the five competitions I participated in. I’ve also worked on various other problems throughout my career, but many of them haven’t resulted in public artefacts – such is the nature of commercial data. Software engineering and programming expertise. My undergraduate degree was in computer science, with a focus on software engineering. I graduated first in class from the Technion – a top Israeli university. My early career included software engineering work with big tech companies (Intel, Qualcomm, and Google). I chose to work with startups after my PhD, before going back to a medium tech company (Automattic), and then returning to the startup and freelancing worlds. All my roles included a substantial hands-on coding component. I take software engineering seriously and strive to keep on top of and apply best practices, as solid software is the foundation of all data work. Artificial intelligence and data science expertise. In my PhD I formally specialised in artificial intelligence. As the term artificial intelligence was falling out of favour when I submitted my thesis in 2012, I also say that my PhD is in data science (which may be decreasing in popularity in 2023). Either way, it resulted in some publications in top venues and won an award for the best thesis in my faculty at Monash – a leading Australian university. Personally, I don’t think it’s a big deal, but people seem to love PhDs and other credentials! Since my PhD, I’ve continued learning and honing my skills, as reflected by posts on this site and my LinkedIn profile. Outcomes beat job titles One of the downsides of working in an ever-changing field and accumulating a broad range of experiences is that it’s hard to summarise with a concise title. For example, being a data scientist used to imply having strong software engineering skills, but this has changed over time. It’s a similar story with the decline and rise of artificial intelligence. In an ideal world, I’d be able to let my work speak for itself, but we don’t live in such a world (in fact, human work is obsolete in my ideal world). In our world, people search for keywords and have different understandings of concepts, e.g., they may want “an AI solution” to a problem that can be solved with deterministic software engineering. Or they may believe they need an AI Engineer rather than a Data Scientist, when a few years ago it’d have been the opposite (as I’m writing this in 2023, you could replace the words data science with artificial intelligence across my historical posts and much of what I wrote would still hold).\nAnyway, whether you’re trying to navigate Data \u0026 AI terminology or solve specific problems, I can probably help. As noted in my consulting page, my aim is to get to the root of business problems and iteratively implement pragmatic solutions. The taxonomy of Data \u0026 AI professionals is only relevant if I’m helping you hire a team.\nA subset of roles I’ve performed in one way or another.\nSource: Machine Learning Operations (MLOps): Overview, Definition, and Architecture. Contact me Feel free to contact me about topics discussed on this website, potential work, or anything else you think I’d find interesting. Before you do, it’s worth checking out my FAQ page to see if your question has already been answered. Contact options include using my contact form, opening a GitHub issue, dropping me a line on LinkedIn, emailing contact at this domain, or booking a paid call.\n","wordCount":"846","inLanguage":"en","image":"https://yanirseroussi.com/profile.jpg","datePublished":"0001-01-01T00:00:00Z","dateModified":"2023-12-18T11:18:46+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/about/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span class=active>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">About Me</h1><div class=post-meta></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/about/profile.jpg 2692w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/about/profile.jpg alt="Yanir Seroussi's profile picture" width=2692 height=2420></figure><div class=toc><details><summary accesskey=c title="(Alt + C)"><span class=details>Table of Contents</span></summary><div class=inner><ul><li><a href=#data--ai-expert aria-label="Data & AI Expert?">Data & AI Expert?</a></li><li><a href=#obligatory-self-promotional-blurb aria-label="Obligatory self-promotional blurb">Obligatory self-promotional blurb</a></li><li><a href=#proof-points aria-label="Proof points">Proof points</a></li><li><a href=#outcomes-beat-job-titles aria-label="Outcomes beat job titles">Outcomes beat job titles</a></li><li><a href=#contact-me aria-label="Contact me">Contact me</a></li></ul></div></details></div><div class=post-content><h2 id=data--ai-expert>Data & AI Expert?<a hidden class=anchor aria-hidden=true href=#data--ai-expert>#</a></h2><p>As noted on <a href=https://yanirseroussi.com/>the homepage</a>, you should beware of self-proclaimed experts on the internet and do your own due diligence. I cringe putting the word <em>expert</em> next to my name, as there&rsquo;s so much to know and learn, especially in a broad area like Data & AI. Still, underselling myself would also be silly. I&rsquo;ll let you be the judge.</p><p><strong>⚡ New!</strong> These days, I provide independent consulting services around Data & AI, focusing on small-to-medium organisations in the climate tech and nature-positive sector. See <a href=/consult/>my consulting page</a> for details, or <a href=#contact-me>scroll down to contact me</a>.</p><h2 id=obligatory-self-promotional-blurb>Obligatory self-promotional blurb<a hidden class=anchor aria-hidden=true href=#obligatory-self-promotional-blurb>#</a></h2><p>I&rsquo;m an experienced data scientist and software engineer with a deep background in computer science, programming, machine learning, and statistics. My work spans the full spectrum from solving isolated data problems to building production systems that serve millions of users. With a proven capability to work independently and in teams, lead and mentor co-workers, and communicate with both technical and non-technical stakeholders, I consistently deliver value to a variety of clients and projects.</p><h2 id=proof-points>Proof points<a hidden class=anchor aria-hidden=true href=#proof-points>#</a></h2><p>Words are cheap. Any chatbot could generate a blurb like the above. Let&rsquo;s go deeper with a few highlights from my work:</p><ul><li><strong>Building production systems that serve millions of users.</strong> In <a href=https://yanirseroussi.com/2021/10/07/my-work-with-automattic/>my work with Automattic</a>, I re-architected and led the implementation of the company&rsquo;s unified online experimentation platform, and co-led the implementation of machine learning pipelines that had a significant impact on revenue from marketing campaigns.</li><li><strong>Solving isolated data & machine learning problems.</strong> I&rsquo;m <a href=https://www.kaggle.com/yanirseroussi target=_blank rel=noopener>a retired Kaggle competition master</a>, having <a href=https://yanirseroussi.com/kaggle/>ranked in the top ten of the five competitions I participated in</a>. I&rsquo;ve also worked on various other problems throughout my career, but many of them haven&rsquo;t resulted in public artefacts – such is the nature of commercial data.</li><li><strong>Software engineering and programming expertise.</strong> My undergraduate degree was in computer science, with a focus on software engineering. I graduated first in class from <a href=https://en.wikipedia.org/wiki/Technion_%E2%80%93_Israel_Institute_of_Technology target=_blank rel=noopener>the Technion – a top Israeli university</a>. My early career included software engineering work with big tech companies (Intel, Qualcomm, and Google). I chose to work with startups after my PhD, before going back to a medium tech company (Automattic), and then <a href=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/>returning to the startup and freelancing worlds</a>. All my roles included a substantial hands-on coding component. I take software engineering seriously and strive to keep on top of and apply best practices, as <a href=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/>solid software is the foundation of all data work</a>.</li><li><strong>Artificial intelligence and data science expertise.</strong> In my PhD I formally specialised in artificial intelligence. As the term <em>artificial intelligence</em> was falling out of favour when I submitted my thesis in 2012, I also say that my PhD is in <em>data science</em> (which may be decreasing in popularity in 2023). Either way, it resulted in <a href=https://yanirseroussi.com/phd-work/>some publications in top venues</a> and <a href=https://www.monash.edu/news/articles/top-of-the-class target=_blank rel=noopener>won an award for the best thesis in my faculty</a> at <a href=https://en.wikipedia.org/wiki/Monash_University target=_blank rel=noopener>Monash – a leading Australian university</a>. Personally, I don&rsquo;t think it&rsquo;s a big deal, but people seem to love PhDs and other credentials! Since my PhD, I&rsquo;ve continued learning and honing my skills, as reflected by <a href=https://yanirseroussi.com/>posts on this site</a> and <a href=https://www.linkedin.com/in/yanirseroussi/ target=_blank rel=noopener>my LinkedIn profile</a>.</li></ul><h2 id=outcomes-beat-job-titles>Outcomes beat job titles<a hidden class=anchor aria-hidden=true href=#outcomes-beat-job-titles>#</a></h2><p>One of the downsides of working in an ever-changing field and accumulating a broad range of experiences is that it&rsquo;s hard to summarise with a concise title. For example, being a data scientist <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>used to imply having strong software engineering skills</a>, but <a href=https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/>this has changed over time</a>. It&rsquo;s a similar story with the decline and rise of artificial intelligence. In an ideal world, I&rsquo;d be able to let my work speak for itself, but we don&rsquo;t live in such a world (in fact, <a href=https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/>human work is obsolete in my ideal world</a>). In our world, people search for keywords and have different understandings of concepts, e.g., they may want &ldquo;an AI solution&rdquo; to a problem that can be solved with deterministic software engineering. Or they may believe they need an AI Engineer rather than a Data Scientist, when a few years ago it&rsquo;d have been the opposite (as I&rsquo;m writing this in 2023, you could replace the words <em>data science</em> with <em>artificial intelligence</em> across my historical posts and much of what I wrote would still hold).</p><p>Anyway, whether you&rsquo;re trying to navigate Data & AI terminology or solve specific problems, I can probably help. As noted in <a href=/consult/>my consulting page</a>, my aim is to get to the root of business problems and iteratively implement pragmatic solutions. The taxonomy of Data & AI professionals is only relevant if I&rsquo;m helping you hire a team.</p><figure><a href=mlops-roles.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
+<meta name=keywords content><meta name=description content="About Yanir Seroussi, a full-stack data scientist and software engineer with over a decade of experience."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/about/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="About Me"><meta property="og:description" content="About Yanir Seroussi, a full-stack data scientist and software engineer with over a decade of experience."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/about/"><meta property="og:image" content="https://yanirseroussi.com/about/profile.jpg"><meta property="article:section" content><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/about/profile.jpg"><meta name=twitter:title content="About Me"><meta name=twitter:description content="About Yanir Seroussi, a full-stack data scientist and software engineer with over a decade of experience."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"About Me","item":"https://yanirseroussi.com/about/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"About Me","name":"About Me","description":"About Yanir Seroussi, a full-stack data scientist and software engineer with over a decade of experience.","keywords":[],"articleBody":"Data \u0026 AI Expert? As noted on the homepage, you should beware of self-proclaimed experts on the internet and do your own due diligence. I cringe putting the word expert next to my name, as there’s so much to know and learn, especially in a broad area like Data \u0026 AI. Still, underselling myself would also be silly. I’ll let you be the judge.\n⚡ New! These days, I provide independent consulting services around Data \u0026 AI, focusing on small-to-medium organisations in the climate tech and nature-positive sector. See my consulting page for details, or scroll down to contact me.\nObligatory self-promotional blurb I’m an experienced data scientist and software engineer with a deep background in computer science, programming, machine learning, and statistics. My work spans the full spectrum from solving isolated data problems to building production systems that serve millions of users. With a proven capability to work independently and in teams, lead and mentor co-workers, and communicate with both technical and non-technical stakeholders, I consistently deliver value to a variety of clients and projects.\nProof points Words are cheap. Any chatbot could generate a blurb like the above. Let’s go deeper with a few highlights from my work:\nBuilding production systems that serve millions of users. In my work with Automattic, I re-architected and led the implementation of the company’s unified online experimentation platform, and co-led the implementation of machine learning pipelines that had a significant impact on revenue from marketing campaigns. Solving isolated data \u0026 machine learning problems. I’m a retired Kaggle competition master, having ranked in the top ten of the five competitions I participated in. I’ve also worked on various other problems throughout my career, but many of them haven’t resulted in public artefacts – such is the nature of commercial data. Software engineering and programming expertise. My undergraduate degree was in computer science, with a focus on software engineering. I graduated first in class from the Technion – a top Israeli university. My early career included software engineering work with big tech companies (Intel, Qualcomm, and Google). I chose to work with startups after my PhD, before going back to a medium tech company (Automattic), and then returning to the startup and freelancing worlds. All my roles included a substantial hands-on coding component. I take software engineering seriously and strive to keep on top of and apply best practices, as solid software is the foundation of all data work. Artificial intelligence and data science expertise. In my PhD I formally specialised in artificial intelligence. As the term artificial intelligence was falling out of favour when I submitted my thesis in 2012, I also say that my PhD is in data science (which may be decreasing in popularity in 2023). Either way, it resulted in some publications in top venues and won an award for the best thesis in my faculty at Monash – a leading Australian university. Personally, I don’t think it’s a big deal, but people seem to love PhDs and other credentials! Since my PhD, I’ve continued learning and honing my skills, as reflected by posts on this site and my LinkedIn profile. Outcomes beat job titles One of the downsides of working in an ever-changing field and accumulating a broad range of experiences is that it’s hard to summarise with a concise title. For example, being a data scientist used to imply having strong software engineering skills, but this has changed over time. It’s a similar story with the decline and rise of artificial intelligence. In an ideal world, I’d be able to let my work speak for itself, but we don’t live in such a world (in fact, human work is obsolete in my ideal world). In our world, people search for keywords and have different understandings of concepts, e.g., they may want “an AI solution” to a problem that can be solved with deterministic software engineering. Or they may believe they need an AI Engineer rather than a Data Scientist, when a few years ago it’d have been the opposite (as I’m writing this in 2023, you could replace the words data science with artificial intelligence across my historical posts and much of what I wrote would still hold).\nAnyway, whether you’re trying to navigate Data \u0026 AI terminology or solve specific problems, I can probably help. As noted in my consulting page, my aim is to get to the root of business problems and iteratively implement pragmatic solutions. The taxonomy of Data \u0026 AI professionals is only relevant if I’m helping you hire a team.\nA subset of roles I’ve performed in one way or another.\nSource: Machine Learning Operations (MLOps): Overview, Definition, and Architecture. Contact me Feel free to contact me about topics discussed on this website, potential work, or anything else you think I’d find interesting. Before you do, it’s worth checking out my FAQ page to see if your question has already been answered. Contact options include using my contact form, opening a GitHub issue, dropping me a line on LinkedIn, emailing contact at this domain, or booking a paid call.\n","wordCount":"846","inLanguage":"en","image":"https://yanirseroussi.com/about/profile.jpg","datePublished":"0001-01-01T00:00:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/about/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span class=active>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">About Me</h1><div class=post-meta></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/about/profile_huff5fdd8a3d7ddcafd6d7832f9991a5bf_676979_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/about/profile.jpg 2692w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/about/profile.jpg alt="Yanir Seroussi's profile picture" width=2692 height=2420></figure><div class=toc><details><summary accesskey=c title="(Alt + C)"><span class=details>Table of Contents</span></summary><div class=inner><ul><li><a href=#data--ai-expert aria-label="Data & AI Expert?">Data & AI Expert?</a></li><li><a href=#obligatory-self-promotional-blurb aria-label="Obligatory self-promotional blurb">Obligatory self-promotional blurb</a></li><li><a href=#proof-points aria-label="Proof points">Proof points</a></li><li><a href=#outcomes-beat-job-titles aria-label="Outcomes beat job titles">Outcomes beat job titles</a></li><li><a href=#contact-me aria-label="Contact me">Contact me</a></li></ul></div></details></div><div class=post-content><h2 id=data--ai-expert>Data & AI Expert?<a hidden class=anchor aria-hidden=true href=#data--ai-expert>#</a></h2><p>As noted on <a href=https://yanirseroussi.com/>the homepage</a>, you should beware of self-proclaimed experts on the internet and do your own due diligence. I cringe putting the word <em>expert</em> next to my name, as there&rsquo;s so much to know and learn, especially in a broad area like Data & AI. Still, underselling myself would also be silly. I&rsquo;ll let you be the judge.</p><p><strong>⚡ New!</strong> These days, I provide independent consulting services around Data & AI, focusing on small-to-medium organisations in the climate tech and nature-positive sector. See <a href=/consult/>my consulting page</a> for details, or <a href=#contact-me>scroll down to contact me</a>.</p><h2 id=obligatory-self-promotional-blurb>Obligatory self-promotional blurb<a hidden class=anchor aria-hidden=true href=#obligatory-self-promotional-blurb>#</a></h2><p>I&rsquo;m an experienced data scientist and software engineer with a deep background in computer science, programming, machine learning, and statistics. My work spans the full spectrum from solving isolated data problems to building production systems that serve millions of users. With a proven capability to work independently and in teams, lead and mentor co-workers, and communicate with both technical and non-technical stakeholders, I consistently deliver value to a variety of clients and projects.</p><h2 id=proof-points>Proof points<a hidden class=anchor aria-hidden=true href=#proof-points>#</a></h2><p>Words are cheap. Any chatbot could generate a blurb like the above. Let&rsquo;s go deeper with a few highlights from my work:</p><ul><li><strong>Building production systems that serve millions of users.</strong> In <a href=https://yanirseroussi.com/2021/10/07/my-work-with-automattic/>my work with Automattic</a>, I re-architected and led the implementation of the company&rsquo;s unified online experimentation platform, and co-led the implementation of machine learning pipelines that had a significant impact on revenue from marketing campaigns.</li><li><strong>Solving isolated data & machine learning problems.</strong> I&rsquo;m <a href=https://www.kaggle.com/yanirseroussi target=_blank rel=noopener>a retired Kaggle competition master</a>, having <a href=https://yanirseroussi.com/kaggle/>ranked in the top ten of the five competitions I participated in</a>. I&rsquo;ve also worked on various other problems throughout my career, but many of them haven&rsquo;t resulted in public artefacts – such is the nature of commercial data.</li><li><strong>Software engineering and programming expertise.</strong> My undergraduate degree was in computer science, with a focus on software engineering. I graduated first in class from <a href=https://en.wikipedia.org/wiki/Technion_%E2%80%93_Israel_Institute_of_Technology target=_blank rel=noopener>the Technion – a top Israeli university</a>. My early career included software engineering work with big tech companies (Intel, Qualcomm, and Google). I chose to work with startups after my PhD, before going back to a medium tech company (Automattic), and then <a href=https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/>returning to the startup and freelancing worlds</a>. All my roles included a substantial hands-on coding component. I take software engineering seriously and strive to keep on top of and apply best practices, as <a href=https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/>solid software is the foundation of all data work</a>.</li><li><strong>Artificial intelligence and data science expertise.</strong> In my PhD I formally specialised in artificial intelligence. As the term <em>artificial intelligence</em> was falling out of favour when I submitted my thesis in 2012, I also say that my PhD is in <em>data science</em> (which may be decreasing in popularity in 2023). Either way, it resulted in <a href=https://yanirseroussi.com/phd-work/>some publications in top venues</a> and <a href=https://www.monash.edu/news/articles/top-of-the-class target=_blank rel=noopener>won an award for the best thesis in my faculty</a> at <a href=https://en.wikipedia.org/wiki/Monash_University target=_blank rel=noopener>Monash – a leading Australian university</a>. Personally, I don&rsquo;t think it&rsquo;s a big deal, but people seem to love PhDs and other credentials! Since my PhD, I&rsquo;ve continued learning and honing my skills, as reflected by <a href=https://yanirseroussi.com/>posts on this site</a> and <a href=https://www.linkedin.com/in/yanirseroussi/ target=_blank rel=noopener>my LinkedIn profile</a>.</li></ul><h2 id=outcomes-beat-job-titles>Outcomes beat job titles<a hidden class=anchor aria-hidden=true href=#outcomes-beat-job-titles>#</a></h2><p>One of the downsides of working in an ever-changing field and accumulating a broad range of experiences is that it&rsquo;s hard to summarise with a concise title. For example, being a data scientist <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/>used to imply having strong software engineering skills</a>, but <a href=https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/>this has changed over time</a>. It&rsquo;s a similar story with the decline and rise of artificial intelligence. In an ideal world, I&rsquo;d be able to let my work speak for itself, but we don&rsquo;t live in such a world (in fact, <a href=https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/>human work is obsolete in my ideal world</a>). In our world, people search for keywords and have different understandings of concepts, e.g., they may want &ldquo;an AI solution&rdquo; to a problem that can be solved with deterministic software engineering. Or they may believe they need an AI Engineer rather than a Data Scientist, when a few years ago it&rsquo;d have been the opposite (as I&rsquo;m writing this in 2023, you could replace the words <em>data science</em> with <em>artificial intelligence</em> across my historical posts and much of what I wrote would still hold).</p><p>Anyway, whether you&rsquo;re trying to navigate Data & AI terminology or solve specific problems, I can probably help. As noted in <a href=/consult/>my consulting page</a>, my aim is to get to the root of business problems and iteratively implement pragmatic solutions. The taxonomy of Data & AI professionals is only relevant if I&rsquo;m helping you hire a team.</p><figure><a href=mlops-roles.png target=_blank rel=noopener><img sizes="(min-width: 768px) 720px,
 100vw" srcset="https://yanirseroussi.com/about/mlops-roles_hua09c876fee5ef02e6d697df08d86922e_54764_360x0_resize_box_3.png 360w,
 https://yanirseroussi.com/about/mlops-roles_hua09c876fee5ef02e6d697df08d86922e_54764_480x0_resize_box_3.png 480w,
 https://yanirseroussi.com/about/mlops-roles_hua09c876fee5ef02e6d697df08d86922e_54764_720x0_resize_box_3.png 720w,
diff --git a/consult/index.html b/consult/index.html
index 9e87e6dd6..67b39e9e8 100644
--- a/consult/index.html
+++ b/consult/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Data & AI Consulting Services | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content><meta name=description content="Yanir Seroussi&rsquo;s Data & AI consulting services."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/consult/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Data & AI Consulting Services"><meta property="og:description" content="Yanir Seroussi&rsquo;s Data & AI consulting services."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/consult/"><meta property="og:image" content="https://yanirseroussi.com/wolchulsan-cloud-bridge.webp"><meta property="article:section" content><meta property="article:modified_time" content="2023-12-12T15:11:17+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/wolchulsan-cloud-bridge.webp"><meta name=twitter:title content="Data & AI Consulting Services"><meta name=twitter:description content="Yanir Seroussi&rsquo;s Data & AI consulting services."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Data \u0026 AI Consulting Services","item":"https://yanirseroussi.com/consult/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Data \u0026 AI Consulting Services","name":"Data \u0026 AI Consulting Services","description":"Yanir Seroussi\u0026rsquo;s Data \u0026amp; AI consulting services.","keywords":[],"articleBody":"This is a high-level overview of my approach to consulting and the sort of problems I can help with. Feel free to contact me for more details.\nPrinciples When approaching consulting engagements, I aim to follow these key principles:\nGetting to the root of business problems. Iteratively implementing pragmatic solutions. Saying what I’ll do, doing what I said, and communicating if anything changes. Offerings With my broad experience in data science, software engineering, and artificial intelligence, I can help in a variety of situations. My key offerings are:\nShort one-off advisory sessions to address specific challenges and questions (book paid call). Longer engagements to tackle data \u0026 AI problems, which start with problem discovery and lead to hands-on implementation work (contact me to discuss). Mid-to-long term engagements as a Fractional Chief Data \u0026 AI Officer for startups and small-to-medium organisations (see slide deck for details). Examples Examples of the above offerings:\nA short call to discuss an ongoing computer vision project, where the client was unsure the consultant they’ve retained was on the right path. I provided advice on where they should focus, based on my experience with best practices in such projects. An internal consulting project with Automattic, where the aim was to increase WordPress.com customer retention rates. Artefacts from this project are still in use by the company five years later. I served in data \u0026 AI leadership roles with multiple startups. While those weren’t fractional, a clear theme is that it’s hard for non-data people to make the first data hires and give them the right tasks. These are the sort of items I can help with fractionally. Ideal clients and areas My main interest these days is to apply my skills within the climate tech and nature-positive sector. However, I’m happy to discuss any project that is likely to have a positive impact on humanity and our shared environment.\nI have worked with large companies (Intel, Qualcomm, Google, and Commonwealth Bank), medium-size businesses (Automattic – WordPress.com), several startups (like Orkestra – renewable energy space and Car Next Door – now Uber Carshare), and nonprofits (Reef Life Survey, GetUp!, and Work on Climate). As an independent consultant, I prefer working with stakeholders who have skin in the game, which typically means working with small-to-medium organisations in settings where it’s possible to quickly ship iterative solutions.\nNext steps Contact me if you’re interested in working together.\n","wordCount":"398","inLanguage":"en","image":"https://yanirseroussi.com/wolchulsan-cloud-bridge.webp","datePublished":"0001-01-01T00:00:00Z","dateModified":"2023-12-12T15:11:17+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/consult/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span class=active>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Data & AI Consulting Services</h1><div class=post-meta></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/consult/wolchulsan-cloud-bridge_huac79323872f5dda72c8669be8a3a406c_308138_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/consult/wolchulsan-cloud-bridge_huac79323872f5dda72c8669be8a3a406c_308138_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/consult/wolchulsan-cloud-bridge_huac79323872f5dda72c8669be8a3a406c_308138_720x0_resize_q75_h2_box_2.webp 720w ,https://yanirseroussi.com/consult/wolchulsan-cloud-bridge_huac79323872f5dda72c8669be8a3a406c_308138_1080x0_resize_q75_h2_box_2.webp 1080w ,https://yanirseroussi.com/consult/wolchulsan-cloud-bridge.webp 1200w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/consult/wolchulsan-cloud-bridge.webp alt="Let's cross bridges together." width=1200 height=675><p>Let&rsquo;s cross bridges together.</p></figure><div class=post-content><p>This is a high-level overview of my approach to consulting and the sort of problems I can help with. Feel free to <a href=/about/#contact-me>contact me</a> for more details.</p><h2 id=principles>Principles<a hidden class=anchor aria-hidden=true href=#principles>#</a></h2><p>When approaching consulting engagements, I aim to follow these key principles:</p><ul><li>Getting to the root of business problems.</li><li>Iteratively implementing pragmatic solutions.</li><li>Saying what I&rsquo;ll do, doing what I said, and communicating if anything changes.</li></ul><h2 id=offerings>Offerings<a hidden class=anchor aria-hidden=true href=#offerings>#</a></h2><p>With <a href=/about/>my broad experience in data science, software engineering, and artificial intelligence</a>, I can help in a variety of situations. My key offerings are:</p><ul><li>Short one-off advisory sessions to address specific challenges and questions (<a href=https://talkw.me/@yanir target=_blank rel=noopener>book paid call</a>).</li><li>Longer engagements to tackle data & AI problems, which start with problem discovery and lead to hands-on implementation work (<a href=/about/#contact-me>contact me to discuss</a>).</li><li>Mid-to-long term engagements as a Fractional Chief Data & AI Officer for startups and small-to-medium organisations (see <a href=/fractional-chief-data-officer/#/>slide deck</a> for details).</li></ul><h2 id=examples>Examples<a hidden class=anchor aria-hidden=true href=#examples>#</a></h2><p>Examples of the above offerings:</p><ul><li>A short call to discuss an ongoing computer vision project, where the client was unsure the consultant they&rsquo;ve retained was on the right path. I provided advice on where they should focus, based on my experience with best practices in such projects.</li><li>An <a href=https://data.blog/2019/01/15/how-to-increase-retention-and-revenue-in-1000-nontrivial-steps/ target=_blank rel=noopener>internal consulting project with Automattic</a>, where the aim was to increase WordPress.com customer retention rates. Artefacts from this project are still in use by the company five years later.</li><li>I served in data & AI leadership roles with multiple startups. While those weren&rsquo;t fractional, a clear theme is that it&rsquo;s hard for non-data people to make the first data hires and give them the right tasks. These are the sort of items I can help with fractionally.</li></ul><h2 id=ideal-clients-and-areas>Ideal clients and areas<a hidden class=anchor aria-hidden=true href=#ideal-clients-and-areas>#</a></h2><p>My main interest these days is to apply my skills within the climate tech and nature-positive sector. However, I&rsquo;m happy to discuss any project that is likely to have a positive impact on humanity and our shared environment.</p><p>I have worked with large companies (Intel, Qualcomm, Google, and Commonwealth Bank), medium-size businesses (Automattic – WordPress.com), several startups (like Orkestra – renewable energy space and Car Next Door – now Uber Carshare), and nonprofits (Reef Life Survey, GetUp!, and Work on Climate). As an independent consultant, I prefer working with stakeholders who have skin in the game, which typically means working with small-to-medium organisations in settings where it&rsquo;s possible to quickly ship iterative solutions.</p><h2 id=next-steps>Next steps<a hidden class=anchor aria-hidden=true href=#next-steps>#</a></h2><p><a href=/about/#contact-me>Contact me</a> if you&rsquo;re interested in working together.</p></div><footer class=post-footer><ul class=post-tags></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on x" href="https://x.com/intent/tweet/?text=Data%20%26%20AI%20Consulting%20Services&amp;url=https%3a%2f%2fyanirseroussi.com%2fconsult%2f&amp;hashtags="><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2fconsult%2f&amp;title=Data%20%26%20AI%20Consulting%20Services&amp;summary=Data%20%26%20AI%20Consulting%20Services&amp;source=https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2fconsult%2f&title=Data%20%26%20AI%20Consulting%20Services"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on whatsapp" href="https://api.whatsapp.com/send?text=Data%20%26%20AI%20Consulting%20Services%20-%20https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on telegram" href="https://telegram.me/share/url?text=Data%20%26%20AI%20Consulting%20Services&amp;url=https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on ycombinator" href="https://news.ycombinator.com/submitlink?t=Data%20%26%20AI%20Consulting%20Services&u=https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
+<meta name=keywords content><meta name=description content="Yanir Seroussi&rsquo;s Data & AI consulting services."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/consult/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Data & AI Consulting Services"><meta property="og:description" content="Yanir Seroussi&rsquo;s Data & AI consulting services."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/consult/"><meta property="og:image" content="https://yanirseroussi.com/consult/wolchulsan-cloud-bridge.webp"><meta property="article:section" content><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/consult/wolchulsan-cloud-bridge.webp"><meta name=twitter:title content="Data & AI Consulting Services"><meta name=twitter:description content="Yanir Seroussi&rsquo;s Data & AI consulting services."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Data \u0026 AI Consulting Services","item":"https://yanirseroussi.com/consult/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Data \u0026 AI Consulting Services","name":"Data \u0026 AI Consulting Services","description":"Yanir Seroussi\u0026rsquo;s Data \u0026amp; AI consulting services.","keywords":[],"articleBody":"This is a high-level overview of my approach to consulting and the sort of problems I can help with. Feel free to contact me for more details.\nPrinciples When approaching consulting engagements, I aim to follow these key principles:\nGetting to the root of business problems. Iteratively implementing pragmatic solutions. Saying what I’ll do, doing what I said, and communicating if anything changes. Offerings With my broad experience in data science, software engineering, and artificial intelligence, I can help in a variety of situations. My key offerings are:\nShort one-off advisory sessions to address specific challenges and questions (book paid call). Longer engagements to tackle data \u0026 AI problems, which start with problem discovery and lead to hands-on implementation work (contact me to discuss). Mid-to-long term engagements as a Fractional Chief Data \u0026 AI Officer for startups and small-to-medium organisations (see slide deck for details). Examples Examples of the above offerings:\nA short call to discuss an ongoing computer vision project, where the client was unsure the consultant they’ve retained was on the right path. I provided advice on where they should focus, based on my experience with best practices in such projects. An internal consulting project with Automattic, where the aim was to increase WordPress.com customer retention rates. Artefacts from this project are still in use by the company five years later. I served in data \u0026 AI leadership roles with multiple startups. While those weren’t fractional, a clear theme is that it’s hard for non-data people to make the first data hires and give them the right tasks. These are the sort of items I can help with fractionally. Ideal clients and areas My main interest these days is to apply my skills within the climate tech and nature-positive sector. However, I’m happy to discuss any project that is likely to have a positive impact on humanity and our shared environment.\nI have worked with large companies (Intel, Qualcomm, Google, and Commonwealth Bank), medium-size businesses (Automattic – WordPress.com), several startups (like Orkestra – renewable energy space and Car Next Door – now Uber Carshare), and nonprofits (Reef Life Survey, GetUp!, and Work on Climate). As an independent consultant, I prefer working with stakeholders who have skin in the game, which typically means working with small-to-medium organisations in settings where it’s possible to quickly ship iterative solutions.\nNext steps Contact me if you’re interested in working together.\n","wordCount":"398","inLanguage":"en","image":"https://yanirseroussi.com/consult/wolchulsan-cloud-bridge.webp","datePublished":"0001-01-01T00:00:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/consult/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span class=active>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Data & AI Consulting Services</h1><div class=post-meta></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/consult/wolchulsan-cloud-bridge_huac79323872f5dda72c8669be8a3a406c_308138_360x0_resize_q75_h2_box_2.webp 360w ,https://yanirseroussi.com/consult/wolchulsan-cloud-bridge_huac79323872f5dda72c8669be8a3a406c_308138_480x0_resize_q75_h2_box_2.webp 480w ,https://yanirseroussi.com/consult/wolchulsan-cloud-bridge_huac79323872f5dda72c8669be8a3a406c_308138_720x0_resize_q75_h2_box_2.webp 720w ,https://yanirseroussi.com/consult/wolchulsan-cloud-bridge_huac79323872f5dda72c8669be8a3a406c_308138_1080x0_resize_q75_h2_box_2.webp 1080w ,https://yanirseroussi.com/consult/wolchulsan-cloud-bridge.webp 1200w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/consult/wolchulsan-cloud-bridge.webp alt="Let's cross bridges together." width=1200 height=675><p>Let&rsquo;s cross bridges together.</p></figure><div class=post-content><p>This is a high-level overview of my approach to consulting and the sort of problems I can help with. Feel free to <a href=/about/#contact-me>contact me</a> for more details.</p><h2 id=principles>Principles<a hidden class=anchor aria-hidden=true href=#principles>#</a></h2><p>When approaching consulting engagements, I aim to follow these key principles:</p><ul><li>Getting to the root of business problems.</li><li>Iteratively implementing pragmatic solutions.</li><li>Saying what I&rsquo;ll do, doing what I said, and communicating if anything changes.</li></ul><h2 id=offerings>Offerings<a hidden class=anchor aria-hidden=true href=#offerings>#</a></h2><p>With <a href=/about/>my broad experience in data science, software engineering, and artificial intelligence</a>, I can help in a variety of situations. My key offerings are:</p><ul><li>Short one-off advisory sessions to address specific challenges and questions (<a href=https://talkw.me/@yanir target=_blank rel=noopener>book paid call</a>).</li><li>Longer engagements to tackle data & AI problems, which start with problem discovery and lead to hands-on implementation work (<a href=/about/#contact-me>contact me to discuss</a>).</li><li>Mid-to-long term engagements as a Fractional Chief Data & AI Officer for startups and small-to-medium organisations (see <a href=/fractional-chief-data-officer/#/>slide deck</a> for details).</li></ul><h2 id=examples>Examples<a hidden class=anchor aria-hidden=true href=#examples>#</a></h2><p>Examples of the above offerings:</p><ul><li>A short call to discuss an ongoing computer vision project, where the client was unsure the consultant they&rsquo;ve retained was on the right path. I provided advice on where they should focus, based on my experience with best practices in such projects.</li><li>An <a href=https://data.blog/2019/01/15/how-to-increase-retention-and-revenue-in-1000-nontrivial-steps/ target=_blank rel=noopener>internal consulting project with Automattic</a>, where the aim was to increase WordPress.com customer retention rates. Artefacts from this project are still in use by the company five years later.</li><li>I served in data & AI leadership roles with multiple startups. While those weren&rsquo;t fractional, a clear theme is that it&rsquo;s hard for non-data people to make the first data hires and give them the right tasks. These are the sort of items I can help with fractionally.</li></ul><h2 id=ideal-clients-and-areas>Ideal clients and areas<a hidden class=anchor aria-hidden=true href=#ideal-clients-and-areas>#</a></h2><p>My main interest these days is to apply my skills within the climate tech and nature-positive sector. However, I&rsquo;m happy to discuss any project that is likely to have a positive impact on humanity and our shared environment.</p><p>I have worked with large companies (Intel, Qualcomm, Google, and Commonwealth Bank), medium-size businesses (Automattic – WordPress.com), several startups (like Orkestra – renewable energy space and Car Next Door – now Uber Carshare), and nonprofits (Reef Life Survey, GetUp!, and Work on Climate). As an independent consultant, I prefer working with stakeholders who have skin in the game, which typically means working with small-to-medium organisations in settings where it&rsquo;s possible to quickly ship iterative solutions.</p><h2 id=next-steps>Next steps<a hidden class=anchor aria-hidden=true href=#next-steps>#</a></h2><p><a href=/about/#contact-me>Contact me</a> if you&rsquo;re interested in working together.</p></div><footer class=post-footer><ul class=post-tags></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on x" href="https://x.com/intent/tweet/?text=Data%20%26%20AI%20Consulting%20Services&amp;url=https%3a%2f%2fyanirseroussi.com%2fconsult%2f&amp;hashtags="><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2fconsult%2f&amp;title=Data%20%26%20AI%20Consulting%20Services&amp;summary=Data%20%26%20AI%20Consulting%20Services&amp;source=https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2fconsult%2f&title=Data%20%26%20AI%20Consulting%20Services"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on whatsapp" href="https://api.whatsapp.com/send?text=Data%20%26%20AI%20Consulting%20Services%20-%20https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on telegram" href="https://telegram.me/share/url?text=Data%20%26%20AI%20Consulting%20Services&amp;url=https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Data & AI Consulting Services on ycombinator" href="https://news.ycombinator.com/submitlink?t=Data%20%26%20AI%20Consulting%20Services&u=https%3a%2f%2fyanirseroussi.com%2fconsult%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
         <a href=https://github.com/adityatelange/hugo-PaperMod/ rel=noopener target=_blank>PaperMod</a></span></footer><div class=mailing-list-container><form class=mailing-list action="https://yanirseroussi.us17.list-manage.com/subscribe/post?u=3c08aa3ff27dd92978019febd&amp;id=bc3ab705af" method=post target=_blank novalidate><label for=mailing-list-email>Get new post notifications</label>
diff --git a/kaggle/index.html b/kaggle/index.html
index 13a4394b0..8bcb4fbe7 100644
--- a/kaggle/index.html
+++ b/kaggle/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Kaggle competition tips and summaries | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="data science,Kaggle,Kaggle competition,predictive modelling"><meta name=description content="Pointers to all my Kaggle advice posts and competition summaries."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/kaggle/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Kaggle competition tips and summaries"><meta property="og:description" content="Pointers to all my Kaggle advice posts and competition summaries."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/kaggle/"><meta property="og:image" content="https://yanirseroussi.com/kaggle-logo-transparent.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-04-05T23:46:10+00:00"><meta property="article:modified_time" content="2023-07-06T09:28:02+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/kaggle-logo-transparent.png"><meta name=twitter:title content="Kaggle competition tips and summaries"><meta name=twitter:description content="Pointers to all my Kaggle advice posts and competition summaries."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Kaggle competition tips and summaries","item":"https://yanirseroussi.com/kaggle/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Kaggle competition tips and summaries","name":"Kaggle competition tips and summaries","description":"Pointers to all my Kaggle advice posts and competition summaries.","keywords":["data science","Kaggle","Kaggle competition","predictive modelling"],"articleBody":"Over the years, I’ve participated in a few Kaggle competitions and wrote a bit about my experiences. This page contains pointers to all my posts, and will be updated if/when I participate in more competitions.\nGeneral advice posts 10 Steps to Success in Kaggle Data Science Competitions (guest post on KDNuggets) How to (almost) win Kaggle competitions Kaggle beginner tips Solution posts Greek Media Monitoring Multilabel Classification [6th/120] – multi-label classification of pre-tokenised texts Personalised Web Search Challenge [9th/194] – reranking web search results in a personalised manner Blue Book for Bulldozers [9th/476] – forecasting auction sale price of bulldozers ICFHR 2012 – Arabic Writer Identification Competition [3rd/42] – classifying handwritten texts by the identity of the writer (Kaggle blog post) EMC Data Science Global Hackathon (Air Quality Prediction) [6th/110] – forecasting levels of air pollutants (Kaggle forum post) ","wordCount":"139","inLanguage":"en","image":"https://yanirseroussi.com/kaggle-logo-transparent.png","datePublished":"2014-04-05T23:46:10Z","dateModified":"2023-07-06T09:28:02+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/kaggle/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Kaggle competition tips and summaries</h1><div class=post-meta><span title='2014-04-05 23:46:10 +0000 UTC'>April 5, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-04-05-kaggle-competition-summaries/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/kaggle/kaggle-logo-transparent_hud5a5728fe9c376b674017f410efda607_7282_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/kaggle/kaggle-logo-transparent_hud5a5728fe9c376b674017f410efda607_7282_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/kaggle/kaggle-logo-transparent_hud5a5728fe9c376b674017f410efda607_7282_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/kaggle/kaggle-logo-transparent.png 1056w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/kaggle/kaggle-logo-transparent.png alt width=1056 height=480></figure><div class=post-content><p>Over the years, I&rsquo;ve participated in a few <a href=https://www.kaggle.com target=_blank rel=noopener>Kaggle</a> competitions and wrote a bit about my experiences. This page contains pointers to all my posts, and will be updated if/when I participate in more competitions.</p><h3 id=general-advice-posts>General advice posts<a hidden class=anchor aria-hidden=true href=#general-advice-posts>#</a></h3><ul><li><a href=http://www.kdnuggets.com/2015/03/10-steps-success-kaggle-data-science-competitions.html target=_blank rel=noopener>10 Steps to Success in Kaggle Data Science Competitions (guest post on KDNuggets)</a></li><li><a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/ target=_blank rel=noopener>How to (almost) win Kaggle competitions</a></li><li><a href=https://yanirseroussi.com/2014/01/19/kaggle-beginner-tips/ target=_blank rel=noopener>Kaggle beginner tips</a></li></ul><h3 id=solution-posts>Solution posts<a hidden class=anchor aria-hidden=true href=#solution-posts>#</a></h3><ul><li><a href=https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/>Greek Media Monitoring Multilabel Classification</a> [6th/120] – multi-label classification of pre-tokenised texts</li><li><a href=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/ title="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)">Personalised Web Search Challenge</a> [9th/194] – reranking web search results in a personalised manner</li><li><a href=https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/ title="Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)">Blue Book for Bulldozers</a> [9th/476] – forecasting auction sale price of bulldozers</li><li><a href=http://blog.kaggle.com/2012/04/29/on-diffusion-kernels-histograms-and-arabic-writer-identification/ target=_blank rel=noopener>ICFHR 2012 – Arabic Writer Identification Competition</a> [3rd/42] – classifying handwritten texts by the identity of the writer (Kaggle blog post)</li><li><a href=https://www.kaggle.com/c/dsg-hackathon/forums/t/1821/general-approaches-to-partitioning-the-models/10631#post10631 target=_blank rel=noopener>EMC Data Science Global Hackathon (Air Quality Prediction)</a> [6th/110] – forecasting levels of air pollutants (Kaggle forum post)</li></ul></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/kaggle/>Kaggle</a></li><li><a href=https://yanirseroussi.com/tags/kaggle-competition/>Kaggle competition</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on x" href="https://x.com/intent/tweet/?text=Kaggle%20competition%20tips%20and%20summaries&amp;url=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f&amp;hashtags=datascience%2cKaggle%2cKagglecompetition%2cpredictivemodelling"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f&amp;title=Kaggle%20competition%20tips%20and%20summaries&amp;summary=Kaggle%20competition%20tips%20and%20summaries&amp;source=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f&title=Kaggle%20competition%20tips%20and%20summaries"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on whatsapp" href="https://api.whatsapp.com/send?text=Kaggle%20competition%20tips%20and%20summaries%20-%20https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on telegram" href="https://telegram.me/share/url?text=Kaggle%20competition%20tips%20and%20summaries&amp;url=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on ycombinator" href="https://news.ycombinator.com/submitlink?t=Kaggle%20competition%20tips%20and%20summaries&u=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
+<meta name=keywords content="data science,Kaggle,Kaggle competition,predictive modelling"><meta name=description content="Pointers to all my Kaggle advice posts and competition summaries."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/kaggle/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Kaggle competition tips and summaries"><meta property="og:description" content="Pointers to all my Kaggle advice posts and competition summaries."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/kaggle/"><meta property="og:image" content="https://yanirseroussi.com/kaggle/kaggle-logo-transparent.png"><meta property="article:section" content="posts"><meta property="article:published_time" content="2014-04-05T23:46:10+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/kaggle/kaggle-logo-transparent.png"><meta name=twitter:title content="Kaggle competition tips and summaries"><meta name=twitter:description content="Pointers to all my Kaggle advice posts and competition summaries."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"Kaggle competition tips and summaries","item":"https://yanirseroussi.com/kaggle/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Kaggle competition tips and summaries","name":"Kaggle competition tips and summaries","description":"Pointers to all my Kaggle advice posts and competition summaries.","keywords":["data science","Kaggle","Kaggle competition","predictive modelling"],"articleBody":"Over the years, I’ve participated in a few Kaggle competitions and wrote a bit about my experiences. This page contains pointers to all my posts, and will be updated if/when I participate in more competitions.\nGeneral advice posts 10 Steps to Success in Kaggle Data Science Competitions (guest post on KDNuggets) How to (almost) win Kaggle competitions Kaggle beginner tips Solution posts Greek Media Monitoring Multilabel Classification [6th/120] – multi-label classification of pre-tokenised texts Personalised Web Search Challenge [9th/194] – reranking web search results in a personalised manner Blue Book for Bulldozers [9th/476] – forecasting auction sale price of bulldozers ICFHR 2012 – Arabic Writer Identification Competition [3rd/42] – classifying handwritten texts by the identity of the writer (Kaggle blog post) EMC Data Science Global Hackathon (Air Quality Prediction) [6th/110] – forecasting levels of air pollutants (Kaggle forum post) ","wordCount":"139","inLanguage":"en","image":"https://yanirseroussi.com/kaggle/kaggle-logo-transparent.png","datePublished":"2014-04-05T23:46:10Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/kaggle/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Kaggle competition tips and summaries</h1><div class=post-meta><span title='2014-04-05 23:46:10 +0000 UTC'>April 5, 2014</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2014-04-05-kaggle-competition-summaries/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/kaggle/kaggle-logo-transparent_hud5a5728fe9c376b674017f410efda607_7282_360x0_resize_box_3.png 360w ,https://yanirseroussi.com/kaggle/kaggle-logo-transparent_hud5a5728fe9c376b674017f410efda607_7282_480x0_resize_box_3.png 480w ,https://yanirseroussi.com/kaggle/kaggle-logo-transparent_hud5a5728fe9c376b674017f410efda607_7282_720x0_resize_box_3.png 720w ,https://yanirseroussi.com/kaggle/kaggle-logo-transparent.png 1056w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/kaggle/kaggle-logo-transparent.png alt width=1056 height=480></figure><div class=post-content><p>Over the years, I&rsquo;ve participated in a few <a href=https://www.kaggle.com target=_blank rel=noopener>Kaggle</a> competitions and wrote a bit about my experiences. This page contains pointers to all my posts, and will be updated if/when I participate in more competitions.</p><h3 id=general-advice-posts>General advice posts<a hidden class=anchor aria-hidden=true href=#general-advice-posts>#</a></h3><ul><li><a href=http://www.kdnuggets.com/2015/03/10-steps-success-kaggle-data-science-competitions.html target=_blank rel=noopener>10 Steps to Success in Kaggle Data Science Competitions (guest post on KDNuggets)</a></li><li><a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/ target=_blank rel=noopener>How to (almost) win Kaggle competitions</a></li><li><a href=https://yanirseroussi.com/2014/01/19/kaggle-beginner-tips/ target=_blank rel=noopener>Kaggle beginner tips</a></li></ul><h3 id=solution-posts>Solution posts<a hidden class=anchor aria-hidden=true href=#solution-posts>#</a></h3><ul><li><a href=https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/>Greek Media Monitoring Multilabel Classification</a> [6th/120] – multi-label classification of pre-tokenised texts</li><li><a href=https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/ title="Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)">Personalised Web Search Challenge</a> [9th/194] – reranking web search results in a personalised manner</li><li><a href=https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/ title="Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)">Blue Book for Bulldozers</a> [9th/476] – forecasting auction sale price of bulldozers</li><li><a href=http://blog.kaggle.com/2012/04/29/on-diffusion-kernels-histograms-and-arabic-writer-identification/ target=_blank rel=noopener>ICFHR 2012 – Arabic Writer Identification Competition</a> [3rd/42] – classifying handwritten texts by the identity of the writer (Kaggle blog post)</li><li><a href=https://www.kaggle.com/c/dsg-hackathon/forums/t/1821/general-approaches-to-partitioning-the-models/10631#post10631 target=_blank rel=noopener>EMC Data Science Global Hackathon (Air Quality Prediction)</a> [6th/110] – forecasting levels of air pollutants (Kaggle forum post)</li></ul></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/kaggle/>Kaggle</a></li><li><a href=https://yanirseroussi.com/tags/kaggle-competition/>Kaggle competition</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on x" href="https://x.com/intent/tweet/?text=Kaggle%20competition%20tips%20and%20summaries&amp;url=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f&amp;hashtags=datascience%2cKaggle%2cKagglecompetition%2cpredictivemodelling"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f&amp;title=Kaggle%20competition%20tips%20and%20summaries&amp;summary=Kaggle%20competition%20tips%20and%20summaries&amp;source=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f&title=Kaggle%20competition%20tips%20and%20summaries"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on whatsapp" href="https://api.whatsapp.com/send?text=Kaggle%20competition%20tips%20and%20summaries%20-%20https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on telegram" href="https://telegram.me/share/url?text=Kaggle%20competition%20tips%20and%20summaries&amp;url=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Kaggle competition tips and summaries on ycombinator" href="https://news.ycombinator.com/submitlink?t=Kaggle%20competition%20tips%20and%20summaries&u=https%3a%2f%2fyanirseroussi.com%2fkaggle%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer><section class=comment-section><p class="post-content contact-cta">Public comments are closed, but I love hearing from readers. Feel free to
 <a href=/about/#contact-me target=_blank>contact me</a> with your thoughts.</p></section></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
diff --git a/phd-work/index.html b/phd-work/index.html
index 70457aea4..6095b3206 100644
--- a/phd-work/index.html
+++ b/phd-work/index.html
@@ -1,5 +1,5 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>My PhD work | Yanir Seroussi | Data & AI for Nature</title>
-<meta name=keywords content="artificial intelligence,career,data science,machine learning,predictive modelling"><meta name=description content="An overview of my PhD in data science / artificial intelligence. Thesis title: Text Mining and Rating Prediction with Topical User Models."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/phd-work/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My PhD work"><meta property="og:description" content="An overview of my PhD in data science / artificial intelligence. Thesis title: Text Mining and Rating Prediction with Topical User Models."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/phd-work/"><meta property="og:image" content="https://yanirseroussi.com/thesis.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-03-30T03:23:33+00:00"><meta property="article:modified_time" content="2023-07-10T15:30:20+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/thesis.jpg"><meta name=twitter:title content="My PhD work"><meta name=twitter:description content="An overview of my PhD in data science / artificial intelligence. Thesis title: Text Mining and Rating Prediction with Topical User Models."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My PhD work","item":"https://yanirseroussi.com/phd-work/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My PhD work","name":"My PhD work","description":"An overview of my PhD in data science / artificial intelligence. Thesis title: Text Mining and Rating Prediction with Topical User Models.","keywords":["artificial intelligence","career","data science","machine learning","predictive modelling"],"articleBody":"I did my PhD at Monash University under the supervision of Ingrid Zukerman and Fabian Bohnert. I started in March 2009 and submitted my thesis in August 2012. When excluding time spent on conference trips and three months of an internship with Google, it took about three years of work to complete the PhD, which is not too bad for a 100% research program (no coursework was required at the time).\nPeople often ask me how to become a data scientist. The PhD was my way of doing that, though it was entirely unplanned. In fact, I didn’t even want to do a PhD. My original plan was to come to Australia, do a master degree, and see if I like it here. Ingrid convinced me to do a PhD, because “the time difference to a master isn’t huge”. I don’t regret listening to her. I had the opportunity to work on interesting problems, travel, and generally have fun. The PhD has even made me more employable due to the boom in data-driven work, which wasn’t something I was aiming for. All I was hoping to achieve was being qualified to work on more interesting stuff than vanilla software engineering, which was my focus prior to the PhD.\nBroadly speaking, the topics of the PhD were in the areas of user modelling and natural language processing. I’m planning to eventually document the journey and the work done through a series of posts.1 The idea is to give a behind-the-scenes overview of the work that went into publishing the papers, as there are many lessons that may be useful to both PhD students and software engineers who wish to become data scientists. In addition, this website gets much more exposure than my papers ever did, so I hope that using this platform to explain the papers in a friendly language would enable a wider audience to build on my PhD work.\nThe title of my thesis is Text Mining and Rating Prediction with Topical User Models. The short, human-friendly abstract is:\nThis thesis develops novel statistical methods to infer implicit information from online user-generated texts. These methods analyse texts to identify and characterise users, detect their sentiments, and predict their preferences for items such as films. The inferred information may be harnessed for improved personalisation of online user experience.\nThe main publications that resulted from my PhD work are as follows. Links to posts about these publications will be added in the future. Please subscribe to get notified when this happens.\nYanir Seroussi, Ingrid Zukerman, and Fabian Bohnert, “Authorship Attribution with Topic Models”. In Computational Linguistics 40(2):269–310, 2014. PDF\nIn a sentence: Essentially a condensed version of my thesis Yanir Seroussi, “Text Mining and Rating Prediction with Topical User Models”. PhD thesis, Faculty of Information Technology, Monash University, Clayton, VIC 3800, Australia, 2012. PDF\nIn a sentence: The thesis, as described above, which was awarded the Mollie Holman medal for the best thesis in the faculty of IT in 2012 Yanir Seroussi, Fabian Bohnert and Ingrid Zukerman, “Authorship attribution with author-aware topic models”. In ACL 2012, pages 264–269, Jeju, Republic of Korea, 2012. PDF\nIn a sentence: An authorship attribution model that combines latent Dirichlet allocation and the author-topic model Yanir Seroussi, Russell Smyth and Ingrid Zukerman, “Ghosts from the High Court’s past: Evidence from computational linguistics for Dixon ghosting for McTiernan and Rich”. In University of New South Wales Law Journal, 34(3):984–1005, 2011. PDF | Dataset\nIn a sentence: A law journal paper that explores the extent to which Australian high court justice Owen Dixon ghost-wrote judgements for Edward McTiernan and George Rich Yanir Seroussi, Ingrid Zukerman and Fabian Bohnert, “Authorship attribution with latent Dirichlet allocation”. In CoNLL 2011, pages 181–189, Portland, OR, USA, 2011. PDF | Judgement dataset | IMDB62 dataset\nIn a sentence: Applying latent Dirichlet allocation to the authorship attribution problem Yanir Seroussi, Fabian Bohnert and Ingrid Zukerman, “Personalised rating prediction for new users using latent factor models”. In HT 2011, pages 47–56, Eindhoven, The Netherlands, 2011. PDF | Dataset\nIn a sentence: Extensions to the basic matrix factorisation approach to recommender systems to handle scenarios with new users who have little data associated with them Yanir Seroussi, Ingrid Zukerman and Fabian Bohnert, “Collaborative inference of sentiments from texts”. In UMAP 2010, pages 195–206, Waikoloa, HI, USA, 2010. PDF | Dataset | Blog post\nIn a sentence: An application of a model based on neighbour-based collaborative filtering to a variant of the sentiment analysis problem where the authors are known July 2023 update: Just noticed this plan while tidying up the website. The series of posts never got off the ground. As it’s been eight years, I think it’s safe to say it’s not going to happen. ↩︎\n","wordCount":"790","inLanguage":"en","image":"https://yanirseroussi.com/thesis.jpg","datePublished":"2015-03-30T03:23:33Z","dateModified":"2023-07-10T15:30:20+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/phd-work/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My PhD work</h1><div class=post-meta><span title='2015-03-30 03:23:33 +0000 UTC'>March 30, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-03-30-phd-work/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/phd-work/thesis.jpg 2592w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/phd-work/thesis.jpg alt width=2592 height=1552></figure><div class=post-content><p>I did my PhD at <a href=http://www.monash.edu/ target=_blank rel=noopener>Monash University</a> under the supervision of <a href=http://users.monash.edu/~ingrid/ target=_blank rel=noopener>Ingrid Zukerman</a> and <a href=https://sites.google.com/a/bohnert.eu/fabian-bohnert/ target=_blank rel=noopener>Fabian Bohnert</a>. I started in March 2009 and submitted my thesis in August 2012. When excluding time spent on conference trips and three months of an internship with Google, it took about three years of work to complete the PhD, which is not too bad for a 100% research program (no coursework was required at the time).</p><p>People often ask me how to become <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/ title="What is data science?">a data scientist</a>. The PhD was my way of doing that, though it was entirely unplanned. In fact, I didn&rsquo;t even want to do a PhD. My original plan was to come to Australia, do a master degree, and see if I like it here. Ingrid convinced me to do a PhD, because &ldquo;the time difference to a master isn&rsquo;t huge&rdquo;. I don&rsquo;t regret listening to her. I had the opportunity to work on interesting problems, travel, and generally have fun. The PhD has even made me more employable due to the boom in data-driven work, which wasn&rsquo;t something I was aiming for. All I was hoping to achieve was being qualified to work on more interesting stuff than vanilla software engineering, which was my focus prior to the PhD.</p><p>Broadly speaking, the topics of the PhD were in the areas of user modelling and natural language processing. I&rsquo;m planning to eventually document the journey and the work done through a series of posts.<sup id=fnref:1><a href=#fn:1 class=footnote-ref role=doc-noteref>1</a></sup> The idea is to give a behind-the-scenes overview of the work that went into publishing the papers, as there are many lessons that may be useful to both PhD students and software engineers who wish to become data scientists. In addition, this website gets much more exposure than my papers ever did, so I hope that using this platform to explain the papers in a friendly language would enable a wider audience to build on my PhD work.</p><p>The title of my thesis is <em>Text Mining and Rating Prediction with Topical User Models</em>. The short, human-friendly abstract is:</p><blockquote><p>This thesis develops novel statistical methods to infer implicit information from online user-generated texts. These methods analyse texts to identify and characterise users, detect their sentiments, and predict their preferences for items such as films. The inferred information may be harnessed for improved personalisation of online user experience.</p></blockquote><p>The main publications that resulted from my PhD work are as follows. Links to posts about these publications will be added in the future. Please subscribe to get notified when this happens.</p><ul><li>Yanir Seroussi, Ingrid Zukerman, and Fabian Bohnert, &ldquo;Authorship Attribution with Topic Models&rdquo;. In <em>Computational Linguistics</em> 40(2):269–310, 2014. <a href=http://aclweb.org/anthology/J/J14/J14-2003.pdf target=_blank rel=noopener>PDF</a><br><strong>In a sentence:</strong> Essentially a condensed version of my thesis</li><li>Yanir Seroussi, &ldquo;Text Mining and Rating Prediction with Topical User Models&rdquo;. PhD thesis, Faculty of Information Technology, Monash University, Clayton, VIC 3800, Australia, 2012. <a href=https://figshare.com/articles/Text_mining_and_rating_prediction_with_topical_user_models/4664473 target=_blank rel=noopener>PDF</a><br><strong>In a sentence:</strong> The thesis, as described above, which was <a href=https://www.monash.edu/news/articles/top-of-the-class target=_blank rel=noopener>awarded the Mollie Holman medal for the best thesis in the faculty of IT in 2012</a></li><li>Yanir Seroussi, Fabian Bohnert and Ingrid Zukerman, &ldquo;Authorship attribution with author-aware topic models&rdquo;. In <em>ACL 2012</em>, pages 264–269, Jeju, Republic of Korea, 2012. <a href=http://aclweb.org/anthology/P/P12/P12-2052v2.pdf target=_blank rel=noopener>PDF</a><br><strong>In a sentence:</strong> An <a href=http://en.wikipedia.org/wiki/Stylometry target=_blank rel=noopener>authorship attribution</a> model that combines <a href=http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation target=_blank rel=noopener>latent Dirichlet allocation</a> and the <a href=http://www.datalab.uci.edu/author-topic/ target=_blank rel=noopener>author-topic model</a></li><li>Yanir Seroussi, Russell Smyth and Ingrid Zukerman, &ldquo;Ghosts from the High Court’s past: Evidence from computational linguistics for Dixon ghosting for McTiernan and Rich&rdquo;. In <em>University of New South Wales Law Journal</em>, 34(3):984–1005, 2011. <a href=http://www.csse.monash.edu.au/~ingrid/Publications/SeroussiSmythZukerman.pdf target=_blank rel=noopener>PDF</a> | <a href="https://umlt.infotech.monash.edu/?page_id=152" target=_blank rel=noopener>Dataset</a><br><strong>In a sentence:</strong> A law journal paper that explores the extent to which Australian high court justice <a href=https://en.wikipedia.org/wiki/Owen_Dixon target=_blank rel=noopener>Owen Dixon</a> ghost-wrote judgements for <a href=https://en.wikipedia.org/wiki/Edward_McTiernan target=_blank rel=noopener>Edward McTiernan</a> and <a href=https://en.wikipedia.org/wiki/George_Rich target=_blank rel=noopener>George Rich</a></li><li>Yanir Seroussi, Ingrid Zukerman and Fabian Bohnert, &ldquo;Authorship attribution with latent Dirichlet allocation&rdquo;. In <em>CoNLL 2011</em>, pages 181–189, Portland, OR, USA, 2011. <a href=http://aclweb.org/anthology/W/W11/W11-0321.pdf target=_blank rel=noopener>PDF</a> | <a href=http://www.csse.monash.edu.au/research/umnl/data/umami/ target=_blank rel=noopener>Judgement dataset</a> | <a href=https://www.dropbox.com/s/np1u1hl343gd73m/imdb62.zip target=_blank rel=noopener>IMDB62 dataset</a><br><strong>In a sentence:</strong> Applying <a href=http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation target=_blank rel=noopener>latent Dirichlet allocation</a> to the <a href=http://en.wikipedia.org/wiki/Stylometry target=_blank rel=noopener>authorship attribution</a> problem</li><li>Yanir Seroussi, Fabian Bohnert and Ingrid Zukerman, &ldquo;Personalised rating prediction for new users using latent factor models&rdquo;. In <em>HT 2011</em>, pages 47–56, Eindhoven, The Netherlands, 2011. <a href=https://www.dropbox.com/s/og42a9f97dcuuyt/SeroussiBohnertZukerman2011.pdf target=_blank rel=noopener>PDF</a> | <a href=https://www.dropbox.com/s/zmev1b6c5ug5l0u/imdb1m.zip target=_blank rel=noopener>Dataset</a><br><strong>In a sentence:</strong> Extensions to the basic matrix factorisation approach to <a href=https://en.wikipedia.org/wiki/Recommender_system target=_blank rel=noopener>recommender systems</a> to handle scenarios with new users who have little data associated with them</li><li>Yanir Seroussi, Ingrid Zukerman and Fabian Bohnert, &ldquo;Collaborative inference of sentiments from texts&rdquo;. In <em>UMAP 2010</em>, pages 195–206, Waikoloa, HI, USA, 2010. <a href=https://www.dropbox.com/s/sz9uw1s5151vs5d/SeroussiZukermanBohnert2010b.pdf target=_blank rel=noopener>PDF</a> | <a href=https://www.dropbox.com/s/np1u1hl343gd73m/imdb62.zip target=_blank rel=noopener>Dataset</a> | <a href=https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/ title="First steps in data science: author-aware sentiment analysis">Blog post</a><br><strong>In a sentence:</strong> An application of a model based on <a href=https://en.wikipedia.org/wiki/Collaborative_filtering#Memory-based target=_blank rel=noopener>neighbour-based collaborative filtering</a> to a variant of the <a href=https://en.wikipedia.org/wiki/Sentiment_analysis target=_blank rel=noopener>sentiment analysis</a> problem where the authors are known</li></ul><div class=footnotes role=doc-endnotes><hr><ol><li id=fn:1><p><em>July 2023 update:</em> Just noticed this plan while tidying up the website. The series of posts never got off the ground. As it&rsquo;s been eight years, I think it&rsquo;s safe to say it&rsquo;s not going to happen.&#160;<a href=#fnref:1 class=footnote-backref role=doc-backlink>&#8617;&#xfe0e;</a></p></li></ol></div></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/artificial-intelligence/>artificial intelligence</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on x" href="https://x.com/intent/tweet/?text=My%20PhD%20work&amp;url=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f&amp;hashtags=artificialintelligence%2ccareer%2cdatascience%2cmachinelearning%2cpredictivemodelling"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f&amp;title=My%20PhD%20work&amp;summary=My%20PhD%20work&amp;source=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f&title=My%20PhD%20work"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on whatsapp" href="https://api.whatsapp.com/send?text=My%20PhD%20work%20-%20https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on telegram" href="https://telegram.me/share/url?text=My%20PhD%20work&amp;url=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on ycombinator" href="https://news.ycombinator.com/submitlink?t=My%20PhD%20work&u=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
+<meta name=keywords content="artificial intelligence,career,data science,machine learning,predictive modelling"><meta name=description content="An overview of my PhD in data science / artificial intelligence. Thesis title: Text Mining and Rating Prediction with Topical User Models."><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/phd-work/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="My PhD work"><meta property="og:description" content="An overview of my PhD in data science / artificial intelligence. Thesis title: Text Mining and Rating Prediction with Topical User Models."><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/phd-work/"><meta property="og:image" content="https://yanirseroussi.com/phd-work/thesis.jpg"><meta property="article:section" content="posts"><meta property="article:published_time" content="2015-03-30T03:23:33+00:00"><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/phd-work/thesis.jpg"><meta name=twitter:title content="My PhD work"><meta name=twitter:description content="An overview of my PhD in data science / artificial intelligence. Thesis title: Text Mining and Rating Prediction with Topical User Models."><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Posts","item":"https://yanirseroussi.com/posts/"},{"@type":"ListItem","position":2,"name":"My PhD work","item":"https://yanirseroussi.com/phd-work/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"My PhD work","name":"My PhD work","description":"An overview of my PhD in data science / artificial intelligence. Thesis title: Text Mining and Rating Prediction with Topical User Models.","keywords":["artificial intelligence","career","data science","machine learning","predictive modelling"],"articleBody":"I did my PhD at Monash University under the supervision of Ingrid Zukerman and Fabian Bohnert. I started in March 2009 and submitted my thesis in August 2012. When excluding time spent on conference trips and three months of an internship with Google, it took about three years of work to complete the PhD, which is not too bad for a 100% research program (no coursework was required at the time).\nPeople often ask me how to become a data scientist. The PhD was my way of doing that, though it was entirely unplanned. In fact, I didn’t even want to do a PhD. My original plan was to come to Australia, do a master degree, and see if I like it here. Ingrid convinced me to do a PhD, because “the time difference to a master isn’t huge”. I don’t regret listening to her. I had the opportunity to work on interesting problems, travel, and generally have fun. The PhD has even made me more employable due to the boom in data-driven work, which wasn’t something I was aiming for. All I was hoping to achieve was being qualified to work on more interesting stuff than vanilla software engineering, which was my focus prior to the PhD.\nBroadly speaking, the topics of the PhD were in the areas of user modelling and natural language processing. I’m planning to eventually document the journey and the work done through a series of posts.1 The idea is to give a behind-the-scenes overview of the work that went into publishing the papers, as there are many lessons that may be useful to both PhD students and software engineers who wish to become data scientists. In addition, this website gets much more exposure than my papers ever did, so I hope that using this platform to explain the papers in a friendly language would enable a wider audience to build on my PhD work.\nThe title of my thesis is Text Mining and Rating Prediction with Topical User Models. The short, human-friendly abstract is:\nThis thesis develops novel statistical methods to infer implicit information from online user-generated texts. These methods analyse texts to identify and characterise users, detect their sentiments, and predict their preferences for items such as films. The inferred information may be harnessed for improved personalisation of online user experience.\nThe main publications that resulted from my PhD work are as follows. Links to posts about these publications will be added in the future. Please subscribe to get notified when this happens.\nYanir Seroussi, Ingrid Zukerman, and Fabian Bohnert, “Authorship Attribution with Topic Models”. In Computational Linguistics 40(2):269–310, 2014. PDF\nIn a sentence: Essentially a condensed version of my thesis Yanir Seroussi, “Text Mining and Rating Prediction with Topical User Models”. PhD thesis, Faculty of Information Technology, Monash University, Clayton, VIC 3800, Australia, 2012. PDF\nIn a sentence: The thesis, as described above, which was awarded the Mollie Holman medal for the best thesis in the faculty of IT in 2012 Yanir Seroussi, Fabian Bohnert and Ingrid Zukerman, “Authorship attribution with author-aware topic models”. In ACL 2012, pages 264–269, Jeju, Republic of Korea, 2012. PDF\nIn a sentence: An authorship attribution model that combines latent Dirichlet allocation and the author-topic model Yanir Seroussi, Russell Smyth and Ingrid Zukerman, “Ghosts from the High Court’s past: Evidence from computational linguistics for Dixon ghosting for McTiernan and Rich”. In University of New South Wales Law Journal, 34(3):984–1005, 2011. PDF | Dataset\nIn a sentence: A law journal paper that explores the extent to which Australian high court justice Owen Dixon ghost-wrote judgements for Edward McTiernan and George Rich Yanir Seroussi, Ingrid Zukerman and Fabian Bohnert, “Authorship attribution with latent Dirichlet allocation”. In CoNLL 2011, pages 181–189, Portland, OR, USA, 2011. PDF | Judgement dataset | IMDB62 dataset\nIn a sentence: Applying latent Dirichlet allocation to the authorship attribution problem Yanir Seroussi, Fabian Bohnert and Ingrid Zukerman, “Personalised rating prediction for new users using latent factor models”. In HT 2011, pages 47–56, Eindhoven, The Netherlands, 2011. PDF | Dataset\nIn a sentence: Extensions to the basic matrix factorisation approach to recommender systems to handle scenarios with new users who have little data associated with them Yanir Seroussi, Ingrid Zukerman and Fabian Bohnert, “Collaborative inference of sentiments from texts”. In UMAP 2010, pages 195–206, Waikoloa, HI, USA, 2010. PDF | Dataset | Blog post\nIn a sentence: An application of a model based on neighbour-based collaborative filtering to a variant of the sentiment analysis problem where the authors are known July 2023 update: Just noticed this plan while tidying up the website. The series of posts never got off the ground. As it’s been eight years, I think it’s safe to say it’s not going to happen. ↩︎\n","wordCount":"790","inLanguage":"en","image":"https://yanirseroussi.com/phd-work/thesis.jpg","datePublished":"2015-03-30T03:23:33Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/phd-work/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">My PhD work</h1><div class=post-meta><span title='2015-03-30 03:23:33 +0000 UTC'>March 30, 2015</span>&nbsp;|&nbsp;<a href=https://github.com/yanirs/yanirseroussi.com/blob/master/content/posts/2015-03-30-phd-work/index.md rel="noopener noreferrer" target=_blank>Suggest changes</a></div></header><figure class=entry-cover><img loading=eager srcset="https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_360x0_resize_q75_box.jpg 360w ,https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_480x0_resize_q75_box.jpg 480w ,https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_720x0_resize_q75_box.jpg 720w ,https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_1080x0_resize_q75_box.jpg 1080w ,https://yanirseroussi.com/phd-work/thesis_hufffef9059c063cbf1893ec2887faca16_981984_1500x0_resize_q75_box.jpg 1500w ,https://yanirseroussi.com/phd-work/thesis.jpg 2592w" sizes="(min-width: 768px) 720px, 100vw" src=https://yanirseroussi.com/phd-work/thesis.jpg alt width=2592 height=1552></figure><div class=post-content><p>I did my PhD at <a href=http://www.monash.edu/ target=_blank rel=noopener>Monash University</a> under the supervision of <a href=http://users.monash.edu/~ingrid/ target=_blank rel=noopener>Ingrid Zukerman</a> and <a href=https://sites.google.com/a/bohnert.eu/fabian-bohnert/ target=_blank rel=noopener>Fabian Bohnert</a>. I started in March 2009 and submitted my thesis in August 2012. When excluding time spent on conference trips and three months of an internship with Google, it took about three years of work to complete the PhD, which is not too bad for a 100% research program (no coursework was required at the time).</p><p>People often ask me how to become <a href=https://yanirseroussi.com/2014/10/23/what-is-data-science/ title="What is data science?">a data scientist</a>. The PhD was my way of doing that, though it was entirely unplanned. In fact, I didn&rsquo;t even want to do a PhD. My original plan was to come to Australia, do a master degree, and see if I like it here. Ingrid convinced me to do a PhD, because &ldquo;the time difference to a master isn&rsquo;t huge&rdquo;. I don&rsquo;t regret listening to her. I had the opportunity to work on interesting problems, travel, and generally have fun. The PhD has even made me more employable due to the boom in data-driven work, which wasn&rsquo;t something I was aiming for. All I was hoping to achieve was being qualified to work on more interesting stuff than vanilla software engineering, which was my focus prior to the PhD.</p><p>Broadly speaking, the topics of the PhD were in the areas of user modelling and natural language processing. I&rsquo;m planning to eventually document the journey and the work done through a series of posts.<sup id=fnref:1><a href=#fn:1 class=footnote-ref role=doc-noteref>1</a></sup> The idea is to give a behind-the-scenes overview of the work that went into publishing the papers, as there are many lessons that may be useful to both PhD students and software engineers who wish to become data scientists. In addition, this website gets much more exposure than my papers ever did, so I hope that using this platform to explain the papers in a friendly language would enable a wider audience to build on my PhD work.</p><p>The title of my thesis is <em>Text Mining and Rating Prediction with Topical User Models</em>. The short, human-friendly abstract is:</p><blockquote><p>This thesis develops novel statistical methods to infer implicit information from online user-generated texts. These methods analyse texts to identify and characterise users, detect their sentiments, and predict their preferences for items such as films. The inferred information may be harnessed for improved personalisation of online user experience.</p></blockquote><p>The main publications that resulted from my PhD work are as follows. Links to posts about these publications will be added in the future. Please subscribe to get notified when this happens.</p><ul><li>Yanir Seroussi, Ingrid Zukerman, and Fabian Bohnert, &ldquo;Authorship Attribution with Topic Models&rdquo;. In <em>Computational Linguistics</em> 40(2):269–310, 2014. <a href=http://aclweb.org/anthology/J/J14/J14-2003.pdf target=_blank rel=noopener>PDF</a><br><strong>In a sentence:</strong> Essentially a condensed version of my thesis</li><li>Yanir Seroussi, &ldquo;Text Mining and Rating Prediction with Topical User Models&rdquo;. PhD thesis, Faculty of Information Technology, Monash University, Clayton, VIC 3800, Australia, 2012. <a href=https://figshare.com/articles/Text_mining_and_rating_prediction_with_topical_user_models/4664473 target=_blank rel=noopener>PDF</a><br><strong>In a sentence:</strong> The thesis, as described above, which was <a href=https://www.monash.edu/news/articles/top-of-the-class target=_blank rel=noopener>awarded the Mollie Holman medal for the best thesis in the faculty of IT in 2012</a></li><li>Yanir Seroussi, Fabian Bohnert and Ingrid Zukerman, &ldquo;Authorship attribution with author-aware topic models&rdquo;. In <em>ACL 2012</em>, pages 264–269, Jeju, Republic of Korea, 2012. <a href=http://aclweb.org/anthology/P/P12/P12-2052v2.pdf target=_blank rel=noopener>PDF</a><br><strong>In a sentence:</strong> An <a href=http://en.wikipedia.org/wiki/Stylometry target=_blank rel=noopener>authorship attribution</a> model that combines <a href=http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation target=_blank rel=noopener>latent Dirichlet allocation</a> and the <a href=http://www.datalab.uci.edu/author-topic/ target=_blank rel=noopener>author-topic model</a></li><li>Yanir Seroussi, Russell Smyth and Ingrid Zukerman, &ldquo;Ghosts from the High Court’s past: Evidence from computational linguistics for Dixon ghosting for McTiernan and Rich&rdquo;. In <em>University of New South Wales Law Journal</em>, 34(3):984–1005, 2011. <a href=http://www.csse.monash.edu.au/~ingrid/Publications/SeroussiSmythZukerman.pdf target=_blank rel=noopener>PDF</a> | <a href="https://umlt.infotech.monash.edu/?page_id=152" target=_blank rel=noopener>Dataset</a><br><strong>In a sentence:</strong> A law journal paper that explores the extent to which Australian high court justice <a href=https://en.wikipedia.org/wiki/Owen_Dixon target=_blank rel=noopener>Owen Dixon</a> ghost-wrote judgements for <a href=https://en.wikipedia.org/wiki/Edward_McTiernan target=_blank rel=noopener>Edward McTiernan</a> and <a href=https://en.wikipedia.org/wiki/George_Rich target=_blank rel=noopener>George Rich</a></li><li>Yanir Seroussi, Ingrid Zukerman and Fabian Bohnert, &ldquo;Authorship attribution with latent Dirichlet allocation&rdquo;. In <em>CoNLL 2011</em>, pages 181–189, Portland, OR, USA, 2011. <a href=http://aclweb.org/anthology/W/W11/W11-0321.pdf target=_blank rel=noopener>PDF</a> | <a href=http://www.csse.monash.edu.au/research/umnl/data/umami/ target=_blank rel=noopener>Judgement dataset</a> | <a href=https://www.dropbox.com/s/np1u1hl343gd73m/imdb62.zip target=_blank rel=noopener>IMDB62 dataset</a><br><strong>In a sentence:</strong> Applying <a href=http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation target=_blank rel=noopener>latent Dirichlet allocation</a> to the <a href=http://en.wikipedia.org/wiki/Stylometry target=_blank rel=noopener>authorship attribution</a> problem</li><li>Yanir Seroussi, Fabian Bohnert and Ingrid Zukerman, &ldquo;Personalised rating prediction for new users using latent factor models&rdquo;. In <em>HT 2011</em>, pages 47–56, Eindhoven, The Netherlands, 2011. <a href=https://www.dropbox.com/s/og42a9f97dcuuyt/SeroussiBohnertZukerman2011.pdf target=_blank rel=noopener>PDF</a> | <a href=https://www.dropbox.com/s/zmev1b6c5ug5l0u/imdb1m.zip target=_blank rel=noopener>Dataset</a><br><strong>In a sentence:</strong> Extensions to the basic matrix factorisation approach to <a href=https://en.wikipedia.org/wiki/Recommender_system target=_blank rel=noopener>recommender systems</a> to handle scenarios with new users who have little data associated with them</li><li>Yanir Seroussi, Ingrid Zukerman and Fabian Bohnert, &ldquo;Collaborative inference of sentiments from texts&rdquo;. In <em>UMAP 2010</em>, pages 195–206, Waikoloa, HI, USA, 2010. <a href=https://www.dropbox.com/s/sz9uw1s5151vs5d/SeroussiZukermanBohnert2010b.pdf target=_blank rel=noopener>PDF</a> | <a href=https://www.dropbox.com/s/np1u1hl343gd73m/imdb62.zip target=_blank rel=noopener>Dataset</a> | <a href=https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/ title="First steps in data science: author-aware sentiment analysis">Blog post</a><br><strong>In a sentence:</strong> An application of a model based on <a href=https://en.wikipedia.org/wiki/Collaborative_filtering#Memory-based target=_blank rel=noopener>neighbour-based collaborative filtering</a> to a variant of the <a href=https://en.wikipedia.org/wiki/Sentiment_analysis target=_blank rel=noopener>sentiment analysis</a> problem where the authors are known</li></ul><div class=footnotes role=doc-endnotes><hr><ol><li id=fn:1><p><em>July 2023 update:</em> Just noticed this plan while tidying up the website. The series of posts never got off the ground. As it&rsquo;s been eight years, I think it&rsquo;s safe to say it&rsquo;s not going to happen.&#160;<a href=#fnref:1 class=footnote-backref role=doc-backlink>&#8617;&#xfe0e;</a></p></li></ol></div></div><footer class=post-footer><ul class=post-tags><li><a href=https://yanirseroussi.com/tags/artificial-intelligence/>artificial intelligence</a></li><li><a href=https://yanirseroussi.com/tags/career/>career</a></li><li><a href=https://yanirseroussi.com/tags/data-science/>data science</a></li><li><a href=https://yanirseroussi.com/tags/machine-learning/>machine learning</a></li><li><a href=https://yanirseroussi.com/tags/predictive-modelling/>predictive modelling</a></li></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on x" href="https://x.com/intent/tweet/?text=My%20PhD%20work&amp;url=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f&amp;hashtags=artificialintelligence%2ccareer%2cdatascience%2cmachinelearning%2cpredictivemodelling"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f&amp;title=My%20PhD%20work&amp;summary=My%20PhD%20work&amp;source=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f&title=My%20PhD%20work"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on whatsapp" href="https://api.whatsapp.com/send?text=My%20PhD%20work%20-%20https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on telegram" href="https://telegram.me/share/url?text=My%20PhD%20work&amp;url=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share My PhD work on ycombinator" href="https://news.ycombinator.com/submitlink?t=My%20PhD%20work&u=https%3a%2f%2fyanirseroussi.com%2fphd-work%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
         <a href=https://github.com/adityatelange/hugo-PaperMod/ rel=noopener target=_blank>PaperMod</a></span></footer><div class=mailing-list-container><form class=mailing-list action="https://yanirseroussi.us17.list-manage.com/subscribe/post?u=3c08aa3ff27dd92978019febd&amp;id=bc3ab705af" method=post target=_blank novalidate><label for=mailing-list-email>Get new post notifications</label>
diff --git a/sitemap.xml b/sitemap.xml
index 73eb25321..935d7b0c8 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>https://yanirseroussi.com/tags/business/</loc><lastmod>2024-01-09T13:23:28+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/career/</loc><lastmod>2024-01-09T13:23:28+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/environment/</loc><lastmod>2024-01-09T13:23:28+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/marketing/</loc><lastmod>2024-01-09T13:23:28+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/personal/</loc><lastmod>2024-01-09T13:23:28+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2024/01/09/psychographic-specialisations-may-work-for-discipline-generalists/</loc><lastmod>2024-01-09T13:23:28+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/</loc><lastmod>2024-01-09T13:23:28+10:00</lastmod></url><url><loc>https://yanirseroussi.com/</loc><lastmod>2024-01-09T13:23:28+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2024/01/08/the-power-of-parasocial-relationships/</loc><lastmod>2024-01-08T16:31:22+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/data-business/</loc><lastmod>2023-12-18T10:38:56+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/data-science/</loc><lastmod>2023-12-18T10:38:56+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/12/18/positioning-is-a-common-problem-for-data-scientists/</loc><lastmod>2023-12-18T10:38:56+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/energy-markets/</loc><lastmod>2023-12-14T10:46:41+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/machine-learning/</loc><lastmod>2023-12-14T10:46:41+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/quotes/</loc><lastmod>2023-12-14T10:46:41+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/12/14/transfer-learning-applies-to-energy-market-bidding/</loc><lastmod>2023-12-14T10:46:41+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/data-engineering/</loc><lastmod>2023-11-29T12:57:12+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/data-visualisation/</loc><lastmod>2023-11-29T12:57:12+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/marine-science/</loc><lastmod>2023-11-29T12:57:12+10:00</lastmod></url><url><loc>https://yanirseroussi.com/posts/</loc><lastmod>2023-11-29T12:57:12+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/reef-life-survey/</loc><lastmod>2023-11-29T12:57:12+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/software-engineering/</loc><lastmod>2023-11-29T12:57:12+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/</loc><lastmod>2023-11-29T12:57:12+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/web-development/</loc><lastmod>2023-11-29T12:57:12+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/11/28/our-blue-machine-is-changing-but-we-are-not-helpless/</loc><lastmod>2023-11-28T16:56:18+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/11/21/you-dont-need-a-proprietary-api-for-static-maps/</loc><lastmod>2023-11-21T16:12:27+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/</loc><lastmod>2023-10-25T15:00:21+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/artificial-intelligence/</loc><lastmod>2023-10-06T15:11:27+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/10/06/artificial-intelligence-was-a-marketing-term-all-along-just-call-it-automation/</loc><lastmod>2023-10-06T15:11:27+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/ethics/</loc><lastmod>2023-10-06T15:11:27+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/09/25/the-lines-between-solo-consulting-and-product-building-are-blurry/</loc><lastmod>2023-09-25T11:15:26+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/09/21/googles-rules-of-machine-learning-still-apply-in-the-age-of-large-language-models/</loc><lastmod>2023-09-22T07:54:13+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/blogging/</loc><lastmod>2023-09-23T08:52:24+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/</loc><lastmod>2023-09-23T08:52:24+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/21/the-minimalist-entrepreneur-is-too-prescriptive-for-me/</loc><lastmod>2023-08-21T13:34:56+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/productivity/</loc><lastmod>2023-08-17T18:10:57+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/17/revisiting-start-small-stay-small-in-2023-chapter-2/</loc><lastmod>2023-08-17T18:10:57+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/16/revisiting-start-small-stay-small-in-2023-chapter-1/</loc><lastmod>2023-08-17T18:10:36+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/14/email-notifications-on-public-github-commits/</loc><lastmod>2023-08-14T15:44:21+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/11/the-rule-of-thirds-can-probably-be-ignored/</loc><lastmod>2023-08-11T14:35:20+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/devops/</loc><lastmod>2023-07-25T09:30:43+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/github/</loc><lastmod>2023-07-25T09:30:43+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/security/</loc><lastmod>2023-07-25T09:30:43+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/07/23/using-yubikey-for-ssh-access/</loc><lastmod>2023-07-25T09:30:43+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/hugo/</loc><lastmod>2023-07-17T17:18:06+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/07/17/making-a-til-section-with-hugo-and-papermod/</loc><lastmod>2023-07-17T17:18:06+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/07/11/you-cant-save-time/</loc><lastmod>2023-07-11T14:28:10+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/</loc><lastmod>2023-06-30T16:33:40+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/hackers/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/</loc><lastmod>2023-05-26T13:08:24+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/futurism/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/machine-intelligence/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/</loc><lastmod>2023-04-21T16:32:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/</loc><lastmod>2022-12-11T10:07:24+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/causal-inference/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/</loc><lastmod>2022-09-12T12:56:22+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/automattic/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/climate-change/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/orkestra/</loc><lastmod>2022-06-06T10:07:53+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/politics/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/remote-work/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/sustainability/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/</loc><lastmod>2022-06-06T10:07:53+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/</loc><lastmod>2023-07-10T16:35:18+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/deep-learning/</loc><lastmod>2023-07-10T16:35:18+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/fast.ai/</loc><lastmod>2023-07-10T16:35:18+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/</loc><lastmod>2022-01-17T09:00:05+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/statistics/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/</loc><lastmod>2021-11-22T13:52:18+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/cloudflare/</loc><lastmod>2022-07-31T16:16:05+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/</loc><lastmod>2022-07-31T16:16:05+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/wordpress/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2021/10/07/my-work-with-automattic/</loc><lastmod>2023-07-05T16:02:07+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/bootstrapping/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/confidence-intervals/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/analytics/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/</loc><lastmod>2023-07-05T11:39:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/</loc><lastmod>2023-07-10T16:35:18+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/frequently-asked-questions/</loc><lastmod>2023-07-10T16:35:18+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/bandcamp/</loc><lastmod>2023-07-07T17:36:55+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/bcrecommender/</loc><lastmod>2023-07-07T17:36:55+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/</loc><lastmod>2023-07-07T17:36:55+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/elasticsearch/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/javascript/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/predictive-modelling/</loc><lastmod>2023-07-10T15:30:20+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/science-communication/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/search-engine-optimisation/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/insights/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/economics/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/scuba-diving/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/facebook/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/linkedin/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/kaggle/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/mongodb/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/health/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/nutrition/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/nutritionism/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/recommender-systems/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/07/31/goodbye-parse-com/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/parse.com/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/deep-learning-resources/</loc><lastmod>2021-11-09T15:38:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/sentiment-analysis/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/divestment/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/fossil-fuels/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/phd-work/</loc><lastmod>2023-07-10T15:30:20+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/gradient-boosting/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/kaggle-competition/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/phantomjs/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/scikit-learn/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/12/29/stochastic-gradient-boosting-choosing-the-best-number-of-iterations/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/traction-book/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/price-forecasting/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/music/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/10/23/what-is-data-science/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/multi-label-classification/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/music-industry/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/kaggle-beginners/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/kaggle/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/01/19/kaggle-beginner-tips/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/a/b-testing/</loc></url><url><loc>https://yanirseroussi.com/about/</loc><lastmod>2023-12-18T11:18:46+10:00</lastmod></url><url><loc>https://yanirseroussi.com/causal-inference-resources/</loc><lastmod>2023-07-06T16:01:57+10:00</lastmod></url><url><loc>https://yanirseroussi.com/consult/</loc><lastmod>2023-12-12T15:11:17+10:00</lastmod></url><url><loc>https://yanirseroussi.com/talks/</loc><lastmod>2023-12-12T14:31:21+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/</loc><lastmod>2023-12-18T11:19:48+10:00</lastmod></url></urlset>
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>https://yanirseroussi.com/tags/business/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/career/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/environment/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/marketing/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/personal/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2024/01/09/psychographic-specialisations-may-work-for-discipline-generalists/</loc><lastmod>2024-01-09T13:23:28+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2024/01/08/the-power-of-parasocial-relationships/</loc><lastmod>2024-01-08T16:31:22+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/data-business/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/data-science/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/12/18/positioning-is-a-common-problem-for-data-scientists/</loc><lastmod>2023-12-18T10:38:56+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/energy-markets/</loc><lastmod>2023-12-14T10:46:41+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/machine-learning/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/quotes/</loc><lastmod>2023-12-14T10:46:41+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/12/14/transfer-learning-applies-to-energy-market-bidding/</loc><lastmod>2023-12-14T10:46:41+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/data-engineering/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/data-visualisation/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/marine-science/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/posts/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/reef-life-survey/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/software-engineering/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/web-development/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/11/28/our-blue-machine-is-changing-but-we-are-not-helpless/</loc><lastmod>2023-11-28T16:56:18+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/11/21/you-dont-need-a-proprietary-api-for-static-maps/</loc><lastmod>2023-11-21T16:12:27+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/artificial-intelligence/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/10/06/artificial-intelligence-was-a-marketing-term-all-along-just-call-it-automation/</loc><lastmod>2023-10-06T15:11:27+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/ethics/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/09/25/the-lines-between-solo-consulting-and-product-building-are-blurry/</loc><lastmod>2023-09-25T11:15:26+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/09/21/googles-rules-of-machine-learning-still-apply-in-the-age-of-large-language-models/</loc><lastmod>2023-09-22T07:54:13+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/blogging/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/21/the-minimalist-entrepreneur-is-too-prescriptive-for-me/</loc><lastmod>2023-08-21T13:34:56+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/productivity/</loc><lastmod>2023-08-17T18:10:57+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/17/revisiting-start-small-stay-small-in-2023-chapter-2/</loc><lastmod>2023-08-17T18:10:57+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/16/revisiting-start-small-stay-small-in-2023-chapter-1/</loc><lastmod>2023-08-17T18:10:36+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/14/email-notifications-on-public-github-commits/</loc><lastmod>2023-08-14T15:44:21+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/08/11/the-rule-of-thirds-can-probably-be-ignored/</loc><lastmod>2023-08-11T14:35:20+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/devops/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/github/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/security/</loc><lastmod>2023-07-25T09:30:43+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/07/23/using-yubikey-for-ssh-access/</loc><lastmod>2023-07-25T09:30:43+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/hugo/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/07/17/making-a-til-section-with-hugo-and-papermod/</loc><lastmod>2023-07-17T17:18:06+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/2023/07/11/you-cant-save-time/</loc><lastmod>2023-07-11T14:28:10+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/hackers/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/05/26/how-hackable-are-automated-coding-assessments/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/futurism/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/machine-intelligence/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2023/04/21/remaining-relevant-as-a-small-language-model/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/12/11/chatgpt-is-transformative-ai/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/causal-inference/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/09/12/causal-machine-learning-book-draft-review/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/automattic/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/climate-change/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/orkestra/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/politics/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/remote-work/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/sustainability/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/deep-learning/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/fast.ai/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2022/01/14/analysis-strategies-in-online-a-b-experiments/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/statistics/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/cloudflare/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/wordpress/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2021/10/07/my-work-with-automattic/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2021/04/05/some-highlights-from-2020/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/bootstrapping/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/confidence-intervals/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2020/01/11/software-commodities-are-eating-interesting-data-science-work/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/analytics/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2019/10/06/bootstrapping-the-right-way/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2018/11/03/reflections-on-remote-data-science-work/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2018/07/22/defining-data-science-in-2018/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/frequently-asked-questions/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/bandcamp/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/bcrecommender/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/09/02/state-of-bandcamp-recommender/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/elasticsearch/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/06/03/exploring-and-visualising-reef-life-survey-data/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/javascript/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/predictive-modelling/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/science-communication/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/search-engine-optimisation/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/insights/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/08/04/is-data-scientist-a-useless-job-title/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/06/19/making-bayesian-ab-testing-more-accessible/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/economics/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/03/20/the-rise-of-greedy-robots/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/scuba-diving/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2016/01/24/the-joys-of-offline-data-collection/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/facebook/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/linkedin/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/12/08/this-holiday-season-give-me-real-insights/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/kaggle/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/mongodb/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/health/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/nutrition/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/nutritionism/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/recommender-systems/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/08/24/you-dont-need-a-data-scientist-yet/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/07/31/goodbye-parse-com/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/parse.com/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/07/06/learning-about-deep-learning-through-album-cover-classification/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/deep-learning-resources/</loc><lastmod>2021-11-09T15:38:25+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/06/06/hopping-on-the-deep-learning-bandwagon/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/sentiment-analysis/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/divestment/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/fossil-fuels/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/04/24/my-divestment-from-fossil-fuels/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/phd-work/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/03/22/the-long-road-to-a-lifestyle-business/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/gradient-boosting/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/kaggle-competition/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2015/01/15/automating-parse-com-bulk-data-imports/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/phantomjs/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/scikit-learn/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/12/29/stochastic-gradient-boosting-choosing-the-best-number-of-iterations/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/12/15/seo-mostly-about-showing-up/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/traction-book/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/price-forecasting/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/11/05/bcrecommender-traction-update/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/music/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/10/23/what-is-data-science/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/multi-label-classification/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/music-industry/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/kaggle-beginners/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/08/17/datas-hierarchy-of-needs/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/kaggle/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/2014/01/19/kaggle-beginner-tips/</loc><lastmod>2023-07-06T09:28:02+10:00</lastmod></url><url><loc>https://yanirseroussi.com/tags/a/b-testing/</loc></url><url><loc>https://yanirseroussi.com/about/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/causal-inference-resources/</loc><lastmod>2023-07-06T16:01:57+10:00</lastmod></url><url><loc>https://yanirseroussi.com/consult/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/talks/</loc><lastmod>2024-01-16T09:56:03+10:00</lastmod></url><url><loc>https://yanirseroussi.com/til/</loc><lastmod>2023-12-18T11:19:48+10:00</lastmod></url></urlset>
\ No newline at end of file
diff --git a/talks/index.html b/talks/index.html
index 235436cbd..dc3b288e1 100644
--- a/talks/index.html
+++ b/talks/index.html
@@ -1,8 +1,8 @@
 <!doctype html><html lang=en dir=auto><head><meta charset=utf-8><meta http-equiv=X-UA-Compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=robots content="index, follow"><title>Talks | Yanir Seroussi | Data & AI for Nature</title>
 <meta name=keywords content><meta name=description content="Just a list of some talks I&rsquo;ve given, saved here for future reference and for general public benefit.
 Lessons from reluctant data engineering (presented at DataEngBytes Brisbane 2023; see video and post) Data ethics – beyond curve fitting (given as part of a local fast.ai course in June 2021; see video and post) Moving Automattic to net zero carbon emissions (PublishPress interview from November 2020) Running remote data teams (Data Futurology webinar from June 2020) Bootstrapping the right way (presented at YOW!"><meta name=author content="Yanir Seroussi"><link rel=canonical href=https://yanirseroussi.com/talks/><meta name=google-site-verification content="aWlue7NGcj4dQpjOKJF7YKiAvw3JuHnq6aFqX6VwWAU"><link crossorigin=anonymous href=/assets/css/stylesheet.4b8cc203f6a37bd20ba1ef634068a73cdc702722ce99fa2fde7f35869dbb5563.css integrity="sha256-S4zCA/aje9ILoe9jQGinPNxwJyLOmfov3n81hp27VWM=" rel="preload stylesheet" as=style><link rel=icon href=https://yanirseroussi.com/favicon.ico><link rel=icon type=image/png sizes=16x16 href=https://yanirseroussi.com/favicon-16x16.png><link rel=icon type=image/png sizes=32x32 href=https://yanirseroussi.com/favicon-32x32.png><link rel=apple-touch-icon href=https://yanirseroussi.com/apple-touch-icon.png><link rel=mask-icon href=https://yanirseroussi.com/safari-pinned-tab.svg><meta name=theme-color content="#2e2e33"><meta name=msapplication-TileColor content="#2e2e33"><noscript><style>#theme-toggle,.top-link{display:none}</style><style>@media(prefers-color-scheme:dark){:root{--theme:rgb(29, 30, 32);--entry:rgb(46, 46, 51);--primary:rgb(218, 218, 219);--secondary:rgb(155, 156, 157);--tertiary:rgb(65, 66, 68);--content:rgb(196, 196, 197);--code-block-bg:rgb(46, 46, 51);--code-bg:rgb(55, 56, 62);--border:rgb(51, 51, 51)}.list{background:var(--theme)}.list:not(.dark)::-webkit-scrollbar-track{background:0 0}.list:not(.dark)::-webkit-scrollbar-thumb{border-color:var(--theme)}}</style></noscript><meta property="og:title" content="Talks"><meta property="og:description" content="Just a list of some talks I&rsquo;ve given, saved here for future reference and for general public benefit.
-Lessons from reluctant data engineering (presented at DataEngBytes Brisbane 2023; see video and post) Data ethics – beyond curve fitting (given as part of a local fast.ai course in June 2021; see video and post) Moving Automattic to net zero carbon emissions (PublishPress interview from November 2020) Running remote data teams (Data Futurology webinar from June 2020) Bootstrapping the right way (presented at YOW!"><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/talks/"><meta property="og:image" content="https://yanirseroussi.com/fractional-chief-data-officer/assets/yanir-seroussi-dataengbytes-bne-2023.webp"><meta property="article:section" content><meta property="article:modified_time" content="2023-12-12T14:31:21+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/fractional-chief-data-officer/assets/yanir-seroussi-dataengbytes-bne-2023.webp"><meta name=twitter:title content="Talks"><meta name=twitter:description content="Just a list of some talks I&rsquo;ve given, saved here for future reference and for general public benefit.
-Lessons from reluctant data engineering (presented at DataEngBytes Brisbane 2023; see video and post) Data ethics – beyond curve fitting (given as part of a local fast.ai course in June 2021; see video and post) Moving Automattic to net zero carbon emissions (PublishPress interview from November 2020) Running remote data teams (Data Futurology webinar from June 2020) Bootstrapping the right way (presented at YOW!"><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Talks","item":"https://yanirseroussi.com/talks/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Talks","name":"Talks","description":"Just a list of some talks I\u0026rsquo;ve given, saved here for future reference and for general public benefit.\nLessons from reluctant data engineering (presented at DataEngBytes Brisbane 2023; see video and post) Data ethics – beyond curve fitting (given as part of a local fast.ai course in June 2021; see video and post) Moving Automattic to net zero carbon emissions (PublishPress interview from November 2020) Running remote data teams (Data Futurology webinar from June 2020) Bootstrapping the right way (presented at YOW!","keywords":[],"articleBody":"Just a list of some talks I’ve given, saved here for future reference and for general public benefit.\nLessons from reluctant data engineering (presented at DataEngBytes Brisbane 2023; see video and post) Data ethics – beyond curve fitting (given as part of a local fast.ai course in June 2021; see video and post) Moving Automattic to net zero carbon emissions (PublishPress interview from November 2020) Running remote data teams (Data Futurology webinar from June 2020) Bootstrapping the right way (presented at YOW! Data 2019; also available as a video) A day in the life of a remote data scientist (presented at Data Science Sydney meetup 2019; also available as a video) Ask Why! Finding motives, causes, and purpose in data science (presented at MeDaScIn 2016 and at Data Science Sydney meetup 2016; also available as a blog post and as a video) The hardest parts of data science (presented at Sydney Data Science Breakfast meetup 2015; also available as a blog post) The wonderful world of recommender systems (presented at Data Science Sydney meetup 2015; also available as a blog post) Gensim: Topic Modelling for Humans (an overview of the gensim package, presented at the Sydney Python meetup 2015) Demystifying data: An introduction to data science (presented as a General Assembly Workshop) How to (almost) win Kaggle competitions (presented at Data Science Sydney meetup 2014; also available as a blog post) High-level introduction to recommender systems (a much-shorter version of The wonderful world of recommender systems) How to avoid most sharding issues with MongoDB (presented at MongoDB Sydney Meetup 2013) Some issues we encountered with Mongo 2.2 (MongoDB Conference 2012 Lightning Talk) Authorship attribution with author-aware topic models (presented as a poster at ACL 2012) Personalised rating prediction for new users using latent factor models (presented at Hypertext 2011) ","wordCount":"299","inLanguage":"en","image":"https://yanirseroussi.com/fractional-chief-data-officer/assets/yanir-seroussi-dataengbytes-bne-2023.webp","datePublished":"0001-01-01T00:00:00Z","dateModified":"2023-12-12T14:31:21+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/talks/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span class=active>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Talks</h1><div class=post-meta></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/fractional-chief-data-officer/assets/yanir-seroussi-dataengbytes-bne-2023.webp alt="Yanir Seroussi giving a talk at DataEngBytes Brisbane 2023."></figure><div class=post-content><p>Just a list of some talks I&rsquo;ve given, saved here for future reference and for general public benefit.</p><ul><li><a href=https://docs.google.com/presentation/d/100GiDkp3UKfQtWtxZOF4CaJWTuSYtkEYxkI0_INdqq8/edit target=_blank rel=noopener>Lessons from reluctant data engineering</a> (presented at <a href=https://dataengconf.com.au/ target=_blank rel=noopener>DataEngBytes</a> Brisbane 2023; see <a href="https://www.youtube.com/watch?v=NE6e7Xx7OLQ" target=_blank rel=noopener>video</a> and <a href=https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/>post</a>)</li><li><a href=https://docs.google.com/presentation/d/1vi0YKxmevanE8zA6u2ZuA835boSXKMa-Su8LZmLA7EA/edit target=_blank rel=noopener>Data ethics – beyond curve fitting</a> (given as part of a local fast.ai course in June 2021; see <a href="https://www.youtube.com/watch?v=P1ebqJ4ZIEI" target=_blank rel=noopener>video</a> and <a href=https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/>post</a>)</li><li><a href="https://www.youtube.com/watch?v=tMFr_agPLJY" target=_blank rel=noopener>Moving Automattic to net zero carbon emissions</a> (PublishPress interview from November 2020)</li><li><a href="https://www.youtube.com/watch?v=79LfP8Kqgvw" target=_blank rel=noopener>Running remote data teams</a> (Data Futurology webinar from June 2020)</li><li><a href=https://yanirs.github.io/talks/bootstrapping-the-right-way/ target=_blank rel="noopener noreferrer">Bootstrapping the right way</a> (presented at <a href=https://yowconference.com/data/2019/ target=_blank rel="noopener noreferrer">YOW! Data 2019</a>; also available as <a href="https://www.youtube.com/watch?v=2wZXejYz-e0" target=_blank rel="noopener noreferrer">a video</a>)</li><li><a href=https://yanirs.github.io/talks/remote-data-scientist/ target=_blank rel="noopener noreferrer">A day in the life of a remote data scientist</a> (presented at <a href=https://www.meetup.com/Data-Science-Sydney/ target=_blank rel="noopener noreferrer">Data Science Sydney</a> meetup 2019; also available as <a href="https://www.youtube.com/watch?v=5qbVEEtgWcY" target=_blank rel="noopener noreferrer">a video</a>)</li><li><a href=https://yanirs.github.io/talks/ask-why/ target=_blank rel="noopener noreferrer">Ask Why! Finding motives, causes, and purpose in data science</a> (presented at <a href=http://www.datasciencemelbourne.com/medascin2016/ target=_blank rel="noopener noreferrer">MeDaScIn 2016</a> and at <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Data Science Sydney</a> meetup 2016; also available as <a href=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/>a blog post</a> and as <a href="http://www.youtube.com/watch?v=2wqu-drqlpo" target=_blank rel="noopener noreferrer">a video</a>)</li><li><a href=http://yanirs.github.io/talks/the-hardest-part-of-data-science/ target=_blank rel="noopener noreferrer">The hardest parts of data science</a> (presented at <a href=http://www.meetup.com/The-Sydney-Data-Science-Breakfast-Meetup-Group/ target=_blank rel="noopener noreferrer">Sydney Data Science Breakfast</a> meetup 2015; also <a href=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/>available as a blog post</a>)</li><li><a href=http://yanirs.github.io/talks/the-wonderful-world-of-recommender-systems/ target=_blank rel="noopener noreferrer">The wonderful world of recommender systems</a> (presented at <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Data Science Sydney</a> meetup 2015; also <a href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/>available as a blog post</a>)</li><li><a href=http://yanirs.github.io/talks/gensim-overview/ target=_blank rel="noopener noreferrer">Gensim: Topic Modelling for Humans</a> (an overview of the <a href=http://radimrehurek.com/gensim/ target=_blank rel="noopener noreferrer">gensim</a> package, presented at the <a href=http://www.meetup.com/sydneypython/ target=_blank rel="noopener noreferrer">Sydney Python</a> meetup 2015)</li><li><a href=http://yanirs.github.io/talks/general-assembly-intro-to-data-science/ target=_blank rel=noopener>Demystifying data: An introduction to data science</a> (presented as a <a href=https://generalassemb.ly/education/demystifying-data-an-introduction-to-data-science target=_blank rel=noopener>General Assembly Workshop</a>)</li><li><a href=http://yanirs.github.io/talks/data-science-sydney-winning-kaggle/ target=_blank rel=noopener>How to (almost) win Kaggle competitions</a> (presented at <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Data Science Sydney</a> meetup 2014; also <a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/>available as a blog post</a>)</li><li><a href=http://yanirs.github.io/talks/high-level-recommender-systems-intro/ target=_blank rel=noopener>High-level introduction to recommender systems</a> (a much-shorter version of <a href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/>The wonderful world of recommender systems</a>)</li><li><a href=http://prezi.com/aqk4kstbvg9v/how-to-avoid-most-sharding-issues/ target=_blank rel=noopener>How to avoid most sharding issues with MongoDB</a> (presented at MongoDB Sydney Meetup 2013)</li><li><a href=http://yanirs.github.io/talks/mongo2012.pdf target=_blank rel=noopener>Some issues we encountered with Mongo 2.2</a> (MongoDB Conference 2012 Lightning Talk)</li><li><a href=http://yanirs.github.io/talks/acl2012-poster.pdf target=_blank rel=noopener>Authorship attribution with author-aware topic models</a> (presented as a poster at <a href=http://acl2012.org/ target=_blank rel=noopener>ACL 2012</a>)</li><li><a href=http://yanirs.github.io/talks/ht2011-talk.pdf target=_blank rel=noopener>Personalised rating prediction for new users using latent factor models</a> (presented at <a href=http://www.ht2011.org/ target=_blank rel=noopener>Hypertext 2011</a>)</li></ul></div><footer class=post-footer><ul class=post-tags></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on x" href="https://x.com/intent/tweet/?text=Talks&amp;url=https%3a%2f%2fyanirseroussi.com%2ftalks%2f&amp;hashtags="><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2ftalks%2f&amp;title=Talks&amp;summary=Talks&amp;source=https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2ftalks%2f&title=Talks"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on whatsapp" href="https://api.whatsapp.com/send?text=Talks%20-%20https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on telegram" href="https://telegram.me/share/url?text=Talks&amp;url=https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on ycombinator" href="https://news.ycombinator.com/submitlink?t=Talks&u=https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
+Lessons from reluctant data engineering (presented at DataEngBytes Brisbane 2023; see video and post) Data ethics – beyond curve fitting (given as part of a local fast.ai course in June 2021; see video and post) Moving Automattic to net zero carbon emissions (PublishPress interview from November 2020) Running remote data teams (Data Futurology webinar from June 2020) Bootstrapping the right way (presented at YOW!"><meta property="og:type" content="article"><meta property="og:url" content="https://yanirseroussi.com/talks/"><meta property="og:image" content="https://yanirseroussi.com/talks/fractional-chief-data-officer/assets/yanir-seroussi-dataengbytes-bne-2023.webp"><meta property="article:section" content><meta property="article:modified_time" content="2024-01-16T09:56:03+10:00"><meta name=twitter:card content="summary_large_image"><meta name=twitter:image content="https://yanirseroussi.com/talks/fractional-chief-data-officer/assets/yanir-seroussi-dataengbytes-bne-2023.webp"><meta name=twitter:title content="Talks"><meta name=twitter:description content="Just a list of some talks I&rsquo;ve given, saved here for future reference and for general public benefit.
+Lessons from reluctant data engineering (presented at DataEngBytes Brisbane 2023; see video and post) Data ethics – beyond curve fitting (given as part of a local fast.ai course in June 2021; see video and post) Moving Automattic to net zero carbon emissions (PublishPress interview from November 2020) Running remote data teams (Data Futurology webinar from June 2020) Bootstrapping the right way (presented at YOW!"><script type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Talks","item":"https://yanirseroussi.com/talks/"}]}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"BlogPosting","headline":"Talks","name":"Talks","description":"Just a list of some talks I\u0026rsquo;ve given, saved here for future reference and for general public benefit.\nLessons from reluctant data engineering (presented at DataEngBytes Brisbane 2023; see video and post) Data ethics – beyond curve fitting (given as part of a local fast.ai course in June 2021; see video and post) Moving Automattic to net zero carbon emissions (PublishPress interview from November 2020) Running remote data teams (Data Futurology webinar from June 2020) Bootstrapping the right way (presented at YOW!","keywords":[],"articleBody":"Just a list of some talks I’ve given, saved here for future reference and for general public benefit.\nLessons from reluctant data engineering (presented at DataEngBytes Brisbane 2023; see video and post) Data ethics – beyond curve fitting (given as part of a local fast.ai course in June 2021; see video and post) Moving Automattic to net zero carbon emissions (PublishPress interview from November 2020) Running remote data teams (Data Futurology webinar from June 2020) Bootstrapping the right way (presented at YOW! Data 2019; also available as a video) A day in the life of a remote data scientist (presented at Data Science Sydney meetup 2019; also available as a video) Ask Why! Finding motives, causes, and purpose in data science (presented at MeDaScIn 2016 and at Data Science Sydney meetup 2016; also available as a blog post and as a video) The hardest parts of data science (presented at Sydney Data Science Breakfast meetup 2015; also available as a blog post) The wonderful world of recommender systems (presented at Data Science Sydney meetup 2015; also available as a blog post) Gensim: Topic Modelling for Humans (an overview of the gensim package, presented at the Sydney Python meetup 2015) Demystifying data: An introduction to data science (presented as a General Assembly Workshop) How to (almost) win Kaggle competitions (presented at Data Science Sydney meetup 2014; also available as a blog post) High-level introduction to recommender systems (a much-shorter version of The wonderful world of recommender systems) How to avoid most sharding issues with MongoDB (presented at MongoDB Sydney Meetup 2013) Some issues we encountered with Mongo 2.2 (MongoDB Conference 2012 Lightning Talk) Authorship attribution with author-aware topic models (presented as a poster at ACL 2012) Personalised rating prediction for new users using latent factor models (presented at Hypertext 2011) ","wordCount":"299","inLanguage":"en","image":"https://yanirseroussi.com/talks/fractional-chief-data-officer/assets/yanir-seroussi-dataengbytes-bne-2023.webp","datePublished":"0001-01-01T00:00:00Z","dateModified":"2024-01-16T09:56:03+10:00","author":{"@type":"Person","name":"Yanir Seroussi"},"mainEntityOfPage":{"@type":"WebPage","@id":"https://yanirseroussi.com/talks/"},"publisher":{"@type":"Organization","name":"Yanir Seroussi | Data \u0026 AI for Nature","logo":{"@type":"ImageObject","url":"https://yanirseroussi.com/favicon.ico"}}}</script></head><body id=top><script>localStorage.getItem("pref-theme")==="dark"?document.body.classList.add("dark"):localStorage.getItem("pref-theme")==="light"?document.body.classList.remove("dark"):window.matchMedia("(prefers-color-scheme: dark)").matches&&document.body.classList.add("dark")</script><header class=header><nav class=nav><div class=logo><a href=https://yanirseroussi.com/ accesskey=h title="Yanir Seroussi | Data & AI for Nature (Alt + H)">Yanir Seroussi | Data & AI for Nature</a><div class=logo-switches><button id=theme-toggle accesskey=t title="(Alt + T)"><svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z"/></svg><svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="18" viewBox="0 0 24 24" fill="none" stroke="currentcolor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg></button></div></div><ul id=menu><li><a href=https://yanirseroussi.com/about/ title=About><span>About</span></a></li><li><a href=https://yanirseroussi.com/talks/ title=Talks><span class=active>Talks</span></a></li><li><a href=https://yanirseroussi.com/consult/ title=Consult><span>Consult</span></a></li></ul></nav></header><main class=main><article class=post-single><header class=post-header><h1 class="post-title entry-hint-parent">Talks</h1><div class=post-meta></div></header><figure class=entry-cover><img loading=eager src=https://yanirseroussi.com/fractional-chief-data-officer/assets/yanir-seroussi-dataengbytes-bne-2023.webp alt="Yanir Seroussi giving a talk at DataEngBytes Brisbane 2023."></figure><div class=post-content><p>Just a list of some talks I&rsquo;ve given, saved here for future reference and for general public benefit.</p><ul><li><a href=https://docs.google.com/presentation/d/100GiDkp3UKfQtWtxZOF4CaJWTuSYtkEYxkI0_INdqq8/edit target=_blank rel=noopener>Lessons from reluctant data engineering</a> (presented at <a href=https://dataengconf.com.au/ target=_blank rel=noopener>DataEngBytes</a> Brisbane 2023; see <a href="https://www.youtube.com/watch?v=NE6e7Xx7OLQ" target=_blank rel=noopener>video</a> and <a href=https://yanirseroussi.com/2023/10/25/lessons-from-reluctant-data-engineering/>post</a>)</li><li><a href=https://docs.google.com/presentation/d/1vi0YKxmevanE8zA6u2ZuA835boSXKMa-Su8LZmLA7EA/edit target=_blank rel=noopener>Data ethics – beyond curve fitting</a> (given as part of a local fast.ai course in June 2021; see <a href="https://www.youtube.com/watch?v=P1ebqJ4ZIEI" target=_blank rel=noopener>video</a> and <a href=https://yanirseroussi.com/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/>post</a>)</li><li><a href="https://www.youtube.com/watch?v=tMFr_agPLJY" target=_blank rel=noopener>Moving Automattic to net zero carbon emissions</a> (PublishPress interview from November 2020)</li><li><a href="https://www.youtube.com/watch?v=79LfP8Kqgvw" target=_blank rel=noopener>Running remote data teams</a> (Data Futurology webinar from June 2020)</li><li><a href=https://yanirs.github.io/talks/bootstrapping-the-right-way/ target=_blank rel="noopener noreferrer">Bootstrapping the right way</a> (presented at <a href=https://yowconference.com/data/2019/ target=_blank rel="noopener noreferrer">YOW! Data 2019</a>; also available as <a href="https://www.youtube.com/watch?v=2wZXejYz-e0" target=_blank rel="noopener noreferrer">a video</a>)</li><li><a href=https://yanirs.github.io/talks/remote-data-scientist/ target=_blank rel="noopener noreferrer">A day in the life of a remote data scientist</a> (presented at <a href=https://www.meetup.com/Data-Science-Sydney/ target=_blank rel="noopener noreferrer">Data Science Sydney</a> meetup 2019; also available as <a href="https://www.youtube.com/watch?v=5qbVEEtgWcY" target=_blank rel="noopener noreferrer">a video</a>)</li><li><a href=https://yanirs.github.io/talks/ask-why/ target=_blank rel="noopener noreferrer">Ask Why! Finding motives, causes, and purpose in data science</a> (presented at <a href=http://www.datasciencemelbourne.com/medascin2016/ target=_blank rel="noopener noreferrer">MeDaScIn 2016</a> and at <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Data Science Sydney</a> meetup 2016; also available as <a href=https://yanirseroussi.com/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/>a blog post</a> and as <a href="http://www.youtube.com/watch?v=2wqu-drqlpo" target=_blank rel="noopener noreferrer">a video</a>)</li><li><a href=http://yanirs.github.io/talks/the-hardest-part-of-data-science/ target=_blank rel="noopener noreferrer">The hardest parts of data science</a> (presented at <a href=http://www.meetup.com/The-Sydney-Data-Science-Breakfast-Meetup-Group/ target=_blank rel="noopener noreferrer">Sydney Data Science Breakfast</a> meetup 2015; also <a href=https://yanirseroussi.com/2015/11/23/the-hardest-parts-of-data-science/>available as a blog post</a>)</li><li><a href=http://yanirs.github.io/talks/the-wonderful-world-of-recommender-systems/ target=_blank rel="noopener noreferrer">The wonderful world of recommender systems</a> (presented at <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Data Science Sydney</a> meetup 2015; also <a href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/>available as a blog post</a>)</li><li><a href=http://yanirs.github.io/talks/gensim-overview/ target=_blank rel="noopener noreferrer">Gensim: Topic Modelling for Humans</a> (an overview of the <a href=http://radimrehurek.com/gensim/ target=_blank rel="noopener noreferrer">gensim</a> package, presented at the <a href=http://www.meetup.com/sydneypython/ target=_blank rel="noopener noreferrer">Sydney Python</a> meetup 2015)</li><li><a href=http://yanirs.github.io/talks/general-assembly-intro-to-data-science/ target=_blank rel=noopener>Demystifying data: An introduction to data science</a> (presented as a <a href=https://generalassemb.ly/education/demystifying-data-an-introduction-to-data-science target=_blank rel=noopener>General Assembly Workshop</a>)</li><li><a href=http://yanirs.github.io/talks/data-science-sydney-winning-kaggle/ target=_blank rel=noopener>How to (almost) win Kaggle competitions</a> (presented at <a href=http://www.meetup.com/Data-Science-Sydney/ target=_blank rel=noopener>Data Science Sydney</a> meetup 2014; also <a href=https://yanirseroussi.com/2014/08/24/how-to-almost-win-kaggle-competitions/>available as a blog post</a>)</li><li><a href=http://yanirs.github.io/talks/high-level-recommender-systems-intro/ target=_blank rel=noopener>High-level introduction to recommender systems</a> (a much-shorter version of <a href=https://yanirseroussi.com/2015/10/02/the-wonderful-world-of-recommender-systems/>The wonderful world of recommender systems</a>)</li><li><a href=http://prezi.com/aqk4kstbvg9v/how-to-avoid-most-sharding-issues/ target=_blank rel=noopener>How to avoid most sharding issues with MongoDB</a> (presented at MongoDB Sydney Meetup 2013)</li><li><a href=http://yanirs.github.io/talks/mongo2012.pdf target=_blank rel=noopener>Some issues we encountered with Mongo 2.2</a> (MongoDB Conference 2012 Lightning Talk)</li><li><a href=http://yanirs.github.io/talks/acl2012-poster.pdf target=_blank rel=noopener>Authorship attribution with author-aware topic models</a> (presented as a poster at <a href=http://acl2012.org/ target=_blank rel=noopener>ACL 2012</a>)</li><li><a href=http://yanirs.github.io/talks/ht2011-talk.pdf target=_blank rel=noopener>Personalised rating prediction for new users using latent factor models</a> (presented at <a href=http://www.ht2011.org/ target=_blank rel=noopener>Hypertext 2011</a>)</li></ul></div><footer class=post-footer><ul class=post-tags></ul><ul class=share-buttons><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on x" href="https://x.com/intent/tweet/?text=Talks&amp;url=https%3a%2f%2fyanirseroussi.com%2ftalks%2f&amp;hashtags="><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446C483.971.0 512 28.03 512 62.554zM269.951 190.75 182.567 75.216H56L207.216 272.95 63.9 436.783h61.366L235.9 310.383l96.667 126.4H456L298.367 228.367l134-153.151H371.033zM127.633 110h36.468l219.38 290.065H349.5z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on linkedin" href="https://www.linkedin.com/shareArticle?mini=true&amp;url=https%3a%2f%2fyanirseroussi.com%2ftalks%2f&amp;title=Talks&amp;summary=Talks&amp;source=https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM160.461 423.278V197.561h-75.04v225.717h75.04zm270.539.0V293.839c0-69.333-37.018-101.586-86.381-101.586-39.804.0-57.634 21.891-67.617 37.266v-31.958h-75.021c.995 21.181.0 225.717.0 225.717h75.02V297.222c0-6.748.486-13.492 2.474-18.315 5.414-13.475 17.767-27.434 38.494-27.434 27.135.0 38.007 20.707 38.007 51.037v120.768H431zM123.448 88.722C97.774 88.722 81 105.601 81 127.724c0 21.658 16.264 39.002 41.455 39.002h.484c26.165.0 42.452-17.344 42.452-39.002-.485-22.092-16.241-38.954-41.943-39.002z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on reddit" href="https://reddit.com/submit?url=https%3a%2f%2fyanirseroussi.com%2ftalks%2f&title=Talks"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zM446 265.638c0-22.964-18.616-41.58-41.58-41.58-11.211.0-21.361 4.457-28.841 11.666-28.424-20.508-67.586-33.757-111.204-35.278l18.941-89.121 61.884 13.157c.756 15.734 13.642 28.29 29.56 28.29 16.407.0 29.706-13.299 29.706-29.701.0-16.403-13.299-29.702-29.706-29.702-11.666.0-21.657 6.792-26.515 16.578l-69.105-14.69c-1.922-.418-3.939-.042-5.585 1.036-1.658 1.073-2.811 2.761-3.224 4.686l-21.152 99.438c-44.258 1.228-84.046 14.494-112.837 35.232-7.468-7.164-17.589-11.591-28.757-11.591-22.965.0-41.585 18.616-41.585 41.58.0 16.896 10.095 31.41 24.568 37.918-.639 4.135-.99 8.328-.99 12.576.0 63.977 74.469 115.836 166.33 115.836s166.334-51.859 166.334-115.836c0-4.218-.347-8.387-.977-12.493 14.564-6.47 24.735-21.034 24.735-38.001zM326.526 373.831c-20.27 20.241-59.115 21.816-70.534 21.816-11.428.0-50.277-1.575-70.522-21.82-3.007-3.008-3.007-7.882.0-10.889 3.003-2.999 7.882-3.003 10.885.0 12.777 12.781 40.11 17.317 59.637 17.317 19.522.0 46.86-4.536 59.657-17.321 3.016-2.999 7.886-2.995 10.885.008 3.008 3.011 3.003 7.882-.008 10.889zm-5.23-48.781c-16.373.0-29.701-13.324-29.701-29.698.0-16.381 13.328-29.714 29.701-29.714 16.378.0 29.706 13.333 29.706 29.714.0 16.374-13.328 29.698-29.706 29.698zM160.91 295.348c0-16.381 13.328-29.71 29.714-29.71 16.369.0 29.689 13.329 29.689 29.71.0 16.373-13.32 29.693-29.689 29.693-16.386.0-29.714-13.32-29.714-29.693z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on facebook" href="https://facebook.com/sharer/sharer.php?u=https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H342.978V319.085h66.6l12.672-82.621h-79.272v-53.617c0-22.603 11.073-44.636 46.58-44.636H425.6v-70.34s-32.71-5.582-63.982-5.582c-65.288.0-107.96 39.569-107.96 111.204v62.971h-72.573v82.621h72.573V512h-191.104c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on whatsapp" href="https://api.whatsapp.com/send?text=Talks%20-%20https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg viewBox="0 0 512 512" height="30" width="30" fill="currentcolor"><path d="M449.446.0C483.971.0 512 28.03 512 62.554v386.892C512 483.97 483.97 512 449.446 512H62.554c-34.524.0-62.554-28.03-62.554-62.554V62.554c0-34.524 28.029-62.554 62.554-62.554h386.892zm-58.673 127.703c-33.842-33.881-78.847-52.548-126.798-52.568-98.799.0-179.21 80.405-179.249 179.234-.013 31.593 8.241 62.428 23.927 89.612l-25.429 92.884 95.021-24.925c26.181 14.28 55.659 21.807 85.658 21.816h.074c98.789.0 179.206-80.413 179.247-179.243.018-47.895-18.61-92.93-52.451-126.81zM263.976 403.485h-.06c-26.734-.01-52.954-7.193-75.828-20.767l-5.441-3.229-56.386 14.792 15.05-54.977-3.542-5.637c-14.913-23.72-22.791-51.136-22.779-79.287.033-82.142 66.867-148.971 149.046-148.971 39.793.014 77.199 15.531 105.329 43.692 28.128 28.16 43.609 65.592 43.594 105.4-.034 82.149-66.866 148.983-148.983 148.984zm81.721-111.581c-4.479-2.242-26.499-13.075-30.604-14.571-4.105-1.495-7.091-2.241-10.077 2.241-2.986 4.483-11.569 14.572-14.182 17.562-2.612 2.988-5.225 3.364-9.703 1.12-4.479-2.241-18.91-6.97-36.017-22.23C231.8 264.15 222.81 249.484 220.198 245s-.279-6.908 1.963-9.14c2.016-2.007 4.48-5.232 6.719-7.847 2.24-2.615 2.986-4.484 4.479-7.472 1.493-2.99.747-5.604-.374-7.846-1.119-2.241-10.077-24.288-13.809-33.256-3.635-8.733-7.327-7.55-10.077-7.688-2.609-.13-5.598-.158-8.583-.158-2.986.0-7.839 1.121-11.944 5.604-4.105 4.484-15.675 15.32-15.675 37.364.0 22.046 16.048 43.342 18.287 46.332 2.24 2.99 31.582 48.227 76.511 67.627 10.685 4.615 19.028 7.371 25.533 9.434 10.728 3.41 20.492 2.929 28.209 1.775 8.605-1.285 26.499-10.833 30.231-21.295 3.732-10.464 3.732-19.431 2.612-21.298-1.119-1.869-4.105-2.99-8.583-5.232z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on telegram" href="https://telegram.me/share/url?text=Talks&amp;url=https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg viewBox="2 2 28 28" height="30" width="30" fill="currentcolor"><path d="M26.49 29.86H5.5a3.37 3.37.0 01-2.47-1 3.35 3.35.0 01-1-2.47V5.48A3.36 3.36.0 013 3 3.37 3.37.0 015.5 2h21A3.38 3.38.0 0129 3a3.36 3.36.0 011 2.46V26.37a3.35 3.35.0 01-1 2.47 3.38 3.38.0 01-2.51 1.02zm-5.38-6.71a.79.79.0 00.85-.66L24.73 9.24a.55.55.0 00-.18-.46.62.62.0 00-.41-.17q-.08.0-16.53 6.11a.59.59.0 00-.41.59.57.57.0 00.43.52l4 1.24 1.61 4.83a.62.62.0 00.63.43.56.56.0 00.4-.17L16.54 20l4.09 3A.9.9.0 0021.11 23.15zM13.8 20.71l-1.21-4q8.72-5.55 8.78-5.55c.15.0.23.0.23.16a.18.18.0 010 .06s-2.51 2.3-7.52 6.8z"/></svg></a></li><li><a target=_blank rel="noopener noreferrer" aria-label="share Talks on ycombinator" href="https://news.ycombinator.com/submitlink?t=Talks&u=https%3a%2f%2fyanirseroussi.com%2ftalks%2f"><svg width="30" height="30" viewBox="0 0 512 512" fill="currentcolor" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"><path d="M449.446.0C483.971.0 512 28.03 512 62.554V449.446C512 483.97 483.97 512 449.446 512H62.554C28.03 512 0 483.97.0 449.446V62.554C0 28.03 28.029.0 62.554.0H449.446zM183.8767 87.9921h-62.034L230.6673 292.4508V424.0079h50.6655V292.4508L390.1575 87.9921H328.1233L256 238.2489z"/></svg></a></li></ul></footer></article></main><footer class=footer><span>Text and figures licensed under <a href=https://creativecommons.org/licenses/by-nc-nd/4.0/ target=_blank rel=noopener>CC BY-NC-ND 4.0</a> by <a href=https://yanirseroussi.com/about/>Yanir Seroussi</a>, except where noted otherwise  |</span>
 <span>Powered by
 <a href=https://gohugo.io/ rel="noopener noreferrer" target=_blank>Hugo</a> &
         <a href=https://github.com/adityatelange/hugo-PaperMod/ rel=noopener target=_blank>PaperMod</a></span></footer><div class=mailing-list-container><form class=mailing-list action="https://yanirseroussi.us17.list-manage.com/subscribe/post?u=3c08aa3ff27dd92978019febd&amp;id=bc3ab705af" method=post target=_blank novalidate><label for=mailing-list-email>Get new post notifications</label>