From 628f1043ee1615dd24d8e796ae6bfd8e83b7bc9d Mon Sep 17 00:00:00 2001 From: yanirs Date: Fri, 20 Dec 2024 06:47:47 +0000 Subject: [PATCH] deploy: 316b57b60fa85102271e350dfb8f3d5845117224 --- 2014/01/19/kaggle-beginner-tips/index.html | 6 ++++-- 2014/08/17/datas-hierarchy-of-needs/index.html | 6 ++++-- 2014/08/24/how-to-almost-win-kaggle-competitions/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- 2014/10/23/what-is-data-science/index.html | 6 ++++-- 2014/11/05/bcrecommender-traction-update/index.html | 6 ++++-- .../index.html | 6 ++++-- 2014/12/15/seo-mostly-about-showing-up/index.html | 6 ++++-- .../index.html | 6 ++++-- .../01/15/automating-parse-com-bulk-data-imports/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- 2015/03/22/the-long-road-to-a-lifestyle-business/index.html | 6 ++++-- 2015/04/24/my-divestment-from-fossil-fuels/index.html | 6 ++++-- .../index.html | 6 ++++-- .../06/06/hopping-on-the-deep-learning-bandwagon/index.html | 6 ++++-- .../index.html | 6 ++++-- 2015/07/31/goodbye-parse-com/index.html | 6 ++++-- 2015/08/24/you-dont-need-a-data-scientist-yet/index.html | 6 ++++-- .../the-wonderful-world-of-recommender-systems/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- 2015/11/23/the-hardest-parts-of-data-science/index.html | 6 ++++-- .../08/this-holiday-season-give-me-real-insights/index.html | 6 ++++-- 2016/01/24/the-joys-of-offline-data-collection/index.html | 6 ++++-- .../index.html | 6 ++++-- 2016/03/20/the-rise-of-greedy-robots/index.html | 6 ++++-- .../index.html | 6 ++++-- .../making-bayesian-ab-testing-more-accessible/index.html | 6 ++++-- 2016/08/04/is-data-scientist-a-useless-job-title/index.html | 6 ++++-- .../21/seven-ways-to-be-data-driven-off-a-cliff/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- 2017/09/02/state-of-bandcamp-recommender/index.html | 6 ++++-- .../index.html | 6 ++++-- 2018/07/22/defining-data-science-in-2018/index.html | 6 ++++-- .../03/reflections-on-remote-data-science-work/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- 2019/10/06/bootstrapping-the-right-way/index.html | 6 ++++-- .../a-day-in-the-life-of-a-remote-data-scientist/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- 2021/04/05/some-highlights-from-2020/index.html | 6 ++++-- 2021/10/07/my-work-with-automattic/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../12/causal-machine-learning-book-draft-review/index.html | 6 ++++-- 2022/12/11/chatgpt-is-transformative-ai/index.html | 6 ++++-- .../remaining-relevant-as-a-small-language-model/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../25/lessons-from-reluctant-data-engineering/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../avoiding-ai-complexity-first-write-no-code/index.html | 6 ++++-- 2024/03/04/two-types-of-startup-data-problems/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../04/22/assessing-a-startups-data-to-ai-health/index.html | 6 ++++-- .../mentorship-and-the-art-of-actionable-advice/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- 2024/06/17/ai-aint-gonna-save-you-from-bad-data/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../giving-up-on-the-minimum-viable-data-stack/index.html | 6 ++++-- 2024/08/26/ai-hype-ai-bullshit-and-the-real-deal/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- 2024/11/18/dont-build-ai-build-with-ai/index.html | 6 ++++-- .../index.html | 6 ++++-- 404.html | 2 +- about/index.html | 2 +- ...7282f7a3576dac5daaca4eefd96f9c48d6db522129863cc99c6.css} | 2 +- causal-inference-resources/index.html | 6 ++++-- consult/index.html | 2 +- contact/index.html | 2 +- data-to-ai-health-check/index.html | 2 +- deep-learning-resources/index.html | 6 ++++-- free-intro-call/index.html | 2 +- index.html | 2 +- kaggle/index.html | 6 ++++-- phd-work/index.html | 2 +- posts/index.html | 2 +- tags/analytics/index.html | 2 +- tags/artificial-intelligence/index.html | 2 +- tags/automattic/index.html | 2 +- tags/bandcamp/index.html | 2 +- tags/bcrecommender/index.html | 2 +- tags/blogging/index.html | 2 +- tags/books/index.html | 2 +- tags/bootstrapping/index.html | 2 +- tags/business/index.html | 2 +- tags/career/index.html | 2 +- tags/causal-inference/index.html | 2 +- tags/climate-change/index.html | 2 +- tags/cloudflare/index.html | 2 +- tags/confidence-intervals/index.html | 2 +- tags/consulting/index.html | 2 +- tags/data-business/index.html | 2 +- tags/data-engineering/index.html | 2 +- tags/data-hiring/index.html | 2 +- tags/data-science/index.html | 2 +- tags/data-strategy/index.html | 2 +- tags/data-visualisation/index.html | 2 +- tags/deep-learning/index.html | 2 +- tags/devops/index.html | 2 +- tags/divestment/index.html | 2 +- tags/economics/index.html | 2 +- tags/elasticsearch/index.html | 2 +- tags/energy-markets/index.html | 2 +- tags/environment/index.html | 2 +- tags/ethics/index.html | 2 +- tags/facebook/index.html | 2 +- tags/fast.ai/index.html | 2 +- tags/fossil-fuels/index.html | 2 +- tags/frequently-asked-questions/index.html | 2 +- tags/futurism/index.html | 2 +- tags/github/index.html | 2 +- tags/gradient-boosting/index.html | 2 +- tags/hackers/index.html | 2 +- tags/health/index.html | 2 +- tags/hugo/index.html | 2 +- tags/index.html | 2 +- tags/insights/index.html | 2 +- tags/javascript/index.html | 2 +- tags/kaggle-beginners/index.html | 2 +- tags/kaggle-competition/index.html | 2 +- tags/kaggle/index.html | 2 +- tags/linkedin/index.html | 2 +- tags/machine-intelligence/index.html | 2 +- tags/machine-learning/index.html | 2 +- tags/marine-science/index.html | 2 +- tags/marketing/index.html | 2 +- tags/mongodb/index.html | 2 +- tags/multi-label-classification/index.html | 2 +- tags/music-industry/index.html | 2 +- tags/music/index.html | 2 +- tags/nutrition/index.html | 2 +- tags/nutritionism/index.html | 2 +- tags/orkestra/index.html | 2 +- tags/parse.com/index.html | 2 +- tags/personal/index.html | 2 +- tags/phantomjs/index.html | 2 +- tags/politics/index.html | 2 +- tags/predictive-modelling/index.html | 2 +- tags/price-forecasting/index.html | 2 +- tags/productivity/index.html | 2 +- tags/quotes/index.html | 2 +- tags/recommender-systems/index.html | 2 +- tags/reef-life-survey/index.html | 2 +- tags/remote-work/index.html | 2 +- tags/science-communication/index.html | 2 +- tags/scikit-learn/index.html | 2 +- tags/scuba-diving/index.html | 2 +- tags/search-engine-optimisation/index.html | 2 +- tags/security/index.html | 2 +- tags/sentiment-analysis/index.html | 2 +- tags/software-engineering/index.html | 2 +- tags/split-testing/index.html | 2 +- tags/startups/index.html | 2 +- tags/statistics/index.html | 2 +- tags/sustainability/index.html | 2 +- tags/traction-book/index.html | 2 +- tags/video/index.html | 2 +- tags/web-development/index.html | 2 +- tags/wordpress/index.html | 2 +- talks/index.html | 2 +- til/2023/07/11/you-cant-save-time/index.html | 6 ++++-- .../making-a-til-section-with-hugo-and-papermod/index.html | 6 ++++-- til/2023/07/23/using-yubikey-for-ssh-access/index.html | 6 ++++-- .../the-rule-of-thirds-can-probably-be-ignored/index.html | 6 ++++-- .../email-notifications-on-public-github-commits/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../01/08/the-power-of-parasocial-relationships/index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../index.html | 6 ++++-- .../atomic-habits-is-full-of-actionable-advice/index.html | 6 ++++-- .../index.html | 6 ++++-- til/2024/04/11/linkedin-is-a-teachable-skill/index.html | 6 ++++-- .../05/25/adapting-to-the-economy-of-algorithms/index.html | 6 ++++-- til/2024/06/12/the-rules-of-the-passion-economy/index.html | 6 ++++-- til/2024/06/22/dealing-with-endless-data-changes/index.html | 6 ++++-- .../index.html | 6 ++++-- til/index.html | 2 +- 222 files changed, 603 insertions(+), 349 deletions(-) rename assets/css/{stylesheet.6ae6009622e3a7f627ec457944ddec46485d368a325461242eff3696affc09de.css => stylesheet.b367d98f2cecb7282f7a3576dac5daaca4eefd96f9c48d6db522129863cc99c6.css} (79%) diff --git a/2014/01/19/kaggle-beginner-tips/index.html b/2014/01/19/kaggle-beginner-tips/index.html index 80a3df44a..4b919a22e 100644 --- a/2014/01/19/kaggle-beginner-tips/index.html +++ b/2014/01/19/kaggle-beginner-tips/index.html @@ -1,10 +1,12 @@ Kaggle beginner tips | Yanir Seroussi | Data & AI for Startup Impact -

Kaggle beginner tips

These are few points from an email I sent to members of the Data Science Sydney Meetup. I suppose other Kaggle beginners may find it useful.

My first steps when working on a new competition are:

  • Read all the instructions carefully to understand the problem. One important thing to look at is what measure is being optimised. For example, minimising the mean absolute error (MAE) may require a different approach from minimising the mean square error (MSE).
  • Read messages on the forum. Especially when joining a competition late, you can learn a lot from the problems other people had. And sometimes there’s even code to get you started (though code quality may vary and it’s not worth relying on).
  • Download the data and look at it a bit to understand it better, noting any insights you may have and things you would like to try. Even if you don’t know how to model something, knowing what you want to model is half of the solution. For example, in the DSG Hackathon (predicting air quality), we noticed that even though we had to produce hourly predictions for pollutant levels, the measured levels don’t change every hour (probably due to limitations in the measuring equipment). This led us to try a simple “model” for the first few hours, where we predicted exactly the last measured value, which proved to be one of our most valuable insights. Stupid and uninspiring, but we did finish 6th :-). The main message is: look at the data!
  • Set up a local validation environment. This will allow you to iterate quickly without making submissions, and will increase the accuracy of your model. For those with some programming experience: local validation is your private development environment, the public leaderboard is staging, and the private leaderboard is production.
    What you use for local validation depends on the type of problem. For example, for classic prediction problems you may use one of the classic cross-validation techniques. For forecasting problems, you should try and have a local setup that is as close as possible to the setup of the leaderboard. In the Yandex competition, the leaderboard is based on data from the last three days of search activity. You should use a similar split for the training data (and of course, use exactly the same local setup for all the team members so you can compare results).
  • Get the submission format right. Make sure that you can reproduce the baseline results locally.

Now, the way things often work is:

  • You try many different approaches and ideas. Most of them lead to nothing. Hopefully some lead to something.
  • Create ensembles of the various approaches.
  • Repeat until you run out of time.
  • Win. Hopefully.

Note that in many competitions, the differences between the top results are not statistically significant, so winning may depend on luck. But getting one of the top results also depends to a large degree on your persistence. To avoid disappointment, I think the main goal should be to learn things, so spend time trying to understand how the methods that you’re using work. Libraries like sklearn make it really easy to try a bunch of models without understanding how they work, but you’re better off trying less things and developing the ability to reason about why they work or not work.

An analogy for programmers: while you can use an array, a linked list, a binary tree, and a hash table interchangeably in some situations, understanding when to use each one can make a world of difference in terms of performance. It’s pretty similar for predictive models (though they are often not as well-behaved as data structures).

Finally, it’s worth watching this video by Phil Brierley, who won a bunch of Kaggle competitions. It’s really good, and doesn’t require much understanding of R.

Any comments are welcome!

Subscribe +

Kaggle beginner tips

These are few points from an email I sent to members of the Data Science Sydney Meetup. I suppose other Kaggle beginners may find it useful.

My first steps when working on a new competition are:

  • Read all the instructions carefully to understand the problem. One important thing to look at is what measure is being optimised. For example, minimising the mean absolute error (MAE) may require a different approach from minimising the mean square error (MSE).
  • Read messages on the forum. Especially when joining a competition late, you can learn a lot from the problems other people had. And sometimes there’s even code to get you started (though code quality may vary and it’s not worth relying on).
  • Download the data and look at it a bit to understand it better, noting any insights you may have and things you would like to try. Even if you don’t know how to model something, knowing what you want to model is half of the solution. For example, in the DSG Hackathon (predicting air quality), we noticed that even though we had to produce hourly predictions for pollutant levels, the measured levels don’t change every hour (probably due to limitations in the measuring equipment). This led us to try a simple “model” for the first few hours, where we predicted exactly the last measured value, which proved to be one of our most valuable insights. Stupid and uninspiring, but we did finish 6th :-). The main message is: look at the data!
  • Set up a local validation environment. This will allow you to iterate quickly without making submissions, and will increase the accuracy of your model. For those with some programming experience: local validation is your private development environment, the public leaderboard is staging, and the private leaderboard is production.
    What you use for local validation depends on the type of problem. For example, for classic prediction problems you may use one of the classic cross-validation techniques. For forecasting problems, you should try and have a local setup that is as close as possible to the setup of the leaderboard. In the Yandex competition, the leaderboard is based on data from the last three days of search activity. You should use a similar split for the training data (and of course, use exactly the same local setup for all the team members so you can compare results).
  • Get the submission format right. Make sure that you can reproduce the baseline results locally.

Now, the way things often work is:

  • You try many different approaches and ideas. Most of them lead to nothing. Hopefully some lead to something.
  • Create ensembles of the various approaches.
  • Repeat until you run out of time.
  • Win. Hopefully.

Note that in many competitions, the differences between the top results are not statistically significant, so winning may depend on luck. But getting one of the top results also depends to a large degree on your persistence. To avoid disappointment, I think the main goal should be to learn things, so spend time trying to understand how the methods that you’re using work. Libraries like sklearn make it really easy to try a bunch of models without understanding how they work, but you’re better off trying less things and developing the ability to reason about why they work or not work.

An analogy for programmers: while you can use an array, a linked list, a binary tree, and a hash table interchangeably in some situations, understanding when to use each one can make a world of difference in terms of performance. It’s pretty similar for predictive models (though they are often not as well-behaved as data structures).

Finally, it’s worth watching this video by Phil Brierley, who won a bunch of Kaggle competitions. It’s really good, and doesn’t require much understanding of R.

Any comments are welcome!

Subscribe

    Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

    Hi Yanir!

    I have a question.

    When you say: “For example, minimising the mean absolute error (MAE) may require a different approach from minimising the mean square error (MSE).” can you explain what kind of approach (or methods, or rules of thumb) that your get to minimising MAE or MSE in machine learning?

    Thanks for your time in advance!

    Regards,

    Flavio

    Hi Flavio!

    The optimisation approach depends on the data and method you’re using.

    A basic example is when you don’t have any features, only a sample of target values. In that case, if you want to minimise the MAE you should choose the sample median, and if you want to minimise the MSE you should choose the sample mean. Here’s proof why: https://www.dropbox.com/s/b1195thcqebnxyn/mae-vs-rmse.pdf

    For more complex problems, if you’re using a machine learning package you can often specify the type of loss function to minimise (see https://en.wikipedia.org/wiki/Loss_function#Selecting_a_loss_function). But even if your measure isn’t directly optimised (e.g., MAE is harder to minimise than MSE because it’s not differentiable at zero), you can always do cross-validation to find the parameters that optimise it.

    I hope this helps.

    Hi Yanir!

    appreciate your work! I need to know should I directly jump into machine learning algorithm, programming etc or to first master math and statistics ? -I am new in this field.

    This site is a part of the Data People Writing Stuff webring.
    ← previous site +  |   +next site →

    \ No newline at end of file diff --git a/2014/08/17/datas-hierarchy-of-needs/index.html b/2014/08/17/datas-hierarchy-of-needs/index.html index b65a42179..1e8a7718c 100644 --- a/2014/08/17/datas-hierarchy-of-needs/index.html +++ b/2014/08/17/datas-hierarchy-of-needs/index.html @@ -1,5 +1,5 @@ Data’s hierarchy of needs | Yanir Seroussi | Data & AI for Startup Impact -

    Data’s hierarchy of needs

    One of my favourite blog posts in recent times is The Log: What every software engineer should know about real-time data’s unifying abstraction by Jay Kreps. That post comprehensively describes how abstracting all the data produced by LinkedIn’s various components into a single log pipeline greatly simplified their architecture and enabled advanced data-driven applications. Among the various technical details there are some beautifully-articulated business insights. My favourite one defines data’s hierarchy of needs:

    Effective use of data follows a kind of Maslow’s hierarchy of needs. The base of the pyramid involves capturing all the relevant data, being able to put it together in an applicable processing environment (be that a fancy real-time query system or just text files and python scripts). This data needs to be modeled in a uniform way to make it easy to read and process. Once these basic needs of capturing data in a uniform way are taken care of it is reasonable to work on infrastructure to process this data in various ways—MapReduce, real-time query systems, etc.

    It’s worth noting the obvious: without a reliable and complete data flow, a Hadoop cluster is little more than a very expensive and difficult to assemble space heater. Once data and processing are available, one can move concern on to more refined problems of good data models and consistent well understood semantics. Finally, concentration can shift to more sophisticated processing—better visualization, reporting, and algorithmic processing and prediction.

    In my experience, most organizations have huge holes in the base of this pyramid—they lack reliable complete data flow—but want to jump directly to advanced data modeling techniques. This is completely backwards. [emphasis mine]

    Visually, it looks something like this:

    Data’s hierarchy of needs

    One of my favourite blog posts in recent times is The Log: What every software engineer should know about real-time data’s unifying abstraction by Jay Kreps. That post comprehensively describes how abstracting all the data produced by LinkedIn’s various components into a single log pipeline greatly simplified their architecture and enabled advanced data-driven applications. Among the various technical details there are some beautifully-articulated business insights. My favourite one defines data’s hierarchy of needs:

    Effective use of data follows a kind of Maslow’s hierarchy of needs. The base of the pyramid involves capturing all the relevant data, being able to put it together in an applicable processing environment (be that a fancy real-time query system or just text files and python scripts). This data needs to be modeled in a uniform way to make it easy to read and process. Once these basic needs of capturing data in a uniform way are taken care of it is reasonable to work on infrastructure to process this data in various ways—MapReduce, real-time query systems, etc.

    It’s worth noting the obvious: without a reliable and complete data flow, a Hadoop cluster is little more than a very expensive and difficult to assemble space heater. Once data and processing are available, one can move concern on to more refined problems of good data models and consistent well understood semantics. Finally, concentration can shift to more sophisticated processing—better visualization, reporting, and algorithmic processing and prediction.

    In my experience, most organizations have huge holes in the base of this pyramid—they lack reliable complete data flow—but want to jump directly to advanced data modeling techniques. This is completely backwards. [emphasis mine]

    Visually, it looks something like this:

    800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

      \ No newline at end of file diff --git a/2014/08/24/how-to-almost-win-kaggle-competitions/index.html b/2014/08/24/how-to-almost-win-kaggle-competitions/index.html index 43d44aa67..e477e6604 100644 --- a/2014/08/24/how-to-almost-win-kaggle-competitions/index.html +++ b/2014/08/24/how-to-almost-win-kaggle-competitions/index.html @@ -1,12 +1,14 @@ How to (almost) win Kaggle competitions | Yanir Seroussi | Data & AI for Startup Impact -

      How to (almost) win Kaggle competitions

      Last week, I gave a talk at the Data Science Sydney Meetup group about some of the lessons I learned through almost winning five Kaggle competitions. The core of the talk was ten tips, which I think are worth putting in a post (the original slides are here). Some of these tips were covered in my beginner tips post from a few months ago. Similar advice was also recently published on the Kaggle blog – it’s great to see that my tips are in line with the thoughts of other prolific kagglers.

      Tip 1: RTFM

      It’s surprising to see how many people miss out on important details, such as remembering the final date to make the first submission. Before jumping into building models, it’s important to understand the competition timeline, be able to reproduce benchmarks, generate the correct submission format, etc.

      Tip 2: Know your measure

      A key part of doing well in a competition is understanding how the measure works. It’s often easy to obtain significant improvements in your score by using an optimisation approach that is suitable to the measure. A classic example is optimising the mean absolute error (MAE) versus the mean square error (MSE). It’s easy to show that given no other data for a set of numbers, the predictor that minimises the MAE is the median, while the predictor that minimises the MSE is the mean. Indeed, in the EMC Data Science Hackathon we fell back to the median rather than the mean when there wasn’t enough data, and that ended up working pretty well.

      Tip 3: Know your data

      In Kaggle competitions, overspecialisation (without overfitting) is a good thing. This is unlike academic machine learning papers, where researchers often test their proposed method on many different datasets. This is also unlike more applied work, where you may care about data drifting and whether what you predict actually makes sense. Examples include the Hackathon, where the measures of pollutants in the air were repeated for consecutive hours (i.e., they weren’t really measured); the multi-label Greek article competition, where I found connected components of labels (doesn’t generalise well to other datasets); and the Arabic writers competition, where I used histogram kernels to deal with the features that we were given. The general lesson is that custom solutions win, and that’s why the world needs data scientists (at least until we are replaced by robots).

      Tip 4: What before how

      It’s important to know what you want to model before figuring out how to model it. It seems like many beginners tend to worry too much about which tool to use (Python or R? Logistic regression or SVMs?), when they should be worrying about understanding the data and what useful patterns they want to capture. For example, when we worked on the Yandex search personalisation competition, we spent a lot of time looking at the data and thinking what makes sense for users to be doing. In that case it was easy to come up with ideas, because we all use search engines. But the main message is that to be effective, you have to become one with the data.

      Tip 5: Do local validation

      This is a point I covered in my Kaggle beginner tips post. Having a local validation environment allows you to move faster and produce more reliable results than when relying on the leaderboard. The main scenarios when you should skip local validation is when the data is too small (a problem I had in the Arabic writers competition), or when you run out of time (towards the end of the competition).

      Tip 6: Make fewer submissions

      In addition to making you look good, making few submissions reduces the likelihood of overfitting the leaderboard, which is a real problem. If your local validation is set up well and is consistent with the leaderboard (which you need to test by making one or two submissions), there’s really no need to make many submissions. Further, if you’re doing well, making submissions erodes your competitive advantage by showing your competitors what scores are obtainable and motivating them to work harder. Just resist the urge to submit, unless you have a really good reason.

      Tip 7: Do your research

      For any given problem, it’s likely that there are people dedicating their lives to its solution. These people (often academics) have probably published papers, benchmarks and code, which you can learn from. Unlike actually winning, which is not only dependent on you, gaining deeper knowledge and understanding is the only sure reward of a competition. This has worked well for me, as I’ve learned something new and applied it successfully in nearly every competition I’ve worked on.

      Tip 8: Apply the basics rigorously

      While playing with obscure methods can be a lot of fun, it’s often the case that the basics will get you very far. Common algorithms have good implementations in most major languages, so there’s really no reason not to try them. However, note that when you do try any methods, you must do some minimal tuning of the main parameters (e.g., number of trees in a random forest or the regularisation of a linear model). Running a method without minimal tuning is worse than not running it at all, because you may get a false negative – giving up on a method that actually works very well.

      An example of applying the basics rigorously is in the classic paper In defense of one-vs-all classification, where the authors showed that the simple one-vs-all (OVA) approach to multiclass classification is at least as good as approaches that are much more sophisticated. In their words: “What we find is that although a wide array of more sophisticated methods for multiclass classification exist, experimental evidence of the superiority of these methods over a simple OVA scheme is either lacking or improperly controlled or measured”. If such a failure to perform proper experiments can happen to serious machine learning researchers, it can definitely happen to the average kaggler. Don’t let it happen to you.

      Tip 9: The forum is your friend

      It’s very important to subscribe to the forum to receive notifications on issues with the data or the competition. In addition, it’s worth trying to figure out what your competitors are doing. An extreme example is the recent trend of code sharing during the competition (which I don’t really like) – while it’s not a good idea to rely on such code, it’s important to be aware of its existence. Finally, reading the post-competition summaries on the forum is a valuable way of learning from the winners and improving over time.

      Tip 10: Ensemble all the things

      Not to be confused with ensemble methods (which are also very important), the idea here is to combine models that were developed independently. In high-profile competitions, it is often the case that teams merge and gain a significant boost from combining their models. This is worth doing even when competing alone, because almost no competition is won by a single model.

      Subscribe +

      How to (almost) win Kaggle competitions

      Last week, I gave a talk at the Data Science Sydney Meetup group about some of the lessons I learned through almost winning five Kaggle competitions. The core of the talk was ten tips, which I think are worth putting in a post (the original slides are here). Some of these tips were covered in my beginner tips post from a few months ago. Similar advice was also recently published on the Kaggle blog – it’s great to see that my tips are in line with the thoughts of other prolific kagglers.

      Tip 1: RTFM

      It’s surprising to see how many people miss out on important details, such as remembering the final date to make the first submission. Before jumping into building models, it’s important to understand the competition timeline, be able to reproduce benchmarks, generate the correct submission format, etc.

      Tip 2: Know your measure

      A key part of doing well in a competition is understanding how the measure works. It’s often easy to obtain significant improvements in your score by using an optimisation approach that is suitable to the measure. A classic example is optimising the mean absolute error (MAE) versus the mean square error (MSE). It’s easy to show that given no other data for a set of numbers, the predictor that minimises the MAE is the median, while the predictor that minimises the MSE is the mean. Indeed, in the EMC Data Science Hackathon we fell back to the median rather than the mean when there wasn’t enough data, and that ended up working pretty well.

      Tip 3: Know your data

      In Kaggle competitions, overspecialisation (without overfitting) is a good thing. This is unlike academic machine learning papers, where researchers often test their proposed method on many different datasets. This is also unlike more applied work, where you may care about data drifting and whether what you predict actually makes sense. Examples include the Hackathon, where the measures of pollutants in the air were repeated for consecutive hours (i.e., they weren’t really measured); the multi-label Greek article competition, where I found connected components of labels (doesn’t generalise well to other datasets); and the Arabic writers competition, where I used histogram kernels to deal with the features that we were given. The general lesson is that custom solutions win, and that’s why the world needs data scientists (at least until we are replaced by robots).

      Tip 4: What before how

      It’s important to know what you want to model before figuring out how to model it. It seems like many beginners tend to worry too much about which tool to use (Python or R? Logistic regression or SVMs?), when they should be worrying about understanding the data and what useful patterns they want to capture. For example, when we worked on the Yandex search personalisation competition, we spent a lot of time looking at the data and thinking what makes sense for users to be doing. In that case it was easy to come up with ideas, because we all use search engines. But the main message is that to be effective, you have to become one with the data.

      Tip 5: Do local validation

      This is a point I covered in my Kaggle beginner tips post. Having a local validation environment allows you to move faster and produce more reliable results than when relying on the leaderboard. The main scenarios when you should skip local validation is when the data is too small (a problem I had in the Arabic writers competition), or when you run out of time (towards the end of the competition).

      Tip 6: Make fewer submissions

      In addition to making you look good, making few submissions reduces the likelihood of overfitting the leaderboard, which is a real problem. If your local validation is set up well and is consistent with the leaderboard (which you need to test by making one or two submissions), there’s really no need to make many submissions. Further, if you’re doing well, making submissions erodes your competitive advantage by showing your competitors what scores are obtainable and motivating them to work harder. Just resist the urge to submit, unless you have a really good reason.

      Tip 7: Do your research

      For any given problem, it’s likely that there are people dedicating their lives to its solution. These people (often academics) have probably published papers, benchmarks and code, which you can learn from. Unlike actually winning, which is not only dependent on you, gaining deeper knowledge and understanding is the only sure reward of a competition. This has worked well for me, as I’ve learned something new and applied it successfully in nearly every competition I’ve worked on.

      Tip 8: Apply the basics rigorously

      While playing with obscure methods can be a lot of fun, it’s often the case that the basics will get you very far. Common algorithms have good implementations in most major languages, so there’s really no reason not to try them. However, note that when you do try any methods, you must do some minimal tuning of the main parameters (e.g., number of trees in a random forest or the regularisation of a linear model). Running a method without minimal tuning is worse than not running it at all, because you may get a false negative – giving up on a method that actually works very well.

      An example of applying the basics rigorously is in the classic paper In defense of one-vs-all classification, where the authors showed that the simple one-vs-all (OVA) approach to multiclass classification is at least as good as approaches that are much more sophisticated. In their words: “What we find is that although a wide array of more sophisticated methods for multiclass classification exist, experimental evidence of the superiority of these methods over a simple OVA scheme is either lacking or improperly controlled or measured”. If such a failure to perform proper experiments can happen to serious machine learning researchers, it can definitely happen to the average kaggler. Don’t let it happen to you.

      Tip 9: The forum is your friend

      It’s very important to subscribe to the forum to receive notifications on issues with the data or the competition. In addition, it’s worth trying to figure out what your competitors are doing. An extreme example is the recent trend of code sharing during the competition (which I don’t really like) – while it’s not a good idea to rely on such code, it’s important to be aware of its existence. Finally, reading the post-competition summaries on the forum is a valuable way of learning from the winners and improving over time.

      Tip 10: Ensemble all the things

      Not to be confused with ensemble methods (which are also very important), the idea here is to combine models that were developed independently. In high-profile competitions, it is often the case that teams merge and gain a significant boost from combining their models. This is worth doing even when competing alone, because almost no competition is won by a single model.

      Subscribe

        Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

        Can you elaborate what you mean in Tip 5 by stating “The main scenarios when you should skip local validation is when the data is too small …”. What I experienced is that with too little observations, the leaderboard becomes very misleading, so my intuition would be to use more local validation for small datasets, not less.

        Good point. What I was referring to are scenarios where local validation is unreliable.

        For example, in the Arabic writer identification competition (http://blog.kaggle.com/2012/04/29/on-diffusion-kernels-histograms-and-arabic-writer-identification/), each of the 204 writers had only two training paragraphs (all containing the same text), while the test/leaderboard instances were a third paragraph with different content. I tried many forms of local validation but none of them yielded results that were consistent with the leaderboard, so I ended up relying on the leaderboard score.

        Ah, thanks, that clarifies what you meant. The (currently still running) Africa Soil Property contest (https://www.kaggle.com/c/afsis-soil-properties) seems a bit similar. I won’t put much more energy into that contest, but I am curious how it will work out in the end, and what things will have worked for the winners (maybe not much except pure luck).
        Could you provide some tips on #3(‘Getting to Know your data’) with respect to best practice visualisations to gain insights from data - especially considering the fact that datasets always have a large number of features. Plotting feature vs. label graphs do seem to be helpful, but for a large number of features will be impractical. So how should one go about data analysis via visualisation?

        It really depends on the dataset. For personal use, I don’t worry too much about pretty visualisations. Often just printing some summary statistics works well.

        Most text classification problems are hard to visualise. If, for example, you use bag of words (or n-grams) as your feature set, you could just print the top words for each label, or the top words that vary between labels. Another thing to look at would be commonalities between misclassified instances – these could be dependent on the content of the texts or their length.

        Examples:

        • In the Greek Media Monitoring competition (http://yanirseroussi.com/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/), I found that ‘Despite being manually annotated, the data isn’t very clean. Issues include identical texts that have different labels, empty articles, and articles with very few words. For example, the training set includes ten “articles” with a single word. Five of these articles have the word 68839, but each of these five was given a different label.’ – this was discovered by just printing some summary statistics and looking at misclassified instances
        • Looking into the raw data behind one of the widely-used sentiment analysis datasets, I found an issue that was overlooked by many other people who used the dataset: http://www.cs.cornell.edu/people/pabo/movie-review-data/ (look for the comment with my name – found four years after the original dataset was published)

        I hope this helps.

        Thanks a lot, So to summarize the following could be 3

        1. Using summary statistics such as means/stds/variance on the data, looking out for outliers,etc in the data
        2. Looking at misclassified instances during validation to find some sort of pattern in them
        3. Looking at label-specific raw data I apologize for the long overdue response, and thanks for these tips. This will surely be useful in my next Kaggle competition.
        Reblogged this on Dr. Manhattan's Diary and commented: -Very good read on how to win in a Kaggle competition. Useful hints!

        I’m starting to dive into Kaggle competitions right now, and I’m having trouble with some of the simple practical considerations. For example, what IDE should I be using - IPython Notebooks? Where should I store the data? My personal computer surely doesn’t have 50GB+ space to spare. How long should I wait for a script to run before I deem it as “broken”?

        Any advice here would be greatly appreciated!

        Thanks for your comment, Derek. These are all good questions, but the answers really depend on the problem you’re working on and what you’re comfortable with.

        Personally, I find IPython Notebooks useful for playing around with the data and for documenting/storing throwaway code. However, once you have code that is more complex or code that you want to rerun, it’s better to save it in separate files. For editing these files, I use PyCharm.

        Not all problems require a large hard drive. If you are working on a large dataset, you can either purchase an external drive, or hire an instance from a cloud provider like AWS or DigitalOcean. The latter is generally cheaper than AWS, but they don’t offer GPUs. If you are working with a remote server, you can run IPython Notebook on the server and work from your browser.

        Regarding waiting for a script, for many models you can first build a simple version to test that everything works (e.g., build a random forest with just a few trees or train a neural network for a few iterations). If everything works well, you can run the full version. If the run time is long, it’s a good idea to take snapshots of the model and monitor performance on a hold-out set to ensure that you’re not wasting time overfitting.

        Thanks for the quick response! I think I’ll be using Sublime Text on EC2 with S3 in the short term, and possibly move onto Amazon ML with RedShift in the future. I’ll probably take snapshots by outputting results to the console or MatPlotLib every once in awhile, so that’s great advice as well.

        I’m starting to dive into Kaggle competitions right now, and I’m having trouble with some of the simple practical considerations. For example, what IDE should I be using - IPython Notebooks? Where should I store the data? My personal computer surely doesn’t have 50GB+ space to spare. How long should I wait for a script to run before I deem it as “broken”?

        Any advice here would be greatly appreciated!

        Thanks for your comment, Derek. These are all good questions, but the answers really depend on the problem you’re working on and what you’re comfortable with.

        Personally, I find IPython Notebooks useful for playing around with the data and for documenting/storing throwaway code. However, once you have code that is more complex or code that you want to rerun, it’s better to save it in separate files. For editing these files, I use PyCharm.

        Not all problems require a large hard drive. If you are working on a large dataset, you can either purchase an external drive, or hire an instance from a cloud provider like AWS or DigitalOcean. The latter is generally cheaper than AWS, but they don’t offer GPUs. If you are working with a remote server, you can run IPython Notebook on the server and work from your browser.

        Regarding waiting for a script, for many models you can first build a simple version to test that everything works (e.g., build a random forest with just a few trees or train a neural network for a few iterations). If everything works well, you can run the full version. If the run time is long, it’s a good idea to take snapshots of the model and monitor performance on a hold-out set to ensure that you’re not wasting time overfitting.

        Thanks for the quick response! I think I’ll be using Sublime Text on EC2 with S3 in the short term, and possibly move onto Amazon ML with RedShift in the future. I’ll probably take snapshots by outputting results to the console or MatPlotLib every once in awhile, so that’s great advice as well.

        This site is a part of the Data People Writing Stuff webring.
        ← previous site +  |   +next site →

        \ No newline at end of file diff --git a/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/index.html b/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/index.html index f9211c290..3580d52fc 100644 --- a/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/index.html +++ b/2014/08/30/building-a-bandcamp-recommender-system-part-1-motivation/index.html @@ -1,11 +1,13 @@ Building a Bandcamp recommender system (part 1 – motivation) | Yanir Seroussi | Data & AI for Startup Impact -

        Building a Bandcamp recommender system (part 1 – motivation)

        I’ve been a Bandcamp user for a few years now. I love the fact that they pay out a significant share of the revenue directly to the artists, unlike other services. In addition, despite the fact that fans may stream all the music for free and even easily rip it, almost $80M were paid out to artists through Bandcamp to date (including almost $3M in the last month) – serving as strong evidence that the traditional music industry’s fight against piracy is a waste of resources and time.

        One thing I’ve been struggling with since starting to use Bandcamp is the discovery of new music. Originally (in 2011), I used the browse-by-tag feature, but it is often too broad to find music that I like. A newer feature is the Discoverinator, which is meant to emulate the experience of browsing through covers at a record store – sadly, I could never find much stuff I liked using that method. Last year, Bandcamp announced Bandcamp for fans, which includes the ability to wishlist items and discover new music by stalking/following other fans. In addition, they released a mobile app, which made the music purchased on Bandcamp much easier to access.

        All these new features definitely increased my engagement and helped me find more stuff to listen to, but I still feel that Bandcamp music discovery could be much better. Specifically, I would love to be served personalised recommendations and be able to browse music that is similar to specific tracks and albums that I like. Rather than waiting for Bandcamp to implement these features, I decided to do it myself. Visit BCRecommender – Bandcamp recommendations based on your fan account to see where this effort stands at the moment.

        While BCRecommender has already helped me discover new music to add to my collection, building it gave me many more ideas on how it can be improved, so it’s definitely a work in progress. I’ll probably tinker with the underlying algorithms as I go, so recommendations may occasionally seem weird (but this always seems to be the case with recommender systems in the real world). In subsequent posts I’ll discuss some of the technical details and where I’d like to take this project.


        It’s probably worth noting that BCRecommender is not associated with or endorsed by Bandcamp, but I doubt they would mind since it was built using publicly-available information, and is full of links to buy the music back on their site.

        Subscribe +

        Building a Bandcamp recommender system (part 1 – motivation)

        I’ve been a Bandcamp user for a few years now. I love the fact that they pay out a significant share of the revenue directly to the artists, unlike other services. In addition, despite the fact that fans may stream all the music for free and even easily rip it, almost $80M were paid out to artists through Bandcamp to date (including almost $3M in the last month) – serving as strong evidence that the traditional music industry’s fight against piracy is a waste of resources and time.

        One thing I’ve been struggling with since starting to use Bandcamp is the discovery of new music. Originally (in 2011), I used the browse-by-tag feature, but it is often too broad to find music that I like. A newer feature is the Discoverinator, which is meant to emulate the experience of browsing through covers at a record store – sadly, I could never find much stuff I liked using that method. Last year, Bandcamp announced Bandcamp for fans, which includes the ability to wishlist items and discover new music by stalking/following other fans. In addition, they released a mobile app, which made the music purchased on Bandcamp much easier to access.

        All these new features definitely increased my engagement and helped me find more stuff to listen to, but I still feel that Bandcamp music discovery could be much better. Specifically, I would love to be served personalised recommendations and be able to browse music that is similar to specific tracks and albums that I like. Rather than waiting for Bandcamp to implement these features, I decided to do it myself. Visit BCRecommender – Bandcamp recommendations based on your fan account to see where this effort stands at the moment.

        While BCRecommender has already helped me discover new music to add to my collection, building it gave me many more ideas on how it can be improved, so it’s definitely a work in progress. I’ll probably tinker with the underlying algorithms as I go, so recommendations may occasionally seem weird (but this always seems to be the case with recommender systems in the real world). In subsequent posts I’ll discuss some of the technical details and where I’d like to take this project.


        It’s probably worth noting that BCRecommender is not associated with or endorsed by Bandcamp, but I doubt they would mind since it was built using publicly-available information, and is full of links to buy the music back on their site.

        Subscribe

          Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

          Hi!

          I just found these articles a few years after their publication… I saw that the BCRecommender seems not active anymore and that the last post is from 2015.

          Any update? -I’m interested to have your feedback.

          Thanks,

          Clément

          This site is a part of the Data People Writing Stuff webring.
          ← previous site +  |   +next site →

          \ No newline at end of file diff --git a/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/index.html b/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/index.html index 69a312cab..036d6a0aa 100644 --- a/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/index.html +++ b/2014/09/07/building-a-recommender-system-on-a-shoestring-budget/index.html @@ -1,9 +1,11 @@ Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout) | Yanir Seroussi | Data & AI for Startup Impact -

          Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)

          This is the second part of a series of posts on my BCRecommender – personalised Bandcamp recommendations project. Check out the first part for the general motivation behind this project.

          BCRecommender is a hobby project whose main goal is to help me find music I like on Bandcamp. Its secondary goal is to serve as a testing ground for ideas I have and things I’d like to explore.
          One question I’ve been wondering about is: how much money does one need to spend on infrastructure for a simple web-based product before it reaches meaningful traffic?
          The answer is: not much at all. It can easily be done for less than $1 per month.
          This post discusses my exploration of this question by describing the main components of the BCRecommender system, without getting into the algorithms that drive it (which will be covered in subsequent posts).

          The general flow of BCRecommender is fairly simple: crawl publicly-available data from Bandcamp (fan collections and tracks/albums = tralbums), generate recommendations based on this data (static lists of tralbums indexed by fan for personalised recommendations and by tralbum for similarity), and present the recommendations to users in a way that’s easy to browse and explore (since we’re dealing with music it must be playable, which is easy to achieve by embedding Bandcamp’s iframes).

          First iteration: Django & AWS

          The first iteration of the project was implemented as a Django project. Having never built a Django project from scratch, I figured this would be a good way to learn how it’s done properly. One thing I was keen on learning was using the Django ORM with an SQL database (in the past I’ve worked with Django and MongoDB). This ended up working less smoothly than I expected, perhaps because I’m too used to MongoDB, or because SQL forces you to model your data in unnatural ways, or because I insisted on using SQLite for simplicity. Whatever it was, I quickly started missing MongoDB, despite its flaws.

          I chose AWS for hosting because my personal account was under the free tier, and using a micro instance is more than enough for serving a website with no traffic. I considered Google App Engine with its indefinite free tier, but after reading the docs I realised I don’t want to jump through so many hoops to use their system – Google’s free tier was likely to cost too much in pain and time.

          While an AWS micro instance is enough for serving the recommendations, it’s not enough for generating them. Rather than paying Amazon for another instance, I figured that using spare capacity on my own laptop (quad-core with 16GB of RAM) would be good enough. So the backend worker for BCRecommender ended up being a local virtual machine using one core and 4GB of RAM.

          After some coding I had a nice setup in place:

          • AWS webserver running Django with SQLite as the database layer and a simple frontend, styled with Bootstrap
          • Local backend worker running Celery under Supervisor to collect the data (with errors reported to a dedicated Gmail account), Dropbox for backups, and Django management commands to generate the recommendations
          • Code and issue tracker hosted on Bitbucket (which provides free private repositories)
          • Fabric scripts for deployments to the AWS webserver and the local backend worker (including database sync as one big SQLite file)
          • Local virtual machine for development (provisioned with Vagrant)

          This system wasn’t going to scale, but I didn’t care. I just used it to discover new music, and it worked. I didn’t even bother registering a domain name, so it was all running for free.

          Second iteration: “Django” backend & Parse

          A few months ago, Facebook announced that Parse’s free tier will include 30 requests / second. That’s over 2.5 million requests per day, which is quite a lot – probably enough to run the majority of websites on the internet. It seemed too good to be true, so I had to try it myself.

          It took a few hours to convert the Django webserver/frontend code to Parse. This was fairly straightforward, and it had the added advantages of getting rid of some deployment scripts and having a more solid development environment. Parse supplies a command-line tool for deployment that constantly syncs the code to an app that is identical to the production app – much better than the Fabric script I had.

          The disadvantages of the move to Parse were having to rewrite some of the backend in JavaScript (= less readable than Python), and a more complex data sync command (no longer just copying a big SQLite file). However, I would definitely use it for other projects because of the generous free tier, the availability of APIs for all major platforms, and the elimination of most operational concerns.

          Current iteration: Goodbye Django, hello BCRecommender

          With the Django webserver out of the way, there was little use left for Django in the project. It took a few more hours to get rid of it, replacing the management commands with Commandr, and the SQLite database with MongoDB (wrapped with the excellent MongoEngine, which has matured a lot in recent years). MongoDB has become a more natural choice now, since it is the database used by Parse. I expect this setup of a local Python backend and a Parse frontend to work quite well (and remain virtually free) for the foreseeable future.

          The only fixed cost I now have comes from registering the bcrecommender.com domain and managing it with Route 53. This wasn’t required when I was running it only for myself, and I could have just kept it under bcrecommender.parseapp.com, but I think it would be useful for other Bandcamp users. I would also like to use it as a training lab to improve my (poor) marketing skills – not having a dedicated domain just looks bad.

          In summary, it’s definitely possible to build simple projects and host them for free. It also looks like my approach would scale way beyond the current BCRecommender volume. The next post in this series will cover some of the algorithms and general considerations of building the recommender system.

          Subscribe +

          Building a recommender system on a shoestring budget (or: BCRecommender part 2 – general system layout)

          This is the second part of a series of posts on my BCRecommender – personalised Bandcamp recommendations project. Check out the first part for the general motivation behind this project.

          BCRecommender is a hobby project whose main goal is to help me find music I like on Bandcamp. Its secondary goal is to serve as a testing ground for ideas I have and things I’d like to explore.
          One question I’ve been wondering about is: how much money does one need to spend on infrastructure for a simple web-based product before it reaches meaningful traffic?
          The answer is: not much at all. It can easily be done for less than $1 per month.
          This post discusses my exploration of this question by describing the main components of the BCRecommender system, without getting into the algorithms that drive it (which will be covered in subsequent posts).

          The general flow of BCRecommender is fairly simple: crawl publicly-available data from Bandcamp (fan collections and tracks/albums = tralbums), generate recommendations based on this data (static lists of tralbums indexed by fan for personalised recommendations and by tralbum for similarity), and present the recommendations to users in a way that’s easy to browse and explore (since we’re dealing with music it must be playable, which is easy to achieve by embedding Bandcamp’s iframes).

          First iteration: Django & AWS

          The first iteration of the project was implemented as a Django project. Having never built a Django project from scratch, I figured this would be a good way to learn how it’s done properly. One thing I was keen on learning was using the Django ORM with an SQL database (in the past I’ve worked with Django and MongoDB). This ended up working less smoothly than I expected, perhaps because I’m too used to MongoDB, or because SQL forces you to model your data in unnatural ways, or because I insisted on using SQLite for simplicity. Whatever it was, I quickly started missing MongoDB, despite its flaws.

          I chose AWS for hosting because my personal account was under the free tier, and using a micro instance is more than enough for serving a website with no traffic. I considered Google App Engine with its indefinite free tier, but after reading the docs I realised I don’t want to jump through so many hoops to use their system – Google’s free tier was likely to cost too much in pain and time.

          While an AWS micro instance is enough for serving the recommendations, it’s not enough for generating them. Rather than paying Amazon for another instance, I figured that using spare capacity on my own laptop (quad-core with 16GB of RAM) would be good enough. So the backend worker for BCRecommender ended up being a local virtual machine using one core and 4GB of RAM.

          After some coding I had a nice setup in place:

          • AWS webserver running Django with SQLite as the database layer and a simple frontend, styled with Bootstrap
          • Local backend worker running Celery under Supervisor to collect the data (with errors reported to a dedicated Gmail account), Dropbox for backups, and Django management commands to generate the recommendations
          • Code and issue tracker hosted on Bitbucket (which provides free private repositories)
          • Fabric scripts for deployments to the AWS webserver and the local backend worker (including database sync as one big SQLite file)
          • Local virtual machine for development (provisioned with Vagrant)

          This system wasn’t going to scale, but I didn’t care. I just used it to discover new music, and it worked. I didn’t even bother registering a domain name, so it was all running for free.

          Second iteration: “Django” backend & Parse

          A few months ago, Facebook announced that Parse’s free tier will include 30 requests / second. That’s over 2.5 million requests per day, which is quite a lot – probably enough to run the majority of websites on the internet. It seemed too good to be true, so I had to try it myself.

          It took a few hours to convert the Django webserver/frontend code to Parse. This was fairly straightforward, and it had the added advantages of getting rid of some deployment scripts and having a more solid development environment. Parse supplies a command-line tool for deployment that constantly syncs the code to an app that is identical to the production app – much better than the Fabric script I had.

          The disadvantages of the move to Parse were having to rewrite some of the backend in JavaScript (= less readable than Python), and a more complex data sync command (no longer just copying a big SQLite file). However, I would definitely use it for other projects because of the generous free tier, the availability of APIs for all major platforms, and the elimination of most operational concerns.

          Current iteration: Goodbye Django, hello BCRecommender

          With the Django webserver out of the way, there was little use left for Django in the project. It took a few more hours to get rid of it, replacing the management commands with Commandr, and the SQLite database with MongoDB (wrapped with the excellent MongoEngine, which has matured a lot in recent years). MongoDB has become a more natural choice now, since it is the database used by Parse. I expect this setup of a local Python backend and a Parse frontend to work quite well (and remain virtually free) for the foreseeable future.

          The only fixed cost I now have comes from registering the bcrecommender.com domain and managing it with Route 53. This wasn’t required when I was running it only for myself, and I could have just kept it under bcrecommender.parseapp.com, but I think it would be useful for other Bandcamp users. I would also like to use it as a training lab to improve my (poor) marketing skills – not having a dedicated domain just looks bad.

          In summary, it’s definitely possible to build simple projects and host them for free. It also looks like my approach would scale way beyond the current BCRecommender volume. The next post in this series will cover some of the algorithms and general considerations of building the recommender system.

          Subscribe

            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

            \ No newline at end of file diff --git a/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/index.html b/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/index.html index fc758aec6..f56602118 100644 --- a/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/index.html +++ b/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/index.html @@ -1,5 +1,5 @@ Bandcamp recommendation and discovery algorithms | Yanir Seroussi | Data & AI for Startup Impact -

            Bandcamp recommendation and discovery algorithms

            This is the third part of a series of posts on my Bandcamp recommendations (BCRecommender) project. Check out the first part for the general motivation behind this project and the second part for the system architecture.

            The main goal of the BCRecommender project is to help me find music I like. This post discusses the algorithmic approaches I took towards that goal. I’ve kept the descriptions at a fairly high-level, without getting too much into the maths, as all recommendation algorithms essentially try to model simple intuition. Please leave a comment if you feel like something needs to be explained further.

            Data & evaluation approach

            The data was collected from publicly-indexable Bandcamp fan and track/album (aka tralbum) pages. For each fan, it consists of the tralbum IDs they bought or wishlisted. For each tralbum, the saved data includes the type (track/album), URL, title, artist name, and the tags (as assigned by the artist).

            At the moment, I have data for about 160K fans, 335K albums and 170K tracks. These fans have expressed their preference for tralbums through purchasing or wishlisting about 3.4M times. There are about 210K unique tags across the 505K tralbums, with the mean number of tags per tralbum being 7. These figures represent a fairly sparse dataset, which makes recommendation somewhat challenging. Perhaps this is why Bandcamp doesn’t do much algorithmic recommendation.

            Before moving on to describe the recommendation approaches I played with, it is worth noting that at this stage, my way of evaluating the recommendations isn’t very rigorous. If I can easily find new music that I like, I’m happy. As such, offline evaluation approaches (e.g., some form of cross-validation) are unlikely to correlate well with my goal, so I just didn’t bother with them. Having more data would allow me to perform more rigorous online evaluation to see what makes other people happy with the recommendations.

            Personalised recommendations with preferences (collaborative filtering)

            My first crack at recommendation generation was using collaborative filtering. The broad idea behind collaborative filtering is using only the preference matrix to find patterns in the data, and generate recommendations accordingly. The preference matrix is defined to have a row for each user and a column for each item. Each matrix element value indicates the level of preference by the user for the item. To keep things simple, I used unary preference values, where the element that corresponds to user/fan u and item/tralbum i is set to 1 if the fan purchased or wishlisted the tralbum, or set to missing otherwise.

            A simple example for collaborative filtering is in the following image, which was taken from the Wikipedia article on the topic.

            Bandcamp recommendation and discovery algorithms

            This is the third part of a series of posts on my Bandcamp recommendations (BCRecommender) project. Check out the first part for the general motivation behind this project and the second part for the system architecture.

            The main goal of the BCRecommender project is to help me find music I like. This post discusses the algorithmic approaches I took towards that goal. I’ve kept the descriptions at a fairly high-level, without getting too much into the maths, as all recommendation algorithms essentially try to model simple intuition. Please leave a comment if you feel like something needs to be explained further.

            Data & evaluation approach

            The data was collected from publicly-indexable Bandcamp fan and track/album (aka tralbum) pages. For each fan, it consists of the tralbum IDs they bought or wishlisted. For each tralbum, the saved data includes the type (track/album), URL, title, artist name, and the tags (as assigned by the artist).

            At the moment, I have data for about 160K fans, 335K albums and 170K tracks. These fans have expressed their preference for tralbums through purchasing or wishlisting about 3.4M times. There are about 210K unique tags across the 505K tralbums, with the mean number of tags per tralbum being 7. These figures represent a fairly sparse dataset, which makes recommendation somewhat challenging. Perhaps this is why Bandcamp doesn’t do much algorithmic recommendation.

            Before moving on to describe the recommendation approaches I played with, it is worth noting that at this stage, my way of evaluating the recommendations isn’t very rigorous. If I can easily find new music that I like, I’m happy. As such, offline evaluation approaches (e.g., some form of cross-validation) are unlikely to correlate well with my goal, so I just didn’t bother with them. Having more data would allow me to perform more rigorous online evaluation to see what makes other people happy with the recommendations.

            Personalised recommendations with preferences (collaborative filtering)

            My first crack at recommendation generation was using collaborative filtering. The broad idea behind collaborative filtering is using only the preference matrix to find patterns in the data, and generate recommendations accordingly. The preference matrix is defined to have a row for each user and a column for each item. Each matrix element value indicates the level of preference by the user for the item. To keep things simple, I used unary preference values, where the element that corresponds to user/fan u and item/tralbum i is set to 1 if the fan purchased or wishlisted the tralbum, or set to missing otherwise.

            A simple example for collaborative filtering is in the following image, which was taken from the Wikipedia article on the topic.

            A simple collaborative filtering example

            I used matrix factorisation as the collaborative filtering algorithm. This algorithm was a key part of the winning team’s solution to the Netflix competition. Unsurprisingly, it didn’t work that well. The key issue is that there are 160K * (335K + 170K) = 80.8B possible preferences in the dataset, but only 3.4M (0.004%) preferences are given. What matrix factorisation tries to do is to predict the remaining 99.996% of preferences based on the tiny percentage of given data. This just didn’t yield any music recommendations I liked, even when I made the matrix denser by dropping fans and tralbums with few preferences. Therefore, I moved on to employing an algorithm that can use more data – the tags.

            Personalised recommendations with tags and preferences (collaborative filtering and content-based hybrid)

            Using data about the items is referred to as content-based recommendation in the literature. In the Bandcamp recommender case, the content data that is most easy to use is the tags that artists assign to their work. The idea is to build a profile for each fan based on tags for their tralbums, and recommend tralbums with tags that match the fan’s profile.

            As mentioned above, the dataset contains 210K unique tags for 505K tralbums, which means that this representation of the dataset is also rather sparse. One obvious way of making it denser is by dropping rare tags. I also “tagged” each tralbum with a fan’s username if that fan purchased or wishlisted the tralbum. In addition to yielding a richer tralbum representation, this approach makes the recommendations likely to be less obvious than those based only on tags. For example, all tralbums tagged with rock are likely to be rock albums, but tralbums tagged with yanir are somewhat more varied.

            To make the tralbum representation denser I used the latent Dirichlet allocation (LDA) implementation from the excellent gensim library. LDA assumes that there’s a fixed number of topics (distributions over tags, i.e., weighted lists of tags), and that every tralbum’s tags are generated from its topics. In practice, this magically yields clusters of tags and tralbums that can be used to generate recommendations. For example, the following word cloud presents the top tags in one cluster, which is focused on psychedelic-progressive rock. Each tralbum is assigned a probability of being generated from this cluster. This means that each tralbum is now represented as a probability distribution over a fixed number of topics – much denser than the raw tag data.

            800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

              \ No newline at end of file diff --git a/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/index.html b/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/index.html index 6247b6c59..bc694243a 100644 --- a/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/index.html +++ b/2014/09/24/applying-the-traction-books-bullseye-framework-to-bcrecommender/index.html @@ -1,9 +1,11 @@ Applying the Traction Book’s Bullseye framework to BCRecommender | Yanir Seroussi | Data & AI for Startup Impact -

              Applying the Traction Book’s Bullseye framework to BCRecommender

              This is the fourth part of a series of posts on my Bandcamp recommendations (BCRecommender) project. Check out previous posts on the general motivation behind this project, the system's architecture, and the recommendation algorithms.

              Having used BCRecommender to find music I like, I’m certain that other Bandcamp fans would like it too. It could probably be extended to attract a wider audience of music lovers, but for now, just getting feedback from Bandcamp fans would be enough. There are about 200,000 fans that I know of – getting even a fraction of them to use and comment on BCRecommender would serve as a good guide to what’s worth building and improving.

              In addition to getting feedback, the personal value for me in getting BCRecommender users is learning some general lessons on traction building. Like many technical people, I like building products and playing with data, but I don’t really enjoy sales and marketing (and that’s an understatement). One of my goals in working independently is forcing myself to get better at the things I’m not good at. To that end, I recently started reading Traction: A Startup Guide to Getting Customers by Gabriel Weinberg and Justin Mares.

              The Traction book identifies 19 different channels for getting traction, and suggests a simple framework (named Bullseye) to ranking and quickly exploring the channels. They explain that many technical founders tend to focus on traction channels they’re familiar with, and that the effort invested in those channels tends to be rather small compared to the investment in building the product. The authors rightly note that “Almost every failed startup has a product. What failed startups don’t have is traction – real customer growth.” They argue that following a rigorous approach to gaining traction via their framework is likely to improve a startup’s chances of success. From personal experience, this is very likely to be true.

              The key steps in the Bullseye framework are brainstorming ideas for each traction channel, ranking the channels into tiers, prioritising the most promising ones, testing them, and focusing on the channels that work. This is not a one-off process – channel suitability changes over time, and one needs to go through the process repeatedly as the product evolves and traction grows.

              Here are the traction channels, ordered in the same order as in the book. Each traction channel is marked with a letter denoting its ranking tier from A (most appropriate) to C (unsuitable right now). A short explanation is provided for each channel.

              • [B] viral marketing: everyone wants to go viral, but at the moment I don’t have a good-enough understanding of my target audience to seriously pursue this channel.
              • [C] public relations (PR): I don’t think that PR would give me access to the kind of focused user group I need at this phase.
              • [C] unconventional PR: same as conventional PR.
              • [C] search engine marketing (SEM): may work, but I don’t want to spend money at this stage.
              • [C] social and display ads: see SEM.
              • [C] offline ads: see SEM.
              • [A] search engine optimization (SEO): this channel seems promising, as ranking highly for queries such as “bandcamp recommendations” should drive quality traffic that is likely to convert (i.e., play recommendations and sign up for updates). It doesn’t seem like “bandcamp recommendations” is a very competitive query, so it’s definitely worth doing some SEO work.
              • [A] content marketing: I think that there’s definitely potential in this channel, since I have a lot of data that can be explored and presented in interesting ways. The problem is creating content that is compelling enough to attract people. I started playing with this channel via the Spotlights feature, but it’s not good enough yet.
              • [B] email marketing: BCRecommender already has the subscription feature for retention. At this stage, this doesn’t seem like a viable acquisition channel.
              • [B] engineering as marketing: this channel sounds promising, but I don’t have good ideas for it at the moment. This may change soon, as I’m currently reading this chapter.
              • [A] targeting blogs: this approach should work for getting high-quality feedback, and help SEO as well.
              • [C] business development: there may be some promising ideas in this channel, but only worth pursuing later.
              • [C] sales: not much to sell.
              • [C] affiliate programs: I’m not going to pay affiliates as I’m not making any money.
              • [B] existing platforms: in a way, I’m already building on top of the existing Bandcamp platform. One way of utilising it for growth is by getting fans to link to BCRecommender when it leads to sales (as I’ve done on my fan page), but that would be more feasible at a later stage with more active users.
              • [C] trade shows: I find it hard to think of trade shows where there are many Bandcamp fans.
              • [C] offline events: probably easier than trade shows (think concerts/indie events), but doesn’t seem worth pursuing at this stage.
              • [C] speaking engagements: similar to offline events. I do speaking engagements, and I’m actually going to mention BCRecommender as a case study at my workshop this week, but the intersection between Bandcamp fans and people interested in data science seems rather small.
              • [C] community building: this may be possible later on, when there is a core group of loyal users. However, some aspects of community building are provided by Bandcamp and I don’t want to compete with them.

              Cool, writing everything up explicitly was actually helpful! The next step is to test the three channels that ranked the highest: SEO, content marketing and targeting blogs. I will report the results in future posts.

              Subscribe +

              Applying the Traction Book’s Bullseye framework to BCRecommender

              This is the fourth part of a series of posts on my Bandcamp recommendations (BCRecommender) project. Check out previous posts on the general motivation behind this project, the system's architecture, and the recommendation algorithms.

              Having used BCRecommender to find music I like, I’m certain that other Bandcamp fans would like it too. It could probably be extended to attract a wider audience of music lovers, but for now, just getting feedback from Bandcamp fans would be enough. There are about 200,000 fans that I know of – getting even a fraction of them to use and comment on BCRecommender would serve as a good guide to what’s worth building and improving.

              In addition to getting feedback, the personal value for me in getting BCRecommender users is learning some general lessons on traction building. Like many technical people, I like building products and playing with data, but I don’t really enjoy sales and marketing (and that’s an understatement). One of my goals in working independently is forcing myself to get better at the things I’m not good at. To that end, I recently started reading Traction: A Startup Guide to Getting Customers by Gabriel Weinberg and Justin Mares.

              The Traction book identifies 19 different channels for getting traction, and suggests a simple framework (named Bullseye) to ranking and quickly exploring the channels. They explain that many technical founders tend to focus on traction channels they’re familiar with, and that the effort invested in those channels tends to be rather small compared to the investment in building the product. The authors rightly note that “Almost every failed startup has a product. What failed startups don’t have is traction – real customer growth.” They argue that following a rigorous approach to gaining traction via their framework is likely to improve a startup’s chances of success. From personal experience, this is very likely to be true.

              The key steps in the Bullseye framework are brainstorming ideas for each traction channel, ranking the channels into tiers, prioritising the most promising ones, testing them, and focusing on the channels that work. This is not a one-off process – channel suitability changes over time, and one needs to go through the process repeatedly as the product evolves and traction grows.

              Here are the traction channels, ordered in the same order as in the book. Each traction channel is marked with a letter denoting its ranking tier from A (most appropriate) to C (unsuitable right now). A short explanation is provided for each channel.

              • [B] viral marketing: everyone wants to go viral, but at the moment I don’t have a good-enough understanding of my target audience to seriously pursue this channel.
              • [C] public relations (PR): I don’t think that PR would give me access to the kind of focused user group I need at this phase.
              • [C] unconventional PR: same as conventional PR.
              • [C] search engine marketing (SEM): may work, but I don’t want to spend money at this stage.
              • [C] social and display ads: see SEM.
              • [C] offline ads: see SEM.
              • [A] search engine optimization (SEO): this channel seems promising, as ranking highly for queries such as “bandcamp recommendations” should drive quality traffic that is likely to convert (i.e., play recommendations and sign up for updates). It doesn’t seem like “bandcamp recommendations” is a very competitive query, so it’s definitely worth doing some SEO work.
              • [A] content marketing: I think that there’s definitely potential in this channel, since I have a lot of data that can be explored and presented in interesting ways. The problem is creating content that is compelling enough to attract people. I started playing with this channel via the Spotlights feature, but it’s not good enough yet.
              • [B] email marketing: BCRecommender already has the subscription feature for retention. At this stage, this doesn’t seem like a viable acquisition channel.
              • [B] engineering as marketing: this channel sounds promising, but I don’t have good ideas for it at the moment. This may change soon, as I’m currently reading this chapter.
              • [A] targeting blogs: this approach should work for getting high-quality feedback, and help SEO as well.
              • [C] business development: there may be some promising ideas in this channel, but only worth pursuing later.
              • [C] sales: not much to sell.
              • [C] affiliate programs: I’m not going to pay affiliates as I’m not making any money.
              • [B] existing platforms: in a way, I’m already building on top of the existing Bandcamp platform. One way of utilising it for growth is by getting fans to link to BCRecommender when it leads to sales (as I’ve done on my fan page), but that would be more feasible at a later stage with more active users.
              • [C] trade shows: I find it hard to think of trade shows where there are many Bandcamp fans.
              • [C] offline events: probably easier than trade shows (think concerts/indie events), but doesn’t seem worth pursuing at this stage.
              • [C] speaking engagements: similar to offline events. I do speaking engagements, and I’m actually going to mention BCRecommender as a case study at my workshop this week, but the intersection between Bandcamp fans and people interested in data science seems rather small.
              • [C] community building: this may be possible later on, when there is a core group of loyal users. However, some aspects of community building are provided by Bandcamp and I don’t want to compete with them.

              Cool, writing everything up explicitly was actually helpful! The next step is to test the three channels that ranked the highest: SEO, content marketing and targeting blogs. I will report the results in future posts.

              Subscribe

                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                \ No newline at end of file diff --git a/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/index.html b/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/index.html index 5c5f8bd71..50b02583e 100644 --- a/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/index.html +++ b/2014/10/07/greek-media-monitoring-kaggle-competition-my-approach/index.html @@ -1,5 +1,5 @@ Greek Media Monitoring Kaggle competition: My approach | Yanir Seroussi | Data & AI for Startup Impact -

                Greek Media Monitoring Kaggle competition: My approach

                A few months ago I participated in the Kaggle Greek Media Monitoring competition. The goal of the competition was doing multilabel classification of texts scanned from Greek print media. Despite not having much time due to travelling and other commitments, I managed to finish 6th (out of 120 teams). This post describes my approach to the problem.

                Data & evaluation

                The data consists of articles scanned from Greek print media in May-September 2013. Due to copyright issues, the organisers didn’t make the original articles available – competitors only had access to normalised tf-idf representations of the texts. This limited the options for doing feature engineering and made it impossible to consider things like word order, but it made things somewhat simpler as the focus was on modelling due to inability to extract interesting features.

                Overall, there are about 65K texts in the training set and 35K in the test set, where the split is based on chronological ordering (i.e., the training articles were published before the test articles). Each article was manually labelled with one or more labels out of a set of 203 labels. For each test article, the goal is to infer its set of labels. Submissions were ranked using the mean F1 score.

                Despite being manually annotated, the data isn’t very clean. Issues include identical texts that have different labels, empty articles, and articles with very few words. For example, the training set includes ten “articles” with a single word. Five of these articles have the word 68839, but each of these five was given a different label. Such issues are not unusual in Kaggle competitions or in real life, but they do limit the general usefulness of the results since any model built on this data would fit some noise.

                Local validation setup

                As mentioned in previous posts (How to (almost) win Kaggle competitions and Kaggle beginner tips) having a solid local validation setup is very important. It ensures you don’t waste time on weak submissions, increases confidence in the models, and avoids leaking information about how well you’re doing.

                I used the first 35K training texts for local training and the following 30K texts for validation. While the article publication dates weren’t provided, I hoped that this would mimic the competition setup, where the test dataset consists of articles that were published after the articles in the training dataset. This seemed to work, as my local results were consistent with the leaderboard results. I’m pleased to report that this setup allowed me to have the lowest number of submissions of all the top-10 teams 🙂

                Things that worked

                I originally wanted to use this competition to play with deep learning through Python packages such as Theano and PyLearn2. However, as this was the first time I worked on a multilabel classification problem, I got sucked into reading a lot of papers on the topic and never got around to doing deep learning. Maybe next time…

                One of my key discoveries was that there if you define a graph where the vertices are labels and there’s an edge between two labels if they appear together in a document’s label set, then there are two main connected components of labels and several small ones with single labels (see figure below). It is possible to train a linear classifier that distinguishes between the components with very high accuracy (over 99%). This allowed me to improve performance by training different classifiers on each connected component.

                Greek Media Monitoring Kaggle competition: My approach

                A few months ago I participated in the Kaggle Greek Media Monitoring competition. The goal of the competition was doing multilabel classification of texts scanned from Greek print media. Despite not having much time due to travelling and other commitments, I managed to finish 6th (out of 120 teams). This post describes my approach to the problem.

                Data & evaluation

                The data consists of articles scanned from Greek print media in May-September 2013. Due to copyright issues, the organisers didn’t make the original articles available – competitors only had access to normalised tf-idf representations of the texts. This limited the options for doing feature engineering and made it impossible to consider things like word order, but it made things somewhat simpler as the focus was on modelling due to inability to extract interesting features.

                Overall, there are about 65K texts in the training set and 35K in the test set, where the split is based on chronological ordering (i.e., the training articles were published before the test articles). Each article was manually labelled with one or more labels out of a set of 203 labels. For each test article, the goal is to infer its set of labels. Submissions were ranked using the mean F1 score.

                Despite being manually annotated, the data isn’t very clean. Issues include identical texts that have different labels, empty articles, and articles with very few words. For example, the training set includes ten “articles” with a single word. Five of these articles have the word 68839, but each of these five was given a different label. Such issues are not unusual in Kaggle competitions or in real life, but they do limit the general usefulness of the results since any model built on this data would fit some noise.

                Local validation setup

                As mentioned in previous posts (How to (almost) win Kaggle competitions and Kaggle beginner tips) having a solid local validation setup is very important. It ensures you don’t waste time on weak submissions, increases confidence in the models, and avoids leaking information about how well you’re doing.

                I used the first 35K training texts for local training and the following 30K texts for validation. While the article publication dates weren’t provided, I hoped that this would mimic the competition setup, where the test dataset consists of articles that were published after the articles in the training dataset. This seemed to work, as my local results were consistent with the leaderboard results. I’m pleased to report that this setup allowed me to have the lowest number of submissions of all the top-10 teams 🙂

                Things that worked

                I originally wanted to use this competition to play with deep learning through Python packages such as Theano and PyLearn2. However, as this was the first time I worked on a multilabel classification problem, I got sucked into reading a lot of papers on the topic and never got around to doing deep learning. Maybe next time…

                One of my key discoveries was that there if you define a graph where the vertices are labels and there’s an edge between two labels if they appear together in a document’s label set, then there are two main connected components of labels and several small ones with single labels (see figure below). It is possible to train a linear classifier that distinguishes between the components with very high accuracy (over 99%). This allowed me to improve performance by training different classifiers on each connected component.

                800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                  Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                  \ No newline at end of file diff --git a/2014/10/23/what-is-data-science/index.html b/2014/10/23/what-is-data-science/index.html index 65d418413..cc09994c1 100644 --- a/2014/10/23/what-is-data-science/index.html +++ b/2014/10/23/what-is-data-science/index.html @@ -1,5 +1,5 @@ What is data science? | Yanir Seroussi | Data & AI for Startup Impact -

                  What is data science?

                  Data science has been a hot term in the past few years. Despite this fact (or perhaps because of it), it still seems like there isn't a single unifying definition of data science. This post discusses my favourite definition.

                  Data Scientist (n.): Person who is better at statistics than any software engineer and better at software engineering than any statistician.

                  — Josh Wills (@josh_wills) May 3, 2012

                  One of my reasons for doing a PhD was wanting to do something more interesting than “vanilla” software engineering. When I was in the final stages of my PhD, I started going to meetups to see what’s changed in the world outside academia. Back then, I defined myself as a “software engineer with a research background”, which didn’t mean much to most people. My first post-PhD job ended up being a data scientist at a small startup. As soon as I changed my LinkedIn title to Data Scientist, many offers started flowing. This is probably the reason why so many people call themselves data scientists these days, often diluting the term to a point where it’s so broad it becomes meaningless. This post presents my preferred data science definitions and my opinions on who should or shouldn’t call themselves a data scientist.

                  Defining data science

                  I really like the definition quoted above, of data science as the intersection of software engineering and statistics. Ofer Mendelevitch goes into more detail, drawing a continuum of professions that ranges from software engineer on the left to pure statistician (or machine learning researcher) on the right.

                  What is data science?

                  Data science has been a hot term in the past few years. Despite this fact (or perhaps because of it), it still seems like there isn't a single unifying definition of data science. This post discusses my favourite definition.

                  Data Scientist (n.): Person who is better at statistics than any software engineer and better at software engineering than any statistician.

                  — Josh Wills (@josh_wills) May 3, 2012

                  One of my reasons for doing a PhD was wanting to do something more interesting than “vanilla” software engineering. When I was in the final stages of my PhD, I started going to meetups to see what’s changed in the world outside academia. Back then, I defined myself as a “software engineer with a research background”, which didn’t mean much to most people. My first post-PhD job ended up being a data scientist at a small startup. As soon as I changed my LinkedIn title to Data Scientist, many offers started flowing. This is probably the reason why so many people call themselves data scientists these days, often diluting the term to a point where it’s so broad it becomes meaningless. This post presents my preferred data science definitions and my opinions on who should or shouldn’t call themselves a data scientist.

                  Defining data science

                  I really like the definition quoted above, of data science as the intersection of software engineering and statistics. Ofer Mendelevitch goes into more detail, drawing a continuum of professions that ranges from software engineer on the left to pure statistician (or machine learning researcher) on the right.

                  800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                    Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                    So true. Thanks for saying it so well.
                    \ No newline at end of file diff --git a/2014/11/05/bcrecommender-traction-update/index.html b/2014/11/05/bcrecommender-traction-update/index.html index 20c14b7e2..6bc4721c9 100644 --- a/2014/11/05/bcrecommender-traction-update/index.html +++ b/2014/11/05/bcrecommender-traction-update/index.html @@ -1,9 +1,11 @@ BCRecommender Traction Update | Yanir Seroussi | Data & AI for Startup Impact -

                    BCRecommender Traction Update

                    This is the fifth part of a series of posts on my Bandcamp recommendations (BCRecommender) project. Check out previous posts on the general motivation behind this project, the system’s architecture, the recommendation algorithms, and initial traction planning.

                    In a previous post, I discussed my plans to apply the Bullseye framework from the Traction Book to BCRecommender, my Bandcamp recommendations project. In that post, I reviewed the 19 traction channels described in the book, and decided to focus on the three most promising ones: blogger outreach, search engine optimisation (SEO), and content marketing. This post discusses my progress to date.

                    Goals

                    My initial traction goals were rather modest: get some feedback from real people, build up steady nonzero traffic to the site, and then increase that traffic to 10+ unique visitors per day. It’s worth noting that I have four other main areas of focus at the moment, so BCRecommender is not getting all the attention I could potentially give it. Nonetheless, I have made good progress on achieving my goals (first two have been obtained, but traffic still fluctuates), and learnt a lot in the process.

                    Things that worked

                    Blogger outreach. The most obvious people to contact are existing Bandcamp fans. It was straightforward to generate a list of prolific fans with blogs, as Bandcamp allows people to populate their profile with a short bio and links to their sites. I worked my way through part of the list, sending each fan an email introducing BCRecommender and asking for their feedback. Each email required some manual work, as the vast majority of people don’t have their email address listed on their Bandcamp profile page. I was careful not to be too spammy, which seemed to work: about 50% of the people I contacted visited BCRecommender, 20% responded with positive feedback, and 10% linked to BCRecommender in some form, with the largest volume of traffic coming from my Hypebot guest post. The problem with this approach is that it doesn’t scale, but the most valuable thing I got out of it was that people like the project and that there’s a real need for it.

                    Twitter. I’m not sure where Twitter falls as a traction channel. It’s probably somewhere between (micro)blogger outreach and content marketing. However you categorise Twitter, it has been working well as a source of traffic. Simply finding people who may be interested in BCRecommender and tweeting related content has proven to be a rather low-effort way of getting attention, which is great at this stage. I have a few ideas for driving more traffic from Twitter, which I will try as I go.

                    Things that didn’t work

                    Content marketing. I haven’t really spent time doing serious content marketing apart from the Spotlights pilot. My vision for the spotlights was to generate quality articles automatically and showcase music on Bandcamp in an engaging way that helps people discover new artists, even if they don’t have a fan account. However, full automation of the spotlight feature would require a lot of work, and I think that there are lower-hanging fruits that I should focus on first. For example, finding interesting insights in the data and presenting them in an engaging way may be a better content strategy, as it would be unique to BCRecommender. For the spotlights, partnering with bloggers to write the articles may be a better approach than automation.

                    SEO. I expected BCRecommender to rank higher for “bandcamp recommendations” by now, as a result of my blogger outreach efforts. At the moment, it’s still on the second page for this query on Google, though it’s the first result on Bing and DuckDuckGo. Obviously, “bandcamp recommendations” is not the only query worth ranking for, but it’s very relevant to BCRecommender, and not too competitive (half of the first page results are old forum posts). One encouraging outcome from the work done so far is that my Hypebot guest post does appear on the first page. Nonetheless, I’m still interested in getting more search engine traffic. Ranking higher would probably require adding more relevant content on the site and getting more quality links (basically what SEO is all about).

                    Points to improve and next steps

                    I could definitely do better work on all of the above channels. Contrary to what’s suggested by the Bullseye framework, I would like to put more effort into the channels that didn’t work well. The reason is that I think they didn’t work well because of lack of attention and weak experiments, rather than due to their unsuitability to BCRecommender.

                    As mentioned above, my main limiting factor is a lack of time to spend on the project. However, there’s no pressing need to hit certain traction milestones by a specific deadline. My stretch goals are to get all Bandcamp fans to check out the project (hundreds of thousands of people), and have a significant portion of them convert by signing up to updates (tens of thousands of people). Getting there will take time. So far I’m finding the process educational and enjoyable, which is a pleasant surprise.

                    Subscribe +

                    BCRecommender Traction Update

                    This is the fifth part of a series of posts on my Bandcamp recommendations (BCRecommender) project. Check out previous posts on the general motivation behind this project, the system’s architecture, the recommendation algorithms, and initial traction planning.

                    In a previous post, I discussed my plans to apply the Bullseye framework from the Traction Book to BCRecommender, my Bandcamp recommendations project. In that post, I reviewed the 19 traction channels described in the book, and decided to focus on the three most promising ones: blogger outreach, search engine optimisation (SEO), and content marketing. This post discusses my progress to date.

                    Goals

                    My initial traction goals were rather modest: get some feedback from real people, build up steady nonzero traffic to the site, and then increase that traffic to 10+ unique visitors per day. It’s worth noting that I have four other main areas of focus at the moment, so BCRecommender is not getting all the attention I could potentially give it. Nonetheless, I have made good progress on achieving my goals (first two have been obtained, but traffic still fluctuates), and learnt a lot in the process.

                    Things that worked

                    Blogger outreach. The most obvious people to contact are existing Bandcamp fans. It was straightforward to generate a list of prolific fans with blogs, as Bandcamp allows people to populate their profile with a short bio and links to their sites. I worked my way through part of the list, sending each fan an email introducing BCRecommender and asking for their feedback. Each email required some manual work, as the vast majority of people don’t have their email address listed on their Bandcamp profile page. I was careful not to be too spammy, which seemed to work: about 50% of the people I contacted visited BCRecommender, 20% responded with positive feedback, and 10% linked to BCRecommender in some form, with the largest volume of traffic coming from my Hypebot guest post. The problem with this approach is that it doesn’t scale, but the most valuable thing I got out of it was that people like the project and that there’s a real need for it.

                    Twitter. I’m not sure where Twitter falls as a traction channel. It’s probably somewhere between (micro)blogger outreach and content marketing. However you categorise Twitter, it has been working well as a source of traffic. Simply finding people who may be interested in BCRecommender and tweeting related content has proven to be a rather low-effort way of getting attention, which is great at this stage. I have a few ideas for driving more traffic from Twitter, which I will try as I go.

                    Things that didn’t work

                    Content marketing. I haven’t really spent time doing serious content marketing apart from the Spotlights pilot. My vision for the spotlights was to generate quality articles automatically and showcase music on Bandcamp in an engaging way that helps people discover new artists, even if they don’t have a fan account. However, full automation of the spotlight feature would require a lot of work, and I think that there are lower-hanging fruits that I should focus on first. For example, finding interesting insights in the data and presenting them in an engaging way may be a better content strategy, as it would be unique to BCRecommender. For the spotlights, partnering with bloggers to write the articles may be a better approach than automation.

                    SEO. I expected BCRecommender to rank higher for “bandcamp recommendations” by now, as a result of my blogger outreach efforts. At the moment, it’s still on the second page for this query on Google, though it’s the first result on Bing and DuckDuckGo. Obviously, “bandcamp recommendations” is not the only query worth ranking for, but it’s very relevant to BCRecommender, and not too competitive (half of the first page results are old forum posts). One encouraging outcome from the work done so far is that my Hypebot guest post does appear on the first page. Nonetheless, I’m still interested in getting more search engine traffic. Ranking higher would probably require adding more relevant content on the site and getting more quality links (basically what SEO is all about).

                    Points to improve and next steps

                    I could definitely do better work on all of the above channels. Contrary to what’s suggested by the Bullseye framework, I would like to put more effort into the channels that didn’t work well. The reason is that I think they didn’t work well because of lack of attention and weak experiments, rather than due to their unsuitability to BCRecommender.

                    As mentioned above, my main limiting factor is a lack of time to spend on the project. However, there’s no pressing need to hit certain traction milestones by a specific deadline. My stretch goals are to get all Bandcamp fans to check out the project (hundreds of thousands of people), and have a significant portion of them convert by signing up to updates (tens of thousands of people). Getting there will take time. So far I’m finding the process educational and enjoyable, which is a pleasant surprise.

                    Subscribe

                      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                      \ No newline at end of file diff --git a/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.html b/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.html index 5ed4e3cf8..bcf819aeb 100644 --- a/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.html +++ b/2014/11/19/fitting-noise-forecasting-the-sale-price-of-bulldozers-kaggle-competition-summary/index.html @@ -1,5 +1,5 @@ Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary) | Yanir Seroussi | Data & AI for Startup Impact -

                      Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)

                      Messy data, buggy software, but all in all a good learning experience...

                      Early last year, I had some free time on my hands, so I decided to participate in yet another Kaggle competition. Having never done any price forecasting work before, I thought it would be interesting to work on the Blue Book for Bulldozers competition, where the goal was to predict the sale price of auctioned bulldozers. I’ve done alright, finishing 9th out of 476 teams. And the experience did turn out to be interesting, but not for the reasons I expected.

                      Data and evaluation

                      The competition dataset consists of about 425K historical records of bulldozer sales. The training subset consists of sales from the 1990s through to the end of 2011, with the validation and testing periods being January-April 2012 and May-November 2012 respectively. The goal is to predict the sale price of each bulldozer, given the sale date and venue, and the bulldozer’s features (e.g., model ID, mechanical specifications, and machine-specific data such as machine ID and manufacturing year). Submissions were scored using the RMSLE measure.

                      Early in the competition (before I joined), there were many posts in the forum regarding issues with the data. The organisers responded by posting an appendix to the data, which included the “correct” information. From people’s posts after the competition ended, it seems like using the “correct” data consistently made the results worse. Luckily, I discovered this about a week before the competition ended. Reducing my reliance on the appendix made a huge difference in the performance of my models. This discovery was thanks to a forum post, which illustrates the general point on the importance of monitoring the forum in Kaggle competitions.

                      My approach: feature engineering, data splitting, and stochastic gradient boosting

                      Having read the forum discussions on data quality, I assumed that spending time on data cleanup and feature engineering would give me an edge over competitors who focused only on data modelling. It’s well-known that simple models fitted on more/better data tend to yield better results than complex models fitted on less/messy data (aka GIGO – garbage in, garbage out). However, doing data cleaning and feature engineering is less glamorous than building sophisticated models, which is why many people avoid the former.

                      Sadly, the data was incredibly messy, so most of my cleanup efforts resulted in no improvements. Even intuitive modifications yielded poor results, like transforming each bulldozer’s manufacturing year into its age at the time of sale. Essentially, to do well in this competition, one had to fit the noise rather than remove it. This was rather disappointing, as one of the nice things about Kaggle competitions is being able to work on relatively clean data. Anomalies in data included bulldozers that have been running for hundreds of years and machines that got sold years before they were manufactured (impossible for second-hand bulldozers!). It is obvious that Fast Iron (the company who sponsored the competition) would have obtained more usable models from this competition if they had spent more time cleaning up the data themselves.

                      Throughout the competition I went through several iterations of modelling and data cleaning. My final submission ended up being a linear combination of four models:

                      • Gradient boosting machine (GBM) regression on the full dataset
                      • A linear model on the full dataset
                      • An ensemble of GBMs, one for each product group (rationale: different product groups represent different bulldozer classes, like track excavators and motor graders, so their prices are not really comparable)
                      • A similar ensemble, where each product group and sale year has a separate GBM, and earlier years get lower weight than more recent years

                      I ended up discarding old training data (before 2000) and the machine IDs (another surprise: even though some machines were sold multiple times, this information was useless). For the GBMs, I treated categorical features as ordinal, which sort of makes sense for many of the features (e.g., model series values are ordered). For the linear model, I just coded them as binary indicators.

                      The most important discovery: stochastic gradient boosting bugs

                      This was the first time I used gradient boosting. Since I was using so many different models, it was hard to reliably tune the number of trees, so I figured I’d use stochastic gradient boosting and rely on out-of-bag (OOB) samples to set the number of trees. This led to me finding a bug in scikit-learn: the OOB scores were actually calculated on in-bag samples.

                      I reported the issue to the maintainers of scikit-learn and made an attempt at fixing it by skipping trees to obtain the OOB samples. This yielded better results than the buggy version, and in some cases I replaced a plain GBM with an ensemble of four stochastic GBMs with subsample ratio of 0.5 and a different random seed for each one (averaging their outputs).

                      This wasn’t enough to convince the maintainers of scikit-learn to accept the pull request with my fix, as they didn’t like my idea of skipping trees. This is for a good reason — obtaining better results on a single dataset should be insufficient to convince anyone. They ended up fixing the issue by copying the implementation from R’s GBM package, which is known to underestimate the number of required trees/boosting iterations (see Section 3.3 in the GBM guide).

                      Recently, I had some time to test my tree skipping idea on the toy dataset used in the scikit-learn documentation. As the following figure shows, a smoothed variant of my tree skipping idea (TSO in the figure) yields superior results to the scikit-learn/R approach (SKO in the figure). The actual loss doesn’t matter — what matters is where it’s minimised. In this case TSO obtains the closest approximation of the number of iterations to the value that minimises the test error, which is a promising result.

                      Fitting noise: Forecasting the sale price of bulldozers (Kaggle competition summary)

                      Messy data, buggy software, but all in all a good learning experience...

                      Early last year, I had some free time on my hands, so I decided to participate in yet another Kaggle competition. Having never done any price forecasting work before, I thought it would be interesting to work on the Blue Book for Bulldozers competition, where the goal was to predict the sale price of auctioned bulldozers. I’ve done alright, finishing 9th out of 476 teams. And the experience did turn out to be interesting, but not for the reasons I expected.

                      Data and evaluation

                      The competition dataset consists of about 425K historical records of bulldozer sales. The training subset consists of sales from the 1990s through to the end of 2011, with the validation and testing periods being January-April 2012 and May-November 2012 respectively. The goal is to predict the sale price of each bulldozer, given the sale date and venue, and the bulldozer’s features (e.g., model ID, mechanical specifications, and machine-specific data such as machine ID and manufacturing year). Submissions were scored using the RMSLE measure.

                      Early in the competition (before I joined), there were many posts in the forum regarding issues with the data. The organisers responded by posting an appendix to the data, which included the “correct” information. From people’s posts after the competition ended, it seems like using the “correct” data consistently made the results worse. Luckily, I discovered this about a week before the competition ended. Reducing my reliance on the appendix made a huge difference in the performance of my models. This discovery was thanks to a forum post, which illustrates the general point on the importance of monitoring the forum in Kaggle competitions.

                      My approach: feature engineering, data splitting, and stochastic gradient boosting

                      Having read the forum discussions on data quality, I assumed that spending time on data cleanup and feature engineering would give me an edge over competitors who focused only on data modelling. It’s well-known that simple models fitted on more/better data tend to yield better results than complex models fitted on less/messy data (aka GIGO – garbage in, garbage out). However, doing data cleaning and feature engineering is less glamorous than building sophisticated models, which is why many people avoid the former.

                      Sadly, the data was incredibly messy, so most of my cleanup efforts resulted in no improvements. Even intuitive modifications yielded poor results, like transforming each bulldozer’s manufacturing year into its age at the time of sale. Essentially, to do well in this competition, one had to fit the noise rather than remove it. This was rather disappointing, as one of the nice things about Kaggle competitions is being able to work on relatively clean data. Anomalies in data included bulldozers that have been running for hundreds of years and machines that got sold years before they were manufactured (impossible for second-hand bulldozers!). It is obvious that Fast Iron (the company who sponsored the competition) would have obtained more usable models from this competition if they had spent more time cleaning up the data themselves.

                      Throughout the competition I went through several iterations of modelling and data cleaning. My final submission ended up being a linear combination of four models:

                      • Gradient boosting machine (GBM) regression on the full dataset
                      • A linear model on the full dataset
                      • An ensemble of GBMs, one for each product group (rationale: different product groups represent different bulldozer classes, like track excavators and motor graders, so their prices are not really comparable)
                      • A similar ensemble, where each product group and sale year has a separate GBM, and earlier years get lower weight than more recent years

                      I ended up discarding old training data (before 2000) and the machine IDs (another surprise: even though some machines were sold multiple times, this information was useless). For the GBMs, I treated categorical features as ordinal, which sort of makes sense for many of the features (e.g., model series values are ordered). For the linear model, I just coded them as binary indicators.

                      The most important discovery: stochastic gradient boosting bugs

                      This was the first time I used gradient boosting. Since I was using so many different models, it was hard to reliably tune the number of trees, so I figured I’d use stochastic gradient boosting and rely on out-of-bag (OOB) samples to set the number of trees. This led to me finding a bug in scikit-learn: the OOB scores were actually calculated on in-bag samples.

                      I reported the issue to the maintainers of scikit-learn and made an attempt at fixing it by skipping trees to obtain the OOB samples. This yielded better results than the buggy version, and in some cases I replaced a plain GBM with an ensemble of four stochastic GBMs with subsample ratio of 0.5 and a different random seed for each one (averaging their outputs).

                      This wasn’t enough to convince the maintainers of scikit-learn to accept the pull request with my fix, as they didn’t like my idea of skipping trees. This is for a good reason — obtaining better results on a single dataset should be insufficient to convince anyone. They ended up fixing the issue by copying the implementation from R’s GBM package, which is known to underestimate the number of required trees/boosting iterations (see Section 3.3 in the GBM guide).

                      Recently, I had some time to test my tree skipping idea on the toy dataset used in the scikit-learn documentation. As the following figure shows, a smoothed variant of my tree skipping idea (TSO in the figure) yields superior results to the scikit-learn/R approach (SKO in the figure). The actual loss doesn’t matter — what matters is where it’s minimised. In this case TSO obtains the closest approximation of the number of iterations to the value that minimises the test error, which is a promising result.

                      800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                        Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                        \ No newline at end of file diff --git a/2014/12/15/seo-mostly-about-showing-up/index.html b/2014/12/15/seo-mostly-about-showing-up/index.html index 47cad3ebd..c6e271754 100644 --- a/2014/12/15/seo-mostly-about-showing-up/index.html +++ b/2014/12/15/seo-mostly-about-showing-up/index.html @@ -1,9 +1,11 @@ SEO: Mostly about showing up? | Yanir Seroussi | Data & AI for Startup Impact -

                        SEO: Mostly about showing up?

                        In previous posts about getting traction for my Bandcamp recommendations project (BCRecommender), I mentioned search engine optimisation (SEO) as one of the promising traction channels. Unfortunately, early efforts yielded negligible traffic – most new visitors came from referrals from blogs and Twitter. It turns out that the problem was not showing up for the SEO game: most of BCRecommender’s pages were blocked for crawling via robots.txt because I was worried that search engines (=Google) would penalise the website for thin/duplicate content.

                        Recently, I beefed up most of the pages, created a sitemap, and removed most pages from robots.txt. This resulted in a significant increase in traffic, as illustrated by the above graph. The number of organic impressions went up from less than ten per day to over a thousand. This is expected to go up even further, as only about 10% of pages are indexed. In addition, some traffic went to my staging site because it wasn’t blocked from crawling (I had to set up a new staging site that is password-protected and add a redirect from the old site to the production site – a bit annoying but I couldn’t find a better solution).

                        I hope Google won’t suddenly decide that BCRecommender content is not valuable or too thin. The content is automatically generated, which is “bad”, but it doesn’t “consist of paragraphs of random text that make no sense to the reader but which may contain search keywords”. As a (completely unbiased) user, I think it is valuable to find similar albums when searching for an album you like – an example that represents the majority of people that click through to BCRecommender. Judging from the main engagement measure I’m using (time spent on site), a good number of these people are happy with what they find.

                        More updates to come in the future. For now, my conclusion is: thin content is better than no content, as long as it’s relevant to what people are searching for and provides real value.

                        Subscribe +

                        SEO: Mostly about showing up?

                        In previous posts about getting traction for my Bandcamp recommendations project (BCRecommender), I mentioned search engine optimisation (SEO) as one of the promising traction channels. Unfortunately, early efforts yielded negligible traffic – most new visitors came from referrals from blogs and Twitter. It turns out that the problem was not showing up for the SEO game: most of BCRecommender’s pages were blocked for crawling via robots.txt because I was worried that search engines (=Google) would penalise the website for thin/duplicate content.

                        Recently, I beefed up most of the pages, created a sitemap, and removed most pages from robots.txt. This resulted in a significant increase in traffic, as illustrated by the above graph. The number of organic impressions went up from less than ten per day to over a thousand. This is expected to go up even further, as only about 10% of pages are indexed. In addition, some traffic went to my staging site because it wasn’t blocked from crawling (I had to set up a new staging site that is password-protected and add a redirect from the old site to the production site – a bit annoying but I couldn’t find a better solution).

                        I hope Google won’t suddenly decide that BCRecommender content is not valuable or too thin. The content is automatically generated, which is “bad”, but it doesn’t “consist of paragraphs of random text that make no sense to the reader but which may contain search keywords”. As a (completely unbiased) user, I think it is valuable to find similar albums when searching for an album you like – an example that represents the majority of people that click through to BCRecommender. Judging from the main engagement measure I’m using (time spent on site), a good number of these people are happy with what they find.

                        More updates to come in the future. For now, my conclusion is: thin content is better than no content, as long as it’s relevant to what people are searching for and provides real value.

                        Subscribe

                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                          \ No newline at end of file diff --git a/2014/12/29/stochastic-gradient-boosting-choosing-the-best-number-of-iterations/index.html b/2014/12/29/stochastic-gradient-boosting-choosing-the-best-number-of-iterations/index.html index d18592e5d..6a45a7894 100644 --- a/2014/12/29/stochastic-gradient-boosting-choosing-the-best-number-of-iterations/index.html +++ b/2014/12/29/stochastic-gradient-boosting-choosing-the-best-number-of-iterations/index.html @@ -1,5 +1,5 @@ Stochastic Gradient Boosting: Choosing the Best Number of Iterations | Yanir Seroussi | Data & AI for Startup Impact -

                          Stochastic Gradient Boosting: Choosing the Best Number of Iterations

                          In my summary of the Kaggle bulldozer price forecasting competition, I mentioned that part of my solution was based on stochastic gradient boosting. To reduce runtime, the number of boosting iterations was set by minimising the loss on the out-of-bag (OOB) samples, skipping trees where samples are in-bag. This approach was motivated by a bug in scikit-learn, where the OOB loss estimate was calculated on the in-bag samples, meaning that it always improved (and thus was useless for the purpose of setting the number of iterations).

                          The bug in scikit-learn was fixed by porting the solution used in R’s GBM package, where the number of iterations is estimated by minimising the improvement on the OOB samples in each boosting iteration. This approach is known to underestimate the number of required iterations, which means that it’s not very useful in practice. This underestimation may be due to the fact that the GBM method is partly estimated on in-bag samples, as the OOB samples for the Nth iteration are likely to have been in-bag in previous iterations.

                          I was curious about how my approach compares to the GBM method. Preliminary results on the toy dataset from scikit-learn’s documentation looked promising:

                          Stochastic Gradient Boosting: Choosing the Best Number of Iterations

                          In my summary of the Kaggle bulldozer price forecasting competition, I mentioned that part of my solution was based on stochastic gradient boosting. To reduce runtime, the number of boosting iterations was set by minimising the loss on the out-of-bag (OOB) samples, skipping trees where samples are in-bag. This approach was motivated by a bug in scikit-learn, where the OOB loss estimate was calculated on the in-bag samples, meaning that it always improved (and thus was useless for the purpose of setting the number of iterations).

                          The bug in scikit-learn was fixed by porting the solution used in R’s GBM package, where the number of iterations is estimated by minimising the improvement on the OOB samples in each boosting iteration. This approach is known to underestimate the number of required iterations, which means that it’s not very useful in practice. This underestimation may be due to the fact that the GBM method is partly estimated on in-bag samples, as the OOB samples for the Nth iteration are likely to have been in-bag in previous iterations.

                          I was curious about how my approach compares to the GBM method. Preliminary results on the toy dataset from scikit-learn’s documentation looked promising:

                          800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                            \ No newline at end of file diff --git a/2015/01/15/automating-parse-com-bulk-data-imports/index.html b/2015/01/15/automating-parse-com-bulk-data-imports/index.html index 543ece1c5..52db457f5 100644 --- a/2015/01/15/automating-parse-com-bulk-data-imports/index.html +++ b/2015/01/15/automating-parse-com-bulk-data-imports/index.html @@ -1,5 +1,5 @@ Automating Parse.com bulk data imports | Yanir Seroussi | Data & AI for Startup Impact -

                            Automating Parse.com bulk data imports

                            Parse is a great backend-as-a-service (BaaS) product. It removes much of the hassle involved in backend devops with its web hosting service, SDKs for all the major mobile platforms, and a generous free tier. Parse does have its share of flaws, including various reliability issues (which seem to be getting rarer), and limitations on what you can do (which is reasonable price to pay for working within a sandboxed environment). One such limitation is the lack of APIs to perform bulk data imports. This post introduces my workaround for this limitation (tl;dr: it’s a PhantomJS script).

                            Update: The script no longer works due to changes to Parse’s website. I won’t be fixing it since I’ve migrated my projects off the platform. If you fix it, let me know and I’ll post a link to the updated script here.

                            I use Parse for two of my projects: BCRecommender and Price Dingo. In both cases, some of the data is generated outside Parse by a Python backend. Doing all the data processing within Parse is not a viable option, so a solution for importing this data into Parse is required.

                            My original solution for data import was using the Parse REST API via ParsePy. The problem with this solution is that Parse billing is done on a requests/second basis. The free tier includes 30 requests/second, so importing BCRecommender’s ~million objects takes about nine hours when operating at maximum capacity. However, operating at maximum capacity causes other client requests to be dropped (i.e., real users suffer). Hence, some sort of rate limiting is required, which makes the sync process take even longer.

                            I thought that using batch requests would speed up the process, but it actually slowed it down! This is because batch requests are billed according to the number of sub-requests, so making even one successful batch request per second with the maximum number of sub-requests (50) causes more requests to be dropped. I implemented some code to retry failed requests, but the whole process was just too brittle.

                            A few months ago I discovered that Parse supports bulk data import via the web interface (with no API support). This feature comes with the caveat that existing collections can’t be updated: a new collection must be created. This is actually a good thing, as it essentially makes the collections immutable. And immutability makes many things easier.

                            BCRecommender data gets updated once a month, so I was happy with manually importing the data via the web interface. As a price comparison engine, Price Dingo’s data changes more frequently, so manual updates are out of the question. For Price Dingo to be hosted on Parse, I had to find a way to automate bulk imports. Some people suggest emulating the requests made by the web interface, but this requires relying on hardcoded cookie and CSRF token data, which may change at any time. A more robust solution would be to scriptify the manual actions, but how? PhantomJS, that’s how.

                            I ended up implementing a PhantomJS script that logs in as the user and uploads a dump to a given collection. This script is available on GitHub Gist. To run it, simply install PhantomJS and run:

                            $ phantomjs --ssl-protocol any \
                            +

                            Automating Parse.com bulk data imports

                            Parse is a great backend-as-a-service (BaaS) product. It removes much of the hassle involved in backend devops with its web hosting service, SDKs for all the major mobile platforms, and a generous free tier. Parse does have its share of flaws, including various reliability issues (which seem to be getting rarer), and limitations on what you can do (which is reasonable price to pay for working within a sandboxed environment). One such limitation is the lack of APIs to perform bulk data imports. This post introduces my workaround for this limitation (tl;dr: it’s a PhantomJS script).

                            Update: The script no longer works due to changes to Parse’s website. I won’t be fixing it since I’ve migrated my projects off the platform. If you fix it, let me know and I’ll post a link to the updated script here.

                            I use Parse for two of my projects: BCRecommender and Price Dingo. In both cases, some of the data is generated outside Parse by a Python backend. Doing all the data processing within Parse is not a viable option, so a solution for importing this data into Parse is required.

                            My original solution for data import was using the Parse REST API via ParsePy. The problem with this solution is that Parse billing is done on a requests/second basis. The free tier includes 30 requests/second, so importing BCRecommender’s ~million objects takes about nine hours when operating at maximum capacity. However, operating at maximum capacity causes other client requests to be dropped (i.e., real users suffer). Hence, some sort of rate limiting is required, which makes the sync process take even longer.

                            I thought that using batch requests would speed up the process, but it actually slowed it down! This is because batch requests are billed according to the number of sub-requests, so making even one successful batch request per second with the maximum number of sub-requests (50) causes more requests to be dropped. I implemented some code to retry failed requests, but the whole process was just too brittle.

                            A few months ago I discovered that Parse supports bulk data import via the web interface (with no API support). This feature comes with the caveat that existing collections can’t be updated: a new collection must be created. This is actually a good thing, as it essentially makes the collections immutable. And immutability makes many things easier.

                            BCRecommender data gets updated once a month, so I was happy with manually importing the data via the web interface. As a price comparison engine, Price Dingo’s data changes more frequently, so manual updates are out of the question. For Price Dingo to be hosted on Parse, I had to find a way to automate bulk imports. Some people suggest emulating the requests made by the web interface, but this requires relying on hardcoded cookie and CSRF token data, which may change at any time. A more robust solution would be to scriptify the manual actions, but how? PhantomJS, that’s how.

                            I ended up implementing a PhantomJS script that logs in as the user and uploads a dump to a given collection. This script is available on GitHub Gist. To run it, simply install PhantomJS and run:

                            $ phantomjs --ssl-protocol any \
                                 import-parse-class.js <configFile> <dumpFile> <collectionName>
                             

                            See the script’s source for a detailed explanation of the command-line arguments.

                            It is worth noting that the script doesn’t do any post-upload verification on the collection. This is done by an extra bit of Python code that verifies that the collection has the expected number of objects, and tries to query the collection sorted by all the keys that are supposed to be indexed (for large collections, it takes Parse a while to index all the fields, which may result in timeouts). Once these conditions are fulfilled, the Parse hosting code is updated to point to the new collection. For security, I added a bot user that has access only to the Parse app that it needs to update. Unlike the root user, this bot user can’t delete the app. As the config file contains the bot’s password, it should be encrypted and stored in a safe place (like the Parse master key).

                            That’s it! I hope that other people would find this solution useful. Any suggestions/comments/issues are very welcome.


                            Image source: Parse Blog.

                            Subscribe
                              @@ -7,7 +7,9 @@

                              Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                              Hi, very nice trick! Trying to implement this as we speak, does this code still work? I get to the collections page, but I don’t think the upload is working. I’m new to Phantomjs. Thanks!
                              Hi Walter! Yeah, the code stopped working when Parse redesigned their website. I never fixed it because I ended up porting my projects away from Parse. If you fix it let me know and I’ll update this post. -By the way, you may find it easier to use Selenium (or something similar) as a wrapper around PhantomJS, as it should result in cleaner code. For example, check out Python’s Selenium bindings: http://selenium.googlecode.com/svn/trunk/docs/api/py/index.html

                              This site is a part of the Data People Writing Stuff webring.
                              ← previous site +  |   +next site →

                              \ No newline at end of file diff --git a/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.html b/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.html index da9b27fe3..969888b11 100644 --- a/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.html +++ b/2015/01/29/is-thinking-like-a-search-engine-possible-yandex-search-personalisation-kaggle-competition-summary-part-1/index.html @@ -1,12 +1,14 @@ Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1) | Yanir Seroussi | Data & AI for Startup Impact -

                              Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)

                              About a year ago, I participated in the Yandex search personalisation Kaggle competition. I started off as a solo competitor, and then added a few Kaggle newbies to the team as part of a program I was running for the Sydney Data Science Meetup. My team hasn’t done too badly, finishing 9th out of 194 teams. As is usually the case with Kaggle competitions, the most valuable part was the lessons learned from the experience. In this case, the lessons go beyond the usual data science skills, and include some insights that are relevant to search engine optimisation (SEO) and privacy. This post describes the competition setup and covers the more general insights. A follow-up post will cover the technical side of our approach.

                              The data

                              Yandex is the leading search engine in Russia. For the competition, they supplied a dataset that consists of log data of search activity from a single large city, which represents one month of search activity (excluding popular queries). In total, the dataset contains about 21M unique queries, 700M unique urls, 6M unique users, and 35M search sessions. This is a relatively-big dataset for a Kaggle competition (the training file is about 16GB uncompressed), but it’s really rather small in comparison to Yandex’s overall search volume and tiny compared to what Google handles.

                              The data was anonymised, so a sample looks like this (see full description of the data format – the example and its description are taken from there):

                              744899 M 23 123123123
                              +

                              Is thinking like a search engine possible? (Yandex search personalisation – Kaggle competition summary – part 1)

                              About a year ago, I participated in the Yandex search personalisation Kaggle competition. I started off as a solo competitor, and then added a few Kaggle newbies to the team as part of a program I was running for the Sydney Data Science Meetup. My team hasn’t done too badly, finishing 9th out of 194 teams. As is usually the case with Kaggle competitions, the most valuable part was the lessons learned from the experience. In this case, the lessons go beyond the usual data science skills, and include some insights that are relevant to search engine optimisation (SEO) and privacy. This post describes the competition setup and covers the more general insights. A follow-up post will cover the technical side of our approach.

                              The data

                              Yandex is the leading search engine in Russia. For the competition, they supplied a dataset that consists of log data of search activity from a single large city, which represents one month of search activity (excluding popular queries). In total, the dataset contains about 21M unique queries, 700M unique urls, 6M unique users, and 35M search sessions. This is a relatively-big dataset for a Kaggle competition (the training file is about 16GB uncompressed), but it’s really rather small in comparison to Yandex’s overall search volume and tiny compared to what Google handles.

                              The data was anonymised, so a sample looks like this (see full description of the data format – the example and its description are taken from there):

                              744899 M 23 123123123
                               744899 0 Q 0 192902 4857,3847,2939 632428,2384 309585,28374 319567,38724 6547,28744 20264,2332 3094446,34535 90,21 841,231 8344,2342 119571,45767
                               744899 1403 C 0 632428
                               

                              These records describe the session (SessionID = 744899) of the user with USERID 123123123, performed on the 23rd day of the dataset. The user submitted the query with QUERYID 192902, which contains terms with TermIDs 4857,3847,2939. The URL with URLID 632428 placed on the domain DomainID 2384 is the top result on the corresponding SERP. 1403 units of time after beginning of the session the user clicked on the result with URLID 632428 (ranked first in the list).

                              While this may seem daunting at first, the data is actually quite simple. For each search session, we know the user, the queries they’ve made, which URLs and domains were returned in the SERP (search engine result page), which results they’ve clicked, and at what point in time the queries and clicks happened.

                              Goal and evaluation

                              The goal of the competition is to rerank the results in each SERP such that the highest-ranking documents are those that the user would find most relevant. As the name of the competition suggests, personalising the results is key, but non-personalised approaches were also welcome (and actually worked quite well).

                              One question that arises is how to tell from this data which results the user finds relevant. In this competition, the results were labelled as either irrelevant (0), relevant (1), or highly relevant (2). Relevance is a function of clicks and dwell time, where dwell time is the time spent on the result (determined by the time that passed until the next query or click). Irrelevant results are ones that weren’t clicked, or those for which the dwell time is less than 50 (the time unit is left unspecified). Relevant results are those that were clicked and have dwell time of 50 to 399. Highly relevant results have dwell time of at least 400, or were clicked as the last action in the session (i.e., it is assumed the user finished the session satisfied with the results rather than left because they couldn’t find what they were looking for).

                              This approach to determining relevance has some obvious flaws, but it apparently correlates well with actual user satisfaction with search results.

                              Given the above definition of relevance, one can quantify how well a reranking method improves the relevance of the results. For this competition, the organisers chose the normalised discounted cumulative gain (NDCG) measure, which is a fancy name for a measure that, in the words of Wikipedia, encodes the assumptions that:

                              • Highly relevant documents are more useful when appearing earlier in a search engine result list (have higher ranks)
                              • Highly relevant documents are more useful than marginally relevant documents, which are in turn more useful than irrelevant documents.

                              SEO insights and other thoughts

                              A key insight that is relevant to SEO and privacy, is that even without considering browser-based tracking and tools like Google Analytics (which may or may not be used by Google to rerank search results), search engines can infer a lot about user behaviour on other sites, just based on user interaction with the SERP. So if your users bounce quickly because your website is slow to load or ranks highly for irrelevant queries, the search engine can know that, and will probably penalise you accordingly.

                              This works both ways, though, and is evident even on search engines that don’t track personal information. Just try searching for “f” or “fa” or “fac” using DuckDuckGo, Google, Bing, Yahoo, or even Yandex. Facebook will be one of the top results (most often the first one), probably just because people tend to search for or visit Facebook after searching for one of those terms by mistake. So if your website ranks poorly for a term for which it should rank well, and your users behave accordingly (because, for example, they’re searching for your website specifically), you may magically end up with better ranking without any changes to inbound links or to your site.

                              Another thing that is demonstrated by this competition’s dataset is just how much data search engines consider when determining ranking. The dataset is just a sample of logs for one city for one month. I don’t like throwing the words “big data” around, but the full volume of data is pretty big. Too big for anyone to grasp and fully understand how exactly search engines work, and this includes the people who build them. What’s worth keeping in mind is that for all major search engines, the user is the product that they sell to advertisers, so keeping the users happy is key. Any changes made to the underlying algorithms are usually done with the end-user in mind, because not making such changes may kill the search engine (remember AltaVista?). Further, personalisation means that different users see different results for the same query. So my feeling is that it’s somewhat futile to do any SEO beyond making the website understandable by search engines, acquiring legitimate links, and just building a website that people would want to visit.

                              Next steps

                              With those thoughts out of the way, it’s time to describe the way we addressed the challenge. This is covered in the next post, Learning to rank for personalised search.

                              Subscribe

                                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                \ No newline at end of file diff --git a/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.html b/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.html index 68894f128..7ee761364 100644 --- a/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.html +++ b/2015/02/11/learning-to-rank-for-personalised-search-yandex-search-personalisation-kaggle-competition-summary-part-2/index.html @@ -1,11 +1,13 @@ Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2) | Yanir Seroussi | Data & AI for Startup Impact -

                                Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)

                                This is the second and last post summarising my team’s solution for the Yandex search personalisation Kaggle competition. See the first post for a summary of the dataset, evaluation approach, and some thoughts about search engine optimisation and privacy. This post discusses the algorithms and features we used.

                                To quickly recap the first post, Yandex released a 16GB dataset of query & click logs. The goal of the competition was to use this data to rerank query results such that the more relevant results appear before less relevant results. Relevance is determined by time spent on each clicked result (non-clicked results are deemed irrelevant), and overall performance is scored using the normalised discounted cumulative gain (NDCG) measure. No data about the content of sites or queries was given – each query in the dataset is a list of token IDs and each result is a (url ID, domain ID) pair.

                                First steps: memory-based heuristics

                                My initial approach wasn’t very exciting: it involved iterating through the data, summarising it in one way or another, and assigning new relevance scores to each (user, session, query) combination. In this early stage I also implemented an offline validation framework, which is an important part of every Kaggle competition: in this case I simply set aside the last three days of data for local testing, because the test dataset that was used for the leaderboard consisted of three days of log data.

                                Somewhat surprisingly, my heuristics worked quite well and put me in a top-10 position on the leaderboard. It seems like the barrier of entry for this competition was higher than for other Kaggle competitions due to the size of the data and the fact that it wasn’t given as preprocessed feature vectors. This was evident from questions on the forum, where people noted that they were having trouble downloading and looking at the data.

                                The heuristic models that worked well included:

                                • Reranking based on mean relevance (this just swapped positions 9 & 10, probably because users are more likely to click the last result)
                                • Reranking based on mean relevance for (query, url) and (query, domain) pairs (non-personalised improvements)
                                • Downranking urls observed previously in a session

                                Each one of the heuristic models was set to output relevance scores. The models were then ensembled by simply summing the relevance scores.

                                Then, I started playing with a collaborative-filtering-inspired matrix factorisation model for predicting relevance, which didn’t work too well. At around that time, I got too busy with other stuff and decided to quit while I’m ahead.

                                Getting more serious with some team work and LambdaMART

                                A few weeks after quitting, I somehow volunteered to organise Kaggle teams for newbies at the Sydney Data Science Meetup group. At that point I was joined by my teammates, which served as a good motivation to do more stuff.

                                The first thing we tried was another heuristic model I read about in one of the papers suggested by the organisers: just reranking based on the fact that people often repeat queries as a navigational aid (e.g., search for Facebook and click Facebook). Combined in a simple linear model with the other heuristics, this put us at #4. Too easy 🙂

                                With all the new motivation, it was time to read more papers and start doing things properly. We ended up using Ranklib’s LambdaMART implementation as one of our main models, and also used LambdaMART to combine the various models (the old heuristics still helped the overall score, as did the matrix factorisation model).

                                Using LambdaMART made it possible to directly optimise the NDCG measure, turning the key problem into feature engineering, i.e., finding good features to feed into the model. Explaining how LambdaMART works is beyond the scope of this post (see this paper for an in-depth discussion), but the basic idea (which is also shared by other learning to rank algorithms) is that rather than trying to solve the hard problem of predicting relevance (i.e., a regression problem), the algorithm tries to predict the ranking that yields the best results according to a user-chosen measure.

                                We tried many features for the LambdaMART model, but after feature selection (using a method learned from Phil Brierley’s talk) the best features turned out to be:

                                • percentage_recurrent_term_ids: percentage of term IDs from the test query that appeared previously in the session — indicates if this query refines previous queries
                                • query_mean_ndcg: historical NDCG for this query — indicates how satisfied people are with the results of this query. Interestingly, we also tried query click entropy, but it performed worse. Probably because we’re optimising the NDCG rather than click-through rate.
                                • query_num_unique_serps: how many different result pages were shown for this query
                                • query_mean_result_dwell_time: how much time on average people spend per result for this query
                                • user_mean_ndcg: like query_mean_ndcg, but for users — a low NDCG indicates that this user is likely to be dissatisfied with the results. As for query_mean_ndcg, adding this feature yielded better results than using the user’s click entropy.
                                • user_num_click_actions_with_relevance_0: over the history of this user, how many of their clicks had relevance 0 (i.e., short dwell time). Interestingly, user_num_click_actions_with_relevance_1 and user_num_click_actions_with_relevance_2 were found to be less useful.
                                • user_num_query_actions: number of queries performed by the user
                                • rank: the original rank, as assigned by Yandex
                                • previous_query_url_relevance_in_session: modelling repeated results within a session, e.g., if a (query, url) pair was already found irrelevant in this session, the user may not want to see it again
                                • previous_url_relevance_in_session: the same as previous_query_url_relevance_in_session, but for a url regardless of the query
                                • user_query_url_relevance_sum: over the entire history of the user, not just the session
                                • user_normalised_rank_relevance: how relevant does the user usually find this rank? The idea is that some people are more likely to go through all the results than others
                                • query_url_click_probability: estimated simply as num_query_url_clicks / num_query_url_occurrences (across all the users)
                                • average_time_on_page: how much time people spend on this url on average

                                Our best submission ended up placing us at the 9th place (out of 194 teams), which is respectable. Things got a bit more interesting towards the end of the competition – if we had used the original heuristic model that put at #4 early on, we would have finished 18th.

                                Conclusion

                                I really enjoyed this competition. The data was well-organised and well-defined, which is not something you get in every competition (or in “real life”). Its size did present some challenges, but we stuck to using flat files and some preprocessing and other tricks to speed things up (e.g., I got to use Cython for the first time). It was good to learn how learning to rank algorithms work and get some insights on search personalisation. As is often the case with Kaggle competitions, this was time well spent.

                                Subscribe +

                                Learning to rank for personalised search (Yandex Search Personalisation – Kaggle Competition Summary – Part 2)

                                This is the second and last post summarising my team’s solution for the Yandex search personalisation Kaggle competition. See the first post for a summary of the dataset, evaluation approach, and some thoughts about search engine optimisation and privacy. This post discusses the algorithms and features we used.

                                To quickly recap the first post, Yandex released a 16GB dataset of query & click logs. The goal of the competition was to use this data to rerank query results such that the more relevant results appear before less relevant results. Relevance is determined by time spent on each clicked result (non-clicked results are deemed irrelevant), and overall performance is scored using the normalised discounted cumulative gain (NDCG) measure. No data about the content of sites or queries was given – each query in the dataset is a list of token IDs and each result is a (url ID, domain ID) pair.

                                First steps: memory-based heuristics

                                My initial approach wasn’t very exciting: it involved iterating through the data, summarising it in one way or another, and assigning new relevance scores to each (user, session, query) combination. In this early stage I also implemented an offline validation framework, which is an important part of every Kaggle competition: in this case I simply set aside the last three days of data for local testing, because the test dataset that was used for the leaderboard consisted of three days of log data.

                                Somewhat surprisingly, my heuristics worked quite well and put me in a top-10 position on the leaderboard. It seems like the barrier of entry for this competition was higher than for other Kaggle competitions due to the size of the data and the fact that it wasn’t given as preprocessed feature vectors. This was evident from questions on the forum, where people noted that they were having trouble downloading and looking at the data.

                                The heuristic models that worked well included:

                                • Reranking based on mean relevance (this just swapped positions 9 & 10, probably because users are more likely to click the last result)
                                • Reranking based on mean relevance for (query, url) and (query, domain) pairs (non-personalised improvements)
                                • Downranking urls observed previously in a session

                                Each one of the heuristic models was set to output relevance scores. The models were then ensembled by simply summing the relevance scores.

                                Then, I started playing with a collaborative-filtering-inspired matrix factorisation model for predicting relevance, which didn’t work too well. At around that time, I got too busy with other stuff and decided to quit while I’m ahead.

                                Getting more serious with some team work and LambdaMART

                                A few weeks after quitting, I somehow volunteered to organise Kaggle teams for newbies at the Sydney Data Science Meetup group. At that point I was joined by my teammates, which served as a good motivation to do more stuff.

                                The first thing we tried was another heuristic model I read about in one of the papers suggested by the organisers: just reranking based on the fact that people often repeat queries as a navigational aid (e.g., search for Facebook and click Facebook). Combined in a simple linear model with the other heuristics, this put us at #4. Too easy 🙂

                                With all the new motivation, it was time to read more papers and start doing things properly. We ended up using Ranklib’s LambdaMART implementation as one of our main models, and also used LambdaMART to combine the various models (the old heuristics still helped the overall score, as did the matrix factorisation model).

                                Using LambdaMART made it possible to directly optimise the NDCG measure, turning the key problem into feature engineering, i.e., finding good features to feed into the model. Explaining how LambdaMART works is beyond the scope of this post (see this paper for an in-depth discussion), but the basic idea (which is also shared by other learning to rank algorithms) is that rather than trying to solve the hard problem of predicting relevance (i.e., a regression problem), the algorithm tries to predict the ranking that yields the best results according to a user-chosen measure.

                                We tried many features for the LambdaMART model, but after feature selection (using a method learned from Phil Brierley’s talk) the best features turned out to be:

                                • percentage_recurrent_term_ids: percentage of term IDs from the test query that appeared previously in the session — indicates if this query refines previous queries
                                • query_mean_ndcg: historical NDCG for this query — indicates how satisfied people are with the results of this query. Interestingly, we also tried query click entropy, but it performed worse. Probably because we’re optimising the NDCG rather than click-through rate.
                                • query_num_unique_serps: how many different result pages were shown for this query
                                • query_mean_result_dwell_time: how much time on average people spend per result for this query
                                • user_mean_ndcg: like query_mean_ndcg, but for users — a low NDCG indicates that this user is likely to be dissatisfied with the results. As for query_mean_ndcg, adding this feature yielded better results than using the user’s click entropy.
                                • user_num_click_actions_with_relevance_0: over the history of this user, how many of their clicks had relevance 0 (i.e., short dwell time). Interestingly, user_num_click_actions_with_relevance_1 and user_num_click_actions_with_relevance_2 were found to be less useful.
                                • user_num_query_actions: number of queries performed by the user
                                • rank: the original rank, as assigned by Yandex
                                • previous_query_url_relevance_in_session: modelling repeated results within a session, e.g., if a (query, url) pair was already found irrelevant in this session, the user may not want to see it again
                                • previous_url_relevance_in_session: the same as previous_query_url_relevance_in_session, but for a url regardless of the query
                                • user_query_url_relevance_sum: over the entire history of the user, not just the session
                                • user_normalised_rank_relevance: how relevant does the user usually find this rank? The idea is that some people are more likely to go through all the results than others
                                • query_url_click_probability: estimated simply as num_query_url_clicks / num_query_url_occurrences (across all the users)
                                • average_time_on_page: how much time people spend on this url on average

                                Our best submission ended up placing us at the 9th place (out of 194 teams), which is respectable. Things got a bit more interesting towards the end of the competition – if we had used the original heuristic model that put at #4 early on, we would have finished 18th.

                                Conclusion

                                I really enjoyed this competition. The data was well-organised and well-defined, which is not something you get in every competition (or in “real life”). Its size did present some challenges, but we stuck to using flat files and some preprocessing and other tricks to speed things up (e.g., I got to use Cython for the first time). It was good to learn how learning to rank algorithms work and get some insights on search personalisation. As is often the case with Kaggle competitions, this was time well spent.

                                Subscribe

                                  Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                  I do not understand how your featureset helped the model to learn anything. For example, -user_num_query_actions: number of queries performed by the user

                                  How will it affect the order of search results for a new/test query.

                                  This site is a part of the Data People Writing Stuff webring.
                                  ← previous site +  |   +next site →

                                  \ No newline at end of file diff --git a/2015/03/22/the-long-road-to-a-lifestyle-business/index.html b/2015/03/22/the-long-road-to-a-lifestyle-business/index.html index 6ee364e35..9e237cdd1 100644 --- a/2015/03/22/the-long-road-to-a-lifestyle-business/index.html +++ b/2015/03/22/the-long-road-to-a-lifestyle-business/index.html @@ -1,9 +1,11 @@ The long road to a lifestyle business | Yanir Seroussi | Data & AI for Startup Impact -

                                  The long road to a lifestyle business

                                  Almost a year ago, I left my last full-time job and decided to set on an independent path that includes data science consulting and work on my own projects. The ultimate goal is not to have to sell my time for money by generating enough passive income to live comfortably. My five main areas of focus are – in no particular order – personal branding & networking, data science contracting, Bandcamp Recommender, Price Dingo, and marine conservation. This post summarises what I’ve been doing in each of these five areas, including highlights and lowlights. So far, it’s way better than having a “real” job. I hope this post will help others who are on a similar journey (there seem to be more and more of us – I’d love to hear from you).

                                  Personal branding & networking

                                  Finding clients requires considerably more work than finding a full-time job. As with job hunting, the ideal situation is where people come to you for help, rather than you chasing them. To this end, I’ve been networking a lot, giving talks, writing up posts and working on distributing them. It may be harder than getting a full-time job, but it’s also much more interesting.

                                  Highlights: going viral in China, getting a post featured in KDNuggets
                                  Lowlights: not having enough time to write all the things and meet all the people

                                  Data science contracting

                                  My goal with contracting/consulting is to have a steady income stream while working on my own projects. As my projects are small enough to be done only by me (with optional outsourcing to contractors), this means I have infinite runway to pursue them. While this is probably not the best way of building a Silicon Valley-style startup that is going to make the world a better place, many others have applied this approach to building a so-called lifestyle business, which is what I want to achieve.

                                  Early on, I realised that doing full-on consulting would be too time consuming, as many clients expect full-time availability. In addition, constantly needing to find new clients means that not much time would be left for work on my own projects. What I really wanted was a stable part-time gig. The first one was with GetUp (who reached out to me following a workshop I gave at General Assembly), where I did some work on forecasting engagement and churn. In parallel, I went through the interview process at DuckDuckGo, which included delivering a piece of work to production. DuckDuckGo ended up wanting me to work full-time (like a few other companies), so last month I started a part-time (three days a week) contract at Commonwealth Bank. I joined a team of very strong data scientists – it looks like it’s going to be interesting.

                                  Highlights: seeing my DuckDuckGo work every time I search for a Python package, the work environment at GetUp
                                  Lowlights: chasing leads that never eventuated

                                  Bandcamp Recommender (BCRecommender)

                                  I’ve written a several posts about BCRecommender, my Bandcamp music recommendation project. While I’ve always treated it as a side-project, it’s been useful in learning how to get traction for a product. It now has thousands of monthly users, and is still growing. My goal for BCRecommender has changed from the original one of finding music for myself to growing it enough to be a noticeable source of traffic for Bandcamp, thereby helping artists and fans. Doing it in side-project mode can be a bit challenging at times (because I have so many other things to do and a long list of ideas to make the app better), but I’ve been making gradual progress and discovering a lot of great music in the process.

                                  Highlights: every time someone gives me positive feedback, every time I listen to music I found using BCRecommender
                                  Lowlights: dealing with Parse issues and random errors

                                  Price Dingo

                                  The inability to reliably compare prices for many types of products has been bothering me for a while. Unlike general web search, where the main providers rank results by relevance, most Australian price comparison engines still require merchants to pay to even have their products listed. This creates an obvious bias in the results. To address this bias, I created Price Dingo – a user-centric price comparison engine. It serves users with results they can trust by not requiring merchants to pay to have their products listed. Just like general web search engines, the main ranking factor is relevancy to the user. This relevancy is also achieved by implementing Price Dingo as a network of independent sites, each focused on a specific product category, with the first category being scuba diving gear.

                                  Implementing Price Dingo hasn’t been too hard – the main challenge has been finding the time to do it with all the other stuff I’ve been doing. There are still plenty of improvements to be made to the site, but now the main goal is to get enough traction to make ongoing time investment worthwhile. Judging by the experience of Booko’s founder, there is space in the market for niche price comparison sites and apps, so it is just a matter of execution.

                                  Highlights: being able to finally compare dive gear prices, the joys of integrating Algolia
                                  Lowlights: extracting data from messy websites – I’ve seen some horrible things…

                                  Marine conservation

                                  The first thing I did after leaving my last job was go overseas for five weeks, which included a ten-day visit to Israel (rockets!) and three weeks of conservation diving with New Heaven Dive School in Thailand. Back in Sydney, I joined the Underwater Research Group of NSW, a dive club that’s involved in many marine conservation and research activities, including Reef Life Survey (RLS) and underwater cleanups. With URG, I’ve been diving more than before, and for a change, some of my dives actually do good. I’d love to do this kind of stuff full-time, but there’s a lot less money in getting people to do less stuff (i.e., conservation and sustainability) than in consuming more. The compromise for now is that a portion of Price Dingo’s scuba revenue goes to the Australian Marine Conservation Society, and the plan is to expand this to other charities as more categories are added. Update – May 2015: I decided that this compromise isn’t good enough for me, so I shut down Price Dingo to focus on projects that are more aligned with my values.

                                  Highlights: becoming a certified RLS diver, pretty much every dive
                                  Lowlights: cutting my hand open by falling on rocks on the first day of diving in Thailand

                                  The future

                                  So far, I’m pretty happy with this not-having-a-job-doing-my-own-thing business. According to The 1000 Day Rule, I still have a long way to go until I get the lifestyle I want. It may even take longer than 1000 days given my decision to not work full-time on a single profitable project, together with my tendency to take more time off than I would if I had a “real” job. But the beauty of this path is that there are no investors breathing down my neck or the feeling of mental rot that comes with a full-time job, so there’s really no rush and I can just enjoy the ride.

                                  Subscribe +

                                  The long road to a lifestyle business

                                  Almost a year ago, I left my last full-time job and decided to set on an independent path that includes data science consulting and work on my own projects. The ultimate goal is not to have to sell my time for money by generating enough passive income to live comfortably. My five main areas of focus are – in no particular order – personal branding & networking, data science contracting, Bandcamp Recommender, Price Dingo, and marine conservation. This post summarises what I’ve been doing in each of these five areas, including highlights and lowlights. So far, it’s way better than having a “real” job. I hope this post will help others who are on a similar journey (there seem to be more and more of us – I’d love to hear from you).

                                  Personal branding & networking

                                  Finding clients requires considerably more work than finding a full-time job. As with job hunting, the ideal situation is where people come to you for help, rather than you chasing them. To this end, I’ve been networking a lot, giving talks, writing up posts and working on distributing them. It may be harder than getting a full-time job, but it’s also much more interesting.

                                  Highlights: going viral in China, getting a post featured in KDNuggets
                                  Lowlights: not having enough time to write all the things and meet all the people

                                  Data science contracting

                                  My goal with contracting/consulting is to have a steady income stream while working on my own projects. As my projects are small enough to be done only by me (with optional outsourcing to contractors), this means I have infinite runway to pursue them. While this is probably not the best way of building a Silicon Valley-style startup that is going to make the world a better place, many others have applied this approach to building a so-called lifestyle business, which is what I want to achieve.

                                  Early on, I realised that doing full-on consulting would be too time consuming, as many clients expect full-time availability. In addition, constantly needing to find new clients means that not much time would be left for work on my own projects. What I really wanted was a stable part-time gig. The first one was with GetUp (who reached out to me following a workshop I gave at General Assembly), where I did some work on forecasting engagement and churn. In parallel, I went through the interview process at DuckDuckGo, which included delivering a piece of work to production. DuckDuckGo ended up wanting me to work full-time (like a few other companies), so last month I started a part-time (three days a week) contract at Commonwealth Bank. I joined a team of very strong data scientists – it looks like it’s going to be interesting.

                                  Highlights: seeing my DuckDuckGo work every time I search for a Python package, the work environment at GetUp
                                  Lowlights: chasing leads that never eventuated

                                  Bandcamp Recommender (BCRecommender)

                                  I’ve written a several posts about BCRecommender, my Bandcamp music recommendation project. While I’ve always treated it as a side-project, it’s been useful in learning how to get traction for a product. It now has thousands of monthly users, and is still growing. My goal for BCRecommender has changed from the original one of finding music for myself to growing it enough to be a noticeable source of traffic for Bandcamp, thereby helping artists and fans. Doing it in side-project mode can be a bit challenging at times (because I have so many other things to do and a long list of ideas to make the app better), but I’ve been making gradual progress and discovering a lot of great music in the process.

                                  Highlights: every time someone gives me positive feedback, every time I listen to music I found using BCRecommender
                                  Lowlights: dealing with Parse issues and random errors

                                  Price Dingo

                                  The inability to reliably compare prices for many types of products has been bothering me for a while. Unlike general web search, where the main providers rank results by relevance, most Australian price comparison engines still require merchants to pay to even have their products listed. This creates an obvious bias in the results. To address this bias, I created Price Dingo – a user-centric price comparison engine. It serves users with results they can trust by not requiring merchants to pay to have their products listed. Just like general web search engines, the main ranking factor is relevancy to the user. This relevancy is also achieved by implementing Price Dingo as a network of independent sites, each focused on a specific product category, with the first category being scuba diving gear.

                                  Implementing Price Dingo hasn’t been too hard – the main challenge has been finding the time to do it with all the other stuff I’ve been doing. There are still plenty of improvements to be made to the site, but now the main goal is to get enough traction to make ongoing time investment worthwhile. Judging by the experience of Booko’s founder, there is space in the market for niche price comparison sites and apps, so it is just a matter of execution.

                                  Highlights: being able to finally compare dive gear prices, the joys of integrating Algolia
                                  Lowlights: extracting data from messy websites – I’ve seen some horrible things…

                                  Marine conservation

                                  The first thing I did after leaving my last job was go overseas for five weeks, which included a ten-day visit to Israel (rockets!) and three weeks of conservation diving with New Heaven Dive School in Thailand. Back in Sydney, I joined the Underwater Research Group of NSW, a dive club that’s involved in many marine conservation and research activities, including Reef Life Survey (RLS) and underwater cleanups. With URG, I’ve been diving more than before, and for a change, some of my dives actually do good. I’d love to do this kind of stuff full-time, but there’s a lot less money in getting people to do less stuff (i.e., conservation and sustainability) than in consuming more. The compromise for now is that a portion of Price Dingo’s scuba revenue goes to the Australian Marine Conservation Society, and the plan is to expand this to other charities as more categories are added. Update – May 2015: I decided that this compromise isn’t good enough for me, so I shut down Price Dingo to focus on projects that are more aligned with my values.

                                  Highlights: becoming a certified RLS diver, pretty much every dive
                                  Lowlights: cutting my hand open by falling on rocks on the first day of diving in Thailand

                                  The future

                                  So far, I’m pretty happy with this not-having-a-job-doing-my-own-thing business. According to The 1000 Day Rule, I still have a long way to go until I get the lifestyle I want. It may even take longer than 1000 days given my decision to not work full-time on a single profitable project, together with my tendency to take more time off than I would if I had a “real” job. But the beauty of this path is that there are no investors breathing down my neck or the feeling of mental rot that comes with a full-time job, so there’s really no rush and I can just enjoy the ride.

                                  Subscribe

                                    Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                    “What I really wanted was a stable part-time gig.”: They’re remarkably hard to find. It’s an absurdity of our time that many people are overemployed - selling more of their time than they want for more money than they need - even while many other people are underemployed - unable to sell enough of their time for enough money to live comfortably.
                                    That’s very true. The interesting thing is that it’s a problem that is not unique to this century. It was discussed by Thoreau in Walden (1854), Bertrand Russell in In Praise of Idleness (1932), and David Graeber in On the Phenomenon of Bullshit Jobs (2013), to name a few. People seem to be worried about robots taking their jobs, but the scarier thought is that robots will never take our jobs, because we’ll keep coming up with ways of staying employed rather than enjoy the affluence afforded by technological advancements.
                                    \ No newline at end of file diff --git a/2015/04/24/my-divestment-from-fossil-fuels/index.html b/2015/04/24/my-divestment-from-fossil-fuels/index.html index d0badaf3c..5fc2f4d7d 100644 --- a/2015/04/24/my-divestment-from-fossil-fuels/index.html +++ b/2015/04/24/my-divestment-from-fossil-fuels/index.html @@ -1,9 +1,11 @@ My divestment from fossil fuels | Yanir Seroussi | Data & AI for Startup Impact -

                                    My divestment from fossil fuels

                                    This post covers recent choices I've made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons.

                                    I recently read Naomi Klein’s This Changes Everything, which deeply influenced me. The book describes how the world has been dragging its feet when it comes to reducing carbon emissions, and how we are coming very close to a point where climate change is likely to spin out of control. While many of the facts presented in the book can be very depressing, one ray of light is that it is still not too late to act. There are still things we can do to avoid catastrophic climate change.

                                    One such thing is divestment from fossil fuels. Fossil fuel companies have committed to extracting (and therefore burning) more than what scientists agree is the safe amount of carbon that can be pumped into the atmosphere. While governments have been rather ineffective in stopping this (the current Australian government is even embarrassingly rolling back emission-reduction measures), divesting your money from such companies can help take away the social licence of these companies to do as they please. Further, this may be a smart investment strategy because the world is moving towards renewable energy. Indeed, according to one index, investors who divested from fossil fuels have had higher returns than conventional investors over the last five years.

                                    It’s worth noting that even if you disagree with the scientific consensus that releasing billions of tonnes of greenhouse gases into the atmosphere increases the likelihood of climate change, you should agree that it’d be better to stop breathing all the pollutants that result from burning fossil fuels. Further, the environmental damage that comes with extracting fossil fuels is something worth avoiding. Examples include the Deepwater Horizon oil spill, numerous cases of poisoned water due to fracking, and the potential damage to the Great Barrier Reef due to coal mine expansion. Even climate change deniers would admit that divestment from fossil fuels and a rapid move to clean renewables will prevent such disasters.

                                    The rest of this post describes steps I’ve recently taken towards divesting from fossil fuels. These are mostly relevant to Australians, though other countries may have similar options.

                                    Superannuation

                                    In Australia, we have compulsory superannuation (commonly known as super), meaning that most working Australians have some money invested somewhere. As this money is only available at retirement, investors can afford to optimise for long-term returns. Many super funds allow investors to choose what to invest in, and switching funds is relatively straightforward. My super fund is UniSuper. Last week, I switched my plan from Balanced, which includes investments in coal miners Rio Tinto and BHP Billiton, to 75% Sustainable Balanced, which doesn’t directly invest in fossil fuels, and 25% Global Environment Opportunities, which is focused on companies with a green agenda such as Tesla. This switch was very simple – I wish I had done it earlier. If you’re interested in making a similar switch, check out Superswitch’s guide to fossil-free super options.

                                    Energy

                                    While our previous energy retailer (ClickEnergy) isn’t one of the big three retailers who are actively lobbying the government to reduce the renewable energy target for 2020, my partner and I decided to switch to Powershop, as it appears to be the greenest energy retailer in New South Wales. Powershop supports maintaining the renewable energy target in its current form and provides free carbon offsets for all non-renewable energy. In addition, Powershop allows customers to purchase 100% green power from renewables – an option that we choose to take. With the savings from moving to Powershop and the extra payment for green power, our bill is expected to be more or less the same as before. Everyone wins!

                                    Note: If you live in New South Wales or Victoria and generally support what GetUp is doing, you can sign up via the links on this page, and GetUp will be paid a referral fee by Powershop.

                                    Banking

                                    There’s been a lot of focus recently on financing provided by the major banks to fossil fuel companies. The problem is that – unlike with super and energy – there aren’t many viable alternatives to the big banks. Reading the statements by smaller banks and credit unions, it is clear that they don’t provide financing to polluters just because they’re too small or not focused on commercial lending. Further, some of the smaller banks invest their money with the bigger banks. If the smaller banks were to become big due to the divestment movement, they may end up financing polluters. Unfortunately, changing your bank doesn’t give you more control over how your chosen financial institute uses your money.

                                    For now, I think it makes sense to push the banks to become fossil free by putting them on notice or participating in demonstrations. With enough pressure, one of the big banks may make a strong statement against lending to polluters, and then it’ll be time to act on the notices. One thing that the big banks care about is customer satisfaction and public image. Sending a strong message about the connection between financing polluters and satisfaction may be enough to make a difference. I’ll be tracking news in this area and will possibly make a switch in the future, depending on how things evolve.

                                    Transportation

                                    My top transportation choices are cycling and public transport, followed by driving when the former two are highly inconvenient (e.g., when going scuba diving). Every bike ride means less pollution and is a vote against fossil fuels. Further, bike riding is my main form of exercise, so I don’t need to set aside time to go to the gym. Finally, it’s almost free, and it’s also the fastest way of getting to the city from where I live.

                                    Since January, I’ve been allowing people to borrow my car through Car Next Door. This service, which is currently active in Sydney and Melbourne, allows people to hire their neighbours’ cars, thereby reducing the number of cars on the road. They also carbon offset all the rides taken through the service. While making my car available has made using it slightly less convenient (because I need to book it for myself), it’s also saved me money, so far covering the cost of insurance and roadside assistance. With my car sitting idle for 95% of the time before joining Car Next Door, it’s definitely another win-win situation. If you’d like to join Car Next Door as either a borrower or an owner, you can use this link to get $15 credit.

                                    Other areas and next steps

                                    Many of the choices we make every day have the power to reduce energy demand. These choices often make our life better, as seen with the bike riding example above. There’s a lot of material online about these green choices, which I may cover from my angle in another post. In general, I’m planning to be more active in the area of environmentalism. While this may come at the cost of reduced focus on my other activities, I would rather be more a part of the solution than a part of the problem. I’ll update as I go – please subscribe to get notified when updates occur.

                                    Subscribe +

                                    My divestment from fossil fuels

                                    This post covers recent choices I've made to reduce my exposure to fossil fuels, including practical steps that can be taken by Australians and generally applicable lessons.

                                    I recently read Naomi Klein’s This Changes Everything, which deeply influenced me. The book describes how the world has been dragging its feet when it comes to reducing carbon emissions, and how we are coming very close to a point where climate change is likely to spin out of control. While many of the facts presented in the book can be very depressing, one ray of light is that it is still not too late to act. There are still things we can do to avoid catastrophic climate change.

                                    One such thing is divestment from fossil fuels. Fossil fuel companies have committed to extracting (and therefore burning) more than what scientists agree is the safe amount of carbon that can be pumped into the atmosphere. While governments have been rather ineffective in stopping this (the current Australian government is even embarrassingly rolling back emission-reduction measures), divesting your money from such companies can help take away the social licence of these companies to do as they please. Further, this may be a smart investment strategy because the world is moving towards renewable energy. Indeed, according to one index, investors who divested from fossil fuels have had higher returns than conventional investors over the last five years.

                                    It’s worth noting that even if you disagree with the scientific consensus that releasing billions of tonnes of greenhouse gases into the atmosphere increases the likelihood of climate change, you should agree that it’d be better to stop breathing all the pollutants that result from burning fossil fuels. Further, the environmental damage that comes with extracting fossil fuels is something worth avoiding. Examples include the Deepwater Horizon oil spill, numerous cases of poisoned water due to fracking, and the potential damage to the Great Barrier Reef due to coal mine expansion. Even climate change deniers would admit that divestment from fossil fuels and a rapid move to clean renewables will prevent such disasters.

                                    The rest of this post describes steps I’ve recently taken towards divesting from fossil fuels. These are mostly relevant to Australians, though other countries may have similar options.

                                    Superannuation

                                    In Australia, we have compulsory superannuation (commonly known as super), meaning that most working Australians have some money invested somewhere. As this money is only available at retirement, investors can afford to optimise for long-term returns. Many super funds allow investors to choose what to invest in, and switching funds is relatively straightforward. My super fund is UniSuper. Last week, I switched my plan from Balanced, which includes investments in coal miners Rio Tinto and BHP Billiton, to 75% Sustainable Balanced, which doesn’t directly invest in fossil fuels, and 25% Global Environment Opportunities, which is focused on companies with a green agenda such as Tesla. This switch was very simple – I wish I had done it earlier. If you’re interested in making a similar switch, check out Superswitch’s guide to fossil-free super options.

                                    Energy

                                    While our previous energy retailer (ClickEnergy) isn’t one of the big three retailers who are actively lobbying the government to reduce the renewable energy target for 2020, my partner and I decided to switch to Powershop, as it appears to be the greenest energy retailer in New South Wales. Powershop supports maintaining the renewable energy target in its current form and provides free carbon offsets for all non-renewable energy. In addition, Powershop allows customers to purchase 100% green power from renewables – an option that we choose to take. With the savings from moving to Powershop and the extra payment for green power, our bill is expected to be more or less the same as before. Everyone wins!

                                    Note: If you live in New South Wales or Victoria and generally support what GetUp is doing, you can sign up via the links on this page, and GetUp will be paid a referral fee by Powershop.

                                    Banking

                                    There’s been a lot of focus recently on financing provided by the major banks to fossil fuel companies. The problem is that – unlike with super and energy – there aren’t many viable alternatives to the big banks. Reading the statements by smaller banks and credit unions, it is clear that they don’t provide financing to polluters just because they’re too small or not focused on commercial lending. Further, some of the smaller banks invest their money with the bigger banks. If the smaller banks were to become big due to the divestment movement, they may end up financing polluters. Unfortunately, changing your bank doesn’t give you more control over how your chosen financial institute uses your money.

                                    For now, I think it makes sense to push the banks to become fossil free by putting them on notice or participating in demonstrations. With enough pressure, one of the big banks may make a strong statement against lending to polluters, and then it’ll be time to act on the notices. One thing that the big banks care about is customer satisfaction and public image. Sending a strong message about the connection between financing polluters and satisfaction may be enough to make a difference. I’ll be tracking news in this area and will possibly make a switch in the future, depending on how things evolve.

                                    Transportation

                                    My top transportation choices are cycling and public transport, followed by driving when the former two are highly inconvenient (e.g., when going scuba diving). Every bike ride means less pollution and is a vote against fossil fuels. Further, bike riding is my main form of exercise, so I don’t need to set aside time to go to the gym. Finally, it’s almost free, and it’s also the fastest way of getting to the city from where I live.

                                    Since January, I’ve been allowing people to borrow my car through Car Next Door. This service, which is currently active in Sydney and Melbourne, allows people to hire their neighbours’ cars, thereby reducing the number of cars on the road. They also carbon offset all the rides taken through the service. While making my car available has made using it slightly less convenient (because I need to book it for myself), it’s also saved me money, so far covering the cost of insurance and roadside assistance. With my car sitting idle for 95% of the time before joining Car Next Door, it’s definitely another win-win situation. If you’d like to join Car Next Door as either a borrower or an owner, you can use this link to get $15 credit.

                                    Other areas and next steps

                                    Many of the choices we make every day have the power to reduce energy demand. These choices often make our life better, as seen with the bike riding example above. There’s a lot of material online about these green choices, which I may cover from my angle in another post. In general, I’m planning to be more active in the area of environmentalism. While this may come at the cost of reduced focus on my other activities, I would rather be more a part of the solution than a part of the problem. I’ll update as I go – please subscribe to get notified when updates occur.

                                    Subscribe

                                      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                      Thanks for sharing your standpoint on this.
                                      \ No newline at end of file diff --git a/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/index.html b/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/index.html index 69210443e..1022c309f 100644 --- a/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/index.html +++ b/2015/05/02/first-steps-in-data-science-author-aware-sentiment-analysis/index.html @@ -1,10 +1,12 @@ First steps in data science: author-aware sentiment analysis | Yanir Seroussi | Data & AI for Startup Impact -

                                      First steps in data science: author-aware sentiment analysis

                                      People often ask me what’s the best way of becoming a data scientist. The way I got there was by first becoming a software engineer and then doing a PhD in what was essentially data science (before it became such a popular term). This post describes my first steps in the field with the goal of helping others who are interested in making the transition from pure software engineering to data science.

                                      While my first steps were in a PhD program, I don’t think that going through the formal PhD process is necessary if you wish to become a data scientist. Self-motivated individuals can get very far by making use of the abundance of learning resources available online. In fact, one can make progress much faster than in a PhD, because PhD programs have many overheads.

                                      This post is organised as a list of steps. Despite the sequential numbering, many steps can be done in parallel. These steps roughly recount the work I’ve done to publish my first paper, which was co-authored by Ingrid Zukerman and Fabian Bohnert. Most of the technical details are intentionally omitted. Readers who are interested in learning more are invited to read the original paper or chapter 6 in my thesis, which includes more thorough experiments and explanations.

                                      Step one: Find a problem to work on

                                      Even if you know nothing about the machine learning and statistics side of data science, it’s important to find a problem to work on. Ideally it’d be something you find personally interesting, as this helps with motivation. You could use a predefined problem such as a Kaggle competition or one of the UCI datasets. Alternatively, you could collect the data yourself to make things a bit more challenging.

                                      In my case, I was interested in natural language processing and user modelling. My supervisor was given a grant to work on sentiment analysis of opinion polls, which was my first direction of research. This quickly changed to focus on the connection between authors and the way they express their sentiments, with the application of harnessing this connection to improve the accuracy of sentiment analysis algorithms. For the purpose of this research, I collected a dataset of texts by the most prolific IMDb users. The problem was to infer the ratings these users assigned to their own reviews, with the hypothesis that methods that take author identity into account would outperform methods that ignore authorship information.

                                      Step two: Close your knowledge gaps

                                      Whatever problem you choose, you will have some knowledge gaps that require filling. Wikipedia, textbooks, and online courses will be your best guide for foundational areas like machine learning and statistics. Reading academic papers is often required to get a better understanding of recent work on the specific problem you’re trying to solve.

                                      Doing a PhD afforded me the luxury of spending about a month just reading papers. Most of the ~200 papers I read were on sentiment analysis, which gave me a good overview of what’s been done in the field. However, the best thing I’ve done was to stop reading and move on to working on the problem. This is also the best advice I can give: there’s no better way to learn than getting your hands dirty working on a problem.

                                      Step three: Get your hands dirty

                                      With a well-defined problem and the knowledge gaps more-or-less closed, it is time to come up with a plan and implement it. Due to my background in software engineering and some exposure to early collaborative filtering approaches to recommender systems, my plan was very much a part of what Leo Breiman called the algorithmic modelling culture. That is, I was more focused on developing algorithms that work than on modelling the process that generated the data. This approach is arguably more in line with the mindset that software engineers tend to have than with the approach of mathematicians and statisticians.

                                      The plan was quite simple:

                                      • Reproduce results that showed that rating inference models trained on enough texts by the target author (i.e., the author who wrote the text whose rating we want to predict) outperform models trained on texts by multiple authors
                                      • Use an approach inspired by collaborative filtering to combine multiple single-author models to infer ratings for texts by the target author, where those models are weighted by similarity to the target author
                                      • Experiment with multiple similarity measurements under various constraints on the number of texts available by the training and target authors
                                      • Iterate on these ideas until the results are publishable

                                      The rationale behind this plan was that while different people express their sentiments differently, similar people would express their sentiments similarly (e.g., use of understatements varies by culture). The key motivation was Pang and Lee’s finding that a model trained on a single author is best if we have enough texts by this author.

                                      The way I implemented the plan was vastly different from how I’d do it today. This was 2009, and using Java with the Weka package for the core modelling seemed like a huge improvement over the C/C++ I was used to. I relied heavily on the university grid to run experiments and wrote a bunch of code to handle experimental logic, including some Perl scripts for post-processing. It ended up being pretty messy, but it worked and I got publishable results. If I were to do the same work today, I’d use Python for everything. IPython Notebook is a great way of keeping track of experimental work, and Python packages like pandas, scikit-learn, gensim, TextBlob, etc. are mature and easy to use for data science applications.

                                      Step four: Publish your results

                                      Having a deadline for publishing results can be stressful, but it has two positive outcomes. First, making your work public allows you to obtain valuable feedback. Second, hard deadlines are great in making you work towards a tangible goal. You can always keep iterating to get infinitesimal improvements, but publication deadlines force you to decide that you’ve done enough.

                                      In my case, the deadline for the UMAP 2010 conference and the promise of a free trip to Hawaii served as excellent motivators. But even if you don’t have the time or energy to get an academic paper published, you should set yourself a deadline to publish something on a blog or a forum, or even as a report to a mentor who can assess your work. Receiving continuous feedback is a key factor in improvement, so release early and release often.

                                      Step five: Improve results or move on

                                      Congratulations! You have published the results of your study. What now? You can either keep working on the same problem – try more approaches, add more data, change the constraints, etc. Or you can move on to work on other problems that interest you.

                                      In my case, I had to go back to iterate on the results of the first paper because of things I learned later. I ended up rerunning all the experiments to make things fit together into a more-or-less coherent story for the thesis (writing a thesis is one of the main overheads that comes with doing a PhD). If I had a choice, I wouldn’t have done that. I would instead have pursued more sensible enhancements to the work presented in the paper, such as using the author as a feature, employing more robust ensemble methods, and testing different base methods than support vector machines. Nonetheless, I still think that the core idea – that the identity of authors should be taken into account in sentiment analysis – is still relevant and viable today. But I’ve taken my own advice and moved on.

                                      Subscribe +

                                      First steps in data science: author-aware sentiment analysis

                                      People often ask me what’s the best way of becoming a data scientist. The way I got there was by first becoming a software engineer and then doing a PhD in what was essentially data science (before it became such a popular term). This post describes my first steps in the field with the goal of helping others who are interested in making the transition from pure software engineering to data science.

                                      While my first steps were in a PhD program, I don’t think that going through the formal PhD process is necessary if you wish to become a data scientist. Self-motivated individuals can get very far by making use of the abundance of learning resources available online. In fact, one can make progress much faster than in a PhD, because PhD programs have many overheads.

                                      This post is organised as a list of steps. Despite the sequential numbering, many steps can be done in parallel. These steps roughly recount the work I’ve done to publish my first paper, which was co-authored by Ingrid Zukerman and Fabian Bohnert. Most of the technical details are intentionally omitted. Readers who are interested in learning more are invited to read the original paper or chapter 6 in my thesis, which includes more thorough experiments and explanations.

                                      Step one: Find a problem to work on

                                      Even if you know nothing about the machine learning and statistics side of data science, it’s important to find a problem to work on. Ideally it’d be something you find personally interesting, as this helps with motivation. You could use a predefined problem such as a Kaggle competition or one of the UCI datasets. Alternatively, you could collect the data yourself to make things a bit more challenging.

                                      In my case, I was interested in natural language processing and user modelling. My supervisor was given a grant to work on sentiment analysis of opinion polls, which was my first direction of research. This quickly changed to focus on the connection between authors and the way they express their sentiments, with the application of harnessing this connection to improve the accuracy of sentiment analysis algorithms. For the purpose of this research, I collected a dataset of texts by the most prolific IMDb users. The problem was to infer the ratings these users assigned to their own reviews, with the hypothesis that methods that take author identity into account would outperform methods that ignore authorship information.

                                      Step two: Close your knowledge gaps

                                      Whatever problem you choose, you will have some knowledge gaps that require filling. Wikipedia, textbooks, and online courses will be your best guide for foundational areas like machine learning and statistics. Reading academic papers is often required to get a better understanding of recent work on the specific problem you’re trying to solve.

                                      Doing a PhD afforded me the luxury of spending about a month just reading papers. Most of the ~200 papers I read were on sentiment analysis, which gave me a good overview of what’s been done in the field. However, the best thing I’ve done was to stop reading and move on to working on the problem. This is also the best advice I can give: there’s no better way to learn than getting your hands dirty working on a problem.

                                      Step three: Get your hands dirty

                                      With a well-defined problem and the knowledge gaps more-or-less closed, it is time to come up with a plan and implement it. Due to my background in software engineering and some exposure to early collaborative filtering approaches to recommender systems, my plan was very much a part of what Leo Breiman called the algorithmic modelling culture. That is, I was more focused on developing algorithms that work than on modelling the process that generated the data. This approach is arguably more in line with the mindset that software engineers tend to have than with the approach of mathematicians and statisticians.

                                      The plan was quite simple:

                                      • Reproduce results that showed that rating inference models trained on enough texts by the target author (i.e., the author who wrote the text whose rating we want to predict) outperform models trained on texts by multiple authors
                                      • Use an approach inspired by collaborative filtering to combine multiple single-author models to infer ratings for texts by the target author, where those models are weighted by similarity to the target author
                                      • Experiment with multiple similarity measurements under various constraints on the number of texts available by the training and target authors
                                      • Iterate on these ideas until the results are publishable

                                      The rationale behind this plan was that while different people express their sentiments differently, similar people would express their sentiments similarly (e.g., use of understatements varies by culture). The key motivation was Pang and Lee’s finding that a model trained on a single author is best if we have enough texts by this author.

                                      The way I implemented the plan was vastly different from how I’d do it today. This was 2009, and using Java with the Weka package for the core modelling seemed like a huge improvement over the C/C++ I was used to. I relied heavily on the university grid to run experiments and wrote a bunch of code to handle experimental logic, including some Perl scripts for post-processing. It ended up being pretty messy, but it worked and I got publishable results. If I were to do the same work today, I’d use Python for everything. IPython Notebook is a great way of keeping track of experimental work, and Python packages like pandas, scikit-learn, gensim, TextBlob, etc. are mature and easy to use for data science applications.

                                      Step four: Publish your results

                                      Having a deadline for publishing results can be stressful, but it has two positive outcomes. First, making your work public allows you to obtain valuable feedback. Second, hard deadlines are great in making you work towards a tangible goal. You can always keep iterating to get infinitesimal improvements, but publication deadlines force you to decide that you’ve done enough.

                                      In my case, the deadline for the UMAP 2010 conference and the promise of a free trip to Hawaii served as excellent motivators. But even if you don’t have the time or energy to get an academic paper published, you should set yourself a deadline to publish something on a blog or a forum, or even as a report to a mentor who can assess your work. Receiving continuous feedback is a key factor in improvement, so release early and release often.

                                      Step five: Improve results or move on

                                      Congratulations! You have published the results of your study. What now? You can either keep working on the same problem – try more approaches, add more data, change the constraints, etc. Or you can move on to work on other problems that interest you.

                                      In my case, I had to go back to iterate on the results of the first paper because of things I learned later. I ended up rerunning all the experiments to make things fit together into a more-or-less coherent story for the thesis (writing a thesis is one of the main overheads that comes with doing a PhD). If I had a choice, I wouldn’t have done that. I would instead have pursued more sensible enhancements to the work presented in the paper, such as using the author as a feature, employing more robust ensemble methods, and testing different base methods than support vector machines. Nonetheless, I still think that the core idea – that the identity of authors should be taken into account in sentiment analysis – is still relevant and viable today. But I’ve taken my own advice and moved on.

                                      Subscribe

                                        Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                        Thanks for the stimulation. I’m still fascinated by the lure to extract sentiment from text, but it seems like so often the sentiment that the author intended never fully came to expression in the text. Maybe an interdisciplinary approach will be required to teach machines to parse the intentions implicit in text, and, like other media phenomena, a loop will have to form: perhaps the awareness that explicit intentions and sentiment are of benefit to authors in a world that (one day) automates the sorting of all its documents will cause writing styles to adapt. The effect of the best we can on what we’re doing now is one of those things you begin to see a pattern in. Here’s an API that correlates patterns of unstructured info: dev.keywordmeme.com Would love your feedback. Let me know if it’s useful to you or if you have any comments. Well done on the carbon post, btw. Glad I found your blog.

                                        Thank you for the comment! I agree that analysing sentiment is very tricky due to the fact that people often don’t express themselves so well. If I remember correctly, inter-annotator agreement on some sentiment analysis tasks is only 70-80%, so it’s likely that we will ever have perfect performance by machines.

                                        dev.keywordmeme.com redirects to a github page – where is the API?

                                        You bet. I’m fascinated to see what seems to be a real live push toward an interdisciplinary approach. That 70-80% performance might be pushed over the hump by humans with special training until such a time as the process can be formalized. It looks like auditing ML-driven processes could be a new category of employment through this next technological plateau. The human-machine relationship in a friendly old configuration! -Sorry about the link. This should work: http://www.keywordmeme.com/. It makes you register, just a heads up. Hit the engineers up on github if you have any questions or if things aren’t working. Which is possible. Take care! :)

                                        Hi Yanir

                                        Thank you very much for this post. Helpul for somebody like me seeking to be a data scientist.

                                        I’m a software engineer, currently master data architect.

                                        I’m taking MOOCs in order to fill the gaps, so let’s say I’m on a good track :)

                                        However, once problem found and hands got dirty, how to find a mentor ? afterwards, get published ?

                                        I think this would be hard via academic

                                        Finding a mentor depends on where you are. Good places to start would be your current workplace (if you work with data scientists), or local meetups (if there are any in your area). Another option would be to contribute to open source projects in the field as a way of getting to know people and getting feedback. Finally, there are courses like the one by Thinkful, where you can pay to be mentored.

                                        Regarding getting published, I agree that it’d be hard to get published in many academic venues without help from people who know how it’s done. However, you can always start your own blog and link to it from places like Reddit and DataTau. Even if you don’t get any feedback, publishing often forces you to think more deeply about the subject of your article.

                                        At the workplace, it will be a bit hard.

                                        I Live in Paris, meetups would be a good option.

                                        You’re right, publishing forces to think more deeply, feedbacks from readers are also good way to learn.

                                        Hi Yanir

                                        Thank you very much for this post. Helpul for somebody like me seeking to be a data scientist.

                                        I’m a software engineer, currently master data architect.

                                        I’m taking MOOCs in order to fill the gaps, so let’s say I’m on a good track :)

                                        However, once problem found and hands got dirty, how to find a mentor ? afterwards, get published ?

                                        I think this would be hard via academic

                                        Finding a mentor depends on where you are. Good places to start would be your current workplace (if you work with data scientists), or local meetups (if there are any in your area). Another option would be to contribute to open source projects in the field as a way of getting to know people and getting feedback. Finally, there are courses like the one by Thinkful, where you can pay to be mentored.

                                        Regarding getting published, I agree that it’d be hard to get published in many academic venues without help from people who know how it’s done. However, you can always start your own blog and link to it from places like Reddit and DataTau. Even if you don’t get any feedback, publishing often forces you to think more deeply about the subject of your article.

                                        At the workplace, it will be a bit hard.

                                        I Live in Paris, meetups would be a good option.

                                        You’re right, publishing forces to think more deeply, feedbacks from readers are also good way to learn.

                                        This site is a part of the Data People Writing Stuff webring.
                                        ← previous site +  |   +next site →

                                        \ No newline at end of file diff --git a/2015/06/06/hopping-on-the-deep-learning-bandwagon/index.html b/2015/06/06/hopping-on-the-deep-learning-bandwagon/index.html index f944bf551..4f7ee1401 100644 --- a/2015/06/06/hopping-on-the-deep-learning-bandwagon/index.html +++ b/2015/06/06/hopping-on-the-deep-learning-bandwagon/index.html @@ -1,5 +1,5 @@ Hopping on the deep learning bandwagon | Yanir Seroussi | Data & AI for Startup Impact -

                                        Hopping on the deep learning bandwagon

                                        I’ve been meaning to get into deep learning for the last few years. Now, the stars having finally aligned and I have the time and motivation to work on a small project that will hopefully improve my understanding of the field. This is the first in a series of posts that will document my progress on this project.

                                        As mentioned in a previous post on getting started as a data scientist, I believe that the best way of becoming proficient at solving data science problems is by getting your hands dirty. Despite being familiar with high-level terminology and having some understanding of how it all works, I don’t have any practical experience applying deep learning. The purpose of this project is to fix this experience gap by working on a real problem.

                                        The problem: Inferring genre from album covers

                                        Deep learning has been very successful at image classification. Therefore, it makes sense to work on an image classification problem for this project. Rather than using an existing dataset, I decided to make things a bit more interesting by building my own dataset. Over the last year, I’ve been running BCRecommender – a recommendation system for Bandcamp music. I’ve noticed that album covers vary by genre, though it’s hard to quantify exactly how they vary. So the question I’ll be trying to answer with this project is how accurately can genre be inferred from Bandcamp album covers?

                                        As the goal of this project is to learn about deep learning rather than make a novel contribution, I didn’t do a comprehensive search to see whether this problem has been addressed before. However, I did find a recent post by Alexandre Passant that describes his use of Clarifai’s API to tag the content of Spotify album covers (identifying elements such as men, night, dark, etc.), and then using these tags to infer the album’s genre. Another related project is Karayev et al.’s Recognizing image style paper, in which the authors classified datasets of images from Flickr and Wikipedia by style and art genre, respectively. In all these cases, the results are pretty good, supporting my intuition that the genre inference task is feasible.

                                        Data collection & splits

                                        As I’ve already been crawling Bandcamp data for BCRecommender, creating the dataset was relatively straightforward. Currently, I have data on about 1.8 million tracks and albums. Bandcamp artists assign multiple tags to each release. To create the dataset, I selected 10 of the top tags: ambient, dubstep, folk, hiphop_rap, jazz, metal, pop, punk, rock, and soul. Then, I randomly selected 10,000 album covers that have exactly one of those tags, with 1,000 albums for each tag/genre. Each cover image size is 350×350. The following image shows a sample of the dataset.

                                        Hopping on the deep learning bandwagon

                                        I’ve been meaning to get into deep learning for the last few years. Now, the stars having finally aligned and I have the time and motivation to work on a small project that will hopefully improve my understanding of the field. This is the first in a series of posts that will document my progress on this project.

                                        As mentioned in a previous post on getting started as a data scientist, I believe that the best way of becoming proficient at solving data science problems is by getting your hands dirty. Despite being familiar with high-level terminology and having some understanding of how it all works, I don’t have any practical experience applying deep learning. The purpose of this project is to fix this experience gap by working on a real problem.

                                        The problem: Inferring genre from album covers

                                        Deep learning has been very successful at image classification. Therefore, it makes sense to work on an image classification problem for this project. Rather than using an existing dataset, I decided to make things a bit more interesting by building my own dataset. Over the last year, I’ve been running BCRecommender – a recommendation system for Bandcamp music. I’ve noticed that album covers vary by genre, though it’s hard to quantify exactly how they vary. So the question I’ll be trying to answer with this project is how accurately can genre be inferred from Bandcamp album covers?

                                        As the goal of this project is to learn about deep learning rather than make a novel contribution, I didn’t do a comprehensive search to see whether this problem has been addressed before. However, I did find a recent post by Alexandre Passant that describes his use of Clarifai’s API to tag the content of Spotify album covers (identifying elements such as men, night, dark, etc.), and then using these tags to infer the album’s genre. Another related project is Karayev et al.’s Recognizing image style paper, in which the authors classified datasets of images from Flickr and Wikipedia by style and art genre, respectively. In all these cases, the results are pretty good, supporting my intuition that the genre inference task is feasible.

                                        Data collection & splits

                                        As I’ve already been crawling Bandcamp data for BCRecommender, creating the dataset was relatively straightforward. Currently, I have data on about 1.8 million tracks and albums. Bandcamp artists assign multiple tags to each release. To create the dataset, I selected 10 of the top tags: ambient, dubstep, folk, hiphop_rap, jazz, metal, pop, punk, rock, and soul. Then, I randomly selected 10,000 album covers that have exactly one of those tags, with 1,000 albums for each tag/genre. Each cover image size is 350×350. The following image shows a sample of the dataset.

                                        800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                          \ No newline at end of file diff --git a/2015/07/06/learning-about-deep-learning-through-album-cover-classification/index.html b/2015/07/06/learning-about-deep-learning-through-album-cover-classification/index.html index d0d25b5de..c58090c35 100644 --- a/2015/07/06/learning-about-deep-learning-through-album-cover-classification/index.html +++ b/2015/07/06/learning-about-deep-learning-through-album-cover-classification/index.html @@ -1,5 +1,5 @@ Learning about deep learning through album cover classification | Yanir Seroussi | Data & AI for Startup Impact -

                                          Learning about deep learning through album cover classification

                                          In the past month, I’ve spent some time on my album cover classification project. The goal of this project is for me to learn about deep learning by working on an actual problem. This post covers my progress so far, highlighting lessons that would be useful to others who are getting started with deep learning.

                                          Initial steps summary

                                          The following points were discussed in detail in the previous post on this project.

                                          • The problem I chose to work on is classifying Bandcamp album covers by genre, using a balanced dataset of 10,000 images from 10 different genres.
                                          • The experimental code is based on Lasagne, and is available on GitHub.
                                          • Having set up the environment for running experiments on a GPU, the plan was to get Lasagne’s examples working on my dataset, and then iteratively read tutorials/papers/books, implement ideas, play with parameters, and visualise parts of the network until I’m satisfied with the results.

                                          Preliminary experiments and learning resources

                                          I hit several issues when adapting Lasagne’s example code to my dataset. The key issue is that the example code is based on the MNIST digits dataset. That dataset’s images are 28×28 grayscale, and my dataset’s images are 350×350 RGB. This difference led to the training loss quickly diverging when running the example code without any changes. It turns out that simply lowering the learning rate resolves this issue, though the initial results I got were still not much better than random. In general, it appears that everything works on the MNIST digits dataset, so choosing to work on my own dataset made things more challenging (which is a good thing).

                                          The main learning resource I used is the excellent notes for the Stanford course Convolutional Neural Networks for Visual Recognition. The notes are very clear, contain up-to-date information from recent publications, and include many practical tips for successful training of convolutional networks (convnets). In addition, I read some other tutorials and a few papers. These are summarised in a separate page.

                                          The first step after getting the MNIST examples working on my dataset was to extend the code to enable more flexible architectures. My main focus was on vanilla convnets, i.e., networks with several convolutional layers, where each convolutional layer is optionally followed by a max-pooling layer, and the convolutional layers are followed by multiple dense/fully-connected layers and dropout layers. To allow for easy experimentation, the specification of the network can be done from the command line. For example, to train an AlexNet architecture:

                                          $ python manage.py run_experiment \
                                          +

                                          Learning about deep learning through album cover classification

                                          In the past month, I’ve spent some time on my album cover classification project. The goal of this project is for me to learn about deep learning by working on an actual problem. This post covers my progress so far, highlighting lessons that would be useful to others who are getting started with deep learning.

                                          Initial steps summary

                                          The following points were discussed in detail in the previous post on this project.

                                          • The problem I chose to work on is classifying Bandcamp album covers by genre, using a balanced dataset of 10,000 images from 10 different genres.
                                          • The experimental code is based on Lasagne, and is available on GitHub.
                                          • Having set up the environment for running experiments on a GPU, the plan was to get Lasagne’s examples working on my dataset, and then iteratively read tutorials/papers/books, implement ideas, play with parameters, and visualise parts of the network until I’m satisfied with the results.

                                          Preliminary experiments and learning resources

                                          I hit several issues when adapting Lasagne’s example code to my dataset. The key issue is that the example code is based on the MNIST digits dataset. That dataset’s images are 28×28 grayscale, and my dataset’s images are 350×350 RGB. This difference led to the training loss quickly diverging when running the example code without any changes. It turns out that simply lowering the learning rate resolves this issue, though the initial results I got were still not much better than random. In general, it appears that everything works on the MNIST digits dataset, so choosing to work on my own dataset made things more challenging (which is a good thing).

                                          The main learning resource I used is the excellent notes for the Stanford course Convolutional Neural Networks for Visual Recognition. The notes are very clear, contain up-to-date information from recent publications, and include many practical tips for successful training of convolutional networks (convnets). In addition, I read some other tutorials and a few papers. These are summarised in a separate page.

                                          The first step after getting the MNIST examples working on my dataset was to extend the code to enable more flexible architectures. My main focus was on vanilla convnets, i.e., networks with several convolutional layers, where each convolutional layer is optionally followed by a max-pooling layer, and the convolutional layers are followed by multiple dense/fully-connected layers and dropout layers. To allow for easy experimentation, the specification of the network can be done from the command line. For example, to train an AlexNet architecture:

                                          $ python manage.py run_experiment \
                                               --dataset-path /path/to/dataset \
                                               --model-architecture ConvNet \
                                               --model-params num_conv_layers=5:num_dense_layers=2:lc0_num_filters=48:lc0_filter_size=11:lc0_stride=4:lc0_mp=True:lm0_pool_size=3:lm0_stride=2:lc1_num_filters=128:lc1_filter_size=5:lc1_mp=True:lm1_pool_size=3:lm1_stride=2:lc2_num_filters=192:lc2_filter_size=3:lc3_num_filters=192:lc3_filter_size=3:lc4_num_filters=128:lc4_filter_size=3:lc4_mp=True:lm4_pool_size=3:lm4_stride=2:ld0_num_units=2048:ld1_num_units=2048
                                          @@ -23,7 +23,9 @@
                                           

                                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                            \ No newline at end of file diff --git a/2015/07/31/goodbye-parse-com/index.html b/2015/07/31/goodbye-parse-com/index.html index fe2c7f50d..8a34b01f7 100644 --- a/2015/07/31/goodbye-parse-com/index.html +++ b/2015/07/31/goodbye-parse-com/index.html @@ -1,5 +1,5 @@ Goodbye, Parse.com | Yanir Seroussi | Data & AI for Startup Impact -

                                            Goodbye, Parse.com

                                            Over the past year, I’ve been using Parse‘s free backend-as-a-service and web hosting to serve BCRecommender (music recommendation service) and Price Dingo (now-closed shopping comparison engine). The main lesson: You get what you pay for. Despite some improvements, Parse remains very unreliable, and any time saved by using their APIs and SDKs tends to be offset by having to work around the restrictions of their sandboxed environment. This post details some of the issues I faced and the transition away from the service.

                                            What’s so bad about Parse?

                                            In one word: reliability. The service is simply unreliable, with many latency spikes and random errors. I reported this issue six months ago, and it’s still being investigated. Reliability has been a known issue for years (see Stack Overflow and Hacker News discussions). Parse’s acquisition by Facebook over two years ago gave some hope that these issues would be resolved quickly, but this is just not the case.

                                            It is worth noting that the way I used Parse was probably somewhat uncommon. For both Price Dingo and BCRecommender, data was scraped and processed outside Parse, and then imported in bulk into Parse. As bulk imports are not supported by the API, automating the process required reliance on the web interface, which made things somewhat fragile. Further, a few months ago Parse inexplicably dropped support for uploading zipped files, making imports much slower. Finally, when importing large collections, I found that it takes ages for the data to get indexed. The final straw was with the last BCRecommender update, where even after days of waiting the data was still not fully indexed.

                                            Price Dingo’s transition

                                            Price Dingo was a shopping comparison engine with a web interface. The idea was to focus on user needs in specialised product categories, as opposed to the traditional model that requires merchants to pay to be listed. I decided to shut down the service a few months ago to focus on other things, but before the shutdown, I almost completed the transition away from Parse. The first step was replacing the persistence layer with Algolia – search engine as a service. Algolia is super-fast, its advanced search capabilities are way better than Parse’s search options, and as a paid service their customer support was excellent. If I hadn’t shut Price Dingo down, the second step would have been replacing Parse hosting with a more reliable service, as I have recently done for BCRecommender.

                                            BCRecommender’s transition

                                            The Parse-hosted part of BCRecommender was a fairly simple express.js backend that rendered Jade templates. The fastest transition would probably have been to set up a standalone express.js backend and replace the Parse API calls with calls to the database. But as I much prefer coding in Python (the recommendation-generating backend is in Python), I decided to completely rewrite the web backend using Flask.

                                            For hosting, I decided to go with DigitalOcean (signing up with this link gives you US$10 credit), because it has a good reputation, and it compares favourably with other infrastructure-as-a-service providers. For US$10/month you get a server with 1GB of memory, 30GB of SSD storage, and 2TB of data transfers, which should be more than enough for BCRecommender’s modest traffic (200 daily users + ~2 bot requests per second).

                                            Setting up the BCRecommender webapp stack is a bit more involved than getting started with Parse, but fortunately I was already familiar with all parts of the stack. It ended up being almost identical to the stack used in Charlie Huang’s blog post Deploy a MongoDB powered Flask app in 5 minutes: an Ubuntu server running MongoDB as the persistence layer, Nginx as the webserver, Gunicorn as the WSGI proxy, Supervisor for daemon management, and Fabric for managing deployments.

                                            Before deploying to DigitalOcean, I used Vagrant to set up a local development environment, which is almost identical to the production environment. Deployment scripts are one thing that you don’t have to worry about when using Parse, as they provide their own build tools. However, it’s not too hard to implement your own scripts, so within a few hours I had the environment and the deployment scripts up and ready for translating the webapp code from express.js to Flask.

                                            The translation process was pretty straightforward and actually enjoyable. The Python code ended up being much cleaner and shorter than the JavaScript code (line count reduced to 284 from 378). This was partly thanks to the newly-found freedom of being able to install any package I wanted, and partly due to the reduction in callbacks, which made the code less nested and easier to understand.

                                            I was hoping to use PyJade to obviate the need for translating the page templates to Jinja. However, I ran into a bunch of issues and subtle bugs that made me decide to use PyJade for one-off translation to Jinja, followed by a manual process of ensuring that each template was converted correctly. Some of the issues were:

                                            • Using PyJade’s Flask extension compiles the templates to Jinja on the fly, so debugging issues is hard because the line numbers in the generated Jinja templates don’t match the line numbers in the original Jade files.
                                            • Jade allows the use of arbitrary JavaScript code, which PyJade doesn’t translate to Python (makes sense – it’d be too hard and messy). This caused many of my templates to simply not work because, e.g., I used the ternary operator or called a built-in JavaScript function. Worse than that, some cases failed silently, e.g., calling arr.length where arr is an array works fine in pure Jade, but is undefined in Python because arrays don’t have a length attribute.
                                            • Hyphenated block names are fine in Jade, but don’t compile in Jinja.

                                            The conversion to Jinja pretty much offset the cleanliness gained in the Python code, with a growth in template line count from 403 to 464 lines, and much clutter with unnecessary closing tags. Jade, I will miss you, but I guess I can’t have it all.

                                            The good news is that latency immediately dropped as I deployed the new environment. The graph below almost says it all. What’s missing is the much more massive spikes (5-60 seconds) and timeouts that happen pretty frequently with Parse hosting.

                                            Goodbye, Parse.com

                                            Over the past year, I’ve been using Parse‘s free backend-as-a-service and web hosting to serve BCRecommender (music recommendation service) and Price Dingo (now-closed shopping comparison engine). The main lesson: You get what you pay for. Despite some improvements, Parse remains very unreliable, and any time saved by using their APIs and SDKs tends to be offset by having to work around the restrictions of their sandboxed environment. This post details some of the issues I faced and the transition away from the service.

                                            What’s so bad about Parse?

                                            In one word: reliability. The service is simply unreliable, with many latency spikes and random errors. I reported this issue six months ago, and it’s still being investigated. Reliability has been a known issue for years (see Stack Overflow and Hacker News discussions). Parse’s acquisition by Facebook over two years ago gave some hope that these issues would be resolved quickly, but this is just not the case.

                                            It is worth noting that the way I used Parse was probably somewhat uncommon. For both Price Dingo and BCRecommender, data was scraped and processed outside Parse, and then imported in bulk into Parse. As bulk imports are not supported by the API, automating the process required reliance on the web interface, which made things somewhat fragile. Further, a few months ago Parse inexplicably dropped support for uploading zipped files, making imports much slower. Finally, when importing large collections, I found that it takes ages for the data to get indexed. The final straw was with the last BCRecommender update, where even after days of waiting the data was still not fully indexed.

                                            Price Dingo’s transition

                                            Price Dingo was a shopping comparison engine with a web interface. The idea was to focus on user needs in specialised product categories, as opposed to the traditional model that requires merchants to pay to be listed. I decided to shut down the service a few months ago to focus on other things, but before the shutdown, I almost completed the transition away from Parse. The first step was replacing the persistence layer with Algolia – search engine as a service. Algolia is super-fast, its advanced search capabilities are way better than Parse’s search options, and as a paid service their customer support was excellent. If I hadn’t shut Price Dingo down, the second step would have been replacing Parse hosting with a more reliable service, as I have recently done for BCRecommender.

                                            BCRecommender’s transition

                                            The Parse-hosted part of BCRecommender was a fairly simple express.js backend that rendered Jade templates. The fastest transition would probably have been to set up a standalone express.js backend and replace the Parse API calls with calls to the database. But as I much prefer coding in Python (the recommendation-generating backend is in Python), I decided to completely rewrite the web backend using Flask.

                                            For hosting, I decided to go with DigitalOcean (signing up with this link gives you US$10 credit), because it has a good reputation, and it compares favourably with other infrastructure-as-a-service providers. For US$10/month you get a server with 1GB of memory, 30GB of SSD storage, and 2TB of data transfers, which should be more than enough for BCRecommender’s modest traffic (200 daily users + ~2 bot requests per second).

                                            Setting up the BCRecommender webapp stack is a bit more involved than getting started with Parse, but fortunately I was already familiar with all parts of the stack. It ended up being almost identical to the stack used in Charlie Huang’s blog post Deploy a MongoDB powered Flask app in 5 minutes: an Ubuntu server running MongoDB as the persistence layer, Nginx as the webserver, Gunicorn as the WSGI proxy, Supervisor for daemon management, and Fabric for managing deployments.

                                            Before deploying to DigitalOcean, I used Vagrant to set up a local development environment, which is almost identical to the production environment. Deployment scripts are one thing that you don’t have to worry about when using Parse, as they provide their own build tools. However, it’s not too hard to implement your own scripts, so within a few hours I had the environment and the deployment scripts up and ready for translating the webapp code from express.js to Flask.

                                            The translation process was pretty straightforward and actually enjoyable. The Python code ended up being much cleaner and shorter than the JavaScript code (line count reduced to 284 from 378). This was partly thanks to the newly-found freedom of being able to install any package I wanted, and partly due to the reduction in callbacks, which made the code less nested and easier to understand.

                                            I was hoping to use PyJade to obviate the need for translating the page templates to Jinja. However, I ran into a bunch of issues and subtle bugs that made me decide to use PyJade for one-off translation to Jinja, followed by a manual process of ensuring that each template was converted correctly. Some of the issues were:

                                            • Using PyJade’s Flask extension compiles the templates to Jinja on the fly, so debugging issues is hard because the line numbers in the generated Jinja templates don’t match the line numbers in the original Jade files.
                                            • Jade allows the use of arbitrary JavaScript code, which PyJade doesn’t translate to Python (makes sense – it’d be too hard and messy). This caused many of my templates to simply not work because, e.g., I used the ternary operator or called a built-in JavaScript function. Worse than that, some cases failed silently, e.g., calling arr.length where arr is an array works fine in pure Jade, but is undefined in Python because arrays don’t have a length attribute.
                                            • Hyphenated block names are fine in Jade, but don’t compile in Jinja.

                                            The conversion to Jinja pretty much offset the cleanliness gained in the Python code, with a growth in template line count from 403 to 464 lines, and much clutter with unnecessary closing tags. Jade, I will miss you, but I guess I can’t have it all.

                                            The good news is that latency immediately dropped as I deployed the new environment. The graph below almost says it all. What’s missing is the much more massive spikes (5-60 seconds) and timeouts that happen pretty frequently with Parse hosting.

                                            Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                            I think it’s all about what you expect. -We used parse for prototypes and it’s been working great for us so far. I actually think we still have one of the prototypes running over it which we haven’t touched in almost year (a mobile/web strategy game now only available on FB - https://apps.facebook.com/foresttribes/). It’s been great since it also saved us the need to develop a backend admin tool to manage/balance the game or add additional content.

                                            Over all I never had a real live public product running on parse to comment on the experience, but for prototypes I’m perfectly happy with the service.

                                            Agreed, it’s perfectly fine for prototypes, but a bit too unreliable for public-facing live products. If Parse were more robust, it’d be perfect for many use cases.
                                            Agreed, it’s perfectly fine for prototypes, but a bit too unreliable for public-facing live products. If Parse were more robust, it’d be perfect for many use cases.

                                            This site is a part of the Data People Writing Stuff webring.
                                            ← previous site +  |   +next site →

                                            \ No newline at end of file diff --git a/2015/08/24/you-dont-need-a-data-scientist-yet/index.html b/2015/08/24/you-dont-need-a-data-scientist-yet/index.html index 88b152b83..a749ddff5 100644 --- a/2015/08/24/you-dont-need-a-data-scientist-yet/index.html +++ b/2015/08/24/you-dont-need-a-data-scientist-yet/index.html @@ -1,9 +1,11 @@ You don’t need a data scientist (yet) | Yanir Seroussi | Data & AI for Startup Impact -

                                            You don’t need a data scientist (yet)

                                            The hype around big data has caused many organisations to hire data scientists without giving much thought to what these data scientists are going to do and whether they’re actually needed. This is a source of frustration for all parties involved. This post discusses some questions you should ask yourself before deciding to hire your first data scientist.

                                            Q1: Do you know what data scientists do?

                                            Somewhat surprisingly, there are quite a few companies that hire data scientists without having a clear idea of what data scientists actually do. People seem to have a fear of missing out on the big data hype, and think of hiring data scientists as the solution. A common misconception is that a data scientist’s role includes telling you what to do with your data. While this may sometimes happen in practice, the ideal scenario is where the business has problems that can be solved using data science (more on this under Q3 below). If you don’t know what your data scientist is going to do, you probably don’t need one.

                                            So what do data scientists do? When you think about it, adding the word “data” to “science” is a bit redundant, as all science is based on data. Following from this, anyone who does any kind of data analysis is a data scientist. While it may be true, this broad definition is not very helpful. As discussed in a previous post, it’s more useful to define data scientists as individuals who combine expertise in statistics and machine learning with strong software engineering skills.

                                            Q2: Do you have enough data available?

                                            It’s not uncommon to see products that suffer from over-engineering and premature investment in advanced analytics capabilities. In the early stages, it’s important to focus on creating a minimum viable product and getting it to market quickly. Data science starts to shine once the product is generating enough data, as most of the power of advanced analytics is in optimising and automating existing processes.

                                            Not having a data scientist in the early stages doesn’t mean the data is being ignored – it just means that it doesn’t require the attention of a full-time data scientist. If your product is at an early stage and you are still concerned, you’re better off hiring a data science consultant for a few days to help lay out the long-term vision for data-driven capabilities. This would be cheaper and less time-consuming than hiring a full-timer. The exception to this rule is when the product itself is built around advanced analytics (e.g., AlchemyAPI or Enlitic). Building such products without data scientists is far from ideal, or just impossible.

                                            Even if your product is mature and generating a lot of data, it doesn’t mean it’s ready for data science. Advanced analytics capabilities are at the top of data’s hierarchy of needs: If your product is buggy, or if your data is scattered everywhere and your platform lacks centralised reporting, you need to first invest in fixing your data plumbing. This is the job of data engineers. Getting data scientists involved when the data is hardly available due to infrastructure issues is likely to lead to frustration. In addition, setting up centralised reporting and dashboarding is likely to give you ideas for problems that data scientists can solve.

                                            Q3: Do you have a specific problem to solve?

                                            If the problem you’re trying to solve is “everyone is doing smart things with data, we should be doing stuff with data too”, you don’t have a specific problem that can be solved by bringing a data scientist on board. Defining the problem often ends up occupying a lot of the data scientist’s time, so you are likely to obtain better results if have more than just a vague idea around “doing something with data, because Hadoop”. Ideally you want to optimise an existing process that is currently being solved with heuristics, make an existing model better, implement a new data-driven feature, or something along these lines. Common examples include reducing churn, increasing conversions, and replacing manual processes with automated data-driven systems. Again, getting advice from experienced data scientists before committing to hiring one may be your best first step.

                                            Q4: Can you get away with heuristics, intuition, and/or manual processes?

                                            Some data scientists would passionately claim that you must deploy only models that are theoretically justified and well-tested. However, in many cases you can get away with using simple heuristics, intuition, and/or manual processes. These can be orders of magnitude cheaper than building sophisticated predictive models and the infrastructure to support them. For many businesses, there are more pressing needs than doing everything in a theoretically sound way. Despite what many technical people like to think, customers don’t tend to care how things are implemented, as long as their needs are fulfilled.

                                            For example, I spent some time with a client whose product includes a semi-manual part where structured data is extracted from documents. Their process included sending some of the documents to a trained team in the Philippines for manual analysis. The client was interested in replacing that manual work with a machine learning algorithm. As is often the case with machine learning, it was unknown whether the resultant model would be accurate enough to completely replace the manual workers. This generally depends on data quality and the feasibility of solving the problem. Assessing the feasibility would have taken some time and money, so the client decided to park the idea and focus on other areas of their business.

                                            Every business has resource constraints. Situations where the best investment you can make is hiring a full-time data scientist are rarer than what the hype may make you think. It’s often the case that functions that would be the responsibility of a data scientist are adequately performed by existing employees, such as software engineers, business/data analysts, and marketers.

                                            Q5: Are you committed to being data-driven?

                                            I have seen more than one case where data scientists are hired only to be blocked or ignored. This is more prevalent in the corporate world, where managers are often incentivised to prioritise doing things that look good over things that make financial sense. But even if recruitment is done with the best intentions, progress may be blocked by employees who feel threatened because they would be replaced by automated data-driven algorithms. Successful data science projects require support from senior leadership, as discussed by Greta Roberts, Radim Řehůřek, Alec Smith, and many others. Without such support and a strong commitment to making data-driven decisions, everyone is just wasting their time.

                                            Closing thoughts

                                            While data science is currently over-hyped, many organisations still have much to gain from hiring data scientists. I hope that this post has helped you decide whether you need a data scientist right now. If you’re unsure, please don’t hesitate to contact me. And to any data scientists reading this: Be very wary of potential employers who do not have good answers to the above questions. At this point in time you can afford to be picky, at least until the hype is over.

                                            Subscribe +

                                            You don’t need a data scientist (yet)

                                            The hype around big data has caused many organisations to hire data scientists without giving much thought to what these data scientists are going to do and whether they’re actually needed. This is a source of frustration for all parties involved. This post discusses some questions you should ask yourself before deciding to hire your first data scientist.

                                            Q1: Do you know what data scientists do?

                                            Somewhat surprisingly, there are quite a few companies that hire data scientists without having a clear idea of what data scientists actually do. People seem to have a fear of missing out on the big data hype, and think of hiring data scientists as the solution. A common misconception is that a data scientist’s role includes telling you what to do with your data. While this may sometimes happen in practice, the ideal scenario is where the business has problems that can be solved using data science (more on this under Q3 below). If you don’t know what your data scientist is going to do, you probably don’t need one.

                                            So what do data scientists do? When you think about it, adding the word “data” to “science” is a bit redundant, as all science is based on data. Following from this, anyone who does any kind of data analysis is a data scientist. While it may be true, this broad definition is not very helpful. As discussed in a previous post, it’s more useful to define data scientists as individuals who combine expertise in statistics and machine learning with strong software engineering skills.

                                            Q2: Do you have enough data available?

                                            It’s not uncommon to see products that suffer from over-engineering and premature investment in advanced analytics capabilities. In the early stages, it’s important to focus on creating a minimum viable product and getting it to market quickly. Data science starts to shine once the product is generating enough data, as most of the power of advanced analytics is in optimising and automating existing processes.

                                            Not having a data scientist in the early stages doesn’t mean the data is being ignored – it just means that it doesn’t require the attention of a full-time data scientist. If your product is at an early stage and you are still concerned, you’re better off hiring a data science consultant for a few days to help lay out the long-term vision for data-driven capabilities. This would be cheaper and less time-consuming than hiring a full-timer. The exception to this rule is when the product itself is built around advanced analytics (e.g., AlchemyAPI or Enlitic). Building such products without data scientists is far from ideal, or just impossible.

                                            Even if your product is mature and generating a lot of data, it doesn’t mean it’s ready for data science. Advanced analytics capabilities are at the top of data’s hierarchy of needs: If your product is buggy, or if your data is scattered everywhere and your platform lacks centralised reporting, you need to first invest in fixing your data plumbing. This is the job of data engineers. Getting data scientists involved when the data is hardly available due to infrastructure issues is likely to lead to frustration. In addition, setting up centralised reporting and dashboarding is likely to give you ideas for problems that data scientists can solve.

                                            Q3: Do you have a specific problem to solve?

                                            If the problem you’re trying to solve is “everyone is doing smart things with data, we should be doing stuff with data too”, you don’t have a specific problem that can be solved by bringing a data scientist on board. Defining the problem often ends up occupying a lot of the data scientist’s time, so you are likely to obtain better results if have more than just a vague idea around “doing something with data, because Hadoop”. Ideally you want to optimise an existing process that is currently being solved with heuristics, make an existing model better, implement a new data-driven feature, or something along these lines. Common examples include reducing churn, increasing conversions, and replacing manual processes with automated data-driven systems. Again, getting advice from experienced data scientists before committing to hiring one may be your best first step.

                                            Q4: Can you get away with heuristics, intuition, and/or manual processes?

                                            Some data scientists would passionately claim that you must deploy only models that are theoretically justified and well-tested. However, in many cases you can get away with using simple heuristics, intuition, and/or manual processes. These can be orders of magnitude cheaper than building sophisticated predictive models and the infrastructure to support them. For many businesses, there are more pressing needs than doing everything in a theoretically sound way. Despite what many technical people like to think, customers don’t tend to care how things are implemented, as long as their needs are fulfilled.

                                            For example, I spent some time with a client whose product includes a semi-manual part where structured data is extracted from documents. Their process included sending some of the documents to a trained team in the Philippines for manual analysis. The client was interested in replacing that manual work with a machine learning algorithm. As is often the case with machine learning, it was unknown whether the resultant model would be accurate enough to completely replace the manual workers. This generally depends on data quality and the feasibility of solving the problem. Assessing the feasibility would have taken some time and money, so the client decided to park the idea and focus on other areas of their business.

                                            Every business has resource constraints. Situations where the best investment you can make is hiring a full-time data scientist are rarer than what the hype may make you think. It’s often the case that functions that would be the responsibility of a data scientist are adequately performed by existing employees, such as software engineers, business/data analysts, and marketers.

                                            Q5: Are you committed to being data-driven?

                                            I have seen more than one case where data scientists are hired only to be blocked or ignored. This is more prevalent in the corporate world, where managers are often incentivised to prioritise doing things that look good over things that make financial sense. But even if recruitment is done with the best intentions, progress may be blocked by employees who feel threatened because they would be replaced by automated data-driven algorithms. Successful data science projects require support from senior leadership, as discussed by Greta Roberts, Radim Řehůřek, Alec Smith, and many others. Without such support and a strong commitment to making data-driven decisions, everyone is just wasting their time.

                                            Closing thoughts

                                            While data science is currently over-hyped, many organisations still have much to gain from hiring data scientists. I hope that this post has helped you decide whether you need a data scientist right now. If you’re unsure, please don’t hesitate to contact me. And to any data scientists reading this: Be very wary of potential employers who do not have good answers to the above questions. At this point in time you can afford to be picky, at least until the hype is over.

                                            Subscribe

                                              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                              I enjoyed the post - though I offer some contrary points to consider:

                                              I have learned that if it is clear that you will need a data scientists (by someone who knows what they do), then you should get them as soon as possible. Don’t wait. Data Scientists work best when they have full context for the problem they are here to solve. Getting them in early allows them to help frame the problem. This framing is critical. If the framing is off, it takes a very long time (sometimes never) to get it back on track. A late-to-the-game data scientists can be too influenced by the the existing framing they are given. They tend to think within that box, when in reality, the box was never the right way to approach the problem. Even if they do see outside of it, it can be very difficult to convince the original framers that there is a better way to do things (people can get quite attached to their vision).

                                              It also can be wise to NOT WAIT till there is data to analyze. Too often, data is an afterthought. Its important for the data scientist to get in early on the initiative so he or she can help define the needed instrumentation and data acquisition strategy. They can even guide the needs of the data warehouse and other repositories where the newly captured data will reside.

                                              Further, it is often the case that it is the data scientist that identifies the specific problem to solve. At my company, I estimate that over half of the ideas for new data products, features, and services come from the data science team – not the business. This is intuitive as the data scientists are the folks that are most intimate with the data and are least constrained by what is possible to do with data. Give them business context and they will come up with problems/solutions that no one has thought of.

                                              Finally, I find heuristics to be dangerous. At best they are suboptimal, and more often than not, they are just plain wrong (those with extensive A/B testing experience can attest to the fact that our intuition fails us again and again). Undoing a bad heuristics can be very painful - in the technical work, the coordinate work, and in the resetting of expectations. Its hard to get people to not walk on a paved path … even if that path is the long way or a dead-end.

                                              I totally agree with “Q5: Are you committed to being data-driven?”. This comes down to business model and culture. Is your business model one where data science can be the source of strategic differentiation? Is your culture able to support empiricism? The answer to both of these has to be ‘yes’ in order to commit to being data-driven.

                                              Thank you for your thoughtful comments, Eric!

                                              I generally agree that it can be beneficial to involve data scientists early on and to avoid thoughtless heuristics, but that it all depends on having a supportive data-driven environment and on resource constraints. As mentioned under Q2, getting advice from a data scientist in the early stages of the product is worthwhile, so it may be smart to pay for a few days of consulting, but not necessarily a good idea to hire a full-timer. A lot of it depends on the general product vision.

                                              Another note regarding heuristics and intuition: While some may be dangerous, you can view many modelling decisions as heuristics. For example, when building a predictive model, you have to make some intuition-driven choices around features (no model uses all the knowledge in the world), learning algorithms and their hyperparameters. You just can’t test everything, so there’s a need for compromises if you aim to ever deliver anything.

                                              \ No newline at end of file diff --git a/2015/10/02/the-wonderful-world-of-recommender-systems/index.html b/2015/10/02/the-wonderful-world-of-recommender-systems/index.html index 50a01df80..981f3935e 100644 --- a/2015/10/02/the-wonderful-world-of-recommender-systems/index.html +++ b/2015/10/02/the-wonderful-world-of-recommender-systems/index.html @@ -1,5 +1,5 @@ The wonderful world of recommender systems | Yanir Seroussi | Data & AI for Startup Impact -

                                              The wonderful world of recommender systems

                                              I recently gave a talk about recommender systems at the Data Science Sydney meetup (the slides are available here). This post roughly follows the outline of the talk, expanding on some of the key points in non-slide form (i.e., complete sentences and paragraphs!). The first few sections give a broad overview of the field and the common recommendation paradigms, while the final part is dedicated to debunking five common myths about recommender systems.

                                              Motivation: Why should we care about recommender systems?

                                              The key reason why many people seem to care about recommender systems is money. For companies such as Amazon, Netflix, and Spotify, recommender systems drive significant engagement and revenue. But this is the more cynical view of things. The reason these companies (and others) see increased revenue is because they deliver actual value to their customers – recommender systems provide a scalable way of personalising content for users in scenarios with many items.

                                              Another reason why data scientists specifically should care about recommender systems is that it is a true data science problem. That is, at least according to my favourite definition of data science as the intersection between software engineering, machine learning, and statistics. As we will see, building successful recommender systems requires all of these skills (and more).

                                              Defining recommender systems

                                              When trying to the define anything, a reasonable first step is to ask Wikipedia. Unfortunately, as of the day of this post’s publication, Wikipedia defines recommender systems too narrowly, as “a subclass of information filtering system that seek to predict the ‘rating’ or ‘preference’ that a user would give to an item” (I should probably fix it, but this wrong definition helped my talk flow better – let me know if you fix it and I’ll update this paragraph).

                                              The problem with Wikipedia’s definition is that there’s so much more to recommender systems than rating prediction. First, recommender is a misnomer – calling it a discovery assistant is better, as the so-called recommendations are far from binding. Second, system means that elements like presentation are important, which is part of what makes recommendation such an interesting data science problem.

                                              My definition is simply:

                                              Recommender systems are systems that help users discover items they may like.

                                              Recommendation paradigms

                                              Depending on who you ask, there are between two and twenty different recommendation paradigms. The usual classification is by the type of data that is used to generate recommendations. The distinction between approaches is more academic than practical, as it is often a good idea to use hybrids/ensembles to address each method’s limitations. Nonetheless, it is worthwhile discussing the different paradigms. The way I see it, if you ignore trivial approaches that often work surprisingly well (e.g., popular items, and “watch it again”), there are four main paradigms: collaborative filtering, content-based, social/demographic, and contextual recommendation.

                                              Collaborative filtering is perhaps the most famous approach to recommendation, to the point that it is sometimes seen as synonymous with the field. The main idea is that you’re given a matrix of preferences by users for items, and these are used to predict missing preferences and recommend items with high predictions. One of the key advantages of this approach is that there has been a huge amount of research into collaborative filtering, making it pretty well-understood, with existing libraries that make implementation fairly straightforward. Another important advantage is that collaborative filtering is independent of item properties. All you need to get started is user and item IDs, and some notion of preference by users for items (ratings, views, etc.).

                                              The major limitation of collaborative filtering is its reliance on preferences. In a cold-start scenario, where there are no preferences at all, it can’t generate any recommendations. However, cold starts can also occur when there are millions of available preferences, because pure collaborative recommendation doesn’t work for items or users with no ratings, and often performs pretty poorly when there are only a few ratings. Further, the underlying collaborative model may yield disappointing results when the preference matrix is sparse. In fact, this has been my experience in nearly every situation where I deployed collaborative filtering. It always requires tweaking, and never simply works out of the box.

                                              Content-based algorithms are given user preferences for items, and recommend similar items based on a domain-specific notion of item content. The main advantage of content-based recommendation over collaborative filtering is that it doesn’t require as much user feedback to get going. Even one known user preference can yield many good recommendations (which can lead to the collection of preferences to enable collaborative recommendation). In many scenarios, content-based recommendation is the most natural approach. For example, when recommending news articles or blog posts, it’s natural to compare the textual content of the items. This approach also extends naturally to cases where item metadata is available (e.g., movie stars, book authors, and music genres).

                                              One problem with deploying content-based recommendations arises when item similarity is not so easily defined. However, even when it is natural to measure similarity, content-based recommendations may end up being too homogeneous to be useful. Such recommendations may also be too static over time, thereby failing to adjust to changes in individual user tastes and other shifts in the underlying data.

                                              Social and demographic recommenders suggest items that are liked by friends, friends of friends, and demographically-similar people. Such recommenders don’t need any preferences by the user to whom recommendations are made, making them very powerful. In my experience, even trivially-implemented approaches can be depressingly accurate. For example, just summing the number of Facebook likes by a person’s close friends can often be enough to paint a pretty accurate picture of what that person likes.

                                              Given this power of social and demographic recommenders, it isn’t surprising that social networks don’t easily give their data away. This means that for many practitioners, employing social/demographic recommendation algorithms is simply impossible. However, even when such data is available, it is not always easy to use without creeping users out. Further, privacy concerns need to be carefully addressed to ensure that users are comfortable with using the system.

                                              Contextual recommendation algorithms recommend items that match the user’s current context. This allows them to be more flexible and adaptive to current user needs than methods that ignore context (essentially giving the same weight to all of the user’s history). Hence, contextual algorithms are more likely to elicit a response than approaches that are based only on historical data.

                                              The key limitations of contextual recommenders are similar to those of social and demographic recommenders – contextual data may not always be available, and there’s a risk of creeping out the user. For example, ad retargeting can be seen as a form of contextual recommendation that follows users around the web and across devices, without having the explicit consent of the users to being tracked in this manner.

                                              Five common myths about recommender systems

                                              There are some common myths and misconceptions surrounding recommender systems. I’ve picked five to address in this post. If you disagree, agree, or have more to add, I would love to hear from you either privately or in the comment section.

                                              The accuracy myth
                                              Offline optimisation of an accuracy measure is sufficient for creating a successful recommender
                                              Reality
                                              Users don't really care about accuracy

                                              This is perhaps the most prevalent myth of all, as evidenced by Wikipedia’s definition of recommender systems. It’s somewhat surprising that it still persists, as it’s been almost ten years since McNee et al.’s influential paper on the damage the focus on accuracy measures has done to the field.

                                              It is therefore worth asking where this myth came from. My theory is that it is a feedback loop between academia and industry. In academia it is pretty easy to publish papers with infinitesimal improvements to arbitrary accuracy measures on offline datasets (I’m also guilty of doing just that), while it’s relatively hard to run experiments on live systems. However, one of the moves that significantly increased focus on offline predictive accuracy came from industry, in the form of the $1M Netflix prize, where the goal was to improve the accuracy of Netflix’s rating prediction algorithm by 10%.

                                              Notably, most of the algorithms that came out of the three-year competition were never integrated into Netflix. As discussed on the Netflix blog:

                                              You might be wondering what happened with the final Grand Prize ensemble that won the $1M two years later… We evaluated some of the new methods offline but the additional accuracy gains that we measured did not seem to justify the engineering effort needed to bring them into a production environment.

                                              Our business objective is to maximize member satisfaction and month-to-month subscription retention… Now it is clear that the Netflix Prize objective, accurate prediction of a movie’s rating, is just one of the many components of an effective recommendation system that optimizes our members’ enjoyment.

                                              The following chart says it all (taken from the second part of the blog post quoted above):

                                              The wonderful world of recommender systems

                                              I recently gave a talk about recommender systems at the Data Science Sydney meetup (the slides are available here). This post roughly follows the outline of the talk, expanding on some of the key points in non-slide form (i.e., complete sentences and paragraphs!). The first few sections give a broad overview of the field and the common recommendation paradigms, while the final part is dedicated to debunking five common myths about recommender systems.

                                              Motivation: Why should we care about recommender systems?

                                              The key reason why many people seem to care about recommender systems is money. For companies such as Amazon, Netflix, and Spotify, recommender systems drive significant engagement and revenue. But this is the more cynical view of things. The reason these companies (and others) see increased revenue is because they deliver actual value to their customers – recommender systems provide a scalable way of personalising content for users in scenarios with many items.

                                              Another reason why data scientists specifically should care about recommender systems is that it is a true data science problem. That is, at least according to my favourite definition of data science as the intersection between software engineering, machine learning, and statistics. As we will see, building successful recommender systems requires all of these skills (and more).

                                              Defining recommender systems

                                              When trying to the define anything, a reasonable first step is to ask Wikipedia. Unfortunately, as of the day of this post’s publication, Wikipedia defines recommender systems too narrowly, as “a subclass of information filtering system that seek to predict the ‘rating’ or ‘preference’ that a user would give to an item” (I should probably fix it, but this wrong definition helped my talk flow better – let me know if you fix it and I’ll update this paragraph).

                                              The problem with Wikipedia’s definition is that there’s so much more to recommender systems than rating prediction. First, recommender is a misnomer – calling it a discovery assistant is better, as the so-called recommendations are far from binding. Second, system means that elements like presentation are important, which is part of what makes recommendation such an interesting data science problem.

                                              My definition is simply:

                                              Recommender systems are systems that help users discover items they may like.

                                              Recommendation paradigms

                                              Depending on who you ask, there are between two and twenty different recommendation paradigms. The usual classification is by the type of data that is used to generate recommendations. The distinction between approaches is more academic than practical, as it is often a good idea to use hybrids/ensembles to address each method’s limitations. Nonetheless, it is worthwhile discussing the different paradigms. The way I see it, if you ignore trivial approaches that often work surprisingly well (e.g., popular items, and “watch it again”), there are four main paradigms: collaborative filtering, content-based, social/demographic, and contextual recommendation.

                                              Collaborative filtering is perhaps the most famous approach to recommendation, to the point that it is sometimes seen as synonymous with the field. The main idea is that you’re given a matrix of preferences by users for items, and these are used to predict missing preferences and recommend items with high predictions. One of the key advantages of this approach is that there has been a huge amount of research into collaborative filtering, making it pretty well-understood, with existing libraries that make implementation fairly straightforward. Another important advantage is that collaborative filtering is independent of item properties. All you need to get started is user and item IDs, and some notion of preference by users for items (ratings, views, etc.).

                                              The major limitation of collaborative filtering is its reliance on preferences. In a cold-start scenario, where there are no preferences at all, it can’t generate any recommendations. However, cold starts can also occur when there are millions of available preferences, because pure collaborative recommendation doesn’t work for items or users with no ratings, and often performs pretty poorly when there are only a few ratings. Further, the underlying collaborative model may yield disappointing results when the preference matrix is sparse. In fact, this has been my experience in nearly every situation where I deployed collaborative filtering. It always requires tweaking, and never simply works out of the box.

                                              Content-based algorithms are given user preferences for items, and recommend similar items based on a domain-specific notion of item content. The main advantage of content-based recommendation over collaborative filtering is that it doesn’t require as much user feedback to get going. Even one known user preference can yield many good recommendations (which can lead to the collection of preferences to enable collaborative recommendation). In many scenarios, content-based recommendation is the most natural approach. For example, when recommending news articles or blog posts, it’s natural to compare the textual content of the items. This approach also extends naturally to cases where item metadata is available (e.g., movie stars, book authors, and music genres).

                                              One problem with deploying content-based recommendations arises when item similarity is not so easily defined. However, even when it is natural to measure similarity, content-based recommendations may end up being too homogeneous to be useful. Such recommendations may also be too static over time, thereby failing to adjust to changes in individual user tastes and other shifts in the underlying data.

                                              Social and demographic recommenders suggest items that are liked by friends, friends of friends, and demographically-similar people. Such recommenders don’t need any preferences by the user to whom recommendations are made, making them very powerful. In my experience, even trivially-implemented approaches can be depressingly accurate. For example, just summing the number of Facebook likes by a person’s close friends can often be enough to paint a pretty accurate picture of what that person likes.

                                              Given this power of social and demographic recommenders, it isn’t surprising that social networks don’t easily give their data away. This means that for many practitioners, employing social/demographic recommendation algorithms is simply impossible. However, even when such data is available, it is not always easy to use without creeping users out. Further, privacy concerns need to be carefully addressed to ensure that users are comfortable with using the system.

                                              Contextual recommendation algorithms recommend items that match the user’s current context. This allows them to be more flexible and adaptive to current user needs than methods that ignore context (essentially giving the same weight to all of the user’s history). Hence, contextual algorithms are more likely to elicit a response than approaches that are based only on historical data.

                                              The key limitations of contextual recommenders are similar to those of social and demographic recommenders – contextual data may not always be available, and there’s a risk of creeping out the user. For example, ad retargeting can be seen as a form of contextual recommendation that follows users around the web and across devices, without having the explicit consent of the users to being tracked in this manner.

                                              Five common myths about recommender systems

                                              There are some common myths and misconceptions surrounding recommender systems. I’ve picked five to address in this post. If you disagree, agree, or have more to add, I would love to hear from you either privately or in the comment section.

                                              The accuracy myth
                                              Offline optimisation of an accuracy measure is sufficient for creating a successful recommender
                                              Reality
                                              Users don't really care about accuracy

                                              This is perhaps the most prevalent myth of all, as evidenced by Wikipedia’s definition of recommender systems. It’s somewhat surprising that it still persists, as it’s been almost ten years since McNee et al.’s influential paper on the damage the focus on accuracy measures has done to the field.

                                              It is therefore worth asking where this myth came from. My theory is that it is a feedback loop between academia and industry. In academia it is pretty easy to publish papers with infinitesimal improvements to arbitrary accuracy measures on offline datasets (I’m also guilty of doing just that), while it’s relatively hard to run experiments on live systems. However, one of the moves that significantly increased focus on offline predictive accuracy came from industry, in the form of the $1M Netflix prize, where the goal was to improve the accuracy of Netflix’s rating prediction algorithm by 10%.

                                              Notably, most of the algorithms that came out of the three-year competition were never integrated into Netflix. As discussed on the Netflix blog:

                                              You might be wondering what happened with the final Grand Prize ensemble that won the $1M two years later… We evaluated some of the new methods offline but the additional accuracy gains that we measured did not seem to justify the engineering effort needed to bring them into a production environment.

                                              Our business objective is to maximize member satisfaction and month-to-month subscription retention… Now it is clear that the Netflix Prize objective, accurate prediction of a movie’s rating, is just one of the many components of an effective recommendation system that optimizes our members’ enjoyment.

                                              The following chart says it all (taken from the second part of the blog post quoted above):

                                              flower
                                              2016-01-26 18:47:02

                                              it does not clear to me why accuracy is not important in recommender and searching??

                                              It is important, but its importance tends to be exaggerated to the exclusion of all other metrics. As I said in the post, things like the way you present your results (UI/UX) and novelty/serendipity are also very important. In addition, the goal of the system is often to optimise a different goal from offline accuracy, such as revenue or engagement. In such cases it is best to focus on what you want to improve rather than offline accuracy.

                                              By the way, I attended a talk by Ted Dunning a few months ago, where he said that one of the most important tweaks in real-life recommender system is adding random recommendations (essentially decreasing offline accuracy). This allows the system to learn from user feedback on a wider range of items, improving performance in the long run.

                                              Thank you very much for your fast response. would you please send me the ted talk’s tilte ? so I can watch it.
                                              It’s a talk by Ted Dunning, not a TED talk :) -Anyway, the video is here: https://www.youtube.com/watch?v=FKAFe2iN_Yw (both talks in the video are excellent)

                                              Two points that were not discussed but I see it recurrently, at least in E-Commerce, it is how people underestimate the effort necessary to build an online realtime recommender system and overestimate its impact. Offline recommendations are enough for the majority of cases. The second one, there is an obsession with novel algorithms, when often bread and butter ALS + domain knowledge reflected in business rules bring better impact.

                                              Overall, really good overview, and I fully agree with all points!

                                              This was an excellent read - would highly recommend ;) - thank you.
                                              Thanks a lot for giving an awesome overview of recommender systems. Have you considered writing a textbook on the topic?
                                              Nah, there already many books out there and it’s too much work to write another one :)
                                              Hi! Thank you for the article. It a bit much help me for my project. However, can you suggest me any book or link that can show me how to apply the algorithm in coding? I guess it a little bit difficult. I do not found any article related to application of the algorithm in coding, especially in Java because currently I’m developing the recommender apps. I’m stuck at the algorithm. Hope you or anyone can help. Thank you!
                                              Thank you for your comment. You can check out Mahout in Action (https://www.manning.com/books/mahout-in-action) or Oryx (http://oryx.io/) for some examples.
                                              Hi Yanir, thanks a lot for a comprehensive article on recommender systems. What I’m interested in particular is a recommender systems for subscription box businesses, as in the case of BarkBox, Birchbox, Blue Apron, Harry’s, OwlCrate, Trunk Club, and Winc. What type of recommeder system do they use and why is beneficial?
                                              Thanks for your comment. Unfortunately, I’m not familiar with those businesses so I don’t really know what recommender systems they use…

                                              Two points that were not discussed but I see it recurrently, at least in E-Commerce, it is how people underestimate the effort necessary to build an online realtime recommender system and overestimate its impact. Offline recommendations are enough for the majority of cases. The second one, there is an obsession with novel algorithms, when often bread and butter ALS + domain knowledge reflected in business rules bring better impact.

                                              Overall, really good overview, and I fully agree with all points!

                                              This was an excellent read - would highly recommend ;) - thank you.
                                              Thanks a lot for giving an awesome overview of recommender systems. Have you considered writing a textbook on the topic?
                                              Nah, there already many books out there and it’s too much work to write another one :)
                                              Hi! Thank you for the article. It a bit much help me for my project. However, can you suggest me any book or link that can show me how to apply the algorithm in coding? I guess it a little bit difficult. I do not found any article related to application of the algorithm in coding, especially in Java because currently I’m developing the recommender apps. I’m stuck at the algorithm. Hope you or anyone can help. Thank you!
                                              Thank you for your comment. You can check out Mahout in Action (https://www.manning.com/books/mahout-in-action) or Oryx (http://oryx.io/) for some examples.
                                              Hi Yanir, thanks a lot for a comprehensive article on recommender systems. What I’m interested in particular is a recommender systems for subscription box businesses, as in the case of BarkBox, Birchbox, Blue Apron, Harry’s, OwlCrate, Trunk Club, and Winc. What type of recommeder system do they use and why is beneficial?
                                              Thanks for your comment. Unfortunately, I’m not familiar with those businesses so I don’t really know what recommender systems they use…

                                              This site is a part of the Data People Writing Stuff webring.
                                              ← previous site +  |   +next site →

                                              \ No newline at end of file diff --git a/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.html b/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.html index 98a851639..3b4821637 100644 --- a/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.html +++ b/2015/10/19/nutritionism-and-the-need-for-complex-models-to-explain-complex-phenomena/index.html @@ -1,12 +1,14 @@ Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling | Yanir Seroussi | Data & AI for Startup Impact -

                                              Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling

                                              I recently finished reading the book In Defense of Food: An Eater’s Manifesto by Michael Pollan. The book criticises nutritionism – the idea that one should eat according to the sum of measured nutrients while ignoring the food that contains these nutrients. The key argument of the book is that since the knowledge derived using food science is still very limited, completely relying on the partial findings and tools provided by this science is likely to lead to health issues. Instead, the author says we should “Eat food. Not too much. Mostly plants.” One of the reasons I found the book interesting is that nutritionism is a special case of misinterpretation and miscommunication of scientific results. This is something many data scientists encounter in their everyday work – finding the balance between simple and complex models, the need to “sell” models and their results to non-technical stakeholders, and the requirement for well-performing models. This post explores these issues through the example of predicting human health based on diet.

                                              As an aside, I generally agree with the book’s message, which is backed by fairly thorough research (though it is a bit dated, as the book was released in 2008). There are many commercial interests invested in persuading us to eat things that may be edible, but shouldn’t really be considered food. These food-like products tend to rely on health claims that dumb down the science. A common example can be found in various fat-free products, where healthy fat is replaced with unhealthy amounts of sugar to compensate for the loss of flavour. These products are then marketed as healthy due to their lack of fat. The book is full of such examples, and is definitely worth reading, especially if you live in the US or in a country that’s heavily influenced by American food culture.

                                              Running example: Predicting a person’s health based on their diet

                                              Predicting health based on diet isn’t an easy problem. First, how do you quantify and measure health? You could use proxies like longevity and occurrence/duration of disease, but these are imperfect measures because you can have a long unhealthy life (thanks to modern medicine) and some diseases are more unbearable than others. Another issue is that there are many factors other than diet that contribute to health, such as genetics, age, lifestyle, access to healthcare, etc. Finally, even if you could reliably study the effect of diet in isolation from other factors, there’s the question of measuring the diet. Do you measure each nutrient separately or do you look at foods and consumption patterns? Do you group foods by time (e.g., looking at overall daily or monthly patterns)? If you just looked at the raw data of foods and nutrients consumed at certain points in time, every studied subject is likely to be an outlier (due to the curse of dimensionality). The raw data on foods consumed by individuals has to be grouped in some way to build a generalisable model, but groupings necessitate removal of some data.

                                              Modelling real-world data is rarely straightforward. Many assumptions are embedded in the measurements and models. Good scientific papers are explicit about the shortcomings and limitations of the presented work. However, by the time scientific studies make it to the real world, shortcomings and limitations are removed to present palatable (and often wrong) conclusions to a general audience. This is illustrated nicely by the following comic:

                                              Miscommunicating science: Simplistic models, nutritionism, and the art of storytelling

                                              I recently finished reading the book In Defense of Food: An Eater’s Manifesto by Michael Pollan. The book criticises nutritionism – the idea that one should eat according to the sum of measured nutrients while ignoring the food that contains these nutrients. The key argument of the book is that since the knowledge derived using food science is still very limited, completely relying on the partial findings and tools provided by this science is likely to lead to health issues. Instead, the author says we should “Eat food. Not too much. Mostly plants.” One of the reasons I found the book interesting is that nutritionism is a special case of misinterpretation and miscommunication of scientific results. This is something many data scientists encounter in their everyday work – finding the balance between simple and complex models, the need to “sell” models and their results to non-technical stakeholders, and the requirement for well-performing models. This post explores these issues through the example of predicting human health based on diet.

                                              As an aside, I generally agree with the book’s message, which is backed by fairly thorough research (though it is a bit dated, as the book was released in 2008). There are many commercial interests invested in persuading us to eat things that may be edible, but shouldn’t really be considered food. These food-like products tend to rely on health claims that dumb down the science. A common example can be found in various fat-free products, where healthy fat is replaced with unhealthy amounts of sugar to compensate for the loss of flavour. These products are then marketed as healthy due to their lack of fat. The book is full of such examples, and is definitely worth reading, especially if you live in the US or in a country that’s heavily influenced by American food culture.

                                              Running example: Predicting a person’s health based on their diet

                                              Predicting health based on diet isn’t an easy problem. First, how do you quantify and measure health? You could use proxies like longevity and occurrence/duration of disease, but these are imperfect measures because you can have a long unhealthy life (thanks to modern medicine) and some diseases are more unbearable than others. Another issue is that there are many factors other than diet that contribute to health, such as genetics, age, lifestyle, access to healthcare, etc. Finally, even if you could reliably study the effect of diet in isolation from other factors, there’s the question of measuring the diet. Do you measure each nutrient separately or do you look at foods and consumption patterns? Do you group foods by time (e.g., looking at overall daily or monthly patterns)? If you just looked at the raw data of foods and nutrients consumed at certain points in time, every studied subject is likely to be an outlier (due to the curse of dimensionality). The raw data on foods consumed by individuals has to be grouped in some way to build a generalisable model, but groupings necessitate removal of some data.

                                              Modelling real-world data is rarely straightforward. Many assumptions are embedded in the measurements and models. Good scientific papers are explicit about the shortcomings and limitations of the presented work. However, by the time scientific studies make it to the real world, shortcomings and limitations are removed to present palatable (and often wrong) conclusions to a general audience. This is illustrated nicely by the following comic:

                                              PHD Comics: Science News Cycle

                                              Selling your model with simple explanations

                                              People like simple explanations for complex phenomena. If you work as a data scientist, or if you are planning to become/hire one, you’ve probably seen storytelling listed as one of the key skills that data scientists should have. Unlike “real” scientists that work in academia and have to explain their results mostly to peers who can handle technical complexities, data scientists in industry have to deal with non-technical stakeholders who want to understand how the models work. However, these stakeholders rarely have the time or patience to understand how things truly work. What they want is a simple hand-wavy explanation to make them feel as if they understand the matter – they want a story, not a technical report (an aside: don’t feel too smug, there is a lot of knowledge out there and in matters that fall outside of our main interests we are all non-technical stakeholders who get fed simple stories).

                                              One of the simplest stories that most people can understand is the story of correlation. Going back to the running example of predicting health based on diet, it is well-known that excessive consumption of certain fats under certain conditions is correlated with an increase in likelihood of certain diseases. This is simplified in some stories to “consuming more fat increases your chance of disease”, which leads to the conclusion that consuming no fat at all decreases the chance of disease to zero. While this may sound ridiculous, it’s the sad reality. According to a recent survey, while the image of fat has improved over the past few years, 42% of Americans still try to limit or avoid all fats.

                                              A slightly more involved story is that of linear models – looking at the effect of the most important factors, rather than presenting a single factor’s contribution. This storytelling technique is commonly used even with non-linear models, where the most important features are identified using various techniques. The problem is that people still tend to interpret this form of presentation as a simple linear relationship. Expanding on the previous example, this approach goes from a single-minded focus on fat to the need to consume less fat and sugar, but more calcium, protein and vitamin D. Unfortunately, even linear models with tens of variables are hard for people to use and follow. In the case of nutrition, few people really track the intake of all the nutrients covered by recommended daily intakes.

                                              Few interesting relationships are linear

                                              Complex phenomena tend to be explained by complex non-linear models. For example, it’s not enough to consume the “right” amount of calcium – you also need vitamin D to absorb it, but popping a few vitamin D pills isn’t going to work well if you don’t consume them with fat, though over-consumption of certain fats is likely to lead to health issues. This list of human-friendly rules can go on and on, but reality is much more complex. It is naive to think that it is possible to predict something as complex as human health with a simple linear model that is based on daily nutrient intake. That being said, some relationships do lend themselves to simple rules of thumb. For example, if you don’t have enough vitamin C, you’re very likely to get scurvy, and people who don’t consume enough vitamin B1 may contract beriberi. However, when it comes to cancers and other diseases that take years to develop, linear models are inadequate.

                                              An accurate model to predict human health based on diet would be based on thousands to millions of variables, and would consider many non-linear relationships. It is fairly safe to assume that there is no magic bullet that simply explains how diet affects our health, and no superfood is going to save us from the complexity of our nutritional needs. It is likely that even if we had such a model, it would not be completely accurate. All models are wrong, but some models are useful. For example, the vitamin C versus scurvy model is very useful, but it is often wrong when it comes to predicting overall health. Predictions made by useful complex models can be very hard to reason about and explain, but it doesn’t mean we shouldn’t use them.

                                              The ongoing quest for sellable complex models

                                              All of the above should be pretty obvious to any modern data scientist. The culture of preferring complex models with high predictive accuracy to simplistic models with questionable predictive power is now prevalent (see Leo Breiman’s 2001 paper for a discussion of these two cultures of statistical modelling). This is illustrated by the focus of many Kaggle competitions on producing accurate models and the recent successes of deep learning for computer vision. Especially with deep learning for vision, no one expects a handful of variables (pixels) to be predictive, so traditional explanations of variable importance are useless. This does lead to a general suspicion of such models, as they are too complex for us to reason about or fully explain. However, it is very hard to argue with the empirical success of accurate modelling techniques.

                                              Nonetheless, many data scientists still work in environments that require simple explanations. This may lead some data scientists to settle for simple models that are easier to sell. In my opinion, it is better to make up a simple explanation for an accurate complex model than settle for a simple model that doesn’t really work. That being said, some situations do call for simple or inflexible models due to a lack of data or the need to enforce strong prior assumptions. In Albert Einstein’s words, “it can scarcely be denied that the supreme goal of all theory is to make the irreducible basic elements as simple and as few as possible without having to surrender the adequate representation of a single datum of experience”. Make things as simple as possible, but not simpler, and always consider the interests of people who try to sell you simplistic (or unnecessarily complex) explanations.

                                              Subscribe

                                                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                \ No newline at end of file diff --git a/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.html b/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.html index 1bd6a5800..f22983619 100644 --- a/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.html +++ b/2015/11/04/migrating-a-simple-web-application-from-mongodb-to-elasticsearch/index.html @@ -1,5 +1,5 @@ Migrating a simple web application from MongoDB to Elasticsearch | Yanir Seroussi | Data & AI for Startup Impact -

                                                Migrating a simple web application from MongoDB to Elasticsearch

                                                Bandcamp Recommender (BCRecommender) is a web application that serves music recommendations from Bandcamp. I recently switched BCRecommender’s data store from MongoDB to Elasticsearch. This has made it possible to offer a richer search experience to users at a similar cost. This post describes the migration process and discusses some of the advantages and disadvantages of using Elasticsearch instead of MongoDB.

                                                Motivation: Why swap MongoDB for Elasticsearch?

                                                I’ve written a few posts in the past on BCRecommender’s design and implementation. It is a fairly simple application with two main components: the backend worker that crawls data and generates recommendations in batch, and the webapp that serves the recommendations. Importantly, each of these components has its own data store, with the recommendations synced up from the worker to the webapp, and data like events and subscriptions synced down from the webapp to the worker. Recently, I migrated the webapp component from Parse to DigitalOcean, replacing Parse’s data store with MongoDB. Choosing MongoDB was meant to simplify the transition – Parse uses MongoDB behind the scenes, as does the backend worker. However, moving out of Parse’s sandboxed environment freed me to choose any data store, and Elasticsearch seemed like a good candidate that would make it possible to expose advanced search capabilities to end users.

                                                Advanced search means different things to different people. In BCRecommender’s case what I had in mind was rather modest, at least for the initial stages. BCRecommender presents recommendations for two types of entities: fans and tralbums (tracks/albums). In both cases, the recommended items are tralbums. When the key is a fan, the recommendations are tralbums that they may like, and when the key is a tralbum, the recommendations are similar tralbums. Each tralbum has a title, an artist name, and a list of tags. Each fan has its Bandcamp username as a primary key, and a list of tags that is derived from the tralbums in the fan’s collection. Originally, “searching” required users to either enter the exact username of a Bandcamp fan, or the exact Bandcamp link of a tralbum – not the best user experience! Indeed, I was tracking the search terms and found that many people were unsuccessfully trying to use unstructured queries. My idea of advanced search was to move away from the original key-value approach to full-text search that considers tags, titles, artists, and other fields that may get added later.

                                                It was clear that while it may be possible to provide advanced search with MongoDB, it wouldn’t be a smooth ride. While recent versions of MongoDB include support for full-text search, it isn’t as feature-rich as Elasticsearch. For example, MongoDB text indices do not store phrases or information about the proximity of words in the documents, making phrase queries run slowly unless the entire collection fits in memory. The names really say it all: MongoDB is a database with some search capabilities, and Elasticsearch is a search engine with some database capabilities. It seems pretty common to use MongoDB (or another database) as a data store and supply search through Elasticsearch, so I figured it isn’t a bad idea to apply this pattern to BCRecommender.

                                                It is worth noting that if BCRecommender were a for-profit project, I would probably use Algolia rather than Elasticsearch. My experience with Algolia on a different project has been excellent – they make it easy for you to get started, have great customer service, and deliver good and fast results with minimal development and operational effort. The two main disadvantages of Algolia are its price and the fact that it’s a closed-source solution (see further discussion on Quora). At over two million records, the monthly cost of running Algolia for BCRecommender would be around US$649, which is more than what I’m willing to spend on this project. However, for a business this may be a reasonable cost because deploying and maintaining an Elasticsearch cluster may end up costing more. Nonetheless, many businesses use Elasticsearch successfully, which is why I have no doubt that it’s a great choice for my use case – it just requires more work than Algolia to get up and running.

                                                Executing the migration plan

                                                The plan for migrating the webapp from MongoDB to Elasticsearch was pretty simple:

                                                1. Read the Elasticsearch manual to ensure it suits my needs
                                                2. Replace MongoDB with Elasticsearch without making any user-facing changes
                                                3. Expose full-text search to BCRecommender users
                                                4. Improve search performance based on user behaviour
                                                5. Implement more search features

                                                Reading the manual is not something I do for every piece of technology I use (there are just too many tools out there these days), but for Elasticsearch it seemed to be worth the effort. I’m not done reading yet, but covering the material in the Getting Started and Search in Depth sections gave me enough information to complete steps 2 & 3. The main things I was worried about was Elasticsearch’s performance as a database and how memory-hungry it’d be. Reading the manual allowed me to avoid some memory-use pitfalls and gave me insights on the way MongoDB and Elasticsearch compare (see details below).

                                                Switching from MongoDB to Elasticsearch as a simple database was pretty straightforward. Both are document-based, so there were no changes required to the data models, but I did use the opportunity to fix some issues. For example, I changed the sitemap generation process from dynamic to static to avoid having to scroll through the entire dataset to fetch deep sitemap pages. To support BCRecommender’s feature of browsing through random fans, I replaced MongoDB’s somewhat-hacky approach of returning random results with Elasticsearch’s cleaner method. As the webapp is implemented in Python, I originally used the elasticsearch-dsl package, but found it too hard to debug queries (e.g., figuring out how to rank results randomly was a bit of a nightmare). Instead, I ended up using the elasticsearch-py package, which is only a thin wrapper around the Elasticsearch API. This approach yields code that doesn’t look very Pythonic – rather than following the Zen of Python’s flat is better than nested aphorism, the API follows the more Java-esque belief of you can never have enough nesting (see image below for example). However, I prefer overly-nested structures that I can debug to flat code that doesn’t work. I may try using the DSL again in the future, once I’ve gained more experience with Elasticsearch.

                                                Migrating a simple web application from MongoDB to Elasticsearch

                                                Bandcamp Recommender (BCRecommender) is a web application that serves music recommendations from Bandcamp. I recently switched BCRecommender’s data store from MongoDB to Elasticsearch. This has made it possible to offer a richer search experience to users at a similar cost. This post describes the migration process and discusses some of the advantages and disadvantages of using Elasticsearch instead of MongoDB.

                                                Motivation: Why swap MongoDB for Elasticsearch?

                                                I’ve written a few posts in the past on BCRecommender’s design and implementation. It is a fairly simple application with two main components: the backend worker that crawls data and generates recommendations in batch, and the webapp that serves the recommendations. Importantly, each of these components has its own data store, with the recommendations synced up from the worker to the webapp, and data like events and subscriptions synced down from the webapp to the worker. Recently, I migrated the webapp component from Parse to DigitalOcean, replacing Parse’s data store with MongoDB. Choosing MongoDB was meant to simplify the transition – Parse uses MongoDB behind the scenes, as does the backend worker. However, moving out of Parse’s sandboxed environment freed me to choose any data store, and Elasticsearch seemed like a good candidate that would make it possible to expose advanced search capabilities to end users.

                                                Advanced search means different things to different people. In BCRecommender’s case what I had in mind was rather modest, at least for the initial stages. BCRecommender presents recommendations for two types of entities: fans and tralbums (tracks/albums). In both cases, the recommended items are tralbums. When the key is a fan, the recommendations are tralbums that they may like, and when the key is a tralbum, the recommendations are similar tralbums. Each tralbum has a title, an artist name, and a list of tags. Each fan has its Bandcamp username as a primary key, and a list of tags that is derived from the tralbums in the fan’s collection. Originally, “searching” required users to either enter the exact username of a Bandcamp fan, or the exact Bandcamp link of a tralbum – not the best user experience! Indeed, I was tracking the search terms and found that many people were unsuccessfully trying to use unstructured queries. My idea of advanced search was to move away from the original key-value approach to full-text search that considers tags, titles, artists, and other fields that may get added later.

                                                It was clear that while it may be possible to provide advanced search with MongoDB, it wouldn’t be a smooth ride. While recent versions of MongoDB include support for full-text search, it isn’t as feature-rich as Elasticsearch. For example, MongoDB text indices do not store phrases or information about the proximity of words in the documents, making phrase queries run slowly unless the entire collection fits in memory. The names really say it all: MongoDB is a database with some search capabilities, and Elasticsearch is a search engine with some database capabilities. It seems pretty common to use MongoDB (or another database) as a data store and supply search through Elasticsearch, so I figured it isn’t a bad idea to apply this pattern to BCRecommender.

                                                It is worth noting that if BCRecommender were a for-profit project, I would probably use Algolia rather than Elasticsearch. My experience with Algolia on a different project has been excellent – they make it easy for you to get started, have great customer service, and deliver good and fast results with minimal development and operational effort. The two main disadvantages of Algolia are its price and the fact that it’s a closed-source solution (see further discussion on Quora). At over two million records, the monthly cost of running Algolia for BCRecommender would be around US$649, which is more than what I’m willing to spend on this project. However, for a business this may be a reasonable cost because deploying and maintaining an Elasticsearch cluster may end up costing more. Nonetheless, many businesses use Elasticsearch successfully, which is why I have no doubt that it’s a great choice for my use case – it just requires more work than Algolia to get up and running.

                                                Executing the migration plan

                                                The plan for migrating the webapp from MongoDB to Elasticsearch was pretty simple:

                                                1. Read the Elasticsearch manual to ensure it suits my needs
                                                2. Replace MongoDB with Elasticsearch without making any user-facing changes
                                                3. Expose full-text search to BCRecommender users
                                                4. Improve search performance based on user behaviour
                                                5. Implement more search features

                                                Reading the manual is not something I do for every piece of technology I use (there are just too many tools out there these days), but for Elasticsearch it seemed to be worth the effort. I’m not done reading yet, but covering the material in the Getting Started and Search in Depth sections gave me enough information to complete steps 2 & 3. The main things I was worried about was Elasticsearch’s performance as a database and how memory-hungry it’d be. Reading the manual allowed me to avoid some memory-use pitfalls and gave me insights on the way MongoDB and Elasticsearch compare (see details below).

                                                Switching from MongoDB to Elasticsearch as a simple database was pretty straightforward. Both are document-based, so there were no changes required to the data models, but I did use the opportunity to fix some issues. For example, I changed the sitemap generation process from dynamic to static to avoid having to scroll through the entire dataset to fetch deep sitemap pages. To support BCRecommender’s feature of browsing through random fans, I replaced MongoDB’s somewhat-hacky approach of returning random results with Elasticsearch’s cleaner method. As the webapp is implemented in Python, I originally used the elasticsearch-dsl package, but found it too hard to debug queries (e.g., figuring out how to rank results randomly was a bit of a nightmare). Instead, I ended up using the elasticsearch-py package, which is only a thin wrapper around the Elasticsearch API. This approach yields code that doesn’t look very Pythonic – rather than following the Zen of Python’s flat is better than nested aphorism, the API follows the more Java-esque belief of you can never have enough nesting (see image below for example). However, I prefer overly-nested structures that I can debug to flat code that doesn’t work. I may try using the DSL again in the future, once I’ve gained more experience with Elasticsearch.

                                                elasticsearch is nesty

                                                As mentioned, one of my worries was that I would have to increase the amount of memory allocated to the machine where Elasticsearch runs. Since BCRecommender is a fairly low-budget project, I’m willing to sacrifice high availability to save a bit on operational costs. Therefore, the webapp and its data store run on the same DigitalOcean instance, which is enough to happily serve the current amount of traffic (around one request per second). By default, Elasticsearch indexes all the fields, and even includes an extra indexed _all field that is a concatenation of all string fields in a document. While indexing everything may be convenient, it wasn’t necessary for the first stage. Choosing the minimal index settings allowed me to keep using the same instance size as before (1GB RAM and 30GB SSD). In fact, due to the switch to static sitemaps and the removal of MongoDB’s random attribute hack, fewer indexes were required after the change.

                                                Once I had all the code converted and working on my local Vagrant environment, it was time to deploy. The deployment was fairly straightforward and required no downtime, as I simply provisioned a new instance and switched over the floating IP once it was all tested and ready to go. I monitored response time and memory use closely and everything seemed to be working just fine – similarly to MongoDB. After a week of monitoring, it was time to take the next step and enable advanced search.

                                                Enabling full-text search is where things got interesting. This phase required adding a search result page (previously users were redirected to the queried page if it was found), and reindexing the data. For this phase, I tried to keep things as simple as possible, and just indexed the string fields (tags, artist, and title) using the standard analyser. I did some manual testing of search results based on common queries, and played a bit with improving precision and recall. Perhaps the most important tweak was allowing an item’s activity level to influence the ranking. For each tralbum, the activity level is the number of fans that have the tralbum in their collection, and for each fan, it is the size of the collection. For example, when searching for amanda, the top result is the fan with username amanda, followed by tralbums by the popular Amanda Palmer. Before I added the consideration of activity level, all tralbums and fans that contained the word amanda had the same ranking.

                                                800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                  Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                  \ No newline at end of file diff --git a/2015/11/23/the-hardest-parts-of-data-science/index.html b/2015/11/23/the-hardest-parts-of-data-science/index.html index b5e429a6a..efab9e429 100644 --- a/2015/11/23/the-hardest-parts-of-data-science/index.html +++ b/2015/11/23/the-hardest-parts-of-data-science/index.html @@ -1,5 +1,5 @@ The hardest parts of data science | Yanir Seroussi | Data & AI for Startup Impact -

                                                  The hardest parts of data science

                                                  Contrary to common belief, the hardest part of data science isn’t building an accurate model or obtaining good, clean data. It is much harder to define feasible problems and come up with reasonable ways of measuring solutions. This post discusses some examples of these issues and how they can be addressed.

                                                  The not-so-hard parts

                                                  Before discussing the hardest parts of data science, it’s worth quickly addressing the two main contenders: model fitting and data collection/cleaning.

                                                  Model fitting is seen by some as particularly hard, or as real data science. This belief is fuelled in part by the success of Kaggle, that calls itself the home of data science. Most Kaggle competitions are focused on model fitting: Participants are given a well-defined problem, a dataset, and a measure to optimise, and they compete to produce the most accurate model. Coupling Kaggle’s excellent marketing with their competition setup leads many people to believe that data science is all about fitting models. In reality, building reasonably-accurate models is not that hard, because many model-building phases can easily be automated. Indeed, there are many companies that offer model fitting as a service (e.g., Microsoft, Amazon, Google and others). Even Ben Hamner, CTO of Kaggle, has said that he is “surprised at the number of ‘black box machine learning in the cloud’ services emerging: model fitting is easy. Problem definition and data collection are not.”

                                                  The hardest parts of data science

                                                  Contrary to common belief, the hardest part of data science isn’t building an accurate model or obtaining good, clean data. It is much harder to define feasible problems and come up with reasonable ways of measuring solutions. This post discusses some examples of these issues and how they can be addressed.

                                                  The not-so-hard parts

                                                  Before discussing the hardest parts of data science, it’s worth quickly addressing the two main contenders: model fitting and data collection/cleaning.

                                                  Model fitting is seen by some as particularly hard, or as real data science. This belief is fuelled in part by the success of Kaggle, that calls itself the home of data science. Most Kaggle competitions are focused on model fitting: Participants are given a well-defined problem, a dataset, and a measure to optimise, and they compete to produce the most accurate model. Coupling Kaggle’s excellent marketing with their competition setup leads many people to believe that data science is all about fitting models. In reality, building reasonably-accurate models is not that hard, because many model-building phases can easily be automated. Indeed, there are many companies that offer model fitting as a service (e.g., Microsoft, Amazon, Google and others). Even Ben Hamner, CTO of Kaggle, has said that he is “surprised at the number of ‘black box machine learning in the cloud’ services emerging: model fitting is easy. Problem definition and data collection are not.”

                                                  Ben Hamner tweet on black box ML in the cloud

                                                  Data collection/cleaning is the essential part that everyone loves to hate. DJ Patil (US Chief Data Scientist) is quoted as saying that “the hardest part of data science is getting good, clean data. Cleaning data is often 80% of the work.” While I agree that collecting data and cleaning it can be a lot of work, I don’t think of this part as particularly hard. It’s definitely important and may require careful planning, but in many cases it just isn’t very challenging. In addition, it is often the case that the data is already given, or is collected using previously-developed methods.

                                                  Problem definition is hard

                                                  There are many reasons why problem definition can be hard. It is sometimes due to stakeholders who don’t know what they want, and expect data scientists to solve all their data problems (either real or imagined). This type of situation is summarised by the following Dilbert strip. It is best handled by cleverly managing stakeholder expectations, while stirring them towards better-defined problems.

                                                  Yanir Seroussi
                                                  2015-12-08 00:55:43

                                                  Thank you for your comment, Arthur.

                                                  I don’t think that replacing “a government” with Paris and pyramid with “Eiffel Tower” makes Shostak’s argument any less convincing. The point still stands that the GDP measure is flawed because it can easily be manipulated through activities that don’t contribute positively to society. Do you think that GDP growth is worth focusing on?

                                                  As to the Berkun quote, I suggest you read more of his writings before dismissing it. He is anything but superstitious and an anti-intellectual. Drawing conclusions from data, pondering the evidence, challenging the null hypothesis, and taking a reasoned position based on the best available evidence doesn’t contradict depending on your intuition for deciding how to interpret and then apply the data. Intuition is often based on experience and data that hasn’t been formally captured. For example, in Bayesian modelling you’re free to use your intuition when setting priors, but the weight of these priors becomes smaller as evidence accumulates.

                                                  Thank you Yansir! I found this post extremely enlightening. I was starting to question to what extent the value of a data scientist would be affected by the emergence of automated data cleaning and model building tools. It makes far more sense that the true value of the role is in the experimental design as opposed to applying algorithms.

                                                  Thank you for your comment, Arthur.

                                                  I don’t think that replacing “a government” with Paris and pyramid with “Eiffel Tower” makes Shostak’s argument any less convincing. The point still stands that the GDP measure is flawed because it can easily be manipulated through activities that don’t contribute positively to society. Do you think that GDP growth is worth focusing on?

                                                  As to the Berkun quote, I suggest you read more of his writings before dismissing it. He is anything but superstitious and an anti-intellectual. Drawing conclusions from data, pondering the evidence, challenging the null hypothesis, and taking a reasoned position based on the best available evidence doesn’t contradict depending on your intuition for deciding how to interpret and then apply the data. Intuition is often based on experience and data that hasn’t been formally captured. For example, in Bayesian modelling you’re free to use your intuition when setting priors, but the weight of these priors becomes smaller as evidence accumulates.

                                                  Thank you Yansir! I found this post extremely enlightening. I was starting to question to what extent the value of a data scientist would be affected by the emergence of automated data cleaning and model building tools. It makes far more sense that the true value of the role is in the experimental design as opposed to applying algorithms.

                                                  This site is a part of the Data People Writing Stuff webring.
                                                  ← previous site +  |   +next site →

                                                  \ No newline at end of file diff --git a/2015/12/08/this-holiday-season-give-me-real-insights/index.html b/2015/12/08/this-holiday-season-give-me-real-insights/index.html index f7781deeb..79c624c45 100644 --- a/2015/12/08/this-holiday-season-give-me-real-insights/index.html +++ b/2015/12/08/this-holiday-season-give-me-real-insights/index.html @@ -1,5 +1,5 @@ This holiday season, give me real insights | Yanir Seroussi | Data & AI for Startup Impact -

                                                  This holiday season, give me real insights

                                                  Merriam-Webster defines an insight as an understanding of the true nature of something. Many companies seem to define an insight as any piece of data or information, which I would call a pseudo-insight. This post surveys some examples of pseudo-insights, and discusses how these can be built upon to provide real insights.

                                                  Exhibit A: WordPress stats

                                                  This website is hosted on wordpress.com. I’m generally happy with WordPress – though it’s not as exciting and shiny as newer competitors, it is rock-solid and very feature-rich. An example of a great WordPress feature is the new stats area (available under wordpress.com/stats if you have a WordPress website). This area includes an insights page, which is full of prime examples of pseudo-insights.

                                                  At the top of the insights page, there is a visualisation of posting activity. As the image below shows, this isn’t very interesting for websites like mine. I already know that I post irregularly, because writing a blog post is time-consuming. I suspect that this visualisation isn’t very useful even for more active multi-author blogs, as it is essentially just a different way of displaying the raw data of post dates. Without joining this data with other information, we won’t gain a better understanding of how the blog is performing and why it performs the way it does.

                                                  This holiday season, give me real insights

                                                  Merriam-Webster defines an insight as an understanding of the true nature of something. Many companies seem to define an insight as any piece of data or information, which I would call a pseudo-insight. This post surveys some examples of pseudo-insights, and discusses how these can be built upon to provide real insights.

                                                  Exhibit A: WordPress stats

                                                  This website is hosted on wordpress.com. I’m generally happy with WordPress – though it’s not as exciting and shiny as newer competitors, it is rock-solid and very feature-rich. An example of a great WordPress feature is the new stats area (available under wordpress.com/stats if you have a WordPress website). This area includes an insights page, which is full of prime examples of pseudo-insights.

                                                  At the top of the insights page, there is a visualisation of posting activity. As the image below shows, this isn’t very interesting for websites like mine. I already know that I post irregularly, because writing a blog post is time-consuming. I suspect that this visualisation isn’t very useful even for more active multi-author blogs, as it is essentially just a different way of displaying the raw data of post dates. Without joining this data with other information, we won’t gain a better understanding of how the blog is performing and why it performs the way it does.

                                                  WordPress insights: posting activity

                                                  An attempt to extract more meaningful insights from posting times appears further down the page, in the form of a widget that tells you the most popular day and hour. The help text says that This is the day and hour when you have been getting the most Views on average. The best timing for publishing a post may be around this period. Unfortunately, I’m pretty certain that this isn’t true in my case. Monday happens to be the most popular day because that’s when I published two of my most popular posts, and I usually try to spread the word about a new post as soon as I publish it. Further, blog posts can become popular a long time after publication, so it is unlikely that the best timing for publishing a post is around Monday 3pm.

                                                  800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                    Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                    Nice post. Mostly agree. Automated insights are hard to automate though, but we (the WordPress.com Data Team) are working on it.

                                                    Some of the things we’ve found that have the biggest impact on successful blogging are:

                                                    • Turning on publicize so your posts are pushed out to various social media channels
                                                    • Regularly publishing. Doesn’t have to be daily, but does need to be regular. We still don’t understand how the periodicity plays into this.
                                                    • Images in posts are correlated with more traffic.

                                                    There’s still a lot to learn here. Interested in helping? https://automattic.com/work-with-us/data-wrangler/ :)

                                                    Thanks Greg! All those factors make sense. Personally, I prefer sharing posts manually to turning on Publicize, but I suppose it has the same effect. My guess is that one of the reasons why images are important is that having at least one image makes posts stick out when shared on social media.

                                                    By the way, I did apply for the data wrangler position a couple of months ago but never heard back. It’s probably too late now, as I have a different position (and a few other options) lined up when I get home from vacation next month :)

                                                    Hey Yanir, that’s embarrassing. :)

                                                    Sorry we haven’t gotten back to you yet. I do see you in our queue. Its been a busy two months so we’re a bit backed up, but getting back on track in the next week or two. Certainly understand if that doesn’t fit into your own timeline. Sorry if that ends up being the case.

                                                    \ No newline at end of file diff --git a/2016/01/24/the-joys-of-offline-data-collection/index.html b/2016/01/24/the-joys-of-offline-data-collection/index.html index f4aafe813..5189b303a 100644 --- a/2016/01/24/the-joys-of-offline-data-collection/index.html +++ b/2016/01/24/the-joys-of-offline-data-collection/index.html @@ -1,5 +1,5 @@ The joys of offline data collection | Yanir Seroussi | Data & AI for Startup Impact -

                                                    The joys of offline data collection

                                                    Many modern data scientists don’t get to experience data collection in the offline world. Recently, I spent a month sailing down the northern Great Barrier Reef, collecting data for the Reef Life Survey project. In addition to being a great diving experience, the trip helped me obtain general insights on data collection and machine learning, which are shared in this article.

                                                    The Reef Life Survey project

                                                    Reef Life Survey (RLS) is a citizen scientist project, led by a team from the University of Tasmania. The data collected by RLS volunteers is freely available on the RLS website, and has been used for producing various reports and scientific publications. An RLS survey is performed along a 50 metre tape, which is laid at a constant depth following a reef’s contour. After laying the tape, one diver takes photos of the bottom at 2.5 metre intervals along the transect line. These photos are automatically analysed to classify the type of substrate or growth (e.g., hard coral or sand). Divers then complete two swims along each side of the transect. On the first swim (method 1), divers record all the fish species and large swimming animals found in a 5 metre corridor from the line. The second swim (method 2) requires keeping closer to the bottom and looking under ledges and vegetation in a 1 metre corridor from the line, targeting invertebrates and cryptic animals. The RLS manual includes all the details on how surveys are performed.

                                                    Performing RLS surveys is not a trivial task. In the tropics, it is not uncommon to record around 100 fish species on method 1. The scientists running the project are very conscious of the importance of obtaining high-quality data, so training to become an RLS volunteer takes considerable effort and dedication. The process generally consists of doing surveys together with an experienced RLS diver, and comparing the data after each dive. Once the trainee’s data matches that of the experienced RLSer, they are considered good enough to perform surveys independently. However, retraining is often required when surveying new ecoregions (e.g., an RLSer trained in Sydney needs further training to survey the Great Barrier Reef).

                                                    RLS requires a lot of hard work, but there are many reasons why it’s worth the effort. As someone who cares about marine conservation, I like the fact that RLS dives yield useful data that is used to drive environmental management decisions. As a scuba diver, I enjoy the opportunity to dive places that are rarely dived and the enhanced knowledge of the marine environment – doing surveys makes me notice things that I would otherwise overlook. Finally, as a data scientist, I find the exposure to the work of marine scientists very educational.

                                                    Pre-training and thoughts on supervised learning

                                                    Doing surveys in the tropics is a completely different story from surveying temperate reefs, due to the substantially higher diversity and abundance of marine creatures. Producing high-quality results requires being able to identify most creatures underwater, while doing the survey. It is possible to write down descriptions and take photos of unidentified species, but doing this for a large number of species is impractical.

                                                    Training the neural network in my head to classify tropical fish by species was an interesting experience. The approach that worked best was making flashcards using reveal.js, photos scraped from various sources, and past survey data. As the image below shows, each flashcard consists of a single photo, and pressing the down arrow reveals the name of the creature. With some basic JavaScript, I made the presentation select a different subset of photos on each load. Originally, I tried to learn all the 1000+ species that were previously recorded in the northern Great Barrier Reef, but this proved to be too hard – I realised that a better strategy was needed. The strategy that I chose was to focus on the most frequently-recorded species: I started by memorising the most frequent ones (e.g., those recorded on more than 50% of surveys), and gradually made it more challenging by decreasing the frequency threshold (e.g., to 25% in 5% steps). This proved to be pretty effective – by the time I started diving I could identify about 50-100 species underwater, even though I had mostly been using static images. It’d be interesting to know whether this kind of approach would be effective in training neural networks (or other batch-trained models) in certain scenarios – spend a few epochs training with instances from a subset of the classes, and gradually increase the number of considered classes. This may be effective when errors on certain classes are more important than others, and may yield different results from simply weighting classes or instances. Please let me know if you know of anyone who has experimented with this idea (update: gwern from Reddit pointed me to the paper Curriculum Learning by Bengio et al., which discusses this idea).

                                                    The joys of offline data collection

                                                    Many modern data scientists don’t get to experience data collection in the offline world. Recently, I spent a month sailing down the northern Great Barrier Reef, collecting data for the Reef Life Survey project. In addition to being a great diving experience, the trip helped me obtain general insights on data collection and machine learning, which are shared in this article.

                                                    The Reef Life Survey project

                                                    Reef Life Survey (RLS) is a citizen scientist project, led by a team from the University of Tasmania. The data collected by RLS volunteers is freely available on the RLS website, and has been used for producing various reports and scientific publications. An RLS survey is performed along a 50 metre tape, which is laid at a constant depth following a reef’s contour. After laying the tape, one diver takes photos of the bottom at 2.5 metre intervals along the transect line. These photos are automatically analysed to classify the type of substrate or growth (e.g., hard coral or sand). Divers then complete two swims along each side of the transect. On the first swim (method 1), divers record all the fish species and large swimming animals found in a 5 metre corridor from the line. The second swim (method 2) requires keeping closer to the bottom and looking under ledges and vegetation in a 1 metre corridor from the line, targeting invertebrates and cryptic animals. The RLS manual includes all the details on how surveys are performed.

                                                    Performing RLS surveys is not a trivial task. In the tropics, it is not uncommon to record around 100 fish species on method 1. The scientists running the project are very conscious of the importance of obtaining high-quality data, so training to become an RLS volunteer takes considerable effort and dedication. The process generally consists of doing surveys together with an experienced RLS diver, and comparing the data after each dive. Once the trainee’s data matches that of the experienced RLSer, they are considered good enough to perform surveys independently. However, retraining is often required when surveying new ecoregions (e.g., an RLSer trained in Sydney needs further training to survey the Great Barrier Reef).

                                                    RLS requires a lot of hard work, but there are many reasons why it’s worth the effort. As someone who cares about marine conservation, I like the fact that RLS dives yield useful data that is used to drive environmental management decisions. As a scuba diver, I enjoy the opportunity to dive places that are rarely dived and the enhanced knowledge of the marine environment – doing surveys makes me notice things that I would otherwise overlook. Finally, as a data scientist, I find the exposure to the work of marine scientists very educational.

                                                    Pre-training and thoughts on supervised learning

                                                    Doing surveys in the tropics is a completely different story from surveying temperate reefs, due to the substantially higher diversity and abundance of marine creatures. Producing high-quality results requires being able to identify most creatures underwater, while doing the survey. It is possible to write down descriptions and take photos of unidentified species, but doing this for a large number of species is impractical.

                                                    Training the neural network in my head to classify tropical fish by species was an interesting experience. The approach that worked best was making flashcards using reveal.js, photos scraped from various sources, and past survey data. As the image below shows, each flashcard consists of a single photo, and pressing the down arrow reveals the name of the creature. With some basic JavaScript, I made the presentation select a different subset of photos on each load. Originally, I tried to learn all the 1000+ species that were previously recorded in the northern Great Barrier Reef, but this proved to be too hard – I realised that a better strategy was needed. The strategy that I chose was to focus on the most frequently-recorded species: I started by memorising the most frequent ones (e.g., those recorded on more than 50% of surveys), and gradually made it more challenging by decreasing the frequency threshold (e.g., to 25% in 5% steps). This proved to be pretty effective – by the time I started diving I could identify about 50-100 species underwater, even though I had mostly been using static images. It’d be interesting to know whether this kind of approach would be effective in training neural networks (or other batch-trained models) in certain scenarios – spend a few epochs training with instances from a subset of the classes, and gradually increase the number of considered classes. This may be effective when errors on certain classes are more important than others, and may yield different results from simply weighting classes or instances. Please let me know if you know of anyone who has experimented with this idea (update: gwern from Reddit pointed me to the paper Curriculum Learning by Bengio et al., which discusses this idea).

                                                    800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                      \ No newline at end of file diff --git a/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.html b/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.html index 8b91b34e5..2789338b3 100644 --- a/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.html +++ b/2016/02/14/why-you-should-stop-worrying-about-deep-learning-and-deepen-your-understanding-of-causality-instead/index.html @@ -1,5 +1,5 @@ Why you should stop worrying about deep learning and deepen your understanding of causality instead | Yanir Seroussi | Data & AI for Startup Impact -

                                                      Why you should stop worrying about deep learning and deepen your understanding of causality instead

                                                      Everywhere you go these days, you hear about deep learning’s impressive advancements. New deep learning libraries, tools, and products get announced on a regular basis, making the average data scientist feel like they’re missing out if they don’t hop on the deep learning bandwagon. However, as Kamil Bartocha put it in his post The Inconvenient Truth About Data Science, 95% of tasks do not require deep learning. This is obviously a made up number, but it’s probably an accurate representation of the everyday reality of many data scientists. This post discusses an often-overlooked area of study that is of much higher relevance to most data scientists than deep learning: causality.

                                                      Causality is everywhere

                                                      An understanding of cause and effect is something that is not unique to humans. For example, the many videos of cats knocking things off tables appear to exemplify experimentation by animals. If you are not familiar with such videos, it can easily be fixed. The thing to notice is that cats appear genuinely curious about what happens when they push an object. And they tend to repeat the experiment to verify that if you push something off, it falls to the ground.

                                                      Humans rely on much more complex causal analysis than that done by cats – an understanding of the long-term effects of one’s actions is crucial to survival. Science, as defined by Wikipedia, is a systematic enterprise that creates, builds and organizes knowledge in the form of testable explanations and predictions about the universe. Causal analysis is key to producing explanations and predictions that are valid and sound, which is why understanding causality is so important to data scientists, traditional scientists, and all humans.

                                                      What is causality?

                                                      It is surprisingly hard to define causality. Just like cats, we all have an intuitive sense of what causality is, but things get complicated on deeper inspection. For example, few people would disagree with the statement that smoking causes cancer. But does it cause cancer immediately? Would smoking a few cigarettes today and never again cause cancer? Do all smokers develop cancer eventually? What about light smokers who live in areas with heavy air pollution?

                                                      Samantha Kleinberg summarises it very well in her book, Why: A Guide to Finding and Using Causes:

                                                      While most definitions of causality are based on Hume’s work, none of the ones we can come up with cover all possible cases and each one has counterexamples another does not. For instance, a medication may lead to side effects in only a small fraction of users (so we can’t assume that a cause will always produce an effect), and seat belts normally prevent death but can cause it in some car accidents (so we need to allow for factors that can have mixed producer/preventer roles depending on context).

                                                      The question often boils down to whether we should see causes as a fundamental building block or force of the world (that can’t be further reduced to any other laws), or if this structure is something we impose. As with nearly every facet of causality, there is disagreement on this point (and even disagreement about whether particular theories are compatible with this notion, which is called causal realism). Some have felt that causes are so hard to find as for the search to be hopeless and, further, that once we have some physical laws, those are more useful than causes anyway. That is, “causes” may be a mere shorthand for things like triggers, pushes, repels, prevents, and so on, rather than a fundamental notion.

                                                      It is somewhat surprising, given how central the idea of causality is to our daily lives, but there is simply no unified philosophical theory of what causes are, and no single foolproof computational method for finding them with absolute certainty. What makes this even more challenging is that, depending on one’s definition of causality, different factors may be identified as causes in the same situation, and it may not be clear what the ground truth is.

                                                      Why study causality now?

                                                      While it’s hard to conclusively prove, it seems to me like interest in formal causal analysis has increased in recent years. My hypothesis is that it’s just a natural progression along the levels of data’s hierarchy of needs. At the start of the big data boom, people were mostly concerned with storing and processing large amounts of data (e.g., using Hadoop, Elasticsearch, or your favourite NoSQL database). Just having your data flowing through pipelines is nice, but not very useful, so the focus switched to reporting and visualisation to extract insights about what happened (commonly known as business intelligence). While having a good picture of what happened is great, it isn’t enough – you can make better decisions if you can predict what’s going to happen, so the focus switched again to predictive analytics. Those who are familiar with predictive analytics know that models often end up relying on correlations between the features and the predicted labels. Using such models without considering the meaning of the variables can lead us to erroneous conclusions, and potentially harmful interventions. For example, based on the following graph we may make a recommendation that the US government decrease its spending on science to reduce the number of suicides by hanging.

                                                      Why you should stop worrying about deep learning and deepen your understanding of causality instead

                                                      Everywhere you go these days, you hear about deep learning’s impressive advancements. New deep learning libraries, tools, and products get announced on a regular basis, making the average data scientist feel like they’re missing out if they don’t hop on the deep learning bandwagon. However, as Kamil Bartocha put it in his post The Inconvenient Truth About Data Science, 95% of tasks do not require deep learning. This is obviously a made up number, but it’s probably an accurate representation of the everyday reality of many data scientists. This post discusses an often-overlooked area of study that is of much higher relevance to most data scientists than deep learning: causality.

                                                      Causality is everywhere

                                                      An understanding of cause and effect is something that is not unique to humans. For example, the many videos of cats knocking things off tables appear to exemplify experimentation by animals. If you are not familiar with such videos, it can easily be fixed. The thing to notice is that cats appear genuinely curious about what happens when they push an object. And they tend to repeat the experiment to verify that if you push something off, it falls to the ground.

                                                      Humans rely on much more complex causal analysis than that done by cats – an understanding of the long-term effects of one’s actions is crucial to survival. Science, as defined by Wikipedia, is a systematic enterprise that creates, builds and organizes knowledge in the form of testable explanations and predictions about the universe. Causal analysis is key to producing explanations and predictions that are valid and sound, which is why understanding causality is so important to data scientists, traditional scientists, and all humans.

                                                      What is causality?

                                                      It is surprisingly hard to define causality. Just like cats, we all have an intuitive sense of what causality is, but things get complicated on deeper inspection. For example, few people would disagree with the statement that smoking causes cancer. But does it cause cancer immediately? Would smoking a few cigarettes today and never again cause cancer? Do all smokers develop cancer eventually? What about light smokers who live in areas with heavy air pollution?

                                                      Samantha Kleinberg summarises it very well in her book, Why: A Guide to Finding and Using Causes:

                                                      While most definitions of causality are based on Hume’s work, none of the ones we can come up with cover all possible cases and each one has counterexamples another does not. For instance, a medication may lead to side effects in only a small fraction of users (so we can’t assume that a cause will always produce an effect), and seat belts normally prevent death but can cause it in some car accidents (so we need to allow for factors that can have mixed producer/preventer roles depending on context).

                                                      The question often boils down to whether we should see causes as a fundamental building block or force of the world (that can’t be further reduced to any other laws), or if this structure is something we impose. As with nearly every facet of causality, there is disagreement on this point (and even disagreement about whether particular theories are compatible with this notion, which is called causal realism). Some have felt that causes are so hard to find as for the search to be hopeless and, further, that once we have some physical laws, those are more useful than causes anyway. That is, “causes” may be a mere shorthand for things like triggers, pushes, repels, prevents, and so on, rather than a fundamental notion.

                                                      It is somewhat surprising, given how central the idea of causality is to our daily lives, but there is simply no unified philosophical theory of what causes are, and no single foolproof computational method for finding them with absolute certainty. What makes this even more challenging is that, depending on one’s definition of causality, different factors may be identified as causes in the same situation, and it may not be clear what the ground truth is.

                                                      Why study causality now?

                                                      While it’s hard to conclusively prove, it seems to me like interest in formal causal analysis has increased in recent years. My hypothesis is that it’s just a natural progression along the levels of data’s hierarchy of needs. At the start of the big data boom, people were mostly concerned with storing and processing large amounts of data (e.g., using Hadoop, Elasticsearch, or your favourite NoSQL database). Just having your data flowing through pipelines is nice, but not very useful, so the focus switched to reporting and visualisation to extract insights about what happened (commonly known as business intelligence). While having a good picture of what happened is great, it isn’t enough – you can make better decisions if you can predict what’s going to happen, so the focus switched again to predictive analytics. Those who are familiar with predictive analytics know that models often end up relying on correlations between the features and the predicted labels. Using such models without considering the meaning of the variables can lead us to erroneous conclusions, and potentially harmful interventions. For example, based on the following graph we may make a recommendation that the US government decrease its spending on science to reduce the number of suicides by hanging.

                                                      brandonrohrer
                                                      2016-08-02 00:23:52

                                                      Hi Yanir, Thanks for addressing this huge gap in how we interpret data! Models that handle causality well will take us from finding cats in images to solving more subtle problems in robotics and adaptive systems. The topic deserves more attention than it gets. Thanks for keeping it in the spotlight. Brandon
                                                      My understanding is that causality is always the central focus of science. Machine learning/data mining is a relatively more recent thing, and its greatest benefit lies in solving complex prediction problems. But I think causality study and data mining can help each other. For the purpose of understanding causality, data mining can be used in an exploratory way that helps scientists to generate theories (I think it is possible to study the features (i.e., hidden units) extracted by deep learning networks), then experiments, longitudinal studies and traditional stats can be used to test the theories. For the purpose of solving practical prediction problems, theories developed from causality studies can help identify useful features as input to the machine learning algorithms. In fact, this was done all the times especially before deep learning became popular. I agree that scientists should improve their understanding of causality, but picking up new technologies that take advantage of modern computers and large data won’t really hurt.

                                                      “According to Nicolas Malebranche and other seventeenth century Cartesian occasionalists, what we actually call causes are really no more than -occasions on which, in accordance with his own laws, God acts to bring about the effect. If one were to replace the notion of God in the occasionalism doctrine by the notion of a mechanism, then a modern (ormechanical) occasionalist could assert that what we actually call causes are no more than occasions on which a mechanism acts to bring about the effect. "

                                                      More info here:http://fqxi.org/community/forum/topic/846 and here http://www.digitalcosmology.com/Blog/beyond-intelligent-design/

                                                      I really enjoyed this post. I’m obviously familiar with the causation vs correlation argument but never stopped to think about what would define causation. I like the example about the seat belt (an argument I often have with my wife). Thanks for the post!

                                                      Hi Yanir,

                                                      Great post. If you are serious abut exploring causality further: check out Transfer Entropy and related information theory inspired measures: I assure you, you will be amazed.

                                                      Thanks, David! I’ll check it out.
                                                      I really enjoyed this post. I’m obviously familiar with the causation vs correlation argument but never stopped to think about what would define causation. I like the example about the seat belt (an argument I often have with my wife). Thanks for the post!

                                                      Hi Yanir,

                                                      Great post. If you are serious abut exploring causality further: check out Transfer Entropy and related information theory inspired measures: I assure you, you will be amazed.

                                                      Thanks, David! I’ll check it out.

                                                      This site is a part of the Data People Writing Stuff webring.
                                                      ← previous site +  |   +next site →

                                                      \ No newline at end of file diff --git a/2016/03/20/the-rise-of-greedy-robots/index.html b/2016/03/20/the-rise-of-greedy-robots/index.html index 45fdc3225..caf71d776 100644 --- a/2016/03/20/the-rise-of-greedy-robots/index.html +++ b/2016/03/20/the-rise-of-greedy-robots/index.html @@ -1,9 +1,11 @@ The rise of greedy robots | Yanir Seroussi | Data & AI for Startup Impact -

                                                      The rise of greedy robots

                                                      Given the impressive advancement of machine intelligence in recent years, many people have been speculating on what the future holds when it comes to the power and roles of robots in our society. Some have even called for regulation of machine intelligence before it’s too late. My take on this issue is that there is no need to speculate – machine intelligence is already here, with greedy robots already dominating our lives.

                                                      Machine intelligence or artificial intelligence?

                                                      The problem with talking about artificial intelligence is that it creates an inflated expectation of machines that would be completely human-like – we won’t have true artificial intelligence until we can create machines that are indistinguishable from humans. While the goal of mimicking human intelligence is certainly interesting, it is clear that we are very far from achieving it. We currently can’t even fully simulate C. elegans, a 1mm worm with 302 neurons. However, we do have machines that can perform tasks that require intelligence, where intelligence is defined as the ability to learn or understand things or to deal with new or difficult situations. Unlike artificial intelligence, there is no doubt that machine intelligence already exists.

                                                      Airplanes provide a famous example: we don’t commonly think of them as performing artificial flight – they are machines that fly faster than any bird. Likewise, computers are super-intelligent machines. They can perform calculations that humans can’t, store and recall enormous amounts of information, translate text, play Go, drive cars, and much more – all without requiring rest or food. The robots are here, and they are becoming increasingly useful and powerful.

                                                      Who are those greedy robots?

                                                      Greed is defined as a selfish desire to have more of something (especially money). It is generally seen as a negative trait in humans. However, we have been cultivating an environment where greedy entities – for-profit organisations – thrive. The primary goal of for-profit organisations is to generate profit for their shareholders. If these organisations were human, they would be seen as the embodiment of greed, as they are focused on making money and little else. Greedy organisations “live” among us and have been enjoying a plethora of legal rights and protections for hundreds of years. These entities, which were formed and shaped by humans, now form and shape human lives.

                                                      Humans running for-profit organisations have little choice but to play by their rules. For example, many people acknowledge that corporate tax avoidance is morally wrong, as revenue from taxes supports the infrastructure and society that enable corporate profits. However, any executive of a public company who refuses to do everything they legally can to minimise their tax bill is likely to lose their job. Despite being separate from the greedy organisations we run, humans have to act greedily to effectively serve their employers.

                                                      The relationship between greedy organisations and greedy robots is clear. Much of the funding that goes into machine intelligence research comes from for-profit organisations, with the end goal of producing profit for these entities. In the words of Jeffrey Hammerbacher: The best minds of my generation are thinking about how to make people click ads. Hammerbacher, an early Facebook employee, was referring to Facebook’s business model, where considerable resources are dedicated to getting people to engage with advertising – the main driver of Facebook’s revenue. Indeed, Facebook has hired Yann LeCun (a prominent machine intelligence researcher) to head its artificial intelligence research efforts. While LeCun’s appointment will undoubtedly result in general research advancements, Facebook’s motivation is clear – they see machine intelligence as a key driver of future profits. They, and other companies, use machine intelligence to build greedy robots, whose sole goal is to increase profits.

                                                      Greedy robots are all around us. Advertising-driven companies like Facebook and Google use sophisticated algorithms to get people to click on ads. Retail companies like Amazon use machine intelligence to mine through people’s shopping history and generate product recommendations. Banks and mutual funds utilise algorithmic trading to drive their investments. None of this is science fiction, and it doesn’t take much of a leap to imagine a world where greedy robots are even more dominant. Just like we have allowed greedy legal entities to dominate our world and shape our lives, we are allowing greedy robots to do the same, just more efficiently and pervasively.

                                                      Will robots take your job?

                                                      The growing range of machine intelligence capabilities gives rise to the question of whether robots are going to take over human jobs. One salient example is that of self-driving cars, that are projected to render millions of professional drivers obsolete in the next few decades. The potential impact of machine intelligence on jobs was summarised very well by CGP Grey in his video Humans Need Not Apply. The main message of the video is that machines will soon be able to perform any job better or more cost-effectively than any human, thereby making humans unemployable for economic reasons. The video ends with a call to society to consider how to deal with a future where there are simply no jobs for a large part of the population.

                                                      Despite all the technological advancements since the start of the industrial revolution, the prevailing mode of wealth distribution remains paid labour, i.e., jobs. The implication of this is that much of the work we do is unnecessary or harmful – people work because they have no other option, but their work doesn’t necessarily benefit society. This isn’t a new insight, as the following quotes demonstrate:

                                                      • “Most men appear never to have considered what a house is, and are actually though needlessly poor all their lives because they think that they must have such a one as their neighbors have. […] For more than five years I maintained myself thus solely by the labor of my hands, and I found that, by working about six weeks in a year, I could meet all the expenses of living.” – Henry David Thoreau, Walden (1854)
                                                      • “I think that there is far too much work done in the world, that immense harm is caused by the belief that work is virtuous, and that what needs to be preached in modern industrial countries is quite different from what always has been preached. […] Modern technique has made it possible to diminish enormously the amount of labor required to secure the necessaries of life for everyone. […] If, at the end of the war, the scientific organization, which had been created in order to liberate men for fighting and munition work, had been preserved, and the hours of the week had been cut down to four, all would have been well. Instead of that the old chaos was restored, those whose work was demanded were made to work long hours, and the rest were left to starve as unemployed.” – Bertrand Russell, In Praise of Idleness (1932)
                                                      • “In the year 1930, John Maynard Keynes predicted that technology would have advanced sufficiently by century’s end that countries like Great Britain or the United States would achieve a 15-hour work week. There’s every reason to believe he was right. In technological terms, we are quite capable of this. And yet it didn’t happen. Instead, technology has been marshaled, if anything, to figure out ways to make us all work more. In order to achieve this, jobs have had to be created that are, effectively, pointless. Huge swathes of people, in Europe and North America in particular, spend their entire working lives performing tasks they secretly believe do not really need to be performed. The moral and spiritual damage that comes from this situation is profound. It is a scar across our collective soul. Yet virtually no one talks about it.” – David Graeber, On the Phenomenon of Bullshit Jobs (2013)

                                                      This leads to the conclusion that we are unlikely to experience the utopian future in which intelligent machines do all our work, leaving us ample time for leisure. Yes, people will lose their jobs. But it is not unlikely that new unnecessary jobs will be invented to keep people busy, or worse, many people will simply be unemployed and will not get to enjoy the wealth provided by technology. Stephen Hawking summarised it well recently:

                                                      If machines produce everything we need, the outcome will depend on how things are distributed. Everyone can enjoy a life of luxurious leisure if the machine-produced wealth is shared, or most people can end up miserably poor if the machine-owners successfully lobby against wealth redistribution. So far, the trend seems to be toward the second option, with technology driving ever-increasing inequality.

                                                      Where to from here?

                                                      Many people believe that the existence of powerful greedy entities is good for society. Indeed, there is no doubt that we owe many beneficial technological breakthroughs to competition between for-profit companies. However, a single-minded focus on profit means that in many cases companies do what they can to reduce their responsibility for harmful side-effects of their activities. Examples include environmental pollution, multinational tax evasion, and health effects of products like tobacco and junk food. As history shows us, in truly unregulated markets, companies would happily utilise slavery and child labour to reduce their costs. Clearly, some regulation of greedy entities is required to obtain the best results for society.

                                                      With machine intelligence becoming increasingly powerful every day, some people think that to produce the best outcomes, we just need to wait for robots to be intelligent enough to completely run our lives. However, as anyone who has actually built intelligent systems knows, the outputs of such systems are strongly dependent on the inputs and goals set by system designers. Machine intelligence is just a tool – a very powerful tool. Like nuclear energy, we can use it to improve our lives, or we can use it to obliterate everything around us. The collective choice is ours to make, but is far from simple.

                                                      Subscribe +

                                                      The rise of greedy robots

                                                      Given the impressive advancement of machine intelligence in recent years, many people have been speculating on what the future holds when it comes to the power and roles of robots in our society. Some have even called for regulation of machine intelligence before it’s too late. My take on this issue is that there is no need to speculate – machine intelligence is already here, with greedy robots already dominating our lives.

                                                      Machine intelligence or artificial intelligence?

                                                      The problem with talking about artificial intelligence is that it creates an inflated expectation of machines that would be completely human-like – we won’t have true artificial intelligence until we can create machines that are indistinguishable from humans. While the goal of mimicking human intelligence is certainly interesting, it is clear that we are very far from achieving it. We currently can’t even fully simulate C. elegans, a 1mm worm with 302 neurons. However, we do have machines that can perform tasks that require intelligence, where intelligence is defined as the ability to learn or understand things or to deal with new or difficult situations. Unlike artificial intelligence, there is no doubt that machine intelligence already exists.

                                                      Airplanes provide a famous example: we don’t commonly think of them as performing artificial flight – they are machines that fly faster than any bird. Likewise, computers are super-intelligent machines. They can perform calculations that humans can’t, store and recall enormous amounts of information, translate text, play Go, drive cars, and much more – all without requiring rest or food. The robots are here, and they are becoming increasingly useful and powerful.

                                                      Who are those greedy robots?

                                                      Greed is defined as a selfish desire to have more of something (especially money). It is generally seen as a negative trait in humans. However, we have been cultivating an environment where greedy entities – for-profit organisations – thrive. The primary goal of for-profit organisations is to generate profit for their shareholders. If these organisations were human, they would be seen as the embodiment of greed, as they are focused on making money and little else. Greedy organisations “live” among us and have been enjoying a plethora of legal rights and protections for hundreds of years. These entities, which were formed and shaped by humans, now form and shape human lives.

                                                      Humans running for-profit organisations have little choice but to play by their rules. For example, many people acknowledge that corporate tax avoidance is morally wrong, as revenue from taxes supports the infrastructure and society that enable corporate profits. However, any executive of a public company who refuses to do everything they legally can to minimise their tax bill is likely to lose their job. Despite being separate from the greedy organisations we run, humans have to act greedily to effectively serve their employers.

                                                      The relationship between greedy organisations and greedy robots is clear. Much of the funding that goes into machine intelligence research comes from for-profit organisations, with the end goal of producing profit for these entities. In the words of Jeffrey Hammerbacher: The best minds of my generation are thinking about how to make people click ads. Hammerbacher, an early Facebook employee, was referring to Facebook’s business model, where considerable resources are dedicated to getting people to engage with advertising – the main driver of Facebook’s revenue. Indeed, Facebook has hired Yann LeCun (a prominent machine intelligence researcher) to head its artificial intelligence research efforts. While LeCun’s appointment will undoubtedly result in general research advancements, Facebook’s motivation is clear – they see machine intelligence as a key driver of future profits. They, and other companies, use machine intelligence to build greedy robots, whose sole goal is to increase profits.

                                                      Greedy robots are all around us. Advertising-driven companies like Facebook and Google use sophisticated algorithms to get people to click on ads. Retail companies like Amazon use machine intelligence to mine through people’s shopping history and generate product recommendations. Banks and mutual funds utilise algorithmic trading to drive their investments. None of this is science fiction, and it doesn’t take much of a leap to imagine a world where greedy robots are even more dominant. Just like we have allowed greedy legal entities to dominate our world and shape our lives, we are allowing greedy robots to do the same, just more efficiently and pervasively.

                                                      Will robots take your job?

                                                      The growing range of machine intelligence capabilities gives rise to the question of whether robots are going to take over human jobs. One salient example is that of self-driving cars, that are projected to render millions of professional drivers obsolete in the next few decades. The potential impact of machine intelligence on jobs was summarised very well by CGP Grey in his video Humans Need Not Apply. The main message of the video is that machines will soon be able to perform any job better or more cost-effectively than any human, thereby making humans unemployable for economic reasons. The video ends with a call to society to consider how to deal with a future where there are simply no jobs for a large part of the population.

                                                      Despite all the technological advancements since the start of the industrial revolution, the prevailing mode of wealth distribution remains paid labour, i.e., jobs. The implication of this is that much of the work we do is unnecessary or harmful – people work because they have no other option, but their work doesn’t necessarily benefit society. This isn’t a new insight, as the following quotes demonstrate:

                                                      • “Most men appear never to have considered what a house is, and are actually though needlessly poor all their lives because they think that they must have such a one as their neighbors have. […] For more than five years I maintained myself thus solely by the labor of my hands, and I found that, by working about six weeks in a year, I could meet all the expenses of living.” – Henry David Thoreau, Walden (1854)
                                                      • “I think that there is far too much work done in the world, that immense harm is caused by the belief that work is virtuous, and that what needs to be preached in modern industrial countries is quite different from what always has been preached. […] Modern technique has made it possible to diminish enormously the amount of labor required to secure the necessaries of life for everyone. […] If, at the end of the war, the scientific organization, which had been created in order to liberate men for fighting and munition work, had been preserved, and the hours of the week had been cut down to four, all would have been well. Instead of that the old chaos was restored, those whose work was demanded were made to work long hours, and the rest were left to starve as unemployed.” – Bertrand Russell, In Praise of Idleness (1932)
                                                      • “In the year 1930, John Maynard Keynes predicted that technology would have advanced sufficiently by century’s end that countries like Great Britain or the United States would achieve a 15-hour work week. There’s every reason to believe he was right. In technological terms, we are quite capable of this. And yet it didn’t happen. Instead, technology has been marshaled, if anything, to figure out ways to make us all work more. In order to achieve this, jobs have had to be created that are, effectively, pointless. Huge swathes of people, in Europe and North America in particular, spend their entire working lives performing tasks they secretly believe do not really need to be performed. The moral and spiritual damage that comes from this situation is profound. It is a scar across our collective soul. Yet virtually no one talks about it.” – David Graeber, On the Phenomenon of Bullshit Jobs (2013)

                                                      This leads to the conclusion that we are unlikely to experience the utopian future in which intelligent machines do all our work, leaving us ample time for leisure. Yes, people will lose their jobs. But it is not unlikely that new unnecessary jobs will be invented to keep people busy, or worse, many people will simply be unemployed and will not get to enjoy the wealth provided by technology. Stephen Hawking summarised it well recently:

                                                      If machines produce everything we need, the outcome will depend on how things are distributed. Everyone can enjoy a life of luxurious leisure if the machine-produced wealth is shared, or most people can end up miserably poor if the machine-owners successfully lobby against wealth redistribution. So far, the trend seems to be toward the second option, with technology driving ever-increasing inequality.

                                                      Where to from here?

                                                      Many people believe that the existence of powerful greedy entities is good for society. Indeed, there is no doubt that we owe many beneficial technological breakthroughs to competition between for-profit companies. However, a single-minded focus on profit means that in many cases companies do what they can to reduce their responsibility for harmful side-effects of their activities. Examples include environmental pollution, multinational tax evasion, and health effects of products like tobacco and junk food. As history shows us, in truly unregulated markets, companies would happily utilise slavery and child labour to reduce their costs. Clearly, some regulation of greedy entities is required to obtain the best results for society.

                                                      With machine intelligence becoming increasingly powerful every day, some people think that to produce the best outcomes, we just need to wait for robots to be intelligent enough to completely run our lives. However, as anyone who has actually built intelligent systems knows, the outputs of such systems are strongly dependent on the inputs and goals set by system designers. Machine intelligence is just a tool – a very powerful tool. Like nuclear energy, we can use it to improve our lives, or we can use it to obliterate everything around us. The collective choice is ours to make, but is far from simple.

                                                      Subscribe

                                                        Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                        Yes, the world has always been greedy. This reminds me of Dijkstra greedy algorithm which is used to find the shortest route. There is a lot of “steps” for an organization to become profitable. Greediness tries to find the most cost-efficient way to achieve the goal of being profitable. Let us assume that each road is a railway and trains traverse to their destinations. Each decision path will sacrifice other trains waiting to cross to their destination. If human stupidity does not overrule again, our scarce resource ultimately will be constrained by economics to one element only: time. Where do we want humans to allocate spending their time on?

                                                        Greediness will always thrive in the sense it is seen as a trait of growth by society. War, which in today’s society we ultimately condemn, was viewed in the past as one way for one nation to gain growth. Growth was limited to the domain of a specific country and the rest treated as an enemy. The end of the war was enforced by the right of not interfering other one’s own property. People had to find other means to gain growth. Thus, the concept of greediness was enforced. Greediness is using emotional appeal to manipulate other’s people habits to a specific domain.

                                                        The problem with greediness is whether people evaluate the emotional appeal matching to something positive or not positive to self and society. The maximum capacity of getting that right is:

                                                        1. The effort of people on having a multi-disciplinary knowledge of multiple domains
                                                        2. The effort of people of using their knowledge to their daily decisions.

                                                        Most consumers are passive on the above two points due to society constraints. More specifically, if people focus learning from other domains, they have the risk of underperforming on their main domain having a competitive disadvantage on their prospect of their career. This limited domain in bayesian terms makes people have low confidence for many topics leaving others to influence our decision making. I think that low confidence is the main causation we see a high rise trend where people’s decisions rely more upon push messages instead of pull messages. I haven’t seen to this day sophisticated push messages where the user has a choice of options what to see except in the on boarding phase of a product. In addition, the on boarding phase are only reasons why you should use me. There will never be a phase on reasons when not to use me. If a specific domain of a product could say to the user based on the personalized information it gathered: “Hey, you shouldn’t be using me in this situation. Use Bob instead, it will make your life easier” (This will be possible with the evolution of data). The problem is a specific domain will never explain alternative domains that can solve a user’s individual problem better because there is not a commission fee of recommending one user to another domain with qualitative information. This causes a domain which consists a set of employees to not have an interest in researching alternative domains that can solve a specific problem better because the current system has not placed a platform to reward it with a commission fee. Instead, the only way for a specific domain to thrive is by copying others ideas or owning them through acquisitions. This demotivates innovation in great sum. So far, it is only people with consciousness, with value or no value, such as start-up entrepreneurs that leave old positions and people who contribute in open source correspondingly, that go the extra mile to innovate. My whole hypothesis is that our natural instincts are a machine learner, and our only task is to do progress on everything, even our own personal life.

                                                        If those two points happen, the rule of greediness will be overruled. People will consciously evaluate whether that emotional appeal makes sense in the big picture because their jobs will force them to associate their domain with alternatives to gain a commission fee. That will gain them a more robust interdisciplinary domain knowledge causing them to have more confidence on pulling than pushing information of other domains they start to know about. Passive consumers will be less passive. The value was based before by war, now greediness, later it will be all about evaluation.

                                                        Your point of people doing less work emphasizes a more passive society than it already is. I do not propose that as that will make our situation worse. The problem is the type of tasks people do, not the task itself. People need to do tasks that progress our society instead of being passive like the game of civilization. It is the only way that makes us happy and has a purpose. Like we humans create machine learning instances have an end goal purpose, so we as humans are machine learners for a purpose where we can handle any situation that becomes a problem. Our starter pack was human suffering, hunger, and death to solve problems. Now it becomes less so and we have to be motivated by it beyond extrinsic rewards.

                                                        \ No newline at end of file diff --git a/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.html b/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.html index 1948eddbd..230aba76c 100644 --- a/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.html +++ b/2016/05/15/diving-deeper-into-causality-pearl-kleinberg-hill-and-untested-assumptions/index.html @@ -1,5 +1,5 @@ Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions | Yanir Seroussi | Data & AI for Startup Impact -

                                                        Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions

                                                        Background: I have previously written about the need for real insights that address the why behind events, not only the what and how. This was followed by a fairly popular post on causality, which was heavily influenced by Samantha Kleinberg's book Why: A Guide to Finding and Using Causes. This post continues my exploration of the field, and is primarily based on Kleinberg's previous book: Causality, Probability, and Time.

                                                        The study of causality and causal inference is central to science in general and data science in particular. Being able to distinguish between correlation and causation is key to designing effective interventions in business, public policy, medicine, and many other fields. There are quite a few approaches to inferring causal relationships from data. In this post, I discuss some aspects of Judea Pearl’s graphical modelling approach, and how its limitations are addressed in recent work by Samantha Kleinberg. I then finish with a brief survey of the Bradford Hill criteria and their applicability to a key limitation of all causal inference methods: The need for untested assumptions.

                                                        Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions

                                                        Background: I have previously written about the need for real insights that address the why behind events, not only the what and how. This was followed by a fairly popular post on causality, which was heavily influenced by Samantha Kleinberg's book Why: A Guide to Finding and Using Causes. This post continues my exploration of the field, and is primarily based on Kleinberg's previous book: Causality, Probability, and Time.

                                                        The study of causality and causal inference is central to science in general and data science in particular. Being able to distinguish between correlation and causation is key to designing effective interventions in business, public policy, medicine, and many other fields. There are quite a few approaches to inferring causal relationships from data. In this post, I discuss some aspects of Judea Pearl’s graphical modelling approach, and how its limitations are addressed in recent work by Samantha Kleinberg. I then finish with a brief survey of the Bradford Hill criteria and their applicability to a key limitation of all causal inference methods: The need for untested assumptions.

                                                        Judea Pearl

                                                        Judea Pearl

                                                        Overcoming my Pearl bias

                                                        First, I must disclose that I have a personal bias in favour of Pearl’s work. While I’ve never met him, Pearl is my academic grandfather – he was the PhD advisor of my main PhD supervisor (Ingrid Zukerman). My first serious exposure to his work was through a Sydney reading group, where we discussed parts of Pearl’s approach to causal inference. Recently, I refreshed my knowledge of Pearl causality by reading Causal inference in statistics: An overview. I am by no means an expert in Pearl’s huge body of work, but I think I understand enough of it to write something of use.

                                                        Pearl’s theory of causality employs Bayesian networks to represent causal structures. These are directed acyclic graphs, where each vertex represents a variable, and an edge from X to Y implies that X causes Y. Pearl also introduces the do(X) operator, which simulates interventions by removing all the causes of X, setting it to a constant. There is much more to this theory, but two of its main contributions are the formalisation of causal concepts that are often given only a verbal treatment, and the explicit encoding of causal assumptions. These assumptions must be made by the modeller based on background knowledge, and are encoded in the graph’s structure – a missing edge between two vertices indicates that there is no direct causal relationship between the two variables.

                                                        My main issue with Pearl’s treatment of causality is that he doesn’t explicitly handle time. While time can be encoded into Pearl’s models (e.g., via dynamic Bayesian networks), there is nothing that prevents creation of models where the future causes changes in the past. A closely-related issue is that Pearl’s causal models must be directed acyclic graphs, making it hard to model feedback loops. For example, Pearl says that “mud does not cause rain”, but this isn’t true – water from mud evaporates, causing rain (which causes mud). What’s true is that “mud now doesn’t cause rain now” or something along these lines, which is something that must be accounted for by adding temporal information to the models.

                                                        Nonetheless, Pearl’s theory is an important step forward in the study of causality. In his words, “in the bulk of the statistical literature before 2000, causal claims rarely appear in the mathematics. They surface only in the verbal interpretation that investigators occasionally attach to certain associations, and in the verbal description with which investigators justify assumptions.” The importance of formal causal analysis cannot be overstated, as it underlies many decisions that affect our lives. However, it seems to me like there’s still plenty of work to be done before causal analysis becomes as established as other statistical tools.

                                                        Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                                        Interesting point on the causal significance. How does this work when you have confounders in x? I’d have thought that x must contain the set of prima facie causes for which we have true exogenous variation.

                                                        Also, how does it work when you have bad controls in x (where x includes post-treatment causes that are plausibly varied by c)?

                                                        Good questions :)

                                                        To be honest, I’m not completely sure it works in all these cases, as there is always a need for interpretation to decide whether the identified causes are genuine. I tried playing a bit with the toy data from Pearl’s report on Simpson’s Paradox, but the results are not entirely convincing. However, I’m also not fully convinced that Pearl’s solution fully resolves Simpson’s Paradox, and Kleinberg does go through a few scenarios where her approach doesn’t work in her book, so I’d say that there are still quite a few open problems in the area.

                                                        Post-treatment causes are partly addressed by the definition in Huang and Kleinberg (2015), where significance is weighted by the number of timepoints where e follows c. Again, that definition doesn’t handle all cases, but I think it’s an interesting line of research. I would definitely like to see their results reproduced by other researchers and expanded to other datasets, though.

                                                        Excellent article! It has been very useful to understand what the topic of causality is about and triggered my interest to continue learning more!
                                                        Thanks for this post! I share your troubles over Pearl/time/feedback loops!
                                                        Nice post. have you had any chance to apply them on real datasets. Please share those results
                                                        Great post. I did not know about Kleinberg and Hill’s work. I knew a similar list of criteria from this article, which is much younger https://doi.org/10.1177%2F0951629805050859 -Regarding Kleinberg: Adding time certainly is valuable, but doesn’t the smoking example change the research question from whether smoking causes lung cancer to when it causes lung cancer? The latter question is more informative and implies the former, but I’d say it is fine to ask the first question when one is not interested in the time of occurrence of cancer.
                                                        Thank you! I agree that the latter question is more informative, but I now think that saying that “smoking causes cancer” isn’t particularly meaningful, as it ignores both timing and dosage. A good summary of the case for well-defined interventions was provided by Miguel Hernán in this paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5207342/
                                                        The limits of Pearl’s theory on feedback loops bothers me too. However, have you studied much Control Theory? Or dynamical systems in general? It explicitly deals with feedback loops. I’d be keen to get your thoughts on the comparison of Control Theory vs Pearl’s Causal Inference.
                                                        Thanks for the comment! No, I haven’t studied Control Theory. Maybe I’ll look into it one day. :)
                                                        Thank you! I agree that the latter question is more informative, but I now think that saying that “smoking causes cancer” isn’t particularly meaningful, as it ignores both timing and dosage. A good summary of the case for well-defined interventions was provided by Miguel Hernán in this paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5207342/
                                                        The limits of Pearl’s theory on feedback loops bothers me too. However, have you studied much Control Theory? Or dynamical systems in general? It explicitly deals with feedback loops. I’d be keen to get your thoughts on the comparison of Control Theory vs Pearl’s Causal Inference.
                                                        Thanks for the comment! No, I haven’t studied Control Theory. Maybe I’ll look into it one day. :)

                                                        This site is a part of the Data People Writing Stuff webring.
                                                        ← previous site +  |   +next site →

                                                        \ No newline at end of file diff --git a/2016/06/19/making-bayesian-ab-testing-more-accessible/index.html b/2016/06/19/making-bayesian-ab-testing-more-accessible/index.html index 65448acea..51d59d6b2 100644 --- a/2016/06/19/making-bayesian-ab-testing-more-accessible/index.html +++ b/2016/06/19/making-bayesian-ab-testing-more-accessible/index.html @@ -1,5 +1,5 @@ Making Bayesian A/B testing more accessible | Yanir Seroussi | Data & AI for Startup Impact -

                                                        Making Bayesian A/B testing more accessible

                                                        Much has been written in recent years on the pitfalls of using traditional hypothesis testing with online A/B tests. A key issue is that you’re likely to end up with many false positives if you repeatedly check your results and stop as soon as you reach statistical significance. One way of dealing with this issue is by following a Bayesian approach to deciding when the experiment should be stopped. While I find the Bayesian view of statistics much more intuitive than the frequentist view, it can be quite challenging to explain Bayesian concepts to laypeople. Hence, I decided to build a new Bayesian A/B testing calculator, which aims to make these concepts clear to any user. This post discusses the general problem and existing solutions, followed by a review of the new tool and how it can be improved further.

                                                        The problem

                                                        The classic A/B testing problem is as follows. Suppose we run an experiment where we have a control group and a test group. Participants (typically website visitors) are allocated to groups randomly, and each group is presented with a different variant of the website or page (e.g., variant A is assigned to the control group and variant B is assigned to the test group). Our aim is to increase the overall number of binary successes, where success can be defined as clicking a button or opening a new account. Hence, we track the number of trials in each group together with the number of successes. For a given group, the number of successes divided by number of trials is the group’s raw success rate.

                                                        Given the results of an experiment (trials and successes for each group), there are a few questions we would typically like to answer:

                                                        1. Should we choose variant A or variant B to maximise our success rate?
                                                        2. How much would our success rate change if we chose one variant over the other?
                                                        3. Do we have enough data or should we keep experimenting?

                                                        It’s important to note some points that might be obvious, but are often overlooked. First, we run an experiment because we assume that it will help us uncover a causal link, where something about A or B is hypothesised to cause people to behave differently, thereby affecting the overall success rate. Second, we want to make a decision and choose either A or B, rather than maintain multiple variants and present the best variant depending on a participant’s features (a problem that’s addressed by contextual bandits, for example). Third, online A/B testing is different from traditional experiments in a lab, because we often have little control over the characteristics of our participants, and when, where, and how they choose to interact with our experiment. This is an important point, because it means that we may need to wait a long time until we get a representative sample of the population. In addition, the raw numbers of trials and successes can’t tell us whether the sample is representative.

                                                        Bayesian solutions

                                                        Many blog posts have been written on how to use Bayesian statistics to answer the above questions, so I won’t get into too much detail here (see the posts by David Robinson, Maciej Kula, Chris Stucchio, and Evan Miller if you need more background). The general idea is that we assume that the success rates for the control and test variants are drawn from Beta(αA, βA) and Beta(αB, βB), respectively, where Beta(α, β) is the beta distribution with shape parameters α and β (which yields values in the [0, 1] interval). As the experiment runs, we update the parameters of the distributions – each success gets added to the group’s α, and each unsuccessful trial gets added to the group’s β. It is often reasonable to assume that the prior (i.e., initial) values of α and β are the same for both variants. If we denote the prior values of the parameters with α and β, and the number of successes and trials for group x with Sx and Tx respectively, we get that the success rates are distributed according to Beta(α + SA, β + TA – SA) for control and Beta(α + SB, β + TB – SB) for test.

                                                        For example, if α = β = 1, TA = 200, SA = 120, TB = 200, and SB = 100, plotting the probability density functions yields the following chart (A – blue, B – red):

                                                        Making Bayesian A/B testing more accessible

                                                        Much has been written in recent years on the pitfalls of using traditional hypothesis testing with online A/B tests. A key issue is that you’re likely to end up with many false positives if you repeatedly check your results and stop as soon as you reach statistical significance. One way of dealing with this issue is by following a Bayesian approach to deciding when the experiment should be stopped. While I find the Bayesian view of statistics much more intuitive than the frequentist view, it can be quite challenging to explain Bayesian concepts to laypeople. Hence, I decided to build a new Bayesian A/B testing calculator, which aims to make these concepts clear to any user. This post discusses the general problem and existing solutions, followed by a review of the new tool and how it can be improved further.

                                                        The problem

                                                        The classic A/B testing problem is as follows. Suppose we run an experiment where we have a control group and a test group. Participants (typically website visitors) are allocated to groups randomly, and each group is presented with a different variant of the website or page (e.g., variant A is assigned to the control group and variant B is assigned to the test group). Our aim is to increase the overall number of binary successes, where success can be defined as clicking a button or opening a new account. Hence, we track the number of trials in each group together with the number of successes. For a given group, the number of successes divided by number of trials is the group’s raw success rate.

                                                        Given the results of an experiment (trials and successes for each group), there are a few questions we would typically like to answer:

                                                        1. Should we choose variant A or variant B to maximise our success rate?
                                                        2. How much would our success rate change if we chose one variant over the other?
                                                        3. Do we have enough data or should we keep experimenting?

                                                        It’s important to note some points that might be obvious, but are often overlooked. First, we run an experiment because we assume that it will help us uncover a causal link, where something about A or B is hypothesised to cause people to behave differently, thereby affecting the overall success rate. Second, we want to make a decision and choose either A or B, rather than maintain multiple variants and present the best variant depending on a participant’s features (a problem that’s addressed by contextual bandits, for example). Third, online A/B testing is different from traditional experiments in a lab, because we often have little control over the characteristics of our participants, and when, where, and how they choose to interact with our experiment. This is an important point, because it means that we may need to wait a long time until we get a representative sample of the population. In addition, the raw numbers of trials and successes can’t tell us whether the sample is representative.

                                                        Bayesian solutions

                                                        Many blog posts have been written on how to use Bayesian statistics to answer the above questions, so I won’t get into too much detail here (see the posts by David Robinson, Maciej Kula, Chris Stucchio, and Evan Miller if you need more background). The general idea is that we assume that the success rates for the control and test variants are drawn from Beta(αA, βA) and Beta(αB, βB), respectively, where Beta(α, β) is the beta distribution with shape parameters α and β (which yields values in the [0, 1] interval). As the experiment runs, we update the parameters of the distributions – each success gets added to the group’s α, and each unsuccessful trial gets added to the group’s β. It is often reasonable to assume that the prior (i.e., initial) values of α and β are the same for both variants. If we denote the prior values of the parameters with α and β, and the number of successes and trials for group x with Sx and Tx respectively, we get that the success rates are distributed according to Beta(α + SA, β + TA – SA) for control and Beta(α + SB, β + TB – SB) for test.

                                                        For example, if α = β = 1, TA = 200, SA = 120, TB = 200, and SB = 100, plotting the probability density functions yields the following chart (A – blue, B – red):

                                                        Beta distributions examples

                                                        Given these distributions, we can calculate the most probable range for the success rate of each variant, and estimate the difference in success rate between the variants. These can be calculated by deriving closed formulas, or by drawing samples from each distribution. In addition, it is important to note that the distributions change as we gather more data, even if the raw success rates don’t. For example, multiplying each count by 10 to obtain TA = 2000, SA = 1200, TB = 2000, and SB = 1000 doesn’t change the success rates, but it does change the distributions – they become much narrower:

                                                        Yanir Seroussi
                                                        2017-03-31 08:00:23

                                                        The source code for the calculation is here: https://github.com/yanirs/yanirs.github.io/blob/master/tools/split-test-calculator/src/bayes.coffee#L139 – it shouldn’t be too hard to translate to R.

                                                        Thanks for the post!

                                                        I’m more of a business stakeholder simply trying to improve our testing practices, rather than a data scientist who understands the theories at a detailed level.

                                                        I’m a bit confused why, if I enter the default example in your calculator (5000 trials each, 100 successes vs 130), the recommendation is to implement EITHER variant.

                                                        Whereas, using a tool such as the following suggests a 97.8% chance the variant with 130 successes will outperform the control: https://abtestguide.com/bayesian/

                                                        This calculator also seems to suggest the 130 successes variant should be chosen, not EITHER, as there is 95% confidence the result is not due to chance : https://abtestguide.com/calc/

                                                        A secondary question is, if there is no predetermined sample size with the Bayesian approach, how do you plan how long to run the test for? Mainly to deal with stakeholder communication & project planning, but also to avoid peaking.

                                                        Many thanks, Sam

                                                        Thanks for your comment, Sam!

                                                        There are many approaches to Bayesian A/B testing. It looks like the other one you linked to doesn’t allow you to specify your prior knowledge about the conversion rate and the decision criterion. Which values did you use for those fields in my calculator?

                                                        As mentioned in the last paragraph of the post, you should really aim for a representative sample of users – size is only one factor. That being said, you can play with the number of trials in the calculator to get an idea of the required sample size based on the minimum effect criterion.

                                                        Great stuff!

                                                        I agree that Peak Conversion makes it easy, but not being able to change the priors makes it limited. Lyst is a very good tool but your use of “success rate” rather than specifying an alpha and beta makes it much more intuitive.

                                                        I just wanted to confirm my assumption on the “Minimum effect” field - you said it’s absolute, so if I wanted to detect a 10% difference in a 5% success rate, that means I would have to input the value as 0.5?

                                                        Thanks David!

                                                        That’s correct. If you set the minimum effect field to 0.5, it means that changes in the success rate that are lower than 0.5% are considered equivalent to zero change. In your example, if the success rate changes from 5% to anything between 4.5% and 5.5%, the change is considered to be insubstantial or unimportant. Note that this is different from statistical significance: A change is significant in the statistical sense if it is unlikely to be due to random variation. The decision whether a change is substantial or important depends on your application, which is where the minimum effect threshold comes in.

                                                        Hi Yanir,

                                                        The calculator is simple enough, and easy to use, though it lacks some basic things such as testing more than one variant against a control, which is very often done in practice.

                                                        I, and hopefully others, am interested to learn how information about the stopping rule used enters into the calculations, if at all? After all, ignoring the stopping rule would be like discarding data that can have a critical impact on the decision made. I make an argument about it here: http://blog.analytics-toolkit.com/2017/bayesian-ab-testing-not-immune-to-optional-stopping-issues/ , with lots of references. Another take on the issue I’ve recently found is here http://srmart.in/bayes-optional-stopping/ .

                                                        Best, Georgi

                                                        Thanks Georgi. I’m not sure I understand your question. The stopping rule is explained under “Building the new calculator”. It’s worth reading Kruschke’s articles for further explanations. Also, as noted in the final paragraph of the post, no stopping rule can tell you whether your sample is representative of the general population, so results on synthetic data are of limited use.

                                                        Hi Yanir,

                                                        I’ve read the explanation provided, and it left me feeling like the calculator ignores the stopping rule used up to the point when data enters your calculator. The same is confirmed by the calculator interface, where I see no option to specify a stopping rule or number and points of prior observations I’ve done on the data I enter. While for the first part one can assume that the user would use your calculator for each look and the stopping rule is then given and the same, the second part clearly means it will have no input on the number and timing of peeks. Given that the stopping rule is data-based, it is certain to introduce heavy bias on the resulting statistics, if my impression that you’re not adjusting the calculations to account for peeking/optional stopping in any way are correct. Am I wrong in my assertion?

                                                        Thanks, -Georgi

                                                        Thanks for the clarification, Georgi. You’re right, the calculator doesn’t take the number of peeks into account. As noted in the final paragraph of the post, I don’t think it matters as much as other factors, so I’m not planning on changing this. As discussed in a separate post, statistical significance doesn’t imply causality, which is what we really care about. It is just one factor that should be taken into account when deciding to implement changes.
                                                        This recent paper on the limited use of statistical significance testing is worth reading: http://www.stat.columbia.edu/~gelman/research/unpublished/abandon.pdf
                                                        Yanir hi! Thanks for sharing this content. I got a question regarding something said on the Lyst blog post that I havent seen mentioned here: “Both of these problems (repeated testing and low base rate) are much less pronounced when we use an informative prior.” Could you please explain how and why this occurs? Or share some paper about it? Because otherwise I cant see in which ways prior selection affects type 1 errors (since you are comparing it to the problems of repeated testing). And also, as my understanding goes, bayesian inference does not control for type 1 errors, its not in its interest to do so, so why would someone choose a prior conditioned on its possible effects on type 1 errors?

                                                        I’m not entirely sure about statements by the Lyst post authors, but I think that it might be because the prior can make it less likely that one would make a call before there is enough data, which is when there is a higher risk of false positives. However, this is also addressed by using a stopping rule like Krushcke’s.

                                                        By the way, Kruschke’s 2018 paper Rejecting or accepting parameter values in Bayesian estimation is a good recent read on the topic.

                                                        Would you happen to have a bayesian test duration calculator or know of one you can point me to?
                                                        A quick search yields some results, but I’m not sure about the reliability of the estimates. Maybe I’ll add it as a feature one day… :)
                                                        Thanks for the clarification, Georgi. You’re right, the calculator doesn’t take the number of peeks into account. As noted in the final paragraph of the post, I don’t think it matters as much as other factors, so I’m not planning on changing this. As discussed in a separate post, statistical significance doesn’t imply causality, which is what we really care about. It is just one factor that should be taken into account when deciding to implement changes.
                                                        This recent paper on the limited use of statistical significance testing is worth reading: http://www.stat.columbia.edu/~gelman/research/unpublished/abandon.pdf
                                                        Yanir hi! Thanks for sharing this content. I got a question regarding something said on the Lyst blog post that I havent seen mentioned here: “Both of these problems (repeated testing and low base rate) are much less pronounced when we use an informative prior.” Could you please explain how and why this occurs? Or share some paper about it? Because otherwise I cant see in which ways prior selection affects type 1 errors (since you are comparing it to the problems of repeated testing). And also, as my understanding goes, bayesian inference does not control for type 1 errors, its not in its interest to do so, so why would someone choose a prior conditioned on its possible effects on type 1 errors?

                                                        I’m not entirely sure about statements by the Lyst post authors, but I think that it might be because the prior can make it less likely that one would make a call before there is enough data, which is when there is a higher risk of false positives. However, this is also addressed by using a stopping rule like Krushcke’s.

                                                        By the way, Kruschke’s 2018 paper Rejecting or accepting parameter values in Bayesian estimation is a good recent read on the topic.

                                                        Would you happen to have a bayesian test duration calculator or know of one you can point me to?
                                                        A quick search yields some results, but I’m not sure about the reliability of the estimates. Maybe I’ll add it as a feature one day… :)

                                                        This site is a part of the Data People Writing Stuff webring.
                                                        ← previous site +  |   +next site →

                                                        \ No newline at end of file diff --git a/2016/08/04/is-data-scientist-a-useless-job-title/index.html b/2016/08/04/is-data-scientist-a-useless-job-title/index.html index 843e7b18e..9e8edaaa5 100644 --- a/2016/08/04/is-data-scientist-a-useless-job-title/index.html +++ b/2016/08/04/is-data-scientist-a-useless-job-title/index.html @@ -1,5 +1,5 @@ Is Data Scientist a useless job title? | Yanir Seroussi | Data & AI for Startup Impact -

                                                        Is Data Scientist a useless job title?

                                                        Data science can be defined as either the intersection or union of software engineering and statistics. In recent years, the field seems to be gravitating towards the broader unifying definition, where everyone who touches data in some way can call themselves a data scientist. Hence, while many people whose job title is Data Scientist do very useful work, the title itself has become fairly useless as an indication of what the title holder actually does. This post briefly discusses how we got to this point, where I think the field is likely to go, and what data scientists can do to remain relevant.

                                                        The many definitions of data science

                                                        About two years ago, I published a post discussing the definition of data scientist by Josh Wills, as a person who is better at statistics than any software engineer and better at software engineering than any statistician. I still quite like this definition, because it describes me well, as someone with education and experience in both areas. However, to be better at statistics than any software engineer and better at software engineering than any statistician, you have to be truly proficient in both areas, as some software engineers are comfortable running complex experiments, and some statisticians are capable of building solid software. Quite a few people who don’t meet Wills’s criteria have decided they wanted to be data scientists too, expanding the definition to be something along the lines of someone who is better at statistics than some software engineers (who’ve never done anything fancier than calculating a sample mean) and better at software engineering than some statisticians (who can’t code).

                                                        In addition to software engineering and statistics, data scientists are expected to deeply understand the domain in which they operate, and be excellent communicators. This leads to the proliferation of increasingly ridiculous Venn diagrams, such as the one by Stephan Kolassa:

                                                        Is Data Scientist a useless job title?

                                                        Data science can be defined as either the intersection or union of software engineering and statistics. In recent years, the field seems to be gravitating towards the broader unifying definition, where everyone who touches data in some way can call themselves a data scientist. Hence, while many people whose job title is Data Scientist do very useful work, the title itself has become fairly useless as an indication of what the title holder actually does. This post briefly discusses how we got to this point, where I think the field is likely to go, and what data scientists can do to remain relevant.

                                                        The many definitions of data science

                                                        About two years ago, I published a post discussing the definition of data scientist by Josh Wills, as a person who is better at statistics than any software engineer and better at software engineering than any statistician. I still quite like this definition, because it describes me well, as someone with education and experience in both areas. However, to be better at statistics than any software engineer and better at software engineering than any statistician, you have to be truly proficient in both areas, as some software engineers are comfortable running complex experiments, and some statisticians are capable of building solid software. Quite a few people who don’t meet Wills’s criteria have decided they wanted to be data scientists too, expanding the definition to be something along the lines of someone who is better at statistics than some software engineers (who’ve never done anything fancier than calculating a sample mean) and better at software engineering than some statisticians (who can’t code).

                                                        In addition to software engineering and statistics, data scientists are expected to deeply understand the domain in which they operate, and be excellent communicators. This leads to the proliferation of increasingly ridiculous Venn diagrams, such as the one by Stephan Kolassa:

                                                        Perfect data scientist Venn diagram

                                                        The perfect data scientist from Kolassa’s Venn diagram is a mythical sexy unicorn ninja rockstar who can transform a business just by thinking about its problems. A more realistic (and less exciting) view of data scientists is offered by Rob Hyndman:

                                                        I take the broad inclusive view. I am a data scientist because I do data analysis, and I do research on the methodology of data analysis. The way I would express it is that I’m a data scientist with a statistical perspective and training. Other data scientists will have different perspectives and different training.

                                                        We are comfortable with having medical specialists, and we will go to a GP, endocrinologist, physiotherapist, etc., when we have medical problems. We also need to take a team perspective on data science.

                                                        None of us can realistically cover the whole field, and so we specialise on certain problems and techniques. It is crazy to think that a doctor must know everything, and it is just as crazy to think a data scientist should be an expert in statistics, mathematics, computing, programming, the application discipline, etc. Instead, we need teams of data scientists with different skills, with each being aware of the boundary of their expertise, and who to call in for help when required.

                                                        Indeed, data science is too broad for any data scientist to fully master all areas of expertise. Despite the misleading name of the field, it encompasses both science and engineering, which is why data scientists can be categorised into two types, as suggested by Michael Hochster:

                                                        • Type A (analyst): focused on static data analysis. Essentially a statistician with coding skills.
                                                        • Type B (builder): focused on building data products. Essentially a software engineer with knowledge in machine learning and statistics.

                                                        Type A is more of a scientist, and Type B is more of an engineer. Many people end up doing both, but it is pretty rare to have an even 50-50 split between the science and engineering sides, as they require different mindsets. This is illustrated by the following diagram, showing the information flow in science and engineering (source).

                                                        Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                                        I think exactly the same, but for the moment the title is somewhat needed. I’ll make my example: I’m a data scientist because my company wants to differentiate between regular data analysts (who can’t code but are learning with me helping them) and backend software engineers who can code better than me, but lack the business knowledge and have the tendency to trow fancy algorithms at numbers without thinking about method and usefulness for the business.

                                                        Eventually we will have new job titles, but for now we are stuck with “data scientists”. As soon as the hype will fade we’ll see people moving to new titles.

                                                        Great article - and really the ambiguity surrounding the Data Scientist title hurts everyone - Data Scientists are frustrated that they’re expected to do everything, and others are frustrated that their Data Scientists can’t do everything that they’ve heard data scientists can do. I think this will change over time as data scientists (or whatever they will be called) roles get further defined.

                                                        Good article - I’ve always had a bit of problem with the term “Data Scientist” in that it infers that the person with such a title is somehow involved in research of data for scientific purposes or has a deep academic background neither of which are usually true.

                                                        Now, what does everyone think about the term “Data Architect” - I could do a rant on that one. Suffice it to say, you are a DATABASE Architect, not a DATA architect. Data is data, it is just raw numbers. You don’t design data, you design a data model which eventually gets translated into a database. Sorry, I guess that was a bit of a rant …

                                                        Yeah, people like the word data more than the word database these days. There are also the various places where you can put data. You can drown it in a lake, for example…

                                                        Great article, and couldn’t agree more - there’s deep irony in “Data Science” as a job title.

                                                        I’ve started to use the term “Entrepreneurial Analyst” to be more precise about the focus on the outcome and also to allow latitude for hypotheses, exploration and discovery.

                                                        Reblogged this on codefying and commented: -Especially like the antiparallel structure of scientific inquiry and engineering design.
                                                        I find it interest that, as you said, data analysis is very useful if done by effective hires. It’s important to understand the data that you are provided with and what context it fits in. Otherwise, you could come to conclusions that miss the mark. It is important to have those who are properly qualified analyze data accurately.
                                                        Great article! I also feel that anybody with an experience of 5 years and more in Data Science, can be considered for a Data Scientist role. Professionals with lesser experience can always have their roles as Data Analysts or Data Engineers. There are many certification programs that can provide you with relevant skill-sets, such as Hortonworks certification, Cloudera Certifications, Data Science Council of Ameria (DASCA) certifications etc.
                                                        I find it interest that, as you said, data analysis is very useful if done by effective hires. It’s important to understand the data that you are provided with and what context it fits in. Otherwise, you could come to conclusions that miss the mark. It is important to have those who are properly qualified analyze data accurately.
                                                        Great article! I also feel that anybody with an experience of 5 years and more in Data Science, can be considered for a Data Scientist role. Professionals with lesser experience can always have their roles as Data Analysts or Data Engineers. There are many certification programs that can provide you with relevant skill-sets, such as Hortonworks certification, Cloudera Certifications, Data Science Council of Ameria (DASCA) certifications etc.

                                                        This site is a part of the Data People Writing Stuff webring.
                                                        ← previous site +  |   +next site →

                                                        \ No newline at end of file diff --git a/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/index.html b/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/index.html index 6c338ee71..0303156c0 100644 --- a/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/index.html +++ b/2016/08/21/seven-ways-to-be-data-driven-off-a-cliff/index.html @@ -1,5 +1,5 @@ If you don’t pay attention, data can drive you off a cliff | Yanir Seroussi | Data & AI for Startup Impact -

                                                        If you don’t pay attention, data can drive you off a cliff

                                                        You’re a hotshot manager. You love your dashboards and you keep your finger on the beating pulse of the business. You take pride in using data to drive your decisions rather than shooting from the hip like one of those old-school 1950s bosses. This is the 21st century, and data is king. You even hired a sexy statistician or data scientist, though you don’t really understand what they do. Never mind, you can proudly tell all your friends that you are leading a modern data-driven team. Nothing can go wrong, right? Incorrect. If you don’t pay attention, data can drive you off a cliff. This article discusses seven of the ways this can happen. Read on to ensure it doesn’t happen to you.

                                                        1. Pretending uncertainty doesn’t exist

                                                        If you don’t pay attention, data can drive you off a cliff

                                                        You’re a hotshot manager. You love your dashboards and you keep your finger on the beating pulse of the business. You take pride in using data to drive your decisions rather than shooting from the hip like one of those old-school 1950s bosses. This is the 21st century, and data is king. You even hired a sexy statistician or data scientist, though you don’t really understand what they do. Never mind, you can proudly tell all your friends that you are leading a modern data-driven team. Nothing can go wrong, right? Incorrect. If you don’t pay attention, data can drive you off a cliff. This article discusses seven of the ways this can happen. Read on to ensure it doesn’t happen to you.

                                                        1. Pretending uncertainty doesn’t exist

                                                        Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                                        Thanks Yanir for this post! Once again, you hit the nail on the head! We’re probably all guilty of doing any number of those mistakes at one point or another of our careers. And it wouldn’t surprise me that a lot of companies are doing all of those mistakes at the same time. I especially liked #6. Instead of stupidity, I would suggest that that ego is responsible for it.
                                                        Yeah, I think that Bertrand Russell was a bit too harsh – it’s really ignorance that often causes overconfidence rather than stupidity. And yes, I have made this mistake as well. Many things often look misleadingly simple if you don’t get into the fine details.
                                                        Reblogged this on QA-notes and commented: -All common sense, but as with many things, having it written down focusses the mind :-)

                                                        This site is a part of the Data People Writing Stuff webring.
                                                        ← previous site +  |   +next site →

                                                        \ No newline at end of file diff --git a/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/index.html b/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/index.html index 0d24a673d..c1d9625f7 100644 --- a/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/index.html +++ b/2016/09/19/ask-why-finding-motives-causes-and-purpose-in-data-science/index.html @@ -1,9 +1,11 @@ Ask Why! Finding motives, causes, and purpose in data science | Yanir Seroussi | Data & AI for Startup Impact -

                                                        Ask Why! Finding motives, causes, and purpose in data science

                                                        Some people equate predictive modelling with data science, thinking that mastering various machine learning techniques is the key that unlocks the mysteries of the field. However, there is much more to data science than the What and How of predictive modelling. I recently gave a talk where I argued the importance of asking Why, touching on three different topics: stakeholder motives, cause-and-effect relationships, and finding a sense of purpose. A video of the talk is available below. Unfortunately, the videographer mostly focused on me pacing rather than on the screen, but you can check out the slides here (note that you need to use both the left/right and up/down arrows to see all the slides).

                                                        If you’re interested in the topics covered in the talk, here are a few posts you should read.

                                                        Stakeholders and their motives

                                                        Causality and experimentation

                                                        Purpose, ethics, and my personal path

                                                        Cover image: Why by Ksayer

                                                        Subscribe +

                                                        Ask Why! Finding motives, causes, and purpose in data science

                                                        Some people equate predictive modelling with data science, thinking that mastering various machine learning techniques is the key that unlocks the mysteries of the field. However, there is much more to data science than the What and How of predictive modelling. I recently gave a talk where I argued the importance of asking Why, touching on three different topics: stakeholder motives, cause-and-effect relationships, and finding a sense of purpose. A video of the talk is available below. Unfortunately, the videographer mostly focused on me pacing rather than on the screen, but you can check out the slides here (note that you need to use both the left/right and up/down arrows to see all the slides).

                                                        If you’re interested in the topics covered in the talk, here are a few posts you should read.

                                                        Stakeholders and their motives

                                                        Causality and experimentation

                                                        Purpose, ethics, and my personal path

                                                        Cover image: Why by Ksayer

                                                        Subscribe

                                                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                          \ No newline at end of file diff --git a/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.html b/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.html index 409d984f4..9a33077da 100644 --- a/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.html +++ b/2017/01/08/customer-lifetime-value-and-the-proliferation-of-misinformation-on-the-internet/index.html @@ -1,5 +1,5 @@ Customer lifetime value and the proliferation of misinformation on the internet | Yanir Seroussi | Data & AI for Startup Impact -

                                                          Customer lifetime value and the proliferation of misinformation on the internet

                                                          Suppose you work for a business that has paying customers. You want to know how much money your customers are likely to spend to inform decisions on customer acquisition and retention budgets. You’ve done a bit of research, and discovered that the figure you want to calculate is commonly called the customer lifetime value. You google the term, and end up on a page with ten results (and probably some ads). How many of those results contain useful, non-misleading information? As of early 2017, fewer than half. Why is that? How can it be that after nearly 20 years of existence, Google still surfaces misleading information for common search terms? And how can you calculate your customer lifetime value correctly, avoiding the traps set up by clever search engine marketers? Read on to find out!

                                                          Background: Misleading search results and fake news

                                                          While Google tries to filter obvious spam from its index, it still relies to a great extent on popularity to rank search results. Popularity is a function of inbound links (weighted by site credibility), and of user interaction with the presented results (e.g., time spent on a result page before moving on to the next result or search). There are two obvious problems with this approach. First, there are no guarantees that wrong, misleading, or inaccurate pages won’t be popular, and therefore earn high rankings. Second, given Google’s near-monopoly of the search market, if a page ranks highly for popular search terms, it is likely to become more popular and be seen as credible. Hence, when searching for the truth, it’d be wise to follow Abraham Lincoln’s famous warning not to trust everything you read on the internet.

                                                          Customer lifetime value and the proliferation of misinformation on the internet

                                                          Suppose you work for a business that has paying customers. You want to know how much money your customers are likely to spend to inform decisions on customer acquisition and retention budgets. You’ve done a bit of research, and discovered that the figure you want to calculate is commonly called the customer lifetime value. You google the term, and end up on a page with ten results (and probably some ads). How many of those results contain useful, non-misleading information? As of early 2017, fewer than half. Why is that? How can it be that after nearly 20 years of existence, Google still surfaces misleading information for common search terms? And how can you calculate your customer lifetime value correctly, avoiding the traps set up by clever search engine marketers? Read on to find out!

                                                          Background: Misleading search results and fake news

                                                          While Google tries to filter obvious spam from its index, it still relies to a great extent on popularity to rank search results. Popularity is a function of inbound links (weighted by site credibility), and of user interaction with the presented results (e.g., time spent on a result page before moving on to the next result or search). There are two obvious problems with this approach. First, there are no guarantees that wrong, misleading, or inaccurate pages won’t be popular, and therefore earn high rankings. Second, given Google’s near-monopoly of the search market, if a page ranks highly for popular search terms, it is likely to become more popular and be seen as credible. Hence, when searching for the truth, it’d be wise to follow Abraham Lincoln’s famous warning not to trust everything you read on the internet.

                                                          Abraham Lincoln internet quote

                                                          Google is not alone in helping spread misinformation. Following Donald Trump’s recent victory in the US presidential election, many people have blamed Facebook for allowing so-called fake news to be widely shared. Indeed, any popular media outlet or website may end up spreading misinformation, especially if – like Facebook and Google – it mainly aggregates and amplifies user-generated content. However, as noted by John Herrman, the problem is much deeper than clearly-fabricated news stories. It is hard to draw the lines between malicious spread of misinformation, slight inaccuracies, and plain ignorance. For example, how would one classify Trump’s claims that climate change is a hoax invented by the Chinese? Should Twitter block his account for knowingly spreading outright lies?

                                                          Wrong customer value calculation by example

                                                          Fortunately, when it comes to customer lifetime value, I doubt that any of the top results returned by Google is intentionally misleading. This is a case where inaccuracies and misinformation result from ignorance rather than from malice. However, relying on such resources without digging further is just as risky as relying on pure fabrications. For example, see this infographic by Kissmetrics, which suggests three different formulas for calculating the average lifetime value of a Starbucks customer. Those three formulas yield very different values ($5,489, $11,535, and $25,272), which the authors then say should be averaged to yield the final lifetime value figure. All formulas are based on numbers that the authors call constants, despite the fact that numbers such as the average customer lifespan or retention rate are clearly not constant in this context (since they’re estimated from the data and used as projections into the future). Indeed, several people have commented on the flaws in Kissmetrics’ approach, which is reminiscent of the Dilbert strip where the pointy-haired boss asks Dilbert to average and multiply wrong data.

                                                          800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                            When I started doing data science in a business setting (after years of doing quantitative genetics in academic settings), I was puzzled by the talk of “customer lifetime value”, partly due to the issues you’ve mentioned. Even with appropriate clarifications - it’s over, say, five years, not forever, etc. - it’s a peculiar quantity, in that as typically calculated, at least by the business-y people in my vicinity, it isn’t an average over a single population of customers. Instead, it’s average, say, first-month net present value over customers who’ve been around for at least one month (or maybe over all such customers who’ve also been around for at most one year, to reduce the influence of customer behavior farther in the past, when the product catalog, marketing strategies, etc. were different), plus average second-month net present value over customers who’ve been around for at least two months, etc., that is, it’s a sum of averages over a sequence of populations of customers (which may not even be nested). And there can be further subtleties. For example, in the context of a “freemium” service such as the one that is my primary client at present, sometimes people want to measure time from when a customer signs up for an account, whereas other times people want to measure time from when a customer first buys something, which may be much later. Altogether, I’ve found that “customer lifetime value” generally requires a good deal of explanation.

                                                            “no amount of cheap demagoguery and misinformation can alter the objective reality of our world.”: Alas, that isn’t quite true. Next week, the objective reality of how the USA is governed will be altered substantially, partly due to blatant demagoguery and misinformation.

                                                            Great analysis Yanir!
                                                            Thanks Ralph! I meant the last sentence in the sense of “and yet it moves”. People’s actions and choices are definitely affected by demagoguery and misinformation, but the spread of misinformation doesn’t change reality by itself. For example, Trump et al.’s climate science denialism isn’t going to alter the reality of anthropogenic climate change, though their actions are probably going to accelerate it.

                                                            This is why Investment Banking and Venture Capital firms should hire Data Scientists.

                                                            I think your post and the links you share can have a part on the Google search results as well in near future :)

                                                            Great post.

                                                            There’s also the BTYD package in R that I’ve seen be used for CLV calculations although I don’t know if it could be used for anything industrial. All credit for this knowledge goes to Dan McCarthy, who just put out some great research on using CLV in non-contractual settings.

                                                            Hi Yanir!

                                                            Nice post.

                                                            How can the models you mentioned be altered in the case of a subscription based business in order to calculate the lifetime value of the customers?

                                                            Thanks Eleni! I think that in the case of subscription-based products, you’re better off using different models, as churn is observed and can be predicted (e.g., using a package like lifelines). Once you have an estimate of when a customer is going to churn, it’s easy to estimate their LTV (assuming constant recurring revenue). In any case, the general principle of not using closed formulas without testing their accuracy on your data still applies here.
                                                            Thanks for the article, Yanir! I am a huge proponent of only using formulas for CLV as a starting point, used only when historical similar models aren’t available, When a good historical financial model is available, it becomes much more useful than the generic formula. I was just speaking with service vendor who was trying to convince to allow his company to perform exhaustive FMEA’s on all of our equipment when we had years of failure data to approach a maintenance strategy. Only rely on theoretical when empirical isn’t an option.
                                                            \ No newline at end of file diff --git a/2017/06/03/exploring-and-visualising-reef-life-survey-data/index.html b/2017/06/03/exploring-and-visualising-reef-life-survey-data/index.html index 7a924b5ea..c263d0a1b 100644 --- a/2017/06/03/exploring-and-visualising-reef-life-survey-data/index.html +++ b/2017/06/03/exploring-and-visualising-reef-life-survey-data/index.html @@ -1,5 +1,5 @@ Exploring and visualising Reef Life Survey data | Yanir Seroussi | Data & AI for Startup Impact -

                                                            Exploring and visualising Reef Life Survey data

                                                            Last year, I wrote about the Reef Life Survey (RLS) project and my experience with offline data collection on the Great Barrier Reef. I found that using auto-generated flashcards with an increasing level of difficulty is a good way to memorise marine species. Since publishing that post, I have improved the flashcards and built a tool for exploring the aggregate survey data. Both tools are now publicly available on the RLS website. This post describes the tools and their implementation, and outlines possible directions for future work.

                                                            The tools

                                                            Each tool is fairly simple and focused on helping users achieve a small set of tasks. The best way to get familiar with the tools is to play with them by following the links below. If you’re only interested in using the tools, you can stop reading after this section. The rest of this post describes the data behind the tools, and some technical implementation details.

                                                            Exploring and visualising Reef Life Survey data

                                                            Last year, I wrote about the Reef Life Survey (RLS) project and my experience with offline data collection on the Great Barrier Reef. I found that using auto-generated flashcards with an increasing level of difficulty is a good way to memorise marine species. Since publishing that post, I have improved the flashcards and built a tool for exploring the aggregate survey data. Both tools are now publicly available on the RLS website. This post describes the tools and their implementation, and outlines possible directions for future work.

                                                            The tools

                                                            Each tool is fairly simple and focused on helping users achieve a small set of tasks. The best way to get familiar with the tools is to play with them by following the links below. If you’re only interested in using the tools, you can stop reading after this section. The rest of this post describes the data behind the tools, and some technical implementation details.

                                                            800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                              \ No newline at end of file diff --git a/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.html b/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.html index 538dbfe0c..b7a433dd7 100644 --- a/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.html +++ b/2017/07/29/my-10-step-path-to-becoming-a-remote-data-scientist-with-automattic/index.html @@ -1,10 +1,12 @@ My 10-step path to becoming a remote data scientist with Automattic | Yanir Seroussi | Data & AI for Startup Impact -

                                                              My 10-step path to becoming a remote data scientist with Automattic

                                                              About two years ago, I read the book The Year without Pants, which describes the author’s experience leading a team at Automattic (the company behind WordPress.com, among other products). Automattic is a fully-distributed company, which means that all of its employees work remotely (hence pants are optional). While the book discusses some of the challenges of working remotely, the author’s general experience was very positive. A few months after reading the book, I decided to look for a full-time position after a period of independent work. Ideally, I wanted a well-paid data science-y remote job with an established distributed tech company that offers a good life balance and makes products I care about. Automattic seemed to tick all my boxes, so I decided to apply for a job with them. This post describes my application steps, which ultimately led to me becoming a data scientist with Automattic.

                                                              Before jumping in, it’s worth noting that this post describes my personal experience. If you apply for a job with Automattic, your experience is likely to be different, as the process varies across teams, and evolves over time.

                                                              📧 Step 1: Do background research and apply

                                                              I decided to apply for a data wrangler position with Automattic in October 2015. While data wrangler may sound less sexy than data scientist, reading the job ad led me to believe that the position may involve interesting data science work. This impression was strengthened by some LinkedIn stalking, which included finding current data wranglers and reading through their profiles and websites. I later found out that all the people on the data division start out as data wranglers, and then they may pick their own title. Some data wranglers do data science work, while others are more focused on data engineering, and there are some projects that require a broad range of skills. As the usefulness of the term data scientist is questionable, I’m not too fussed about fancy job titles. It’s more important to do interesting work in a supportive environment.

                                                              Applying for the job was fairly straightforward. I simply followed the instructions from the ad:

                                                              Does this sound interesting? If yes, please send a short email to jobs @ this domain telling us about yourself and attach a resumé. Let us know what you can contribute to the team. Include the title of the position you’re applying for and your name in the subject. Proofread! Make sure you spell and capitalize WordPress and Automattic correctly. We are lucky to receive hundreds of applications for every position, so try to make your application stand out. If you apply for multiple positions or send multiple emails there will be one reply.

                                                              Having been on the receiving side of job applications, I find it surprising that many people don’t bother writing a cover letter, addressing the selection criteria in the ad, or even applying for a job they’re qualified to do. Hence, my cover letter was fairly short, comprising of several bullet points that highlight the similarities between the job requirements and my experience. It was nothing fancy, but simple cover letters have worked well for me in the past.

                                                              ⏳ Step 2: Wait patiently

                                                              The initial application was followed by a long wait. From my research, this is the typical scenario. This is unsurprising, as Automattic is a fairly small company with a large footprint, which is both distributed and known as a great place to work (e.g., its Glassdoor rating is 4.9). Therefore, it attracts many applicants from all over the world, which take a while to process. In addition, Matt Mullenweg (Automattic’s CEO) reviews job applications before passing them on to the team leads.

                                                              As I didn’t know that Matt reviewed job applications, I decided to try to shorten the wait by getting introduced to someone in the data division. My first attempt was via a second-degree LinkedIn connection who works for Automattic. He responded quickly when I reached out to him, saying that his experience working with the company is in line with the Glassdoor reviews – it’s the best job he’s had in his 15-year-long career. However, he couldn’t help me with an intro, because there is no simple way around Automattic’s internal processes. Nonetheless, he reassured me that it is worth waiting patiently, as the strict process means that you end up working with great people.

                                                              I wasn’t in a huge rush to find a job, but in December 2015 I decided to accept an offer to become the head of data science at Car Next Door. This was a good decision at the time, as I believe in the company’s original vision of reducing the number of cars on the road through car sharing, and it seemed like there would be many interesting projects for me to work on. The position wasn’t completely remote, but as the company was already spread across several cities, I was able to work from home for a day or two every week. In addition, it was a pleasant commute by bike from my Sydney home to the office, so putting the fully-remote job search on hold didn’t seem like a major sacrifice. As I haven’t heard anything from Automattic at that stage, it seemed unwise to reject a good offer, so I started working full-time with Car Next Door in January 2016.

                                                              I successfully attracted Automattic’s attention with a post I published on the misuse of the word insights by many tech companies, which included an example from WordPress.com. Greg Ichneumon Brown, one of the data wranglers, commented on the post, and invited me to apply to join Automattic and help them address the issues I raised. This happened after I accepted the offer from Car Next Door, and hasn’t resulted in any speed up of the process, so I just gave up on Automattic and carried on with my life.

                                                              💬 Step 3: Chat with the data lead

                                                              I finally heard back from Automattic in February 2016 (four months after my initial application and a month into my employment with Car Next Door). Martin Remy, who leads the data division, emailed me to enquire if I’m still interested in the position. I informed him that I was no longer looking for a job, but we agreed to have an informal chat, as I’ve been waiting for such a long time.

                                                              As is often the case with Automattic interviews, the chat with Martin was completely text-based. Working with a distributed team means that voice and video calls can be hard to schedule. Hence, Automattic relies heavily on textual channels, and text-based interviews allow the company to test the written communication skills of candidates. The chat revolved around my past work experience, and Martin also took the time to answer my questions about the company and the data division. At the conclusion of the chat, Martin suggested I contact him directly if I was ever interested in continuing the application process. While I was happy with my position at the time, the chat strengthened my positive impression of Automattic, and I decided that I would reapply if I were to look for a full-time position again.

                                                              My next job search started earlier than I had anticipated. In October 2016, I decided to leave Car Next Door due to disagreements with the founders over the general direction of the company. In addition, I had more flexibility in choosing where to live, as my personal circumstances had changed. As I’ve always been curious about life outside the capital cities of Australia, I wanted to move away from Sydney. While I could have probably continued working remotely with Car Next Door, I felt that it would be better to find a job with a fully-distributed team. Therefore, I messaged Martin and we scheduled another chat.

                                                              The second chat with Martin took place in early November. Similarly to the first chat, it was conducted via Skype text messages, and revolved around my work in the time that has passed since the first chat. This time, as I was keen on continuing with the process, I asked more specific questions about what kind of work I’m likely to end up doing and what the next steps would be. The answers were that I’d be joining the data science team, and that the next steps are a pre-trial test, a paid trial, and a final interview with Matt. While this sounds straightforward, it took another six months until I finally became an Automattic employee (but I wasn’t in a rush).

                                                              ☑️ Step 4: Pass the pre-trial test

                                                              The pre-trial test consisted of a data analysis task, where I was given a dataset and a set of questions to answer by Carly Stambaugh, the data science lead. The goal of the test is to evaluate the candidate’s approach to a problem, and assess organisational and communication skills. As such, the focus isn’t on obtaining a specific result, so candidates are given a choice of several potential avenues to explore. The open-ended nature of the task is reminiscent of many real-world data science projects, where you don’t always have a clear idea of what you’re going to discover. While some people may find this kind of uncertainty daunting, I find it interesting, as it is one of the things that makes data science a science.

                                                              I spent a few days analysing the data and preparing a report, which was submitted as a Jupyter Notebook. After submitting my initial report, there were a few follow-up questions, which I answered by email. The report was reviewed by Carly and Martin, and as they were satisfied with my work, I was invited to proceed to the next stage: A paid trial project.

                                                              👨‍💻 Step 5: Do the trial project

                                                              The main part of the application process with Automattic is the paid trial project. The rationale behind doing paid trials was explained a few years ago by Matt in Hire by Auditions, Not Resumes:

                                                              Before we hire anyone, they go through a trial process first, on contract. They can do the work at night or over the weekend, so they don’t have to leave their current job in the meantime. We pay a standard rate of $25 per hour, regardless of whether you’re applying to be an engineer or the chief financial officer.

                                                              During the trials, we give the applicants actual work. If you’re applying to work in customer support, you’ll answer tickets. If you’re an engineer, you’ll work on engineering problems. If you’re a designer, you’ll design.

                                                              There’s nothing like being in the trenches with someone, working with them day by day. It tells you something you can’t learn from resumes, interviews, or reference checks. At the end of the trial, everyone involved has a great sense of whether they want to work together going forward. And, yes, that means everyone — it’s a mutual tryout. Some people decide we’re not the right fit for them.

                                                              The goal of my trial project was to improve the Elasticsearch language detection algorithm. This took about a month, and ultimately resulted in a pull request that got merged into the language detection plugin. I find this aspect of the process pretty exciting: While the plugin is used to classify millions of documents internally by Automattic, its impact extends beyond the company, as Elasticsearch is used by many other organisations and projects. This stands in contrast to many other technical job interviews, which consist of unpaid work on toy problems under stressful conditions, where the work performed is ultimately thrown away. While the monetary compensation for the trial work is lower than the market rate for data science consulting, I valued the opportunity to work on a real open source project, even if this hadn’t led to me getting hired.

                                                              There was much more to the trial project than what’s shown in the final pull request. Most of the discussions were held on an internal project thread, primarly under the guidance of Carly (the data science lead), and Greg (the data wrangler who replied to my post a year earlier). The project was kicked off with a general problem statement: There was some evidence that the Elasticsearch language detection plugin doesn’t perform well on short texts, and my mission was to improve it. As the plugin didn’t include any tests for short texts, one of the main contributions of my work was the creation of datasets and tests to measure its accuracy on texts of different lengths. This was followed by some tweaks that improved the plugin’s performance, as summarised in the pull request. Internally, this work consisted of several iterations where I came up with ideas, asked questions, implemented the ideas, shared the results, and discussed further steps. There are still many possible improvements to the work done in the trial. However, as trials generally last around a month, we decided to end it after a few iterations.

                                                              I enjoyed the trial process, but it is definitely not for everyone. Most notably, there is a strong emphasis on asynchronous text-based communication, which is the main mode by which projects are coordinated at Automattic. People who don’t enjoy written communication may find this aspect challenging, but I have always found that writing helps me organise my thoughts, and that I retain information better when reading than when listening to people speak. That being said, Automatticians do meet in person several times a year, and some teams have video chats for some discussions. While doing the trial, I had a video chat with Carly, which was the first (and last) time in the process that I got to see and hear a live human. However, this was not an essential part of the trial project, as our chat was mostly on the data scientist role and my job expectations.

                                                              ⏳ Step 6: Wait patiently

                                                              I finished working on the trial project just before Christmas. The feedback I received throughout the trial was positive, but Martin, Carly, and Greg had to go through the work and discuss it among themselves before making a final decision. This took about a month, due to the holiday period, various personal circumstances, and the data science team meetup that was scheduled for January 2017. Eventually, Martin got back to me with positive news: They were satisfied with my trial work, which meant there was only one stage left – the final interview with Matt Mullenweg, Automattic’s CEO.

                                                              👉 Step 7: Ping Matt

                                                              Like other parts of the process, the interview with Matt is text-based. The way it works is fairly simple: I was instructed to message Matt on Slack and wait for a response, which may take days or weeks. I sent Matt a message on January 25, and was surprised to hear back from him the following morning. However, that day was Australia Day, which is a public holiday here. Therefore, I only got back to him two hours after he messaged me that morning, and by that time he was probably already busy with other things. This was the start of a pretty long wait.

                                                              ⏳ Step 8: Wait patiently

                                                              I left Car Next Door at the end of January, as I figured that I would be able to line up some other work even if things didn’t work out with Automattic. My plan was to take some time off, and then move up to the Northern Rivers area of New South Wales. I had two Reef Life Survey trips planned, so I wasn’t going to start working again before mid-April. I assumed that I would hear back from Matt before then, which would have allowed me to make an informed decision whether to look for another job or not.

                                                              After two weeks of waiting, the time for my dive trips was nearing. As I was going to be without mobile reception for a while, I thought it’d be worth letting Matt know my schedule. After discussing the matter with Martin, I messaged Matt. He responded, saying that we might as well do the interview at the beginning of April, as I won’t be starting work before that time anyway. I would have preferred to be done with the interview earlier, but was happy to have some certainty and not worry about missing more chat messages before April.

                                                              In early April, I returned from my second dive trip (which included a close encounter with Cyclone Debbie), and was hoping to sort out my remote work situation while completing the move up north. Unfortunately, while the move was successful, I was ready to give up on Automattic because I haven’t heard back from Matt at all in April. However, Martin remained optimistic and encouraged me to wait patiently, which I did as I was pretty busy with the move and with some casual freelancing projects.

                                                              💬 Step 9: Chat with Matt and accept the job offer

                                                              The chat with Matt finally happened on May 2. As is often the case, it took a few hours and covered my background, the trial process, and some other general questions. I asked him about my long wait for the final chat, and he apologised for me being an outlier, as most chats happen within two weeks of a candidate being passed over to him. As the chat was about to conclude, we got to the topic of salary negotiation (which went well), and then the process was finally over! Within a few hours of the chat I was sent an offer letter and an employment contract. As Automattic has an entity in Australia (called Ausomattic), it’s a fairly standard contract. I signed the contract and started work the following week – over a year and a half after my initial application. Even before I started working, I booked tickets to meet the data division in Montréal – a fairly swift transition from the long wait for the final interview.

                                                              🎉 Step 10: Start working and choose a job title

                                                              As noted above, Automatticians get to choose their own job titles, so to become a data scientist with Automattic, I had to set my job title to Data Scientist. This is generally how many people become data scientists these days, even outside Automattic. However, job titles don’t matter as much as job satisfaction. And after 2.5 months with Automattic, I’m very satisfied with my decision to join the company. My first three weeks were spent doing customer support, like all new Automattic employees. Since then, I’ve been involved in projects to make engagement measurement more consistent (harder than it sounds, as counting things is hard), and to improve the data science codebase (e.g., moving away from Legacy Python). Besides that, I also went to Montréal for the data division meetup, and have started getting into chatbot work. I’m looking forward to doing more work and sharing my experience here and on data.blog.

                                                              Subscribe +

                                                              My 10-step path to becoming a remote data scientist with Automattic

                                                              About two years ago, I read the book The Year without Pants, which describes the author’s experience leading a team at Automattic (the company behind WordPress.com, among other products). Automattic is a fully-distributed company, which means that all of its employees work remotely (hence pants are optional). While the book discusses some of the challenges of working remotely, the author’s general experience was very positive. A few months after reading the book, I decided to look for a full-time position after a period of independent work. Ideally, I wanted a well-paid data science-y remote job with an established distributed tech company that offers a good life balance and makes products I care about. Automattic seemed to tick all my boxes, so I decided to apply for a job with them. This post describes my application steps, which ultimately led to me becoming a data scientist with Automattic.

                                                              Before jumping in, it’s worth noting that this post describes my personal experience. If you apply for a job with Automattic, your experience is likely to be different, as the process varies across teams, and evolves over time.

                                                              📧 Step 1: Do background research and apply

                                                              I decided to apply for a data wrangler position with Automattic in October 2015. While data wrangler may sound less sexy than data scientist, reading the job ad led me to believe that the position may involve interesting data science work. This impression was strengthened by some LinkedIn stalking, which included finding current data wranglers and reading through their profiles and websites. I later found out that all the people on the data division start out as data wranglers, and then they may pick their own title. Some data wranglers do data science work, while others are more focused on data engineering, and there are some projects that require a broad range of skills. As the usefulness of the term data scientist is questionable, I’m not too fussed about fancy job titles. It’s more important to do interesting work in a supportive environment.

                                                              Applying for the job was fairly straightforward. I simply followed the instructions from the ad:

                                                              Does this sound interesting? If yes, please send a short email to jobs @ this domain telling us about yourself and attach a resumé. Let us know what you can contribute to the team. Include the title of the position you’re applying for and your name in the subject. Proofread! Make sure you spell and capitalize WordPress and Automattic correctly. We are lucky to receive hundreds of applications for every position, so try to make your application stand out. If you apply for multiple positions or send multiple emails there will be one reply.

                                                              Having been on the receiving side of job applications, I find it surprising that many people don’t bother writing a cover letter, addressing the selection criteria in the ad, or even applying for a job they’re qualified to do. Hence, my cover letter was fairly short, comprising of several bullet points that highlight the similarities between the job requirements and my experience. It was nothing fancy, but simple cover letters have worked well for me in the past.

                                                              ⏳ Step 2: Wait patiently

                                                              The initial application was followed by a long wait. From my research, this is the typical scenario. This is unsurprising, as Automattic is a fairly small company with a large footprint, which is both distributed and known as a great place to work (e.g., its Glassdoor rating is 4.9). Therefore, it attracts many applicants from all over the world, which take a while to process. In addition, Matt Mullenweg (Automattic’s CEO) reviews job applications before passing them on to the team leads.

                                                              As I didn’t know that Matt reviewed job applications, I decided to try to shorten the wait by getting introduced to someone in the data division. My first attempt was via a second-degree LinkedIn connection who works for Automattic. He responded quickly when I reached out to him, saying that his experience working with the company is in line with the Glassdoor reviews – it’s the best job he’s had in his 15-year-long career. However, he couldn’t help me with an intro, because there is no simple way around Automattic’s internal processes. Nonetheless, he reassured me that it is worth waiting patiently, as the strict process means that you end up working with great people.

                                                              I wasn’t in a huge rush to find a job, but in December 2015 I decided to accept an offer to become the head of data science at Car Next Door. This was a good decision at the time, as I believe in the company’s original vision of reducing the number of cars on the road through car sharing, and it seemed like there would be many interesting projects for me to work on. The position wasn’t completely remote, but as the company was already spread across several cities, I was able to work from home for a day or two every week. In addition, it was a pleasant commute by bike from my Sydney home to the office, so putting the fully-remote job search on hold didn’t seem like a major sacrifice. As I haven’t heard anything from Automattic at that stage, it seemed unwise to reject a good offer, so I started working full-time with Car Next Door in January 2016.

                                                              I successfully attracted Automattic’s attention with a post I published on the misuse of the word insights by many tech companies, which included an example from WordPress.com. Greg Ichneumon Brown, one of the data wranglers, commented on the post, and invited me to apply to join Automattic and help them address the issues I raised. This happened after I accepted the offer from Car Next Door, and hasn’t resulted in any speed up of the process, so I just gave up on Automattic and carried on with my life.

                                                              💬 Step 3: Chat with the data lead

                                                              I finally heard back from Automattic in February 2016 (four months after my initial application and a month into my employment with Car Next Door). Martin Remy, who leads the data division, emailed me to enquire if I’m still interested in the position. I informed him that I was no longer looking for a job, but we agreed to have an informal chat, as I’ve been waiting for such a long time.

                                                              As is often the case with Automattic interviews, the chat with Martin was completely text-based. Working with a distributed team means that voice and video calls can be hard to schedule. Hence, Automattic relies heavily on textual channels, and text-based interviews allow the company to test the written communication skills of candidates. The chat revolved around my past work experience, and Martin also took the time to answer my questions about the company and the data division. At the conclusion of the chat, Martin suggested I contact him directly if I was ever interested in continuing the application process. While I was happy with my position at the time, the chat strengthened my positive impression of Automattic, and I decided that I would reapply if I were to look for a full-time position again.

                                                              My next job search started earlier than I had anticipated. In October 2016, I decided to leave Car Next Door due to disagreements with the founders over the general direction of the company. In addition, I had more flexibility in choosing where to live, as my personal circumstances had changed. As I’ve always been curious about life outside the capital cities of Australia, I wanted to move away from Sydney. While I could have probably continued working remotely with Car Next Door, I felt that it would be better to find a job with a fully-distributed team. Therefore, I messaged Martin and we scheduled another chat.

                                                              The second chat with Martin took place in early November. Similarly to the first chat, it was conducted via Skype text messages, and revolved around my work in the time that has passed since the first chat. This time, as I was keen on continuing with the process, I asked more specific questions about what kind of work I’m likely to end up doing and what the next steps would be. The answers were that I’d be joining the data science team, and that the next steps are a pre-trial test, a paid trial, and a final interview with Matt. While this sounds straightforward, it took another six months until I finally became an Automattic employee (but I wasn’t in a rush).

                                                              ☑️ Step 4: Pass the pre-trial test

                                                              The pre-trial test consisted of a data analysis task, where I was given a dataset and a set of questions to answer by Carly Stambaugh, the data science lead. The goal of the test is to evaluate the candidate’s approach to a problem, and assess organisational and communication skills. As such, the focus isn’t on obtaining a specific result, so candidates are given a choice of several potential avenues to explore. The open-ended nature of the task is reminiscent of many real-world data science projects, where you don’t always have a clear idea of what you’re going to discover. While some people may find this kind of uncertainty daunting, I find it interesting, as it is one of the things that makes data science a science.

                                                              I spent a few days analysing the data and preparing a report, which was submitted as a Jupyter Notebook. After submitting my initial report, there were a few follow-up questions, which I answered by email. The report was reviewed by Carly and Martin, and as they were satisfied with my work, I was invited to proceed to the next stage: A paid trial project.

                                                              👨‍💻 Step 5: Do the trial project

                                                              The main part of the application process with Automattic is the paid trial project. The rationale behind doing paid trials was explained a few years ago by Matt in Hire by Auditions, Not Resumes:

                                                              Before we hire anyone, they go through a trial process first, on contract. They can do the work at night or over the weekend, so they don’t have to leave their current job in the meantime. We pay a standard rate of $25 per hour, regardless of whether you’re applying to be an engineer or the chief financial officer.

                                                              During the trials, we give the applicants actual work. If you’re applying to work in customer support, you’ll answer tickets. If you’re an engineer, you’ll work on engineering problems. If you’re a designer, you’ll design.

                                                              There’s nothing like being in the trenches with someone, working with them day by day. It tells you something you can’t learn from resumes, interviews, or reference checks. At the end of the trial, everyone involved has a great sense of whether they want to work together going forward. And, yes, that means everyone — it’s a mutual tryout. Some people decide we’re not the right fit for them.

                                                              The goal of my trial project was to improve the Elasticsearch language detection algorithm. This took about a month, and ultimately resulted in a pull request that got merged into the language detection plugin. I find this aspect of the process pretty exciting: While the plugin is used to classify millions of documents internally by Automattic, its impact extends beyond the company, as Elasticsearch is used by many other organisations and projects. This stands in contrast to many other technical job interviews, which consist of unpaid work on toy problems under stressful conditions, where the work performed is ultimately thrown away. While the monetary compensation for the trial work is lower than the market rate for data science consulting, I valued the opportunity to work on a real open source project, even if this hadn’t led to me getting hired.

                                                              There was much more to the trial project than what’s shown in the final pull request. Most of the discussions were held on an internal project thread, primarly under the guidance of Carly (the data science lead), and Greg (the data wrangler who replied to my post a year earlier). The project was kicked off with a general problem statement: There was some evidence that the Elasticsearch language detection plugin doesn’t perform well on short texts, and my mission was to improve it. As the plugin didn’t include any tests for short texts, one of the main contributions of my work was the creation of datasets and tests to measure its accuracy on texts of different lengths. This was followed by some tweaks that improved the plugin’s performance, as summarised in the pull request. Internally, this work consisted of several iterations where I came up with ideas, asked questions, implemented the ideas, shared the results, and discussed further steps. There are still many possible improvements to the work done in the trial. However, as trials generally last around a month, we decided to end it after a few iterations.

                                                              I enjoyed the trial process, but it is definitely not for everyone. Most notably, there is a strong emphasis on asynchronous text-based communication, which is the main mode by which projects are coordinated at Automattic. People who don’t enjoy written communication may find this aspect challenging, but I have always found that writing helps me organise my thoughts, and that I retain information better when reading than when listening to people speak. That being said, Automatticians do meet in person several times a year, and some teams have video chats for some discussions. While doing the trial, I had a video chat with Carly, which was the first (and last) time in the process that I got to see and hear a live human. However, this was not an essential part of the trial project, as our chat was mostly on the data scientist role and my job expectations.

                                                              ⏳ Step 6: Wait patiently

                                                              I finished working on the trial project just before Christmas. The feedback I received throughout the trial was positive, but Martin, Carly, and Greg had to go through the work and discuss it among themselves before making a final decision. This took about a month, due to the holiday period, various personal circumstances, and the data science team meetup that was scheduled for January 2017. Eventually, Martin got back to me with positive news: They were satisfied with my trial work, which meant there was only one stage left – the final interview with Matt Mullenweg, Automattic’s CEO.

                                                              👉 Step 7: Ping Matt

                                                              Like other parts of the process, the interview with Matt is text-based. The way it works is fairly simple: I was instructed to message Matt on Slack and wait for a response, which may take days or weeks. I sent Matt a message on January 25, and was surprised to hear back from him the following morning. However, that day was Australia Day, which is a public holiday here. Therefore, I only got back to him two hours after he messaged me that morning, and by that time he was probably already busy with other things. This was the start of a pretty long wait.

                                                              ⏳ Step 8: Wait patiently

                                                              I left Car Next Door at the end of January, as I figured that I would be able to line up some other work even if things didn’t work out with Automattic. My plan was to take some time off, and then move up to the Northern Rivers area of New South Wales. I had two Reef Life Survey trips planned, so I wasn’t going to start working again before mid-April. I assumed that I would hear back from Matt before then, which would have allowed me to make an informed decision whether to look for another job or not.

                                                              After two weeks of waiting, the time for my dive trips was nearing. As I was going to be without mobile reception for a while, I thought it’d be worth letting Matt know my schedule. After discussing the matter with Martin, I messaged Matt. He responded, saying that we might as well do the interview at the beginning of April, as I won’t be starting work before that time anyway. I would have preferred to be done with the interview earlier, but was happy to have some certainty and not worry about missing more chat messages before April.

                                                              In early April, I returned from my second dive trip (which included a close encounter with Cyclone Debbie), and was hoping to sort out my remote work situation while completing the move up north. Unfortunately, while the move was successful, I was ready to give up on Automattic because I haven’t heard back from Matt at all in April. However, Martin remained optimistic and encouraged me to wait patiently, which I did as I was pretty busy with the move and with some casual freelancing projects.

                                                              💬 Step 9: Chat with Matt and accept the job offer

                                                              The chat with Matt finally happened on May 2. As is often the case, it took a few hours and covered my background, the trial process, and some other general questions. I asked him about my long wait for the final chat, and he apologised for me being an outlier, as most chats happen within two weeks of a candidate being passed over to him. As the chat was about to conclude, we got to the topic of salary negotiation (which went well), and then the process was finally over! Within a few hours of the chat I was sent an offer letter and an employment contract. As Automattic has an entity in Australia (called Ausomattic), it’s a fairly standard contract. I signed the contract and started work the following week – over a year and a half after my initial application. Even before I started working, I booked tickets to meet the data division in Montréal – a fairly swift transition from the long wait for the final interview.

                                                              🎉 Step 10: Start working and choose a job title

                                                              As noted above, Automatticians get to choose their own job titles, so to become a data scientist with Automattic, I had to set my job title to Data Scientist. This is generally how many people become data scientists these days, even outside Automattic. However, job titles don’t matter as much as job satisfaction. And after 2.5 months with Automattic, I’m very satisfied with my decision to join the company. My first three weeks were spent doing customer support, like all new Automattic employees. Since then, I’ve been involved in projects to make engagement measurement more consistent (harder than it sounds, as counting things is hard), and to improve the data science codebase (e.g., moving away from Legacy Python). Besides that, I also went to Montréal for the data division meetup, and have started getting into chatbot work. I’m looking forward to doing more work and sharing my experience here and on data.blog.

                                                              Subscribe

                                                                Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                                                Very enlightening post! It was very awesome to see that the insights you saw to Elasticsearch went to a PR. I bet that was worth the whole thing!
                                                                That’s very exciting, I wanted to ask are you a self learner or do you have a degree,can you please share your background. -Thank you
                                                                Thanks Mostafa. Yes, I have a BSc in computer science, and a PhD in what you would now call data science. See: https://www.linkedin.com/in/yanirseroussi/

                                                                This was an amazing post, Yanir! Loved the breakdown and the patience you had for the whole process, very well played and you really deserved it! :)

                                                                P.S: Really can connect as I’ve been working independently for a while now and would definitely be open to looking for long-term contracts or remote jobs like this.

                                                                Your post is really a therapy to most people who apply for jobs and loose hope of waiting. I believe patience is a key to everything. Thqnks
                                                                Thanks Mostafa. Yes, I have a BSc in computer science, and a PhD in what you would now call data science. See: https://www.linkedin.com/in/yanirseroussi/

                                                                This was an amazing post, Yanir! Loved the breakdown and the patience you had for the whole process, very well played and you really deserved it! :)

                                                                P.S: Really can connect as I’ve been working independently for a while now and would definitely be open to looking for long-term contracts or remote jobs like this.

                                                                Your post is really a therapy to most people who apply for jobs and loose hope of waiting. I believe patience is a key to everything. Thqnks

                                                                This site is a part of the Data People Writing Stuff webring.
                                                                ← previous site +  |   +next site →

                                                                \ No newline at end of file diff --git a/2017/09/02/state-of-bandcamp-recommender/index.html b/2017/09/02/state-of-bandcamp-recommender/index.html index 00d1f4896..389b17ef7 100644 --- a/2017/09/02/state-of-bandcamp-recommender/index.html +++ b/2017/09/02/state-of-bandcamp-recommender/index.html @@ -1,5 +1,5 @@ State of Bandcamp Recommender, Late 2017 | Yanir Seroussi | Data & AI for Startup Impact -

                                                                State of Bandcamp Recommender, Late 2017

                                                                November 2017: Update and goodbye

                                                                I’ve decided to shut down Bandcamp Recommender (BCRecommender), despite hearing back from a few volunteers. The main reasons are:

                                                                1. Bandcamp now shows album recommendations at the bottom of album pages. While this isn’t quite the same as BCRecommender, I hope that it will evolve to a more comprehensive recommender system.
                                                                2. I tried to contact Bandcamp to get their support for the continued running of BCRecommender. I have not heard back from them. It would have been nice to receive some acknowledgement that they find BCRecommender useful.
                                                                3. As discussed below, I don’t have much time to spend on the project, and handing it off to other maintainers would have been time-consuming. Given reasons 1 and 2, I don’t feel like it’s worth the effort. Thanks to everyone who’s contacted me – you’re awesome!

                                                                September 2017: Original announcement

                                                                I released the first version of Bandcamp Recommender (BCRecommender) about three years ago, with the main goal of surfacing music recommendations from Bandcamp. A secondary goal was learning more about building and marketing a standalone web app. As such, I shared a few posts about BCRecommender over the years:

                                                                The last of the above posts was published in November 2015 – almost two years ago. Most of the work on BCRecommender was done up to that point, when my main focus was on part-time contracting while working on my own projects. However, since January 2016 I’ve mostly been working full-time, so I haven’t had the time to give enough attention to the project. Therefore, it looks like it’s time for me to say goodbye to BCRecommender.

                                                                Despite the lack of attention, about 5,000 people still visit BCRecommender every month (down from a peak of around 9,000). I know that people find it useful, even though it hasn’t been functionally updated in a long time (though the recommendations have been refreshed a few times). In an ideal world, BCRecommender would be replaced by algorithmic recommendations from Bandcamp. But unfortunately, Bandcamp still doesn’t offer personalised recommendations. This is a shame, because such recommendations could be of great benefit to both artists and fans. Millions of tracks and albums have been published on Bandcamp, meaning that serving personalised recommendations that cover their full catalogue can only be achieved using algorithms. However, it seems like they’re not interested in building this kind of functionality.

                                                                Rather than simply pulling the plug on BCRecommender, I thought I’d put a call out to see if anyone is interested in maintaining it. I’m happy to open source the code and hand the project over to someone else if it means it would be in good hands. With a little bit of work, BCRecommender can be turned into a full Bandcamp-based personalised radio station. If you think you’d be a good fit for maintaining the project, drop me a line and we can discuss further. If you just love BCRecommender, you can also let Bandcamp know that you want them to implement algorithmic recommendations (e.g., on Twitter or by emailing support@bandcamp.com). I’ll keep BCRecommender alive for about two more months and see if I get any responses. Either way, I’ll be saying goodbye to maintaining it before the end of the year.

                                                                Subscribe +

                                                                State of Bandcamp Recommender, Late 2017

                                                                November 2017: Update and goodbye

                                                                I’ve decided to shut down Bandcamp Recommender (BCRecommender), despite hearing back from a few volunteers. The main reasons are:

                                                                1. Bandcamp now shows album recommendations at the bottom of album pages. While this isn’t quite the same as BCRecommender, I hope that it will evolve to a more comprehensive recommender system.
                                                                2. I tried to contact Bandcamp to get their support for the continued running of BCRecommender. I have not heard back from them. It would have been nice to receive some acknowledgement that they find BCRecommender useful.
                                                                3. As discussed below, I don’t have much time to spend on the project, and handing it off to other maintainers would have been time-consuming. Given reasons 1 and 2, I don’t feel like it’s worth the effort. Thanks to everyone who’s contacted me – you’re awesome!

                                                                September 2017: Original announcement

                                                                I released the first version of Bandcamp Recommender (BCRecommender) about three years ago, with the main goal of surfacing music recommendations from Bandcamp. A secondary goal was learning more about building and marketing a standalone web app. As such, I shared a few posts about BCRecommender over the years:

                                                                The last of the above posts was published in November 2015 – almost two years ago. Most of the work on BCRecommender was done up to that point, when my main focus was on part-time contracting while working on my own projects. However, since January 2016 I’ve mostly been working full-time, so I haven’t had the time to give enough attention to the project. Therefore, it looks like it’s time for me to say goodbye to BCRecommender.

                                                                Despite the lack of attention, about 5,000 people still visit BCRecommender every month (down from a peak of around 9,000). I know that people find it useful, even though it hasn’t been functionally updated in a long time (though the recommendations have been refreshed a few times). In an ideal world, BCRecommender would be replaced by algorithmic recommendations from Bandcamp. But unfortunately, Bandcamp still doesn’t offer personalised recommendations. This is a shame, because such recommendations could be of great benefit to both artists and fans. Millions of tracks and albums have been published on Bandcamp, meaning that serving personalised recommendations that cover their full catalogue can only be achieved using algorithms. However, it seems like they’re not interested in building this kind of functionality.

                                                                Rather than simply pulling the plug on BCRecommender, I thought I’d put a call out to see if anyone is interested in maintaining it. I’m happy to open source the code and hand the project over to someone else if it means it would be in good hands. With a little bit of work, BCRecommender can be turned into a full Bandcamp-based personalised radio station. If you think you’d be a good fit for maintaining the project, drop me a line and we can discuss further. If you just love BCRecommender, you can also let Bandcamp know that you want them to implement algorithmic recommendations (e.g., on Twitter or by emailing support@bandcamp.com). I’ll keep BCRecommender alive for about two more months and see if I get any responses. Either way, I’ll be saying goodbye to maintaining it before the end of the year.

                                                                Subscribe

                                                                  Public comments are closed, but I love hearing from readers. Feel free to @@ -7,7 +7,9 @@ i am bandcamp freak… Bandcamp has recently strarted recommendations at the bottom ;-) seems primitive though. example https://ogreyouasshole.bandcamp.com/album/crossword-lost-sigh-days-james-mcnew-remixes would love to hear about the basic logic you used behind the “recommendations” . I have no technical knowledge at all but a few years ago thought of a basic recommendation model ..but couldnt take it forward though…i thought ‘contextualizing’ artists would be a cool way to connect bands. cheers.!

                                                                  Thanks Shanky! I didn’t notice the new Bandcamp recommendations. It looks promising and it’s about time! Makes me feel better about saying goodbye to this project :)

                                                                  Check out this post for a fairly high-level overview of the logic behind the BCRecommender recommendations: https://yanirseroussi.com/2014/09/19/bandcamp-recommendation-and-discovery-algorithms/

                                                                  Thanks for the Bandcamp Recommender, showed me some interesting music ! Have one question though, maybe i am knocking atthe wrong door here.. , but when accessing ‘my’ bandcamp on the I-Phone, there’s no more log-in plus very nice elevator going up tune, and direct access to ‘my’ music, also the log-out option is gone… Or am I doing something wrong.. or where can I ask for this..? -Thank you, and all the best
                                                                  Thanks Jan. The place to ask app-related questions is support@bandcamp.com, as I’m not affiliated with Bandcamp.
                                                                  Thanks Jan. The place to ask app-related questions is support@bandcamp.com, as I’m not affiliated with Bandcamp.

                                                                  This site is a part of the Data People Writing Stuff webring.
                                                                  ← previous site +  |   +next site →

                                                                  \ No newline at end of file diff --git a/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/index.html b/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/index.html index 9af93201c..b72aa9fc5 100644 --- a/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/index.html +++ b/2017/10/15/advice-for-aspiring-data-scientists-and-other-faqs/index.html @@ -1,12 +1,14 @@ Advice for aspiring data scientists and other FAQs | Yanir Seroussi | Data & AI for Startup Impact -

                                                                  Advice for aspiring data scientists and other FAQs

                                                                  Aspiring data scientists and other visitors to this site often repeat the same questions. This post is the definitive collection of my answers to such questions (which may evolve over time).

                                                                  How do I become a data scientist?

                                                                  It depends on your situation. Before we get into it, have you thought about why you want to become a data scientist?

                                                                  Hmm… Not really. Why should I become a data scientist?

                                                                  I can't answer this for you, but it's great to see you asking why. Do you know what data science is? Do you understand what data scientists do?

                                                                  Sort of. Just so we’re on the same page, what is data science?

                                                                  No one knows for sure. Here are my thoughts from 2014 on defining data science as the intersection of software engineering and statistics, and a more recent post on defining data science in 2018.

                                                                  What are the hardest parts of data science?

                                                                  The hardest parts of data science are problem definition and solution measurement, not model fitting and data cleaning, because counting things is hard.

                                                                  Thanks, that’s helpful. But what do data scientists actually do?

                                                                  It varies a lot. This variability makes the job title somewhat useless. You should try to get an idea what areas of data science interest you. For many people, excitement over the technical aspects wanes with time. And even if you still find the technical aspects exciting, most jobs have boring parts. When considering career changes, think of the non-technical aspects that would keep you engaged.

                                                                  To answer the question, here are some posts on things I've done: Joined Automattic by improving the Elasticsearch language detection plugin, calculated customer lifetime value, analysed A/B test results, built recommender systems (including one for Bandcamp music), competed on Kaggle, and completed a PhD. I've also dabbled in deep learning, marine surveys, causality, and other things that I haven't had the chance to write about.

                                                                  Cool! Can you provide a general overview of how to become a data scientist?

                                                                  Yes! Check out Alec Smith's excellent articles.

                                                                  I’m pretty happy with my current job, but still thinking of becoming a data scientist. What should I do?

                                                                  Find ways of doing data science within your current role, working overtime if needed. Working on a real problem in a familiar domain is much more valuable than working on toy problems from online courses and platforms like Kaggle (though they're also useful). If you're a data analyst, learn how to program to automate and simplify your analyses. If you're a software engineer, become comfortable with analysing and modelling data. Machine learning doesn't have to be a part of what you choose to do.

                                                                  I’m pretty busy. What online course should I take to learn about the area?

                                                                  Calling Bullshit: Data Reasoning for the Digital Age is a good place to start. Deep learning should be pretty low on your list if you don't have much background in the area.

                                                                  Should I learn Python or R? Keras or Tensorflow? What about <insert name here>?

                                                                  It doesn't matter. Focus on principles and you'll be fine. The following quote still applies today (to people of all genders).

                                                                  As to methods, there may be a million and then some, but principles are few. The man who grasps principles can successfully select his own methods. The man who tries methods, ignoring principles, is sure to have trouble.

                                                                  +

                                                                  Advice for aspiring data scientists and other FAQs

                                                                  Aspiring data scientists and other visitors to this site often repeat the same questions. This post is the definitive collection of my answers to such questions (which may evolve over time).

                                                                  How do I become a data scientist?

                                                                  It depends on your situation. Before we get into it, have you thought about why you want to become a data scientist?

                                                                  Hmm… Not really. Why should I become a data scientist?

                                                                  I can't answer this for you, but it's great to see you asking why. Do you know what data science is? Do you understand what data scientists do?

                                                                  Sort of. Just so we’re on the same page, what is data science?

                                                                  No one knows for sure. Here are my thoughts from 2014 on defining data science as the intersection of software engineering and statistics, and a more recent post on defining data science in 2018.

                                                                  What are the hardest parts of data science?

                                                                  The hardest parts of data science are problem definition and solution measurement, not model fitting and data cleaning, because counting things is hard.

                                                                  Thanks, that’s helpful. But what do data scientists actually do?

                                                                  It varies a lot. This variability makes the job title somewhat useless. You should try to get an idea what areas of data science interest you. For many people, excitement over the technical aspects wanes with time. And even if you still find the technical aspects exciting, most jobs have boring parts. When considering career changes, think of the non-technical aspects that would keep you engaged.

                                                                  To answer the question, here are some posts on things I've done: Joined Automattic by improving the Elasticsearch language detection plugin, calculated customer lifetime value, analysed A/B test results, built recommender systems (including one for Bandcamp music), competed on Kaggle, and completed a PhD. I've also dabbled in deep learning, marine surveys, causality, and other things that I haven't had the chance to write about.

                                                                  Cool! Can you provide a general overview of how to become a data scientist?

                                                                  Yes! Check out Alec Smith's excellent articles.

                                                                  I’m pretty happy with my current job, but still thinking of becoming a data scientist. What should I do?

                                                                  Find ways of doing data science within your current role, working overtime if needed. Working on a real problem in a familiar domain is much more valuable than working on toy problems from online courses and platforms like Kaggle (though they're also useful). If you're a data analyst, learn how to program to automate and simplify your analyses. If you're a software engineer, become comfortable with analysing and modelling data. Machine learning doesn't have to be a part of what you choose to do.

                                                                  I’m pretty busy. What online course should I take to learn about the area?

                                                                  Calling Bullshit: Data Reasoning for the Digital Age is a good place to start. Deep learning should be pretty low on your list if you don't have much background in the area.

                                                                  Should I learn Python or R? Keras or Tensorflow? What about <insert name here>?

                                                                  It doesn't matter. Focus on principles and you'll be fine. The following quote still applies today (to people of all genders).

                                                                  As to methods, there may be a million and then some, but principles are few. The man who grasps principles can successfully select his own methods. The man who tries methods, ignoring principles, is sure to have trouble.

                                                                  I want to become a data science freelancer. Can you provide some advice?

                                                                  As with any freelancing job, expect to spend much of your time on sales and networking. I've only explored the freelancing path briefly, but Radim Řehůřek has published great slides on the topic. If you're thinking of freelancing as a way of gaining financial independence, also consider spending less, earning more, and investing wisely.

                                                                  Can you recommend an academic data science degree?

                                                                  Sorry, but I don't know much about those degrees. Boris Gorelik has some interesting thoughts on studying data science.

                                                                  Will you be my mentor?

                                                                  Probably not, unless you're hard-working, independent, and doing something I find interesting. Feel free to contact me if you believe we'd both find the relationship beneficial.

                                                                  Can you help with my project?

                                                                  Possibly. If you think I'd find your project exciting, please do contact me.


                                                                  What about ethics?

                                                                  What about them? There isn't a single definition of right and wrong, as morality is multi-dimensional. I believe it's important to question your own choices, and avoid applying data science blindly. For me, this means divesting from harmful industries like fossil fuels and striving to go beyond the creation of greedy robots (among other things).

                                                                  I’m a manager. When should I hire a data scientist and start using machine learning?

                                                                  There's a good chance you don't need a data scientist yet, but you should be aware of common pitfalls when trying to be data-driven. It's also worth reading Paras Chopra's post on what you need to know before you board the machine learning train.

                                                                  Do you want to buy my products or services?

                                                                  No. If I did, I'd contact you.

                                                                  I have a question that isn’t answered here or anywhere on the internet, and I think you can help. Can I contact you?

                                                                  Sure, use the form on this page.

                                                                  Subscribe

                                                                    Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                                                    Thanks so much for sharing this Yanir!

                                                                    Indeed, such questions seem to be very recurring. Thanks for providing answers to help guide folks. I might add a few things:

                                                                    when ready for the job search… Advice to Data Scientists on Where to Work http://multithreaded.stitchfix.com/blog/2015/03/31/advice-for-data-scientists/

                                                                    if you are going to get into data science, do it for the right reasons. Let your passion drive! -https://www.quora.com/How-do-I-move-from-data-scientist-to-data-science-management

                                                                    This site is a part of the Data People Writing Stuff webring.
                                                                    ← previous site +  |   +next site →

                                                                    \ No newline at end of file diff --git a/2018/07/22/defining-data-science-in-2018/index.html b/2018/07/22/defining-data-science-in-2018/index.html index 254484c55..79ed603ce 100644 --- a/2018/07/22/defining-data-science-in-2018/index.html +++ b/2018/07/22/defining-data-science-in-2018/index.html @@ -1,9 +1,11 @@ Defining data science in 2018 | Yanir Seroussi | Data & AI for Startup Impact -

                                                                    Defining data science in 2018

                                                                    I got my first data science job in 2012, the year Harvard Business Review announced data scientist to be the sexiest job of the 21st century. Two years later, I published a post on my then-favourite definition of data science, as the intersection between software engineering and statistics. Unfortunately, that definition became somewhat irrelevant as more and more people jumped on the data science bandwagon – possibly to the point of making data scientist useless as a job title. However, I still call myself a data scientist. Even better – I still get paid for being a data scientist. But what does it mean? What do I actually do here? This article is a short summary of my understanding of the definition of data science in 2018.

                                                                    It’s not all about machine learning

                                                                    As I was wrapping up my PhD in 2012, I started thinking about my next steps. I knew I wanted to get back to working in the tech industry, ideally with a small startup. But it wasn’t clear to me how to market myself – my LinkedIn title at the time was “software engineer with a research background”, which is a bit of a mouthful. Around that time I heard about Kaggle and decided to try competing. This went pretty well, and exposed me to the data science community globally and in Melbourne, where I was living at the time. That’s how I first met Adam Neumann, the founder of Giveable, a startup that aimed to recommend gifts based on social networking data. Upon graduating, I joined Giveable as a data scientist. Changing my LinkedIn title quickly led to many other offers, but I was happy to be working on Giveable – I felt fortunate to have found a startup job that was related to my PhD research on recommender systems.

                                                                    My understanding of data science at the time was heavily influenced by Kaggle and the tech industry. Kaggle was only about predictive modelling competitions back then, and so I believed that data science is about using machine learning to build models and deploy them as part of various applications. I was very comfortable with that definition, having spent my PhD years on several predictive modelling tasks, and having worked as a software engineer prior to that.

                                                                    Things have changed considerably since 2012. It is now much easier to deploy machine learning models, even without a deep understanding of how they work. Many more people call themselves data scientists, including some who are more focused on data analysis than on building data products. Even Kaggle – which is now owned by Google – has broadened its scope beyond modelling competitions to support other types of analysis. Numerous articles have been published on the meaning of data science in the past six years. We seem to be going towards a broad definition of the field, which includes any type of general data analysis. This trend of broadening the definition may make data scientist somewhat useless as a job title. However, I believe that data science tasks remain useful, as shown by the following definitions.

                                                                    Recent definitions by Hernán, Hawkins, and Dubossarsky

                                                                    In a recent article, Hernán et al. classify data science tasks into three types: description, prediction, and causal inference. Like other authors, they argue that causal inference has been neglected by traditional statistics and some scientific disciplines. They claim that the emergence of data science is an opportunity to get causal inference “right”. Further, they emphasise the importance of domain expert knowledge, which is essential in causal inference. Defining data science in this broad manner seems to capture the essence of what the field is about these days. However, purely descriptive tasks are still often performed by data analysts rather than scientists. And the distinction between prediction and causal inference can be a bit fuzzy, especially as the tools for the latter are at a lower level of maturity. In addition, while I agree with Hernán et al. that domain expertise is important, it seems unlikely that this will forever be the case. No one is born an expert – expertise is gained by learning from and interacting with the world. Therefore, it’s plausible that gaining expertise can and will be automated. Further, there are numerous cases where experts were proven to be wrong. For example, it wasn’t so long ago that doctors recommended smoking.

                                                                    Despite the importance of domain knowledge, one can argue that scientists that specialise in a single domain are not data scientists. In fact, the ability to go beyond one domain and think of data in a more abstract manner is what makes a data scientist. Applying this abstract knowledge often requires some domain expertise or input from domain experts, but most data science techniques are not domain-specific – they can be applied to many different problems. John Hawkins explains this point well in an article titled why all scientists are not data scientists:

                                                                    Those scientists and statisticians who have focused themselves on understanding the limitations and possibilities of making inferences from experimental data are the ones who are the forerunners to data scientists. They have a skill which transcends the particulars of what it takes to do lab work on cell cultures, or field studies for ecology etc. Their core skill involves thinking about the data involved at an abstracted level. To ask the question “given data with these properties, what conclusions can we draw?”

                                                                    Finally, according to Eugene Dubossarsky, “there’s only one purpose to data science, and that is to support decisions. And more specifically, to make better decisions. That should be something no one can argue with.” This goal-focused definition is unsurprising, given the fact that Eugene runs a training and consulting business and has been working in the field for over 20 years. I’m not going to argue with him, but to put it all together, we can define data science as a field that deals with description, prediction, and causal inference from data in a manner that is both domain-independent and domain-aware, with the ultimate goal of supporting decisions.

                                                                    What about AI?

                                                                    Everyone loves a good buzzword, and these days AI (Artificial Intelligence) is one of the hottest buzzwords. However, despite what some people may try to tell you, AI is unlikely to make data science obsolete any time soon. Following the above definition, as long as there is a need to make decisions based on data, there will be a need for data scientists. This includes decisions that aren’t made by humans, as data scientists are involved in building systems that make decisions autonomously.

                                                                    The resurgence of AI feels somewhat amusing given my personal experience. One of the reasons I decided to pursue a PhD in natural language processing and personalisation was my interest in what I considered to be AI back in 2008. My initial introduction to the field was through an AI course and a project I did as part of my bachelor’s degree in computer science. However, by the time I graduated from my PhD, saying that I’m an AI expert seemed less useful than calling myself a data scientist. It may be that the field is about to shift again, and that rebranding as an AI expert would be more beneficial (though I’d be doing exactly the same work). Titles are somewhat silly – I’m going to continue working with data to support decisions for as long as there is demand for this kind of work and I continue enjoying it. There is plenty to learn and develop in this area, regardless of buzzwords and sexy titles.

                                                                    Subscribe +

                                                                    Defining data science in 2018

                                                                    I got my first data science job in 2012, the year Harvard Business Review announced data scientist to be the sexiest job of the 21st century. Two years later, I published a post on my then-favourite definition of data science, as the intersection between software engineering and statistics. Unfortunately, that definition became somewhat irrelevant as more and more people jumped on the data science bandwagon – possibly to the point of making data scientist useless as a job title. However, I still call myself a data scientist. Even better – I still get paid for being a data scientist. But what does it mean? What do I actually do here? This article is a short summary of my understanding of the definition of data science in 2018.

                                                                    It’s not all about machine learning

                                                                    As I was wrapping up my PhD in 2012, I started thinking about my next steps. I knew I wanted to get back to working in the tech industry, ideally with a small startup. But it wasn’t clear to me how to market myself – my LinkedIn title at the time was “software engineer with a research background”, which is a bit of a mouthful. Around that time I heard about Kaggle and decided to try competing. This went pretty well, and exposed me to the data science community globally and in Melbourne, where I was living at the time. That’s how I first met Adam Neumann, the founder of Giveable, a startup that aimed to recommend gifts based on social networking data. Upon graduating, I joined Giveable as a data scientist. Changing my LinkedIn title quickly led to many other offers, but I was happy to be working on Giveable – I felt fortunate to have found a startup job that was related to my PhD research on recommender systems.

                                                                    My understanding of data science at the time was heavily influenced by Kaggle and the tech industry. Kaggle was only about predictive modelling competitions back then, and so I believed that data science is about using machine learning to build models and deploy them as part of various applications. I was very comfortable with that definition, having spent my PhD years on several predictive modelling tasks, and having worked as a software engineer prior to that.

                                                                    Things have changed considerably since 2012. It is now much easier to deploy machine learning models, even without a deep understanding of how they work. Many more people call themselves data scientists, including some who are more focused on data analysis than on building data products. Even Kaggle – which is now owned by Google – has broadened its scope beyond modelling competitions to support other types of analysis. Numerous articles have been published on the meaning of data science in the past six years. We seem to be going towards a broad definition of the field, which includes any type of general data analysis. This trend of broadening the definition may make data scientist somewhat useless as a job title. However, I believe that data science tasks remain useful, as shown by the following definitions.

                                                                    Recent definitions by Hernán, Hawkins, and Dubossarsky

                                                                    In a recent article, Hernán et al. classify data science tasks into three types: description, prediction, and causal inference. Like other authors, they argue that causal inference has been neglected by traditional statistics and some scientific disciplines. They claim that the emergence of data science is an opportunity to get causal inference “right”. Further, they emphasise the importance of domain expert knowledge, which is essential in causal inference. Defining data science in this broad manner seems to capture the essence of what the field is about these days. However, purely descriptive tasks are still often performed by data analysts rather than scientists. And the distinction between prediction and causal inference can be a bit fuzzy, especially as the tools for the latter are at a lower level of maturity. In addition, while I agree with Hernán et al. that domain expertise is important, it seems unlikely that this will forever be the case. No one is born an expert – expertise is gained by learning from and interacting with the world. Therefore, it’s plausible that gaining expertise can and will be automated. Further, there are numerous cases where experts were proven to be wrong. For example, it wasn’t so long ago that doctors recommended smoking.

                                                                    Despite the importance of domain knowledge, one can argue that scientists that specialise in a single domain are not data scientists. In fact, the ability to go beyond one domain and think of data in a more abstract manner is what makes a data scientist. Applying this abstract knowledge often requires some domain expertise or input from domain experts, but most data science techniques are not domain-specific – they can be applied to many different problems. John Hawkins explains this point well in an article titled why all scientists are not data scientists:

                                                                    Those scientists and statisticians who have focused themselves on understanding the limitations and possibilities of making inferences from experimental data are the ones who are the forerunners to data scientists. They have a skill which transcends the particulars of what it takes to do lab work on cell cultures, or field studies for ecology etc. Their core skill involves thinking about the data involved at an abstracted level. To ask the question “given data with these properties, what conclusions can we draw?”

                                                                    Finally, according to Eugene Dubossarsky, “there’s only one purpose to data science, and that is to support decisions. And more specifically, to make better decisions. That should be something no one can argue with.” This goal-focused definition is unsurprising, given the fact that Eugene runs a training and consulting business and has been working in the field for over 20 years. I’m not going to argue with him, but to put it all together, we can define data science as a field that deals with description, prediction, and causal inference from data in a manner that is both domain-independent and domain-aware, with the ultimate goal of supporting decisions.

                                                                    What about AI?

                                                                    Everyone loves a good buzzword, and these days AI (Artificial Intelligence) is one of the hottest buzzwords. However, despite what some people may try to tell you, AI is unlikely to make data science obsolete any time soon. Following the above definition, as long as there is a need to make decisions based on data, there will be a need for data scientists. This includes decisions that aren’t made by humans, as data scientists are involved in building systems that make decisions autonomously.

                                                                    The resurgence of AI feels somewhat amusing given my personal experience. One of the reasons I decided to pursue a PhD in natural language processing and personalisation was my interest in what I considered to be AI back in 2008. My initial introduction to the field was through an AI course and a project I did as part of my bachelor’s degree in computer science. However, by the time I graduated from my PhD, saying that I’m an AI expert seemed less useful than calling myself a data scientist. It may be that the field is about to shift again, and that rebranding as an AI expert would be more beneficial (though I’d be doing exactly the same work). Titles are somewhat silly – I’m going to continue working with data to support decisions for as long as there is demand for this kind of work and I continue enjoying it. There is plenty to learn and develop in this area, regardless of buzzwords and sexy titles.

                                                                    Subscribe

                                                                      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                      Great set of definitions and path of evolutions here!

                                                                      There has to be chaos and confusion as it evolves surely, but the consensus as you very well mentioned is decisions. Anything done in the data world, if not leading to decisions is not quite viable in long term.

                                                                      Thanks for sharing your thoughts, love reading your blog.

                                                                      \ No newline at end of file diff --git a/2018/11/03/reflections-on-remote-data-science-work/index.html b/2018/11/03/reflections-on-remote-data-science-work/index.html index bf468828e..43453e1a2 100644 --- a/2018/11/03/reflections-on-remote-data-science-work/index.html +++ b/2018/11/03/reflections-on-remote-data-science-work/index.html @@ -1,10 +1,12 @@ Reflections on remote data science work | Yanir Seroussi | Data & AI for Startup Impact -

                                                                      Reflections on remote data science work

                                                                      It’s been about a year and a half since I joined Automattic as a remote data scientist. This is the longest I’ve been in one position since finishing my PhD in 2012. This is also the first time I’ve worked full-time with a fully-distributed team. In this post, I briefly discuss some of the top pluses and minuses of remote work, based on my experience so far.

                                                                      + Flexible hours
                                                                      – Potentially boundless work

                                                                      By far, one of the top perks of remote work with a distributed team is truly flexible hours. I only have one or two synchronous meetings a week, and in the rest of my time I'm free to work the hours I prefer. No one expects me to be online at specific times, as long as the work gets done and I respond to pings within a reasonable time. As I'm a morning person, this means that I typically work a few hours in the early morning, take a long break (e.g., to surf or run some errands), and then work a few more hours in the afternoon or early evening.

                                                                      The potential downside of such flexibility is not being able to stop working, especially as most of my colleagues are in Europe and North America. I deal with this by avoiding all work communications during my designated non-work hours. For example, I don't have any work-related apps on my phone, I keep all my work tabs in a separate tab group, and I turn Slack off when I'm not working. I found that this approach sets enough of a boundary between my work and personal life, though I do end up thinking about work problems outside work hours occasionally.

                                                                      + More time for non-work activities
                                                                      – There’s never enough time!

                                                                      Not commuting freed up the equivalent of a workday in my schedule. In addition, having flexible hours means that I can make time in the middle of the day for leisure activities like surfing and diving. However, it's still a full-time job, so I'm not completely free to pursue non-work activities. It often feels like there isn't enough time in the day, as I can always think of more stuff I'd like to do. But my current situation is much better than having to commute on a daily basis. Even though it's been a relatively short time, I find the idea of going back to full-time office work hard to imagine.

                                                                      + No need to attend an office
                                                                      – Possible isolation from colleagues (and the real world)

                                                                      Offices – especially open-plan offices – are not great places to get work done. This is definitely the case with work that requires a high level of concentration over uninterrupted blocks of time, like coding and data analysis. Working from home is great for avoiding distractions – there's no need for silly horse blinders here (though I do enjoy looking at the bird and lizard action outside my window).

                                                                      One good thing about offices is the physical availability of colleagues. It's easy to ask others for feedback, socialise over drinks or shared meals, and keep up to date with company politics. Automattic works around the lack of daily physical interaction by running a few meetups a year. The number of people attending a meetup can vary from a handful for team meetups, to hundreds for the annual Grand Meetup. In all cases, the idea is to bring employees together for up to a week at a time to work and socialise. In my experience, the everyday distance creates a craving to attend meetups. I've never worked in a place where co-workers were so enthusiastic about spending so much time together – with non-distributed companies, team building is often seen as a chore. I suppose that the physical distance makes us appreciate the opportunity to be together and make the most of this precious time – it's a bit like being in a long-distance relationship.

                                                                      That said, in the majority of the time, isolation can be a problem. As I'm based in Australia, I probably feel it more than others – most of my teammates are offline during my work hours, which means that there's no one to chat with on Slack. This isn't a huge issue, but I do need to ensure I get enough social interaction through other avenues. As the jobs page of Bandcamp (another distributed company) used to say: "If you do not have a strong social structure outside of work then employment at Bandcamp will likely lead to heart disease and an early death. We’re hiring!"

                                                                      + Most communication is written
                                                                      – Information overload

                                                                      As Automattic is a fully-distributed company, most of the communication is done in writing. The main tools are Slack and internal forums called P2s (emails are rarely used). This makes catching up on the latest company news easy in comparison to places that rely more heavily on synchronous meetings. The downside of so much written communication is potential information overload. It is impossible to follow all the P2 posts, and even keeping up with stuff I should know can sometimes be overwhelming. I especially feel it in the mornings, as most of my colleagues work while I'm sleeping. Therefore, catching up on everything that happened overnight and responding to pings often takes over an hour – things are rarely as I left them when I last logged off. I experience this same feeling of being overwhelmed when coming back from vacation. Depending on the length of time away, it can take days to catch up. On the plus side, this process doesn't rely on someone filling me in – it's all there for me to read.

                                                                      + Free trips around the world
                                                                      – Jet lag and flying

                                                                      As noted above, Automatticians meet in person a few times a year. Since joining, I attended meetups in Montreal, Whistler, Playa del Carmen, Bali, and Orlando. In some cases, I used the opportunity for personal trips near the meetup locations. Such trips can be a lot of fun. However, the obvious downside when travelling from Australia is that getting to meetups usually involves days of jetlag and long flights (e.g., the 17-hour Dallas to Sydney trip). Nonetheless, I still enjoy the travel opportunities. For example, I doubt I would have ever visited Florida and snorkelled with manatees if it wasn't for Automattic.

                                                                      + Exposure to diverse opinions and people
                                                                      – Cultural differences can pose challenges

                                                                      Australia's population is made up of many migrants, especially in the tech industry. However, all such migrants have some familiarity with Australian culture and values. The composition of Automattic's workforce is even more diverse, and it lacks the unifying factor of everyone choosing to live in the same place. This is mostly positive, as I find the exposure to a diverse set of people interesting, and everyone tends to be friendly, welcoming, and focused on the work rather than on cultural differences. However, it's important to be aware of differences in communication styles. There's also a wider range of cultural sensitivities than when working with a more homogeneous group. Still, I haven't found it to be much of an issue, possibly because I'm already used to being a migrant. For example, moving to Australia from Israel required some adjustment of my communication style to be less direct.

                                                                      Closing words

                                                                      Overall, I like working with Automattic. For me, the positives outweigh the negatives, as evidenced by the fact that it’s the longest I’ve been in one position since 2012. Doing remote data science work doesn’t seem particularly different to doing any other sort of non-physical work remotely. I hope that more companies will join Automattic and the growing list of remote companies, and offer their employees the option to work from wherever they’re most productive.

                                                                      Update (March 2019): I also covered similar topics in a Data Science Sydney talk about a day in the life of a remote data scientist.

                                                                      Subscribe +

                                                                      Reflections on remote data science work

                                                                      It’s been about a year and a half since I joined Automattic as a remote data scientist. This is the longest I’ve been in one position since finishing my PhD in 2012. This is also the first time I’ve worked full-time with a fully-distributed team. In this post, I briefly discuss some of the top pluses and minuses of remote work, based on my experience so far.

                                                                      + Flexible hours
                                                                      – Potentially boundless work

                                                                      By far, one of the top perks of remote work with a distributed team is truly flexible hours. I only have one or two synchronous meetings a week, and in the rest of my time I'm free to work the hours I prefer. No one expects me to be online at specific times, as long as the work gets done and I respond to pings within a reasonable time. As I'm a morning person, this means that I typically work a few hours in the early morning, take a long break (e.g., to surf or run some errands), and then work a few more hours in the afternoon or early evening.

                                                                      The potential downside of such flexibility is not being able to stop working, especially as most of my colleagues are in Europe and North America. I deal with this by avoiding all work communications during my designated non-work hours. For example, I don't have any work-related apps on my phone, I keep all my work tabs in a separate tab group, and I turn Slack off when I'm not working. I found that this approach sets enough of a boundary between my work and personal life, though I do end up thinking about work problems outside work hours occasionally.

                                                                      + More time for non-work activities
                                                                      – There’s never enough time!

                                                                      Not commuting freed up the equivalent of a workday in my schedule. In addition, having flexible hours means that I can make time in the middle of the day for leisure activities like surfing and diving. However, it's still a full-time job, so I'm not completely free to pursue non-work activities. It often feels like there isn't enough time in the day, as I can always think of more stuff I'd like to do. But my current situation is much better than having to commute on a daily basis. Even though it's been a relatively short time, I find the idea of going back to full-time office work hard to imagine.

                                                                      + No need to attend an office
                                                                      – Possible isolation from colleagues (and the real world)

                                                                      Offices – especially open-plan offices – are not great places to get work done. This is definitely the case with work that requires a high level of concentration over uninterrupted blocks of time, like coding and data analysis. Working from home is great for avoiding distractions – there's no need for silly horse blinders here (though I do enjoy looking at the bird and lizard action outside my window).

                                                                      One good thing about offices is the physical availability of colleagues. It's easy to ask others for feedback, socialise over drinks or shared meals, and keep up to date with company politics. Automattic works around the lack of daily physical interaction by running a few meetups a year. The number of people attending a meetup can vary from a handful for team meetups, to hundreds for the annual Grand Meetup. In all cases, the idea is to bring employees together for up to a week at a time to work and socialise. In my experience, the everyday distance creates a craving to attend meetups. I've never worked in a place where co-workers were so enthusiastic about spending so much time together – with non-distributed companies, team building is often seen as a chore. I suppose that the physical distance makes us appreciate the opportunity to be together and make the most of this precious time – it's a bit like being in a long-distance relationship.

                                                                      That said, in the majority of the time, isolation can be a problem. As I'm based in Australia, I probably feel it more than others – most of my teammates are offline during my work hours, which means that there's no one to chat with on Slack. This isn't a huge issue, but I do need to ensure I get enough social interaction through other avenues. As the jobs page of Bandcamp (another distributed company) used to say: "If you do not have a strong social structure outside of work then employment at Bandcamp will likely lead to heart disease and an early death. We’re hiring!"

                                                                      + Most communication is written
                                                                      – Information overload

                                                                      As Automattic is a fully-distributed company, most of the communication is done in writing. The main tools are Slack and internal forums called P2s (emails are rarely used). This makes catching up on the latest company news easy in comparison to places that rely more heavily on synchronous meetings. The downside of so much written communication is potential information overload. It is impossible to follow all the P2 posts, and even keeping up with stuff I should know can sometimes be overwhelming. I especially feel it in the mornings, as most of my colleagues work while I'm sleeping. Therefore, catching up on everything that happened overnight and responding to pings often takes over an hour – things are rarely as I left them when I last logged off. I experience this same feeling of being overwhelmed when coming back from vacation. Depending on the length of time away, it can take days to catch up. On the plus side, this process doesn't rely on someone filling me in – it's all there for me to read.

                                                                      + Free trips around the world
                                                                      – Jet lag and flying

                                                                      As noted above, Automatticians meet in person a few times a year. Since joining, I attended meetups in Montreal, Whistler, Playa del Carmen, Bali, and Orlando. In some cases, I used the opportunity for personal trips near the meetup locations. Such trips can be a lot of fun. However, the obvious downside when travelling from Australia is that getting to meetups usually involves days of jetlag and long flights (e.g., the 17-hour Dallas to Sydney trip). Nonetheless, I still enjoy the travel opportunities. For example, I doubt I would have ever visited Florida and snorkelled with manatees if it wasn't for Automattic.

                                                                      + Exposure to diverse opinions and people
                                                                      – Cultural differences can pose challenges

                                                                      Australia's population is made up of many migrants, especially in the tech industry. However, all such migrants have some familiarity with Australian culture and values. The composition of Automattic's workforce is even more diverse, and it lacks the unifying factor of everyone choosing to live in the same place. This is mostly positive, as I find the exposure to a diverse set of people interesting, and everyone tends to be friendly, welcoming, and focused on the work rather than on cultural differences. However, it's important to be aware of differences in communication styles. There's also a wider range of cultural sensitivities than when working with a more homogeneous group. Still, I haven't found it to be much of an issue, possibly because I'm already used to being a migrant. For example, moving to Australia from Israel required some adjustment of my communication style to be less direct.

                                                                      Closing words

                                                                      Overall, I like working with Automattic. For me, the positives outweigh the negatives, as evidenced by the fact that it’s the longest I’ve been in one position since 2012. Doing remote data science work doesn’t seem particularly different to doing any other sort of non-physical work remotely. I hope that more companies will join Automattic and the growing list of remote companies, and offer their employees the option to work from wherever they’re most productive.

                                                                      Update (March 2019): I also covered similar topics in a Data Science Sydney talk about a day in the life of a remote data scientist.

                                                                      Subscribe

                                                                        Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                                                        I have been working remotely for WRI for nearly 2 years, and I can resonate with almost everything you have said. Great blog!
                                                                        Interested. Though not trained as Data scientist yet but as BI consultant with experience over a decade. Let me know if you have any opportunity.
                                                                        I am working for Accenture as Analyst. The article is very similar to my real life. -I pursued data science from top university and worked on few capstone projects.

                                                                        This site is a part of the Data People Writing Stuff webring.
                                                                        ← previous site +  |   +next site →

                                                                        \ No newline at end of file diff --git a/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.html b/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.html index cf7e9f8ad..2b14ee472 100644 --- a/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.html +++ b/2018/12/24/the-most-practical-causal-inference-book-ive-read-is-still-a-draft/index.html @@ -1,9 +1,11 @@ The most practical causal inference book I’ve read (is still a draft) | Yanir Seroussi | Data & AI for Startup Impact -

                                                                        The most practical causal inference book I’ve read (is still a draft)

                                                                        I’ve been interested in the area of causal inference in the past few years. In my opinion it’s more exciting and relevant to everyday life than more hyped data science areas like deep learning. However, I’ve found it hard to apply what I’ve learned about causal inference to my work. Now, I believe I’ve finally found a book with practical techniques that I can use on real problems: Causal Inference by Miguel Hernán and Jamie Robins. It is available for free from their site, but is still in draft mode. This post is a short summary of the reasons why I think Causal Inference is a great practical resource.

                                                                        One of the things that sets Causal Inference apart from other books on the topic is the background of its authors. Hernán and Robins are both epidemiologists, which means they often have to deal with data with strong limitations on sample size and feasibility of experiments. Decisions driven by causal inference in epidemiology can often make the difference between life and death of individuals. Hence, the book is full of practical examples.

                                                                        The book focuses on randomised controlled trials and well-defined interventions as the basis of causal inference from both experimental and observational data. As the authors show, even with randomised experiments, the analysis often requires using observational causal inference tools due to factors like selection and measurement biases. Their insistence on well-defined interventions is particularly refreshing, as one of the things that bothers me about the writings of Judea Pearl (a prominent researcher of causal inference) is the vagueness of statements like “smoking causes cancer” and “mud doesn’t cause rain”. The need for well-defined interventions was summarised by Hernán in the article Does water kill? A call for less casual causal inferences.

                                                                        Unlike some other resources, Causal Inference doesn’t appear to be too dogmatic about the framework used for modelling causality. I’m not an expert on where each idea originated, but it seems like the authors mix elements from the potential outcomes framework and from Pearl’s graphical models. They also don’t neglect time as an important consideration in cause-and-effect relationships. In fact, the third part of the book is dedicated to the topic of time-varying treatments and effects.

                                                                        The practicality of the book is also demonstrated by the fact that it comes with code examples in multiple languages. In addition, the authors don’t dwell too much on the philosophy of causality. While it is a fascinating topic, the opening paragraphs of the book make its goals clear:

                                                                        By reading this book you are expressing an interest in learning about causal inference. But, as a human being, you have already mastered the fundamental concepts of causal inference. You certainly know what a causal effect is; you clearly understand the difference between association and causation; and you have used this knowledge constantly throughout your life. In fact, had you not understood these causal concepts, you would have not survived long enough to read this chapter–or even to learn to read. As a toddler you would have jumped right into the swimming pool after observing that those who did so were later able to reach the jam jar. As a teenager, you would have skied down the most dangerous slopes after observing that those who did so were more likely to win the next ski race. As a parent, you would have refused to give antibiotics to your sick child after observing that those children who took their medicines were less likely to be playing in the park the next day.

                                                                        Since you already understand the definition of causal effect and the difference between association and causation, do not expect to gain deep conceptual insights from this chapter. Rather, the purpose of this chapter is to introduce mathematical notation that formalizes the causal intuition that you already possess. Make sure that you can match your causal intuition with the mathematical notation introduced here. This notation is necessary to precisely define causal concepts, and we will use it throughout the book.

                                                                        I won’t try to summarise the technical aspects of the book – partly because I don’t fully understand it all, and partly because the book itself is already a summary of a very rich research area. However, I’m likely to go back and reread the book in the future, with the goal of applying the techniques from the book to my work. I’d also like to take Hernán’s causal inference course as a way of practising what I’ve learned from the book. For people who want a non-technical summary of the topics covered by the book, I recommend the article The c-word: Scientific euphemisms do not improve causal inference from observational data. If you’re curious about other (less practical) causality books I’ve read, check out my causal inference resource list and my two previous posts on the topic: Why you should stop worrying about deep learning and deepen your understanding of causality instead and Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions.

                                                                        Subscribe +

                                                                        The most practical causal inference book I’ve read (is still a draft)

                                                                        I’ve been interested in the area of causal inference in the past few years. In my opinion it’s more exciting and relevant to everyday life than more hyped data science areas like deep learning. However, I’ve found it hard to apply what I’ve learned about causal inference to my work. Now, I believe I’ve finally found a book with practical techniques that I can use on real problems: Causal Inference by Miguel Hernán and Jamie Robins. It is available for free from their site, but is still in draft mode. This post is a short summary of the reasons why I think Causal Inference is a great practical resource.

                                                                        One of the things that sets Causal Inference apart from other books on the topic is the background of its authors. Hernán and Robins are both epidemiologists, which means they often have to deal with data with strong limitations on sample size and feasibility of experiments. Decisions driven by causal inference in epidemiology can often make the difference between life and death of individuals. Hence, the book is full of practical examples.

                                                                        The book focuses on randomised controlled trials and well-defined interventions as the basis of causal inference from both experimental and observational data. As the authors show, even with randomised experiments, the analysis often requires using observational causal inference tools due to factors like selection and measurement biases. Their insistence on well-defined interventions is particularly refreshing, as one of the things that bothers me about the writings of Judea Pearl (a prominent researcher of causal inference) is the vagueness of statements like “smoking causes cancer” and “mud doesn’t cause rain”. The need for well-defined interventions was summarised by Hernán in the article Does water kill? A call for less casual causal inferences.

                                                                        Unlike some other resources, Causal Inference doesn’t appear to be too dogmatic about the framework used for modelling causality. I’m not an expert on where each idea originated, but it seems like the authors mix elements from the potential outcomes framework and from Pearl’s graphical models. They also don’t neglect time as an important consideration in cause-and-effect relationships. In fact, the third part of the book is dedicated to the topic of time-varying treatments and effects.

                                                                        The practicality of the book is also demonstrated by the fact that it comes with code examples in multiple languages. In addition, the authors don’t dwell too much on the philosophy of causality. While it is a fascinating topic, the opening paragraphs of the book make its goals clear:

                                                                        By reading this book you are expressing an interest in learning about causal inference. But, as a human being, you have already mastered the fundamental concepts of causal inference. You certainly know what a causal effect is; you clearly understand the difference between association and causation; and you have used this knowledge constantly throughout your life. In fact, had you not understood these causal concepts, you would have not survived long enough to read this chapter–or even to learn to read. As a toddler you would have jumped right into the swimming pool after observing that those who did so were later able to reach the jam jar. As a teenager, you would have skied down the most dangerous slopes after observing that those who did so were more likely to win the next ski race. As a parent, you would have refused to give antibiotics to your sick child after observing that those children who took their medicines were less likely to be playing in the park the next day.

                                                                        Since you already understand the definition of causal effect and the difference between association and causation, do not expect to gain deep conceptual insights from this chapter. Rather, the purpose of this chapter is to introduce mathematical notation that formalizes the causal intuition that you already possess. Make sure that you can match your causal intuition with the mathematical notation introduced here. This notation is necessary to precisely define causal concepts, and we will use it throughout the book.

                                                                        I won’t try to summarise the technical aspects of the book – partly because I don’t fully understand it all, and partly because the book itself is already a summary of a very rich research area. However, I’m likely to go back and reread the book in the future, with the goal of applying the techniques from the book to my work. I’d also like to take Hernán’s causal inference course as a way of practising what I’ve learned from the book. For people who want a non-technical summary of the topics covered by the book, I recommend the article The c-word: Scientific euphemisms do not improve causal inference from observational data. If you’re curious about other (less practical) causality books I’ve read, check out my causal inference resource list and my two previous posts on the topic: Why you should stop worrying about deep learning and deepen your understanding of causality instead and Diving deeper into causality: Pearl, Kleinberg, Hill, and untested assumptions.

                                                                        Subscribe

                                                                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                          \ No newline at end of file diff --git a/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/index.html b/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/index.html index 8a739681a..f8feccb21 100644 --- a/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/index.html +++ b/2019/01/08/hackers-beware-bootstrap-sampling-may-be-harmful/index.html @@ -1,5 +1,5 @@ Hackers beware: Bootstrap sampling may be harmful | Yanir Seroussi | Data & AI for Startup Impact -

                                                                          Hackers beware: Bootstrap sampling may be harmful

                                                                          Bootstrap sampling techniques are very appealing, as they don’t require knowing much about statistics and opaque formulas. Instead, all one needs to do is resample the given data many times, and calculate the desired statistics. Therefore, bootstrapping has been promoted as an easy way of modelling uncertainty to hackers who don’t have much statistical knowledge. For example, the main thesis of the excellent Statistics for Hackers talk by Jake VanderPlas is: “If you can write a for-loop, you can do statistics”. Similar ground was covered by Erik Bernhardsson in The Hacker’s Guide to Uncertainty Estimates, which provides more use cases for bootstrapping (with code examples). However, I’ve learned in the past few weeks that there are quite a few pitfalls in bootstrapping. Much of what I’ve learned is summarised in a paper titled What Teachers Should Know about the Bootstrap: Resampling in the Undergraduate Statistics Curriculum by Tim Hesterberg. I doubt that many hackers would be motivated to read a paper with such a title, so my goal with this post is to make some of my discoveries more accessible to a wider audience. To learn more about the issues raised in this post, it’s worth reading Hesterberg’s paper and other linked resources.

                                                                          For quick reference, here’s a summary of the advice in this post:

                                                                          • Use an accurate method for estimating confidence intervals
                                                                          • Use enough resamples – at least 10-15K
                                                                          • Don’t compare confidence intervals visually
                                                                          • Ensure that the basic assumptions apply to your situation

                                                                          Pitfall #1: Inaccurate confidence intervals

                                                                          Confidence intervals are a common way of quantifying the uncertainty in an estimate of a population parameter. The percentile method is one of the simplest bootstrapping approaches for generating confidence intervals. For example, let’s say we have a data sample of size n and we want to estimate a 95% confidence interval for the population mean. We take r bootstrap resamples from the original data sample, where each resample is a sample with replacement of size n. We calculate the mean of each resample and store the means in a sorted array. We then return the 95% confidence interval as the values that fall at the 0.025r and 0.975r indices of the sorted array (i.e., the 2.5% and 97.5% percentiles). The following table shows what the first two resamples may look like for a data sample of size n=5.

                                                                          Original sampleResample #1Resample #2
                                                                          Values103020
                                                                          122020
                                                                          201230
                                                                          301230
                                                                          454530
                                                                          Mean23.423.826

                                                                          The percentile method is nice and simple. Any programmer should be able to easily implement it in their favourite programming language, assuming they can actually program. Unfortunately, this method is just not accurate enough for small sample sizes. Quoting Hesterberg (emphasis mine):

                                                                          The sample sizes needed for different intervals to satisfy the “reasonably accurate” (off by no more than 10% on each side) criterion are: n ≥ 101 for the bootstrap t, 220 for the skewness-adjusted t statistic, 2,235 for expanded percentile, 2,383 for percentile, 4,815 for ordinary t (which I have rounded up to 5,000 above), 5,063 for t with bootstrap standard errors and something over 8,000 for the reverse percentile method.

                                                                          In a shorter version of the paper cited above, Hesterberg concludes that:

                                                                          In practice, implementing some of the more accurate bootstrap methods is difficult (especially those not described here), and people should use a package rather than attempt this themselves.

                                                                          In short, make sure you’re using an accurate method for estimating confidence intervals when dealing with sample sizes of less than a few thousand values. Using a package is a great idea, but unfortunately I don’t know of any Python bootstrapping package that is feature-complete: ARCH and scikits-bootstrap support advanced confidence interval methods but don’t support analysis of two samples of uneven sizes, while bootstrapped works with samples of uneven sizes but only supports the percentile and the reverse percentile method (which Hesterberg found to be even less accurate). If you know of any better Python packages, please let me know! (I don’t use R, but I suspect the situation is better there). Update: ARCH now supports analysis of samples of uneven sizes following an issue I reported. It seems to be the best Python bootstrapping package, so I recommend using it.

                                                                          Pitfall #2: Not enough resamples

                                                                          Accurate bootstrap estimates require a large number of resamples. Many code snippets use 1,000 resamples, probably because it looks like a large number. However, seeming large isn’t enough. Quoting Hesterberg again:

                                                                          For both the bootstrap and permutation tests, the number of resamples needs to be 15,000 or more, for 95% probability that simulation-based one-sided levels fall within 10% of the true values, for 95% intervals and 5% tests. I recommend r = 10,000 for routine use, and more when accuracy matters.

                                                                          […]

                                                                          We want decisions to depend on the data, not random variation in the Monte Carlo implementation. We used r = 500,000 in the Verizon project.

                                                                          That’s right, half a million resamples! Accuracy mattered in the Verizon case, as the results of the analysis determined whether large penalties were paid or not. In short, use at least 10-15,000 resamples to be safe. Don’t use 1,000.

                                                                          Pitfall #3: Comparison of single-sample confidence intervals

                                                                          Confidence intervals are commonly used to decide if the difference between two samples is statistically significant. Bootstrapping provides a straightforward way of estimating confidence intervals without making assumptions about the way the data was generated. For example, given two samples, we can obtain confidence intervals for the mean of each sample and end up with a plot like this:

                                                                          Hackers beware: Bootstrap sampling may be harmful

                                                                          Bootstrap sampling techniques are very appealing, as they don’t require knowing much about statistics and opaque formulas. Instead, all one needs to do is resample the given data many times, and calculate the desired statistics. Therefore, bootstrapping has been promoted as an easy way of modelling uncertainty to hackers who don’t have much statistical knowledge. For example, the main thesis of the excellent Statistics for Hackers talk by Jake VanderPlas is: “If you can write a for-loop, you can do statistics”. Similar ground was covered by Erik Bernhardsson in The Hacker’s Guide to Uncertainty Estimates, which provides more use cases for bootstrapping (with code examples). However, I’ve learned in the past few weeks that there are quite a few pitfalls in bootstrapping. Much of what I’ve learned is summarised in a paper titled What Teachers Should Know about the Bootstrap: Resampling in the Undergraduate Statistics Curriculum by Tim Hesterberg. I doubt that many hackers would be motivated to read a paper with such a title, so my goal with this post is to make some of my discoveries more accessible to a wider audience. To learn more about the issues raised in this post, it’s worth reading Hesterberg’s paper and other linked resources.

                                                                          For quick reference, here’s a summary of the advice in this post:

                                                                          • Use an accurate method for estimating confidence intervals
                                                                          • Use enough resamples – at least 10-15K
                                                                          • Don’t compare confidence intervals visually
                                                                          • Ensure that the basic assumptions apply to your situation

                                                                          Pitfall #1: Inaccurate confidence intervals

                                                                          Confidence intervals are a common way of quantifying the uncertainty in an estimate of a population parameter. The percentile method is one of the simplest bootstrapping approaches for generating confidence intervals. For example, let’s say we have a data sample of size n and we want to estimate a 95% confidence interval for the population mean. We take r bootstrap resamples from the original data sample, where each resample is a sample with replacement of size n. We calculate the mean of each resample and store the means in a sorted array. We then return the 95% confidence interval as the values that fall at the 0.025r and 0.975r indices of the sorted array (i.e., the 2.5% and 97.5% percentiles). The following table shows what the first two resamples may look like for a data sample of size n=5.

                                                                          Original sampleResample #1Resample #2
                                                                          Values103020
                                                                          122020
                                                                          201230
                                                                          301230
                                                                          454530
                                                                          Mean23.423.826

                                                                          The percentile method is nice and simple. Any programmer should be able to easily implement it in their favourite programming language, assuming they can actually program. Unfortunately, this method is just not accurate enough for small sample sizes. Quoting Hesterberg (emphasis mine):

                                                                          The sample sizes needed for different intervals to satisfy the “reasonably accurate” (off by no more than 10% on each side) criterion are: n ≥ 101 for the bootstrap t, 220 for the skewness-adjusted t statistic, 2,235 for expanded percentile, 2,383 for percentile, 4,815 for ordinary t (which I have rounded up to 5,000 above), 5,063 for t with bootstrap standard errors and something over 8,000 for the reverse percentile method.

                                                                          In a shorter version of the paper cited above, Hesterberg concludes that:

                                                                          In practice, implementing some of the more accurate bootstrap methods is difficult (especially those not described here), and people should use a package rather than attempt this themselves.

                                                                          In short, make sure you’re using an accurate method for estimating confidence intervals when dealing with sample sizes of less than a few thousand values. Using a package is a great idea, but unfortunately I don’t know of any Python bootstrapping package that is feature-complete: ARCH and scikits-bootstrap support advanced confidence interval methods but don’t support analysis of two samples of uneven sizes, while bootstrapped works with samples of uneven sizes but only supports the percentile and the reverse percentile method (which Hesterberg found to be even less accurate). If you know of any better Python packages, please let me know! (I don’t use R, but I suspect the situation is better there). Update: ARCH now supports analysis of samples of uneven sizes following an issue I reported. It seems to be the best Python bootstrapping package, so I recommend using it.

                                                                          Pitfall #2: Not enough resamples

                                                                          Accurate bootstrap estimates require a large number of resamples. Many code snippets use 1,000 resamples, probably because it looks like a large number. However, seeming large isn’t enough. Quoting Hesterberg again:

                                                                          For both the bootstrap and permutation tests, the number of resamples needs to be 15,000 or more, for 95% probability that simulation-based one-sided levels fall within 10% of the true values, for 95% intervals and 5% tests. I recommend r = 10,000 for routine use, and more when accuracy matters.

                                                                          […]

                                                                          We want decisions to depend on the data, not random variation in the Monte Carlo implementation. We used r = 500,000 in the Verizon project.

                                                                          That’s right, half a million resamples! Accuracy mattered in the Verizon case, as the results of the analysis determined whether large penalties were paid or not. In short, use at least 10-15,000 resamples to be safe. Don’t use 1,000.

                                                                          Pitfall #3: Comparison of single-sample confidence intervals

                                                                          Confidence intervals are commonly used to decide if the difference between two samples is statistically significant. Bootstrapping provides a straightforward way of estimating confidence intervals without making assumptions about the way the data was generated. For example, given two samples, we can obtain confidence intervals for the mean of each sample and end up with a plot like this:

                                                                          800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                            “the basic bootstrap makes no assumption about the underlying distribution of the data”: I suppose bootstrapping per se doesn’t, but some things people like to use it for do. For example, suppose the original sample is from a Cauchy distribution, and bootstrapping is used to compute a confidence interval around the sample mean; no matter how many bootstrap replicates are used, the computed interval is worthless, because the original distribution doesn’t have a mean. Of course, that’s an extreme case unlikely to arise in practice, but it immediately raises doubt that guidelines like “n ≥ 101 for the bootstrap t” should be applied uncritically. As you obviously agree, it’s best to know and think about where the data came from when deciding which statistical methods to apply.
                                                                            Learned a lot from the post. One question on the CIs for means and the difference between the means. If the two CIs for the means do not overlap, does it always imply that the difference is significant? Or can the error go in both ways, meaning that it is possible to have non-overlapping CIs and the CI of the difference includes 0?

                                                                            Reblogged this on Boris Gorelik and commented:

                                                                            Anything is better when bootstrapped. Read my co-worker’s post on bootstrapping. Also make sure following the links Yanir gives to support his claims

                                                                            \ No newline at end of file diff --git a/2019/10/06/bootstrapping-the-right-way/index.html b/2019/10/06/bootstrapping-the-right-way/index.html index e4f96f430..64a6c1ceb 100644 --- a/2019/10/06/bootstrapping-the-right-way/index.html +++ b/2019/10/06/bootstrapping-the-right-way/index.html @@ -1,5 +1,5 @@ Bootstrapping the right way? | Yanir Seroussi | Data & AI for Startup Impact -

                                                                            Bootstrapping the right way?

                                                                            Bootstrapping the right way is a talk I gave earlier this year at the YOW! Data conference in Sydney. You can now watch the video of the talk and have a look through the slides. The content of the talk is similar to a post I published on bootstrapping pitfalls, with some additional simulations.

                                                                            The main takeaways shared in the talk are:

                                                                            • Don’t compare single-sample confidence intervals by eye
                                                                            • Use enough resamples (15K?)
                                                                            • Use a solid bootstrapping package (e.g., Python ARCH)
                                                                            • Use the right bootstrap for the job
                                                                            • Consider going parametric Bayesian
                                                                            • Test all the things

                                                                            Testing all the things typically requires writing code, which I did for the talk. You can browse through it in this notebook. The most interesting findings from my tests are summarised by the following figure.

                                                                            Bootstrapping the right way?

                                                                            Bootstrapping the right way is a talk I gave earlier this year at the YOW! Data conference in Sydney. You can now watch the video of the talk and have a look through the slides. The content of the talk is similar to a post I published on bootstrapping pitfalls, with some additional simulations.

                                                                            The main takeaways shared in the talk are:

                                                                            • Don’t compare single-sample confidence intervals by eye
                                                                            • Use enough resamples (15K?)
                                                                            • Use a solid bootstrapping package (e.g., Python ARCH)
                                                                            • Use the right bootstrap for the job
                                                                            • Consider going parametric Bayesian
                                                                            • Test all the things

                                                                            Testing all the things typically requires writing code, which I did for the talk. You can browse through it in this notebook. The most interesting findings from my tests are summarised by the following figure.

                                                                            800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                              Reblogged this on Boris Gorelik and commented:

                                                                              Many years ago, I terribly overfit a model which caused losses of a lot of shekels (a LOT). It’s not that I wasn’t aware of the potential overfitting. I was. Among other things, I used several bootstrapping simulations. It turns out that I applied the bootstrapping in a wrong way. My particular problem was that I “forgot” about confounding parameters and that I “forgot” that peeping into the future is a bad thing.

                                                                              Anyhow, Yanir Seroussi, my coworker data scientist, gave a very good talk on bootstrapping.

                                                                              \ No newline at end of file diff --git a/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/index.html b/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/index.html index 375ba9ca1..146b4fd47 100644 --- a/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/index.html +++ b/2019/12/12/a-day-in-the-life-of-a-remote-data-scientist/index.html @@ -1,9 +1,11 @@ A day in the life of a remote data scientist | Yanir Seroussi | Data & AI for Startup Impact -

                                                                              A day in the life of a remote data scientist

                                                                              Earlier this year, I gave a talk titled A Day in the Life of a Remote Data Scientist at the Data Science Sydney meetup. The talk covered similar ground to a post I published on remote data science work, with additional details on my daily schedule and projects, some gifs and Sydney jokes, heckling by the audience, and a Q&A session. I managed to watch it a few months ago without cringing too much, so it’s about time to post it here. The slides are on my GitHub, as is my list of established remote companies, which you may find useful if you want to join the remote work fun.

                                                                              Subscribe +

                                                                              A day in the life of a remote data scientist

                                                                              Earlier this year, I gave a talk titled A Day in the Life of a Remote Data Scientist at the Data Science Sydney meetup. The talk covered similar ground to a post I published on remote data science work, with additional details on my daily schedule and projects, some gifs and Sydney jokes, heckling by the audience, and a Q&A session. I managed to watch it a few months ago without cringing too much, so it’s about time to post it here. The slides are on my GitHub, as is my list of established remote companies, which you may find useful if you want to join the remote work fun.

                                                                              Subscribe

                                                                                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                \ No newline at end of file diff --git a/2020/01/11/software-commodities-are-eating-interesting-data-science-work/index.html b/2020/01/11/software-commodities-are-eating-interesting-data-science-work/index.html index 836a720b4..62fac8048 100644 --- a/2020/01/11/software-commodities-are-eating-interesting-data-science-work/index.html +++ b/2020/01/11/software-commodities-are-eating-interesting-data-science-work/index.html @@ -1,5 +1,5 @@ Software commodities are eating interesting data science work | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                Software commodities are eating interesting data science work

                                                                                The passage of time makes wizards of us all. Today, any dullard can make bells ring across the ocean by tapping out phone numbers, cause inanimate toys to march by barking an order, or activate remote devices by touching a wireless screen. Thomas Edison couldn’t have managed any of this at his peak—and shortly before his time, such powers would have been considered the unique realm of God.

                                                                                Rob Reid +

                                                                                Software commodities are eating interesting data science work

                                                                                The passage of time makes wizards of us all. Today, any dullard can make bells ring across the ocean by tapping out phone numbers, cause inanimate toys to march by barking an order, or activate remote devices by touching a wireless screen. Thomas Edison couldn’t have managed any of this at his peak—and shortly before his time, such powers would have been considered the unique realm of God.

                                                                                Being a data scientist can sometimes feel like a race against software innovations. Every interesting and useful problem is bound to become a software commodity. My story seems to reflect that: From my first steps in sentiment analysis and topic modelling, through building recommender systems while dabbling in Kaggle competitions and deep learning a few years ago, and to my present-day interest in causal inference. What can one do to remain relevant in such an environment? Read this post to find out.

                                                                                Highlights from my past

                                                                                When I started my PhD in 2009, the plan was to work on sentiment analysis of opinion polls. This got me into applied machine learning using Java and Weka, with which I made some modest contributions to the field. Today, researching sentiment analysis would feel somewhat pointless, given the plethora of sentiment analysis services. Sentiment analysis is a commodity – using it in practice is a software engineering problem.

                                                                                Moving forward in my PhD, I got into topic modelling. I learned about Bayesian statistics and conjugate priors. I went through the arduous process of solving integrals by hand and coding a custom Gibbs sampler for the models I specified. Today, I probably wouldn’t bother with the maths. Instead, I’d specify the model and let a probabilistic programming tool like pymc3 or Stan handle the rest. Bayesian inference is now a commodity that’s accessible to any hacker.

                                                                                Public comments are closed, but I love hearing from readers. Feel free to contact me with your thoughts.

                                                                                This is the same conclusion I reached when deciding between deepening data science skills vs engineering; now I’m deeper into cloud services and off-the-shelf ML tools.
                                                                                Good points, thanks Boris!

                                                                                Hi Yanir!

                                                                                The post really reasonated with me. I find more and more that I do engineering during my day than science.

                                                                                I believe the data engineering part, including cloud and full stack development skills, will prove to be the skills that keep you relevant in industry. If you combine these with knowledge on which techniques to use regarding data science and machine learning, then you can be unstoppable.

                                                                                Otherwise, as you said, it’s better to stay in academia.

                                                                                Best, -Antonios

                                                                                Hi Yanir,

                                                                                I am glad I found your post. I am switching careers and want to work with data for social good. I learned data analysis, and am thinking of explore machine learning, see if that’s something for me. Being an engineer (not tech related), I would definitely be content with your option 1, where I understand what’s going on behind the scenes but don’t get into research and maths.

                                                                                Interesting to know what the trend is in the field.

                                                                                Hi Yanir,

                                                                                I am glad I found your post. I am switching careers and want to work with data for social good. I learned data analysis, and am thinking of explore machine learning, see if that’s something for me. Being an engineer (not tech related), I would definitely be content with your option 1, where I understand what’s going on behind the scenes but don’t get into research and maths.

                                                                                Interesting to know what the trend is in the field.

                                                                                This site is a part of the Data People Writing Stuff webring.
                                                                                ← previous site +  |   +next site →

                                                                                \ No newline at end of file diff --git a/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.html b/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.html index 4fbd9890a..ee3e7558b 100644 --- a/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.html +++ b/2020/08/24/many-is-not-enough-counting-simulations-to-bootstrap-the-right-way/index.html @@ -1,5 +1,5 @@ Many is not enough: Counting simulations to bootstrap the right way | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                Many is not enough: Counting simulations to bootstrap the right way

                                                                                Previously, I encouraged readers to test different approaches to bootstrapped confidence interval (CI) estimation. Such testing can done by relying on the definition of CIs: Given an infinite number of independent samples from the same population, we expect a ci_level CI to contain the population parameter in exactly ci_level percent of the samples. Therefore, we run “many” simulations (num_simulations), where each simulation generates a random sample from the same population and runs the CI algorithm on the sample. We then look at the observed CI level (i.e., the percentage of CIs that contain the true population parameter), and say that the CI algorithm works as expected if the observed CI level is “not too far” from the requested ci_level.

                                                                                Keen observers may notice that the language I used to describe the process isn’t accurate enough. How many is “many” simulations? How far is “not too far”?

                                                                                I made a mistake by not asking and answering these questions before. I decided that num_simulations=1,000 is a reasonable number of simulations, and didn’t consider how this affects the observed CI level. The decision to use num_simulations=1,000 was informed by practical concerns (i.e., wanting the simulations to finish within a reasonable timeframe), while ranges for the observed CI level were determined empirically – by observing the results of the simulations rather than by considering the properties of the problem.

                                                                                The idea of using simulations to test bootstrapped CIs came from Tim Hesterberg’s What Teachers Should Know about the Bootstrap. The experiments presented in that paper used num_simulations=10,000, but it wasn’t made clear why this number was chosen. This may have been due to space limitations or because this point is obvious to experienced statisticians. Embarrassingly, my approach of using fewer simulations without considering how they affect the observed CIs can be seen as a form of Belief in The Law of Small Numbers.

                                                                                Fortunately, it’s not hard to move away from belief in the law of small numbers in this case: We can see a set of simulations as sampling from Binomial(n=num_simulations, p=ci_level), where the number of “successes” is the number of simulations where the true population parameter falls in the CI returned by the CI algorithm. We can define our desired level of confidence in the simulation results as the simulation confidence, and use the simulation confidence interval of the binomial distribution to decide on a likely range for the observed CI level.

                                                                                To make this more concrete, here’s a Python function that gives the observed CI level bounds for different values of num_simulations, given the ci_level and simulation confidence. The output from running this function with the default arguments is plotted below.

                                                                                import numpy as np
                                                                                +

                                                                                Many is not enough: Counting simulations to bootstrap the right way

                                                                                Previously, I encouraged readers to test different approaches to bootstrapped confidence interval (CI) estimation. Such testing can done by relying on the definition of CIs: Given an infinite number of independent samples from the same population, we expect a ci_level CI to contain the population parameter in exactly ci_level percent of the samples. Therefore, we run “many” simulations (num_simulations), where each simulation generates a random sample from the same population and runs the CI algorithm on the sample. We then look at the observed CI level (i.e., the percentage of CIs that contain the true population parameter), and say that the CI algorithm works as expected if the observed CI level is “not too far” from the requested ci_level.

                                                                                Keen observers may notice that the language I used to describe the process isn’t accurate enough. How many is “many” simulations? How far is “not too far”?

                                                                                I made a mistake by not asking and answering these questions before. I decided that num_simulations=1,000 is a reasonable number of simulations, and didn’t consider how this affects the observed CI level. The decision to use num_simulations=1,000 was informed by practical concerns (i.e., wanting the simulations to finish within a reasonable timeframe), while ranges for the observed CI level were determined empirically – by observing the results of the simulations rather than by considering the properties of the problem.

                                                                                The idea of using simulations to test bootstrapped CIs came from Tim Hesterberg’s What Teachers Should Know about the Bootstrap. The experiments presented in that paper used num_simulations=10,000, but it wasn’t made clear why this number was chosen. This may have been due to space limitations or because this point is obvious to experienced statisticians. Embarrassingly, my approach of using fewer simulations without considering how they affect the observed CIs can be seen as a form of Belief in The Law of Small Numbers.

                                                                                Fortunately, it’s not hard to move away from belief in the law of small numbers in this case: We can see a set of simulations as sampling from Binomial(n=num_simulations, p=ci_level), where the number of “successes” is the number of simulations where the true population parameter falls in the CI returned by the CI algorithm. We can define our desired level of confidence in the simulation results as the simulation confidence, and use the simulation confidence interval of the binomial distribution to decide on a likely range for the observed CI level.

                                                                                To make this more concrete, here’s a Python function that gives the observed CI level bounds for different values of num_simulations, given the ci_level and simulation confidence. The output from running this function with the default arguments is plotted below.

                                                                                import numpy as np
                                                                                 import pandas as pd
                                                                                 import scipy.stats
                                                                                  
                                                                                @@ -45,7 +45,9 @@
                                                                                 

                                                                                  Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                  \ No newline at end of file diff --git a/2021/04/05/some-highlights-from-2020/index.html b/2021/04/05/some-highlights-from-2020/index.html index b35cf8597..d270756f0 100644 --- a/2021/04/05/some-highlights-from-2020/index.html +++ b/2021/04/05/some-highlights-from-2020/index.html @@ -1,5 +1,5 @@ Some highlights from 2020 | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                  Some highlights from 2020

                                                                                  My track record of posting here has been pretty poor in 2020, partly because of a bunch of content I’ve contributed elsewhere. In general, my guiding principle for posting is to only add stuff I’d want to read or cite, e.g., because I haven’t seen it discussed elsewhere. Well, no one has compiled a meta-post of my public work from 2020 (that I know of), so it’s finally time to publish it myself.

                                                                                  Remote work. I’ve been working remotely with Automattic since 2017, so I was pretty covid-ready as far as work was concerned. The main thing that’s changed for me is being unable to meet my colleagues in person. Looking back at the interview I did with BuiltIn from March 2020, it’s somewhat amusing that I was hopeful that we’d get to travel in May 2020, as business trips are still on hold a year later. Outside Automattic, it was interesting to see how quickly remote work has become commonplace, to the point where my curated list of established remote companies now seems irrelevant. Also, my June webinar with Felipe Flores on running remote teams is probably dated now that many more people have hands-on experience with remote work. The world has adapted quickly, though it seems like Automattic’s globally-distributed model is still quite unusual. Instead, many companies have switched to a locally-remote model, hiring remotely within the same country or timezone region. Considering the coordination costs of globally-distributed teams and the impact of frequent long-haul flights on employee wellbeing and on our environment, it may turn out that the locally-remote model is more sustainable in the long term. Only time will tell.

                                                                                  Sustainability. The Australian bushfires of 2019-20 provided me with extra motivation to help nudge Automattic to do more in the fight against climate change. The initial covid-19 lockdown provided me with extra free time to make the measurement and offsetting of Automattic’s emissions from data centre power use happen. I summarised this work in a post on the company’s blog, and discussed it in an interview with PublishPress. If there’s one key reason why I haven’t posted more here, it’s that the sustainability work always seems more worthwhile. I hope to continue working in the area in 2021, so the frequency of posts here is likely to remain about the same.

                                                                                  Some highlights from 2020

                                                                                  My track record of posting here has been pretty poor in 2020, partly because of a bunch of content I’ve contributed elsewhere. In general, my guiding principle for posting is to only add stuff I’d want to read or cite, e.g., because I haven’t seen it discussed elsewhere. Well, no one has compiled a meta-post of my public work from 2020 (that I know of), so it’s finally time to publish it myself.

                                                                                  Remote work. I’ve been working remotely with Automattic since 2017, so I was pretty covid-ready as far as work was concerned. The main thing that’s changed for me is being unable to meet my colleagues in person. Looking back at the interview I did with BuiltIn from March 2020, it’s somewhat amusing that I was hopeful that we’d get to travel in May 2020, as business trips are still on hold a year later. Outside Automattic, it was interesting to see how quickly remote work has become commonplace, to the point where my curated list of established remote companies now seems irrelevant. Also, my June webinar with Felipe Flores on running remote teams is probably dated now that many more people have hands-on experience with remote work. The world has adapted quickly, though it seems like Automattic’s globally-distributed model is still quite unusual. Instead, many companies have switched to a locally-remote model, hiring remotely within the same country or timezone region. Considering the coordination costs of globally-distributed teams and the impact of frequent long-haul flights on employee wellbeing and on our environment, it may turn out that the locally-remote model is more sustainable in the long term. Only time will tell.

                                                                                  Sustainability. The Australian bushfires of 2019-20 provided me with extra motivation to help nudge Automattic to do more in the fight against climate change. The initial covid-19 lockdown provided me with extra free time to make the measurement and offsetting of Automattic’s emissions from data centre power use happen. I summarised this work in a post on the company’s blog, and discussed it in an interview with PublishPress. If there’s one key reason why I haven’t posted more here, it’s that the sustainability work always seems more worthwhile. I hope to continue working in the area in 2021, so the frequency of posts here is likely to remain about the same.

                                                                                  800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                    Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                    \ No newline at end of file diff --git a/2021/10/07/my-work-with-automattic/index.html b/2021/10/07/my-work-with-automattic/index.html index 604ad69b9..5ecea6ed0 100644 --- a/2021/10/07/my-work-with-automattic/index.html +++ b/2021/10/07/my-work-with-automattic/index.html @@ -1,9 +1,11 @@ My work with Automattic | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                    My work with Automattic

                                                                                    Bing thinks I looked like this while working at Automattic.

                                                                                    Bing thinks I looked like this while working at Automattic.

                                                                                    Automattic is the company behind WordPress.com, Tumblr, Jetpack, WooCommerce, and several other products. I worked with Automattic as a Type B Data Scientist (i.e., I mostly built and deployed code to production) from May 2017 to October 2021. This post is back-dated to my last day with the company to make it fit nicely into my post timeline, but I’m actually writing this in July 2023. The magic of time travel! 🪄

                                                                                    A nice perk of working with Automattic was getting to write about my work on company blogs. When my website was on WordPress.com, I used the reblogging feature to share those posts here, but they never looked great. One of the first projects I completed after leaving Automattic was migrating my site from WordPress.com to Hugo, which made the reblog posts look even worse. Now all those reblogs redirect here, thanks to Hugo’s aliases feature.

                                                                                    Anyway, here are some highlights from my Automattic work along with links to the relevant posts:

                                                                                    On this website, you can also read about how I ended up joining Automattic and on some of the reasons behind my decision to leave the company.

                                                                                    Subscribe +

                                                                                    My work with Automattic

                                                                                    Bing thinks I looked like this while working at Automattic.

                                                                                    Bing thinks I looked like this while working at Automattic.

                                                                                    Automattic is the company behind WordPress.com, Tumblr, Jetpack, WooCommerce, and several other products. I worked with Automattic as a Type B Data Scientist (i.e., I mostly built and deployed code to production) from May 2017 to October 2021. This post is back-dated to my last day with the company to make it fit nicely into my post timeline, but I’m actually writing this in July 2023. The magic of time travel! 🪄

                                                                                    A nice perk of working with Automattic was getting to write about my work on company blogs. When my website was on WordPress.com, I used the reblogging feature to share those posts here, but they never looked great. One of the first projects I completed after leaving Automattic was migrating my site from WordPress.com to Hugo, which made the reblog posts look even worse. Now all those reblogs redirect here, thanks to Hugo’s aliases feature.

                                                                                    Anyway, here are some highlights from my Automattic work along with links to the relevant posts:

                                                                                    On this website, you can also read about how I ended up joining Automattic and on some of the reasons behind my decision to leave the company.

                                                                                    Subscribe

                                                                                      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                      \ No newline at end of file diff --git a/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.html b/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.html index 2c4e4d5b8..b0d311fb2 100644 --- a/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.html +++ b/2021/11/10/migrating-from-wordpress-com-to-hugo-on-github-cloudflare/index.html @@ -1,5 +1,5 @@ Migrating from WordPress.com to Hugo on GitHub + Cloudflare | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                      Migrating from WordPress.com to Hugo on GitHub + Cloudflare

                                                                                      Last month, I left Automattic (the company behind WordPress.com) after about 4.5 years of working there as a data scientist. As I am moving back into independent consulting, I decided it was time to give my website a facelift and start posting more often. The biggest part of the facelift was migrating off WordPress.com – I now use Hugo for site generation and GitHub + Cloudflare for hosting. This post summarises my reasons for switching and some technical choices I made, which may be useful for people who are considering a similar migration.

                                                                                      Why switch from WordPress.com to Hugo?

                                                                                      The easiest short-term choice would have been to stick with WordPress.com and spend more time on publishing new posts and working on other projects. However, if I were to start a new personal site today, it’s unlikely I would choose WordPress.com, i.e., not migrating would have been due to inertia. Given that I had the free time to invest in the migration, it seemed worth doing for the following long-term benefits:

                                                                                      \ No newline at end of file diff --git a/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.html b/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.html index 5634f8e87..5a1664813 100644 --- a/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.html +++ b/2021/11/22/use-your-human-brain-to-avoid-artificial-intelligence-disasters/index.html @@ -1,5 +1,5 @@ Use your human brain to avoid artificial intelligence disasters | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                      Use your human brain to avoid artificial intelligence disasters

                                                                                      If you don't think about your modelling context, you're gonna have a bad time.

                                                                                      Earlier this year, I helped mentor a local edition of fast.ai’s Practical Deep Learning for Coders. Each mentor gave a brief talk on a given week’s subject, adding to the material covered in the recorded lectures. My talk (embedded below) supplemented the data ethics lesson. While the mere mention of the word ethics can elicit instant yawns from some people, the main message for me is that it’s critical for humans to think about the context and consequences of deploying machine learning models.

                                                                                      Unfortunately, this message sometimes gets muddied amidst the outrage about specific applications that conflict with the values of the outraged parties. But I believe it’s possible to transcend narrow moralities and agree that better outcomes arise when humans think deeply about their deep learning systems. Or to put it more bluntly, any fool can build machine learning models, but it takes thoughtful humans to build good artificial intelligence applications.

                                                                                      Use your human brain to avoid artificial intelligence disasters

                                                                                      If you don't think about your modelling context, you're gonna have a bad time.

                                                                                      Earlier this year, I helped mentor a local edition of fast.ai’s Practical Deep Learning for Coders. Each mentor gave a brief talk on a given week’s subject, adding to the material covered in the recorded lectures. My talk (embedded below) supplemented the data ethics lesson. While the mere mention of the word ethics can elicit instant yawns from some people, the main message for me is that it’s critical for humans to think about the context and consequences of deploying machine learning models.

                                                                                      Unfortunately, this message sometimes gets muddied amidst the outrage about specific applications that conflict with the values of the outraged parties. But I believe it’s possible to transcend narrow moralities and agree that better outcomes arise when humans think deeply about their deep learning systems. Or to put it more bluntly, any fool can build machine learning models, but it takes thoughtful humans to build good artificial intelligence applications.

                                                                                      800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                        Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                        \ No newline at end of file diff --git a/2022/01/14/analysis-strategies-in-online-a-b-experiments/index.html b/2022/01/14/analysis-strategies-in-online-a-b-experiments/index.html index 2f244989e..354445443 100644 --- a/2022/01/14/analysis-strategies-in-online-a-b-experiments/index.html +++ b/2022/01/14/analysis-strategies-in-online-a-b-experiments/index.html @@ -1,11 +1,13 @@ Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                        Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials

                                                                                        In theory, there is no difference between theory and practice. In practice, there is.

                                                                                        Benjamin Brewster

                                                                                        Many discussions of online A/B experiments deal with the sunny day scenario: You randomly assign users to groups A and B, expose group A to the control variant and group B to the treatment variant, run statistical tests on your chosen metrics, and assume that metric differences between the groups that aren’t explained by randomness are due to exposure to the treatment.

                                                                                        However, it’s not always a sunny day for the online experimenter. Challenges include dealing with bot traffic and malicious users, and implementation realities that may make users experience both variants or neither of them. While many of these problems have parallels in clinical trials, I haven’t found many resources that explore these parallels. In this post, I share some lessons I learned from the rich clinical trial literature while building Automattic’s experimentation platform, focusing on analysis strategies that deal with deviations from the ideal experiment scenario.

                                                                                        Reminder: Why we run A/B experiments

                                                                                        Uncontrolled versus controlled experiment

                                                                                        Uncontrolled versus controlled experiment

                                                                                        While the practice of running online A/B experiments is now commonplace, it’s worth reflecting on why such experiments work. Why can’t we just roll out any treatments we think of, measure the metric changes, and assume that differences beyond what we expect from random variation are due to the genius (or folly) of our implemented treatments?

                                                                                        Well, it’s not that simple because the world isn’t static. Even if we don’t make any changes, we’re likely to see different outcomes from month to month and day to day, as the world and our user population change. This is represented by the top part of the diagram above: While we’re interested in the causal impact of the Treatment on the Outcome, many Unknowns may affect both. That is, without an A/B experiment, the Unknowns act as confounders that make it impossible to estimate the causal effect without further assumptions.

                                                                                        With an ideal A/B experiment, we make exposure to the Treatment depend only on our randomisation mechanism – the Assigner on the bottom part of the diagram. Assuming everything goes to plan, we end up with two distinct groups for which exposure to the Treatment is only due to our randomisation mechanism. This allows us to conclude that any differences in the Outcome across the groups beyond what’s expected from randomness are due to the Treatment.

                                                                                        However, reality is often different from this ideal scenario.

                                                                                        Running example

                                                                                        To make things more concrete, let’s take a simple example: You run a crypto exchange, and you want to maximise signups from one of your landing pages. The current call-to-action text is “sign up”. You’re wondering whether changing it to “sign up today!” would instill a sense of urgency and increase the signup conversion rate (signups divided by unique visitors).

                                                                                        sign up +

                                                                                        Analysis strategies in online A/B experiments: Intention-to-treat, per-protocol, and other lessons from clinical trials

                                                                                        In theory, there is no difference between theory and practice. In practice, there is.

                                                                                        Benjamin Brewster

                                                                                        Many discussions of online A/B experiments deal with the sunny day scenario: You randomly assign users to groups A and B, expose group A to the control variant and group B to the treatment variant, run statistical tests on your chosen metrics, and assume that metric differences between the groups that aren’t explained by randomness are due to exposure to the treatment.

                                                                                        However, it’s not always a sunny day for the online experimenter. Challenges include dealing with bot traffic and malicious users, and implementation realities that may make users experience both variants or neither of them. While many of these problems have parallels in clinical trials, I haven’t found many resources that explore these parallels. In this post, I share some lessons I learned from the rich clinical trial literature while building Automattic’s experimentation platform, focusing on analysis strategies that deal with deviations from the ideal experiment scenario.

                                                                                        Reminder: Why we run A/B experiments

                                                                                        Uncontrolled versus controlled experiment

                                                                                        Uncontrolled versus controlled experiment

                                                                                        While the practice of running online A/B experiments is now commonplace, it’s worth reflecting on why such experiments work. Why can’t we just roll out any treatments we think of, measure the metric changes, and assume that differences beyond what we expect from random variation are due to the genius (or folly) of our implemented treatments?

                                                                                        Well, it’s not that simple because the world isn’t static. Even if we don’t make any changes, we’re likely to see different outcomes from month to month and day to day, as the world and our user population change. This is represented by the top part of the diagram above: While we’re interested in the causal impact of the Treatment on the Outcome, many Unknowns may affect both. That is, without an A/B experiment, the Unknowns act as confounders that make it impossible to estimate the causal effect without further assumptions.

                                                                                        With an ideal A/B experiment, we make exposure to the Treatment depend only on our randomisation mechanism – the Assigner on the bottom part of the diagram. Assuming everything goes to plan, we end up with two distinct groups for which exposure to the Treatment is only due to our randomisation mechanism. This allows us to conclude that any differences in the Outcome across the groups beyond what’s expected from randomness are due to the Treatment.

                                                                                        However, reality is often different from this ideal scenario.

                                                                                        Running example

                                                                                        To make things more concrete, let’s take a simple example: You run a crypto exchange, and you want to maximise signups from one of your landing pages. The current call-to-action text is “sign up”. You’re wondering whether changing it to “sign up today!” would instill a sense of urgency and increase the signup conversion rate (signups divided by unique visitors).

                                                                                        sign up OR sign up today!

                                                                                        A simplified mockup of the variants. Which one would you choose?

                                                                                        Placing this scenario into the above diagram, if we were to simply change the text, i.e., apply the Treatment to everyone, we wouldn’t be able to confidently tell whether the text change was the cause of any observed difference in the conversion rate. For example, if our release coincided with a surge of interest in cryptocurrency, this surge may be one of the Unknowns that would cause more motivated users to come to our exchange and sign up. That is, the surge would affect both exposure to the Treatment and the Outcome.

                                                                                        When we run an ideal A/B experiment, we don’t have this problem. Factors like a surge of interest in crypto don’t affect the assignment of users to the control group A (“sign up”) and the treatment group B (“sign up today!”). We can compare the conversion rates across the groups, estimate random variability with our favourite A/B testing calculator, and rejoice. Right?

                                                                                        Well, not so fast…

                                                                                        Problems, problems…

                                                                                        In the ideal scenario, all the users that were assigned to one of the experiment groups experience their assigned variant and produce a measurable outcome. In our running example, the groups are A: control and B: treatment with a simple exposure of seeing “sign up” for the former and “sign up today!” for the latter. The outcome is a successful signup or an absence of a signup. To make the outcome well-defined, it’s often a good idea to limit outcome measurement to events that happen (or don’t happen) within a reasonable attribution window from exposure or assignment. In our example, a reasonable attribution window is probably on the order of hours, as we don’t expect the call-to-action text to have long-lasting effects.

                                                                                        Potential deviations from the ideal scenario include:

                                                                                        • Assignment of ineligible users. In our running example, these may be bots or users that already have an account. If we include many ineligible users in our analysis, we may underestimate the effect size even if their distribution across groups is uniform.
                                                                                        • Crossovers. These are users that manage to experience both variants. For example, they may come across our site on mobile with the “sign up today!” text, and then switch to desktop and see the “sign up” message. Depending on the instrumentation we have in place, we may not be able to detect such users, or we may only detect them if they sign up on one device and then log in on the other device.
                                                                                        • Assignment without exposure. Due to implementation constraints, we may not be guaranteed that assigned users are actually exposed to the treatment and control. In our running example, it may be that the assignment is done on the backend while exposure happens conditionally and asynchronously on the frontend – some users may bounce in the gap between assignment and exposure, and never see the call-to-action text.
                                                                                        • Multiple exposures. Once a user has been assigned, they may get exposed to the treatment and control multiple times (without crossing over). In our example, they may visit the landing page repeatedly and see the “sign up” or “sign up today!” text multiple times before deciding to sign up.

                                                                                        Epidemiologist jargon and analysis strategies

                                                                                        While clinical trials are more tightly controlled than online A/B experiments, they are also susceptible to problems like assignment of ineligible patients and non-adherence to treatment (e.g., crossover, non-exposure, and multiple exposures). Hence, much has been written on addressing these problems at the analysis stage. However, when researching the topic, overcoming the domain-specific language barrier was a bit of a challenge, as the terminology used by online experimenters is different from the terminology used by epidemiologists. Fortunately, I came across the term intention-to-treat at some point, which opened the door to decades of research on the topic.

                                                                                        Two papers I found useful are Intention-to-treat concept: A review (Gupta, 2011) and Guidelines for estimating causal effects in pragmatic randomized trials (Murray, Swanson, and Hernán, 2019). Seeing Miguel Hernán on the author list was an especially positive signal for me, as he is responsible for some of my favourite resources on causal inference, including the most practical book I’ve read on the topic.

                                                                                        The definitions and guidelines from these two papers provide a solid foundation for thinking about problems of ineligibility and non-adherence. Specifically, Gupta defines intention-to-treat as an analysis strategy “that includes all randomized patients in the groups to which they were randomly assigned, regardless of their adherence with the entry criteria, regardless of the treatment they actually received, and regardless of subsequent withdrawal from treatment or deviation from the protocol.”

                                                                                        There are often good reasons to exclude some randomised participants from analysis. Depending on the exclusions, this may or may not bias the results. The use of conservative exclusions can be described as modified intention-to-treat, which according to Gupta “allows the exclusion of some randomized subjects in a justified way (such as patients who were deemed ineligible after randomization or certain patients who never started treatment). However, the definition given to the modified ITT (mITT) in randomized controlled trials has been found to be irregular and arbitrary because there is a lack of consistent guidelines for its application. The mITT analysis allows a subjective approach in entry criteria, which may lead to confusion, inaccurate results and bias.”

                                                                                        Exclusions and further adjustments are usually an attempt to estimate the per-protocol effect, which is defined by Murray, Swanson, and Hernán as “the effect of receiving the assigned treatment strategies throughout the follow-up as specified in the study protocol.” Unfortunately, obtaining a valid estimate of the per-protocol effect isn’t trivial: “To validly estimate the per-protocol effect, baseline variables which predict adherence and are prognostic for the outcome need to be accounted for, either through direct adjustment or via an instrumental variable analysis. Yet two commonly used analytic approaches do not incorporate any such adjustment: (1) Naïve per-protocol analysis, that is, restricting the analytic subset to adherent individuals; and (2) As-treated analysis, that is, comparing individuals based on the treatment they choose.” In other words, if we’re not careful, the per-protocol analysis may become analogous to an uncontrolled experiment, as depicted at the top of the diagram above.

                                                                                        What should be done in practice?

                                                                                        From my reading of the clinical trial literature, the tendency is to use multiple analysis strategies. For example, the first guideline noted by Murray, Swanson, and Hernán is: “To adequately guide decision making by all stakeholders, report estimates of both the intention-to-treat effect and the per-protocol effect, as well as methods and key conditions underlying the estimation procedures.” This echoes the 1988 US FDA guidelines that require applicants to provide an intention-to-treat analysis in addition to the applicant’s preferred per-protocol analyses. Similarly, the 1998 European Medicines Agency guidelines provide more details on the intention-to-treat, modified intention-to-treat, and per-protocol strategies, stating that: “In general, it is advantageous to demonstrate a lack of sensitivity of the principal trial results to alternative choices of the set of subjects analysed. […] When the full analysis set and the per protocol set lead to essentially the same conclusions, confidence in the trial results is increased, bearing in mind, however, that the need to exclude a substantial proportion of subjects from the per protocol analysis throws some doubt on the overall validity of the trial.”

                                                                                        While the stakes in online experiments are typically much lower than in human drug approval, I believe that applying multiple analysis strategies is still a great idea. We did that for Automattic’s experimentation platform, where we flagged discrepancies between the strategies if they led to conflicting conclusions. One downside of this approach is that it complicates the presentation of results in comparison to using a single strategy. If you face the same challenge, you may draw inspiration from seeing how it’s addressed by the open source frontend of Automattic’s experimentation platform.

                                                                                        Going back to our running example, we can perform the following analyses to deal with the deviations noted above:

                                                                                        • Intention-to-treat. Includes all users based on their initial group assignment, regardless of what variant they were exposed to.
                                                                                        • Modified intention-to-treat: No ineligible users. This applies to cases where we detect the ineligibility after assignment, but the eligibility criteria are based on factors that could have been known before the experiment. Hence, it should be safe to exclude the ineligible users after the fact. In our example, excluding bots and existing users should increase the observed effect size, but not change the preferred variant.
                                                                                        • Modified intention-to-treat: No crossovers. If we have a mechanism to detect some crossovers, excluding them and comparing the results to the intention-to-treat analysis may uncover implementation bugs. It’s worth noting that crossovers shouldn’t occur in cases where we can uniquely identify users at all stages of the experiment – it is a problem that is more likely to occur when dealing with anonymous users, as in our landing page example. As such, and given the inability to detect all crossovers, A/B experiments should be avoided when users are highly motivated to cross over. For example, displaying different price levels based on anonymous and transient identifiers like cookies is often a bad idea.
                                                                                        • Naive per-protocol: Exposed users. For this analysis, we’d only include users that were exposed to the control and treatment texts. As noted by Murray, Swanson, and Hernán, this is naive because we should adjust our estimates based on variables that predict exposure. However, if missing exposures are only due to the inherent limitations of online experiments, this falls more under the modified intention-to-treat criterion noted by Gupta, of excluding “patients who never started treatment”. Things get more complicated if we wish to use each exposure as a distinct starting point for measuring multiple assignment windows (the multiple exposures scenario above), which is akin to patients choosing their own dosage – far from a controlled experiment. For automated analysis, it’s better to use the first exposure as the attribution window start, as it should be unaffected by the experiment variants.

                                                                                        For all analysis approaches, it’s critical to verify that there is no sample ratio mismatch in the analysed population, i.e., that the distribution of users across variants matches what we expect from a random assignment. If this isn’t the case, manual analysis by a qualified data scientist is needed. The result of this manual analysis may be that the results should be discarded, as sample ratio mismatches are a common indicator of implementation bugs. This is discussed in detail in the book Trustworthy Online Controlled Experiments, which also includes a chapter on exposure-based analysis (called triggering in the book). Among other recommendations, the authors suggest analysing the unexposed users. If everything goes as expected, metrics for the assigned-but-unexposed populations would behave like A/A experiment metrics, i.e., any differences between the groups should be due to random variability.

                                                                                        Having rigorous consistency checks in place and falling back to manual analysis when any discrepancies are detected should help avoid the pitfalls of unsafe user exclusions that’d bias the results. Given the need for careful adjustments to get a valid per-protocol estimate in case anything goes wrong, it is often best to fix any underlying issues and rerun the experiment. Usually, this is much cheaper to do in an online setting than in clinical trials.

                                                                                        Closing thoughts and further reading

                                                                                        Once you move from the theory of experimentation to the practice of running experiments in the real world, you discover the many complexities involved in doing it well. This applies whether you’re an epidemiologist or an online experimenter. As noted in the preface to the trustworthy experiments book: “Getting numbers is easy; getting numbers you can trust is hard!”

                                                                                        This post only scratched the surface of one area of experimentation: Deciding what population to analyse once the experiment was run. There is, of course, a lot more to online experimentation and causal inference than what I could cover here. But I hope that this message is clear: Approach experimentation with humility, and aim to learn from a broad set of teachers rather than limit yourself to the relatively-recent developments in online experiments.

                                                                                        As mentioned above, some resources that are worth reading to learn more include my favourite causal inference book, the trustworthy experiments book, and the guidelines for pragmatic trials. There are also a bunch of resources on my causal inference list, and my post on Bayesian A/B testing should be of interest if you made it to this point. Finally, I’m always happy to discuss these topics, so feel free to contact me or leave a comment with your thoughts.


                                                                                        Cover image by Tumisu from Pixabay

                                                                                        Subscribe

                                                                                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                          \ No newline at end of file diff --git a/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.html b/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.html index 94e0ed36b..23f5c38fd 100644 --- a/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.html +++ b/2022/03/20/building-useful-machine-learning-tools-keeps-getting-easier-a-fish-id-case-study/index.html @@ -1,5 +1,5 @@ Building useful machine learning tools keeps getting easier: A fish ID case study | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                          Building useful machine learning tools keeps getting easier: A fish ID case study

                                                                                          Being a data scientist is a constant struggle with FOMO (fear of missing out): While you spend your time and attention on one tool, technique, or domain, dozens of other areas keep advancing at breakneck speed. It is impossible to keep up with everything. Fortunately, some advancements make it easy for a single person to accomplish tasks that previously required a team of experts. I covered some aspects of this phenomenon in a previous post: Software commodities are eating interesting data science work. Today’s post covers a specific case study, of how I recently overcame some of my deep learning FOMO by building a fish ID web app.

                                                                                          Background

                                                                                          Until October last year, I was working as a data scientist with Automattic. I was with the company for about 4.5 years in total. In my final two years, I was the tech lead for the company’s unified experimentation platform. In the two years prior to that, I co-led the implementation of the company’s machine learning pipeline. My interest in causal inference was one of the reasons I got involved with the unified experimentation platform, but this involvement meant I neglected my machine learning skills. Similarly, the machine learning pipeline I worked on was focused on marketing applications with tabular data. This meant that there was no need for me to do anything in computer vision or deep learning for many years. In fact, the last time I touched computer vision was due to deep learning FOMO, back in 2015.

                                                                                          Around the middle of last year, I helped mentor a local edition of the fast.ai Practical Deep Learning for Coders course. I figured it’d help me catch up on some recent developments, while helping others in the community. Given my hobby of volunteering as a scuba diver with the Reef Life Survey (RLS) project, it seemed like a good opportunity to do a side project around automated fish ID. However, the reality of full-time remote work meant that I had little motivation to spend extra time in front of the computer, so that side project never got off the ground.

                                                                                          Fortunately, I decided to leave Automattic and pursue work that better aligns with my values and interests. Rather than jumping into another full-time role, I decided to spend some time exploring and learning – a great antidote to the data science FOMO. First on the agenda after migrating my site off WordPress.com was making progress on the automated fish ID project. While it is still experimental, it’s now live on the RLS website, with the code available in my deep-fish repo.

                                                                                          The fish ID tool

                                                                                          As far as machine learning applications go, the tool I built isn’t groundbreaking – and that’s exactly the point. Many machine learning apps are boring and “uncool” (fulling fast.ai’s goal of making neural nets uncool again). But such apps are often useful. In my case, the tool scratches an itch felt by many RLS volunteers and other divers: Given a photo taken at a certain location, what fish is in the photo?

                                                                                          The tool relies on a classification model trained on images from the RLS website. In addition to the model, it lets users filter results based on previously-observed species at RLS sites. The following video demonstrates how it works:

                                                                                          I built the computer vision model with fast.ai, and the web app with Streamlit. It only took a couple of weeks to put everything together, and it could have easily been faster if I hadn’t taken the time to understand the underlying modelling code and tinker with various things. I’m sure that the model can be improved – my initial modelling attempts yielded a top-10 accuracy of about 60%, which I subsequently improved to about 72%. The main challenge is that there are 6,628 images and 2,167 species in the dataset I used, so it’s likely that some species can’t be identified reliably from the available training images.

                                                                                          You can read through my modelling experiments in the project’s notebooks. Copyright for the images belongs to the photographers, so I can’t share the full dataset.

                                                                                          Lessons learned

                                                                                          Rather than writing too much about the model and the code, which aren’t too unusual, I’d like to share a few lessons I learned while working on this project.

                                                                                          1. Getting reasonable performance out of a deep learning model can be cheap and easy. This lesson is highlighted in the introduction to the fast.ai course: With a few lines of code (and the right data), it’s easy to train reasonable models. It can also be cheap: I only used my laptop’s GPU for most of the experiments, and relied on Kaggle’s free notebook environment for experiments that I couldn’t run locally. On my dataset, I found that training a bigger (ResNet50) model with Kaggle didn’t improve accuracy in comparison to the smaller (ResNet18) model I could fit into my laptop’s GPU memory. This would definitely vary by dataset, but the point is that reasonable performance doesn’t necessarily require much human or computer work. In fact, much of the time I spent on modelling was for my own benefit, to better understand the material taught by fast.ai. Conceptually, I was pleased to discover that many things remained the same since my last foray into computer vision: Reasonable performance can be obtained by using established techniques and pre-trained architectures, while focusing on the data, the modelling pipeline, and augmentations. In my experience, this principle applies to many machine learning problems. This is summarised well by the directive from Google’s Rules of Machine Learning to “do machine learning like the great engineer you are, not like the great machine learning expert you aren’t.”

                                                                                          2. Building a Streamlit UI feels like magic. I’ve heard about Streamlit years ago, but this was my first time using it. I was impressed with how quickly I could put together a useful app using only Python. I went from a vague idea to a pretty complete implementation in a day (with some additional tinkering in subsequent days). It really is a game changer for data scientists.

                                                                                          3. Deploying a Streamlit app is a bit less magical. Streamlit Cloud seemed like a straightforward way to deploy Streamlit apps, but I ran into issues because I used a Conda environment. I managed to work around those issues, but it seems like the environment installed on Cloud isn’t truly isolated: Judging by the logs, Streamlit Cloud reads the Conda file and installs the required packages into an existing environment. This results in weird error messages that are hard to debug. I also ran into memory issues, which seem to be un-debuggable with the information provided by Streamlit Cloud. Still, I decided to initially deploy the app to Streamlit Cloud’s free tier and wrap it in an iframe for the RLS website. Given the steep increase in price from the free tier to the lowest paid tier (US$250 / month), it’s likely I’d switch to self-hosting if I run into more issues. This is a disappointing contrast to the magical experience of building the UI, but I hope that Streamlit Cloud would become easier to use with time.

                                                                                          4. The fast.ai library is a great starting point, despite its quirks. Using fast.ai felt a bit like cheating, in the sense captured by xkcd’s Real Programmers comic. Given the hype, it feels like it should be harder to build useful models – real data scientists use PyTorch directly! But no, in reality it makes sense to use the best tool for the job. And there’s nothing wrong with something being easy or fast, as it lets you spend more time elsewhere. In the words of the principles behind the agile manifesto: “Simplicity – the art of maximizing the amount of work not done – is essential.”

                                                                                          Building useful machine learning tools keeps getting easier: A fish ID case study

                                                                                          Being a data scientist is a constant struggle with FOMO (fear of missing out): While you spend your time and attention on one tool, technique, or domain, dozens of other areas keep advancing at breakneck speed. It is impossible to keep up with everything. Fortunately, some advancements make it easy for a single person to accomplish tasks that previously required a team of experts. I covered some aspects of this phenomenon in a previous post: Software commodities are eating interesting data science work. Today’s post covers a specific case study, of how I recently overcame some of my deep learning FOMO by building a fish ID web app.

                                                                                          Background

                                                                                          Until October last year, I was working as a data scientist with Automattic. I was with the company for about 4.5 years in total. In my final two years, I was the tech lead for the company’s unified experimentation platform. In the two years prior to that, I co-led the implementation of the company’s machine learning pipeline. My interest in causal inference was one of the reasons I got involved with the unified experimentation platform, but this involvement meant I neglected my machine learning skills. Similarly, the machine learning pipeline I worked on was focused on marketing applications with tabular data. This meant that there was no need for me to do anything in computer vision or deep learning for many years. In fact, the last time I touched computer vision was due to deep learning FOMO, back in 2015.

                                                                                          Around the middle of last year, I helped mentor a local edition of the fast.ai Practical Deep Learning for Coders course. I figured it’d help me catch up on some recent developments, while helping others in the community. Given my hobby of volunteering as a scuba diver with the Reef Life Survey (RLS) project, it seemed like a good opportunity to do a side project around automated fish ID. However, the reality of full-time remote work meant that I had little motivation to spend extra time in front of the computer, so that side project never got off the ground.

                                                                                          Fortunately, I decided to leave Automattic and pursue work that better aligns with my values and interests. Rather than jumping into another full-time role, I decided to spend some time exploring and learning – a great antidote to the data science FOMO. First on the agenda after migrating my site off WordPress.com was making progress on the automated fish ID project. While it is still experimental, it’s now live on the RLS website, with the code available in my deep-fish repo.

                                                                                          The fish ID tool

                                                                                          As far as machine learning applications go, the tool I built isn’t groundbreaking – and that’s exactly the point. Many machine learning apps are boring and “uncool” (fulling fast.ai’s goal of making neural nets uncool again). But such apps are often useful. In my case, the tool scratches an itch felt by many RLS volunteers and other divers: Given a photo taken at a certain location, what fish is in the photo?

                                                                                          The tool relies on a classification model trained on images from the RLS website. In addition to the model, it lets users filter results based on previously-observed species at RLS sites. The following video demonstrates how it works:

                                                                                          I built the computer vision model with fast.ai, and the web app with Streamlit. It only took a couple of weeks to put everything together, and it could have easily been faster if I hadn’t taken the time to understand the underlying modelling code and tinker with various things. I’m sure that the model can be improved – my initial modelling attempts yielded a top-10 accuracy of about 60%, which I subsequently improved to about 72%. The main challenge is that there are 6,628 images and 2,167 species in the dataset I used, so it’s likely that some species can’t be identified reliably from the available training images.

                                                                                          You can read through my modelling experiments in the project’s notebooks. Copyright for the images belongs to the photographers, so I can’t share the full dataset.

                                                                                          Lessons learned

                                                                                          Rather than writing too much about the model and the code, which aren’t too unusual, I’d like to share a few lessons I learned while working on this project.

                                                                                          1. Getting reasonable performance out of a deep learning model can be cheap and easy. This lesson is highlighted in the introduction to the fast.ai course: With a few lines of code (and the right data), it’s easy to train reasonable models. It can also be cheap: I only used my laptop’s GPU for most of the experiments, and relied on Kaggle’s free notebook environment for experiments that I couldn’t run locally. On my dataset, I found that training a bigger (ResNet50) model with Kaggle didn’t improve accuracy in comparison to the smaller (ResNet18) model I could fit into my laptop’s GPU memory. This would definitely vary by dataset, but the point is that reasonable performance doesn’t necessarily require much human or computer work. In fact, much of the time I spent on modelling was for my own benefit, to better understand the material taught by fast.ai. Conceptually, I was pleased to discover that many things remained the same since my last foray into computer vision: Reasonable performance can be obtained by using established techniques and pre-trained architectures, while focusing on the data, the modelling pipeline, and augmentations. In my experience, this principle applies to many machine learning problems. This is summarised well by the directive from Google’s Rules of Machine Learning to “do machine learning like the great engineer you are, not like the great machine learning expert you aren’t.”

                                                                                          2. Building a Streamlit UI feels like magic. I’ve heard about Streamlit years ago, but this was my first time using it. I was impressed with how quickly I could put together a useful app using only Python. I went from a vague idea to a pretty complete implementation in a day (with some additional tinkering in subsequent days). It really is a game changer for data scientists.

                                                                                          3. Deploying a Streamlit app is a bit less magical. Streamlit Cloud seemed like a straightforward way to deploy Streamlit apps, but I ran into issues because I used a Conda environment. I managed to work around those issues, but it seems like the environment installed on Cloud isn’t truly isolated: Judging by the logs, Streamlit Cloud reads the Conda file and installs the required packages into an existing environment. This results in weird error messages that are hard to debug. I also ran into memory issues, which seem to be un-debuggable with the information provided by Streamlit Cloud. Still, I decided to initially deploy the app to Streamlit Cloud’s free tier and wrap it in an iframe for the RLS website. Given the steep increase in price from the free tier to the lowest paid tier (US$250 / month), it’s likely I’d switch to self-hosting if I run into more issues. This is a disappointing contrast to the magical experience of building the UI, but I hope that Streamlit Cloud would become easier to use with time.

                                                                                          4. The fast.ai library is a great starting point, despite its quirks. Using fast.ai felt a bit like cheating, in the sense captured by xkcd’s Real Programmers comic. Given the hype, it feels like it should be harder to build useful models – real data scientists use PyTorch directly! But no, in reality it makes sense to use the best tool for the job. And there’s nothing wrong with something being easy or fast, as it lets you spend more time elsewhere. In the words of the principles behind the agile manifesto: “Simplicity – the art of maximizing the amount of work not done – is essential.”

                                                                                          800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                            Loved this short blog. Planning to transition to Climate tech as a DS guy and am slowly cultivating a pent-up passion for conserving the marine life so too many things that I can relate to here haha
                                                                                            Thank you! Good luck with the transition. 🙂
                                                                                            \ No newline at end of file diff --git a/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.html b/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.html index d8fbbe36a..1e1d44edc 100644 --- a/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.html +++ b/2022/06/06/the-mission-matters-moving-to-climate-tech-as-a-data-scientist/index.html @@ -1,5 +1,5 @@ The mission matters: Moving to climate tech as a data scientist | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                            The mission matters: Moving to climate tech as a data scientist

                                                                                            So we are facing the most consequential fork in the road. If we continue as now, we are going to be irreparably going down a course of constant destruction, with much human pain and biodiversity loss. Or we can choose to go in the other direction, a path of reconstruction and regeneration, and at least diminish the negative impacts of climate change to something that is manageable.

                                                                                            But we can only choose it this decade. Our parents did not have this choice, because they didn’t have the capital, technologies and understanding. And for our children, it will be too late. So this is the decade and we are the generation.

                                                                                            Christiana Figueres +

                                                                                            The mission matters: Moving to climate tech as a data scientist

                                                                                            So we are facing the most consequential fork in the road. If we continue as now, we are going to be irreparably going down a course of constant destruction, with much human pain and biodiversity loss. Or we can choose to go in the other direction, a path of reconstruction and regeneration, and at least diminish the negative impacts of climate change to something that is manageable.

                                                                                            But we can only choose it this decade. Our parents did not have this choice, because they didn’t have the capital, technologies and understanding. And for our children, it will be too late. So this is the decade and we are the generation.

                                                                                            Multiple factors contributed to my decision to leave Automattic last year. One factor was that the company’s mission to “democratize publishing and eCommerce” no longer resonated with me: First, publishing and eCommerce are already widely accessible. Second, despite decades of increased access to a wide variety of publication tools, global democracy is declining. Third, a corollary of the mission is hosting publications by the likes of News Corp Australia, an organisation that is harming Australian democracy according to former prime ministers from both sides of politics. Fourth, I believed that there are more pressing problems I could spend my time on.1

                                                                                            Update (2024-08-20): Following the October 7th (2023) attack on Israel and its global aftermath, I have a more nuanced view of the above. In Australia, News Corp ended up playing a balancing role in the face of misinformation, under-reporting, and the anti-Zionist & anti-Jewish sentiments that were on display by left-leaning publications like The Guardian and The Australian Broadcasting Corporation. I still think News Corp over-sensationalises many items and is completely off on climate and other topics, but I now see that no single news source is reasonably trustworthy. As such, I see broad access to publishing as better than the alternative of a handful of gatekeepers deciding what deserves coverage.

                                                                                            One such problem is the climate crisis. I was fortunate to have spent a small amount of time on it at Automattic, where I co-founded a sustainability employee group and led the company’s first purchases of carbon offsets and removals. However, this was a side gig.

                                                                                            When I left Automattic, I was hoping to get involved more directly in climate and environmental action. Having recently joined Orkestra – a company whose mission is “to power the world’s energy decision-making” – I figured it’s time to share some thoughts on the climate tech space, along with resources that others may find useful.

                                                                                            Doing more with my climate obsession

                                                                                            I’ve always cared about the environment, but my levels of activity in the area have fluctuated over the years. While it’s no excuse, I suppose that environmental issues often appear too intractable, especially with the growth of human population and of the percentage of humans who live in societies that require continuous economic growth to prosper. Collectively, we still haven’t figured out how to obtain prosperity without growth. Given the magnitude of the problems, even the most influential individuals can only make a relatively small impact on driving solutions.

                                                                                            Still, being unable to do everything doesn’t mean one should do nothing, as the too-small-to-matter excuse can even be applied at the country level. For example, some people claim that given Australia’s small share of global emissions, it can’t play a significant role in addressing climate change. This conveniently neglects the fact that Australia has one of the highest per-capita carbon footprints in the world, and that it is a major exporter of fossil fuels. Clearly, Australia can do more to help achieve the collective goal of keeping global heating below truly dangerous levels. And doing it intelligently would help prosperity, as Australia is uniquely positioned to become a green energy superpower. Given the results of the last federal election, most Australians fall on the “do more” side of the debate.

                                                                                            Anyway, I am not a country, but similar logic applies: I can do more as an individual, even though my personal emissions are negligible when compared to the daunting amount emitted by humanity as a whole.

                                                                                            When it comes to climate action, a couple of key milestones for me were in 2015, when I became more aware of how I could divest from fossil fuels, and in early 2020, when the massive fires in Australia made me want to do something. Among other things, this led me to push for climate action within Automattic, as noted above. It also led me to – somewhat obsessively – consume quite a few resources on the topic. Honourable mentions go to Outrage + Optimism, Volts, TIL Climate, and My Climate Journey – many others are sprinkled throughout this article.

                                                                                            One outcome of the obsession is that I’m more aware of the impacts of climate change, environmental degradation, and government inaction. Massive fires? Climate change increases their frequency and severity. Global pandemics? Habitat loss and greater human-animal interaction increase their probability, while air pollution increases risk from respiratory infections. Widespread floods? Climate change increases flood impact and over-development on floodplains leads to avoidable suffering. More frequent coral bleaching? Increased emissions lead to ocean heatwaves and acidification, while reduced water quality and overfishing certainly don’t help ocean ecosystems.

                                                                                            So yeah, big problems. And one can always more/better to help. But it’s usually possible to also do less or worse. Therefore, I believe in doing more while cutting people slack, as suggested by Sami Grover:

                                                                                            So by all means, skip that next beef burger, or take a pass on that cheap flight to Cancún. But then ask yourself how you can magnify the impact of what you do. Are there campaigns or advocacy groups you can join? Can you talk to friends or family about the shifts you are making? Can you influence policy or practices at your place of work or study? Can you identify barriers to action that are preventing others from joining in?

                                                                                            In so doing, remember to cut yourself, and those around you, some slack. We are not each on an individual journey to slash our footprint to zero. We are on a collective mission to shift the only true footprint that matters: that of society as a whole.

                                                                                            Climate tech and its intersections with data science

                                                                                            When it comes to doing more, one path that a growing number of people seem to take is getting into climate tech. What is climate tech? Good question. To me, defining it is somewhat reminiscent of attempts to define data science, which I’ve tackled in posts from 2014 to 2018.2 In the same way that data science encompassed things that some people have been doing for decades, climate tech is giving a new name to existing activities. Broadly, I’d say that it’s work on technology to reverse, mitigate, and adapt to anthropogenic climate change.

                                                                                            Another parallel I see between data science and climate tech is that many things with tenuous connections to the field get lumped into it, in an attempt to capitalise on its trendiness. I think we’re past the peak of the data science hype, but there was a time when people who had only taken cursory looks at data rebranded as data scientists. Similarly, there are “climate tech” companies out there that may have a negative or neutral impact on fighting climate change. Personally, I’m also skeptical of grouping adaptation efforts under climate tech. For example, dealing with extreme weather events is needed even in a world with a stable climate, so I don’t think such work captures the intention behind climate tech (though it can be valuable).

                                                                                            Most importantly, no matter how you define climate tech and data science, there is a need for data skills to develop technologies that address climate change. And this is where data scientists who are concerned about the climate (like me) can help make difference. In the words of Saul Griffith:

                                                                                            If you are a tech worker, stop making social media and delivery apps and make software that helps people use less energy, balances the grid, automates the design of solar and wind plants, makes public transit work better, and does other useful things to accelerate our transition to renewables.

                                                                                            A structured approach to making career decisions

                                                                                            I’m fortunate to have skills that are in demand in the current market. I’m also fortunate to be in a financial position that allows me to take unpaid time off. Put together, this means that I have a high degree of freedom to choose how I spend my time.

                                                                                            In the past, I’ve advocated for asking why about every career step. And indeed, I can explain the reasoning behind every point in my resume. Sometimes, a step is due to dumb luck, e.g., I discovered that I was a data scientist in 2012, the year Harvard Business Review deemed it the sexiest job of the 21st century3 – I didn’t plan to become a data scientist when I started my PhD in 2009. And sometimes, a step is more planned – I specifically targeted Automattic as one of the few established fully-remote companies that was hiring data scientists in 2017, as my goals included living outside major cities and having a job that I can hold for more than a year without wanting to run away.

                                                                                            Given that my current position presents more options than I’ve had in the past, I decided to have a look through 80,000 Hours. I’ve been aware of their work for years, but my vague impression was that they’re overly utilitarian. However, digging through their resources, I found that they emphasise the importance of personal fit and well-being, both when it comes to career paths and to problem areas. For example, they aren’t too pushy about choosing the problems that they find most pressing if it doesn’t align with one’s beliefs and values.

                                                                                            The 80,000 Hours website contains a wealth of well-reasoned articles. I found the self-guided course on career planning useful to go through, as it helped me apply their main ideas to my situation. While I don’t feel like it led to a major shift in my views and plans, having more structure and a richer terminology to think through my career decisions is helpful.

                                                                                            That said, one area where I diverge from the 80,000 Hours philosophy is in concern about far-future human extinction. They conclude that climate change is less recommended than other problems as the odds of it leading to human extinction are low. However, working in the climate space should alleviate human suffering in this century and reduce the extinction risks of nonhuman animals. Both of these are important to me, especially given the rich cultural lives of animals like whales and dolphins.

                                                                                            800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                              This is a very well written post and it’s great to hear your reasoning on this. Congrats on your new position!
                                                                                              \ No newline at end of file diff --git a/2022/09/12/causal-machine-learning-book-draft-review/index.html b/2022/09/12/causal-machine-learning-book-draft-review/index.html index 0cf6c0093..e28d0b15c 100644 --- a/2022/09/12/causal-machine-learning-book-draft-review/index.html +++ b/2022/09/12/causal-machine-learning-book-draft-review/index.html @@ -1,5 +1,5 @@ Causal Machine Learning is off to a good start, despite some issues | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                              Causal Machine Learning is off to a good start, despite some issues

                                                                                              [DALL·E](https://labs.openai.com/)'s _steampunk painting of a data scientist reading a book about causal machine learning_.

                                                                                              DALL·E’s steampunk painting of a data scientist reading a book about causal machine learning.

                                                                                              I was recently given a free eBook copy of the MEAP of Causal Machine Learning. MEAP stands for Manning Early Access Program, where books are published one chapter at a time. While the current version could use better copyediting and proofreading, I’m keen on reading more of the book as it becomes available.

                                                                                              Causal Machine Learning addresses a gap in the causal inference literature: While much has been published on the topic, putting the theory to practice in the real world can be challenging. For example, even though I considered Causal Inference: What If to be the most practical book I’ve read on the topic, I haven’t used much of its content directly. This is partly due to my focus on other areas, e.g., online experimentation and the energy space. But it is also due to the availability of sample code and mature packages that can be quickly adapted to my needs. The book aims to address the latter through a code-first approach that utilises Python packages such as Pyro, pgmpy, and DoWhy.

                                                                                              Despite the code-first promise, the book feels a bit slow at getting into the more exciting content. I couldn’t help but compare it to the fast.ai book, which first shows how to build and deploy a custom image classifier, and only then goes into unpacking how it all works. However, despite the verbosity of the first two chapters, by the third chapter things start to get more interesting. At the time of this writing, only chapters 1-3 are available, but upcoming chapters look promising based on the table of contents.

                                                                                              While lacking a production-ready example early in the book is a minor concern, I found the many grammatical errors more distracting. Even though a MEAP is essentially a draft, I think its proofreading level should be higher than that of a blog post.1 This is especially the case for paid content published by an organisation that cares enough to have contacted me to promote the book. As Steven Pinker says in the intro to The Sense of Style:

                                                                                              Style earns trust. If readers can see that a writer cares about consistency and accuracy in her prose, they will be reassured that the writer cares about those virtues in conduct they cannot see as easily. Here is how one technology executive explains why he rejects job applications filled with errors of grammar and punctuation: “If it takes someone more than 20 years to notice how to properly use it’s, then that’s not a learning curve I’m comfortable with.” And if that isn’t enough to get you to brush up your prose, consider the discovery of the dating site OkCupid that sloppy grammar and spelling in a profile are “huge turn-offs.” As one client said, “If you’re trying to date a woman, I don’t expect flowery Jane Austen prose. But aren’t you trying to put your best foot forward?”

                                                                                              Another source of distraction is the choice of variables for some of the toy examples. For instance, one model of blood type inheritance confuses the phenotype and genotype, claiming that “knowing your grandfather’s [blood] type has no benefit in predicting your type once we know your father’s”. However, knowing the grandparents’ blood types can help predict the grandchild’s blood type even when the parent’s blood type is known. The toy example would work if it focused on genotypes, not on the common meaning of blood type as the phenotype (i.e., observable traits). See pages 58-60 in Probabilistic Graphical Models: Principles and Techniques for a less casual presentation of a similar example.

                                                                                              Causal Machine Learning is off to a good start, despite some issues

                                                                                              [DALL·E](https://labs.openai.com/)'s _steampunk painting of a data scientist reading a book about causal machine learning_.

                                                                                              DALL·E’s steampunk painting of a data scientist reading a book about causal machine learning.

                                                                                              I was recently given a free eBook copy of the MEAP of Causal Machine Learning. MEAP stands for Manning Early Access Program, where books are published one chapter at a time. While the current version could use better copyediting and proofreading, I’m keen on reading more of the book as it becomes available.

                                                                                              Causal Machine Learning addresses a gap in the causal inference literature: While much has been published on the topic, putting the theory to practice in the real world can be challenging. For example, even though I considered Causal Inference: What If to be the most practical book I’ve read on the topic, I haven’t used much of its content directly. This is partly due to my focus on other areas, e.g., online experimentation and the energy space. But it is also due to the availability of sample code and mature packages that can be quickly adapted to my needs. The book aims to address the latter through a code-first approach that utilises Python packages such as Pyro, pgmpy, and DoWhy.

                                                                                              Despite the code-first promise, the book feels a bit slow at getting into the more exciting content. I couldn’t help but compare it to the fast.ai book, which first shows how to build and deploy a custom image classifier, and only then goes into unpacking how it all works. However, despite the verbosity of the first two chapters, by the third chapter things start to get more interesting. At the time of this writing, only chapters 1-3 are available, but upcoming chapters look promising based on the table of contents.

                                                                                              While lacking a production-ready example early in the book is a minor concern, I found the many grammatical errors more distracting. Even though a MEAP is essentially a draft, I think its proofreading level should be higher than that of a blog post.1 This is especially the case for paid content published by an organisation that cares enough to have contacted me to promote the book. As Steven Pinker says in the intro to The Sense of Style:

                                                                                              Style earns trust. If readers can see that a writer cares about consistency and accuracy in her prose, they will be reassured that the writer cares about those virtues in conduct they cannot see as easily. Here is how one technology executive explains why he rejects job applications filled with errors of grammar and punctuation: “If it takes someone more than 20 years to notice how to properly use it’s, then that’s not a learning curve I’m comfortable with.” And if that isn’t enough to get you to brush up your prose, consider the discovery of the dating site OkCupid that sloppy grammar and spelling in a profile are “huge turn-offs.” As one client said, “If you’re trying to date a woman, I don’t expect flowery Jane Austen prose. But aren’t you trying to put your best foot forward?”

                                                                                              Another source of distraction is the choice of variables for some of the toy examples. For instance, one model of blood type inheritance confuses the phenotype and genotype, claiming that “knowing your grandfather’s [blood] type has no benefit in predicting your type once we know your father’s”. However, knowing the grandparents’ blood types can help predict the grandchild’s blood type even when the parent’s blood type is known. The toy example would work if it focused on genotypes, not on the common meaning of blood type as the phenotype (i.e., observable traits). See pages 58-60 in Probabilistic Graphical Models: Principles and Techniques for a less casual presentation of a similar example.

                                                                                              800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                \ No newline at end of file diff --git a/2022/12/11/chatgpt-is-transformative-ai/index.html b/2022/12/11/chatgpt-is-transformative-ai/index.html index 91b9d7960..cb12ec0af 100644 --- a/2022/12/11/chatgpt-is-transformative-ai/index.html +++ b/2022/12/11/chatgpt-is-transformative-ai/index.html @@ -1,5 +1,5 @@ ChatGPT is transformative AI | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                ChatGPT is transformative AI

                                                                                                [Mage](https://www.mage.space/)'s interpretation of _human brain expanding_

                                                                                                Mage’s interpretation of human brain expanding

                                                                                                I remember the days before Google: Finding answers on the internet was tedious and clunky, I had to switch between search engines or run meta-searches to get workable results, and it still felt like I wasn’t finding all the information that was out there. Then Google came along and everything changed – I felt like I had gained new super-powers.

                                                                                                Using ChatGPT feels at least as transformative as switching from AltaVista to Google. After only a few days of working with ChatGPT, I feel like it has made me much more effective. It’d be hard to go back to pre-ChatGPT life.

                                                                                                It’s worth noting that I tend to be a mid-to-late adopter of shiny new consumer tech. I’m also a bit of an AI hype skeptic. Twitter? Seemed like a useless tool back in the day, and a mostly harmful one these days. Facebook? I resisted for a few years, and reluctantly ended up joining to avoid missing out on real-life social activity. Smartphones? Very useful, but also very distracting – I often have mine on airplane mode to avoid getting sucked in. Crypto? Still too volatile and speculative for me. Dall-E and Stable Diffusion? Fun toys, but not too useful in my everyday life.

                                                                                                ChatGPT is different because it distills information that is out there and makes it relevant to me. I feel like I’m still retaining agency, unlike with social media and other tools that are designed to suck me in. ChatGPT is more like a classic search engine that’s there to help when needed. I’m hooked, but not addicted.

                                                                                                In the past week, my work-related ChatGPT usage included questions about Nginx, Prefect, Python, AWS, React, MySQL, Google Sheets, and probably a few other tools. This makes it vastly more useful than GitHub Copilot, which I stopped using when it became paid. The problem with GitHub Copilot isn’t that it doesn’t provide useful output – some of its code completions feel like pure magic. The issue is more with the interface – it often distracts me from what I’m trying to do. In that sense, it’s less like a copilot and more like a backseat driver.

                                                                                                ChatGPT feels like a helpful copilot, personal assistant, coach, and much more – definitely worth paying for. In addition to technical advice, I asked it questions about the meaning of time, the Joel test for learning designers, rephrasing text, investment, and career-related issues. It wasn’t always correct, but it was often informative and thought-provoking. This is more than can be said for interactions with some humans.

                                                                                                The OpenAI team pretty much nailed the user experience and interface. With an ongoing chat, I can get more useful results by refining my queries. Unlike with a search engine, I don’t need to wade through sometimes-dodgy websites and discrepant interfaces to get what I’m looking for. ChatGPT makes information accessible and useful – like Google’s mission, but often better than Google (though it may catch up).

                                                                                                The exciting and terrifying thing is that the tech is still in its infancy. It’s going to get radically better and different, and disrupt many industries and people. The rise of machine intelligence continues – ChatGPT is a significant transformative AI step.

                                                                                                ChatGPT is transformative AI

                                                                                                [Mage](https://www.mage.space/)'s interpretation of _human brain expanding_

                                                                                                Mage’s interpretation of human brain expanding

                                                                                                I remember the days before Google: Finding answers on the internet was tedious and clunky, I had to switch between search engines or run meta-searches to get workable results, and it still felt like I wasn’t finding all the information that was out there. Then Google came along and everything changed – I felt like I had gained new super-powers.

                                                                                                Using ChatGPT feels at least as transformative as switching from AltaVista to Google. After only a few days of working with ChatGPT, I feel like it has made me much more effective. It’d be hard to go back to pre-ChatGPT life.

                                                                                                It’s worth noting that I tend to be a mid-to-late adopter of shiny new consumer tech. I’m also a bit of an AI hype skeptic. Twitter? Seemed like a useless tool back in the day, and a mostly harmful one these days. Facebook? I resisted for a few years, and reluctantly ended up joining to avoid missing out on real-life social activity. Smartphones? Very useful, but also very distracting – I often have mine on airplane mode to avoid getting sucked in. Crypto? Still too volatile and speculative for me. Dall-E and Stable Diffusion? Fun toys, but not too useful in my everyday life.

                                                                                                ChatGPT is different because it distills information that is out there and makes it relevant to me. I feel like I’m still retaining agency, unlike with social media and other tools that are designed to suck me in. ChatGPT is more like a classic search engine that’s there to help when needed. I’m hooked, but not addicted.

                                                                                                In the past week, my work-related ChatGPT usage included questions about Nginx, Prefect, Python, AWS, React, MySQL, Google Sheets, and probably a few other tools. This makes it vastly more useful than GitHub Copilot, which I stopped using when it became paid. The problem with GitHub Copilot isn’t that it doesn’t provide useful output – some of its code completions feel like pure magic. The issue is more with the interface – it often distracts me from what I’m trying to do. In that sense, it’s less like a copilot and more like a backseat driver.

                                                                                                ChatGPT feels like a helpful copilot, personal assistant, coach, and much more – definitely worth paying for. In addition to technical advice, I asked it questions about the meaning of time, the Joel test for learning designers, rephrasing text, investment, and career-related issues. It wasn’t always correct, but it was often informative and thought-provoking. This is more than can be said for interactions with some humans.

                                                                                                The OpenAI team pretty much nailed the user experience and interface. With an ongoing chat, I can get more useful results by refining my queries. Unlike with a search engine, I don’t need to wade through sometimes-dodgy websites and discrepant interfaces to get what I’m looking for. ChatGPT makes information accessible and useful – like Google’s mission, but often better than Google (though it may catch up).

                                                                                                The exciting and terrifying thing is that the tech is still in its infancy. It’s going to get radically better and different, and disrupt many industries and people. The rise of machine intelligence continues – ChatGPT is a significant transformative AI step.

                                                                                                800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                  Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                  I’m amused that the first sample on the home page features the user asking, “this code is not working like i expect - how do i fix it?” In my mind, I hear the voice of HAL (Douglas Rain) answer, “This sort of thing has cropped up before, and it has always been due to human error.”

                                                                                                  I’ve been assuming ChatGPT is just the latest specimen of what typically passes for AI these days, a system with an elaborate model of utterances disconnected from any deeper and richer model of the world to which utterances refer, hence brittle and shallow. (Such systems more or less realize John Searle’s “Chinese room” scenario, although unlike Searle, I don’t think they represent any fundamental limit on AI, merely the current, crude state of the art.) However, you’ve convinced me to try it out.

                                                                                                  Thanks Ralph! It’s definitely still early days, but it feels like a step change in chatbot tech, much like the significant improvements in image recognition from a decade ago. I can see it only getting more capable with all the interaction data that they’re collecting.
                                                                                                  \ No newline at end of file diff --git a/2023/04/21/remaining-relevant-as-a-small-language-model/index.html b/2023/04/21/remaining-relevant-as-a-small-language-model/index.html index 55f174d7c..8a53b15fb 100644 --- a/2023/04/21/remaining-relevant-as-a-small-language-model/index.html +++ b/2023/04/21/remaining-relevant-as-a-small-language-model/index.html @@ -1,5 +1,5 @@ Remaining relevant as a small language model | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                  Remaining relevant as a small language model

                                                                                                  [Mage](https://www.mage.space/)'s interpretation of _horse versus car minimalistic_

                                                                                                  Mage’s interpretation of horse versus car minimalistic

                                                                                                  Computer science as a field is in for a pretty major upheaval few of us are really prepared for. Programming will be obsolete.

                                                                                                  Matt Welsh +

                                                                                                  Remaining relevant as a small language model

                                                                                                  [Mage](https://www.mage.space/)'s interpretation of _horse versus car minimalistic_

                                                                                                  Mage’s interpretation of horse versus car minimalistic

                                                                                                  Computer science as a field is in for a pretty major upheaval few of us are really prepared for. Programming will be obsolete.

                                                                                                  Many of us feel both despair and awe when contemplating recent AI developments: Despair due to the rapid pace of automation that threatens personal and social stability. Awe due to the seemingly-magical ability of computers to outperform most humans on an ever-expanding range of tasks. But there is nothing magical about human intelligence, just as there is no magic formula that makes horses gallop and birds fly. That is, it can all be replicated with the right machinery.

                                                                                                  In its wild early days, Bing Chat referred to a user as “a late version of a small language model”. While there’s more to humans than language, there’s no denying that our language processing abilities are limited by our biology. Meanwhile, computers don’t face the same constraints. This raises the question: As small language models, what can we do that is still of value?

                                                                                                  800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                    Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                    Reading of the despair of the many silent/invisible contributors of intellectual property copied, without financial recompense, which went into these LLMs. It’s a training data gold rush/free for all right now, just as unedifying and thoughtless as actual gold rushes in history were. I fear this will also quickly lead to Apple-style closed gardens of proprietary creative talent (think of top artists, writers and thinkers signed up to train AI rather than creating content for direct human consumption). Counter-ML tech like Glaze will only delay the inevitable. https://glaze.cs.uchicago.edu/.

                                                                                                    I can think of some ways where governments might respond, e.g. special taxes and incentives on AI businesses to fund creative academies and collectives, much as public universities are today.

                                                                                                    Thanks John! Yeah, I doubt that tech like Glaze can be made future proof, as they admit on that page. Besides, I lean more towards the view that all creative work is derivative and copying isn’t theft. Copyright mostly protects platforms and businesses rather than individuals. While I empathise with individuals who feel like their work is being exploited without their permission, I don’t see the training of machine learning models as being that different from artists learning from other artists.

                                                                                                    Thoughtful government intervention would be great, but it’s unlikely to be applied in a timely manner or evenly across jurisdictions.

                                                                                                    \ No newline at end of file diff --git a/2023/05/26/how-hackable-are-automated-coding-assessments/index.html b/2023/05/26/how-hackable-are-automated-coding-assessments/index.html index dfdb6b014..16a956f7e 100644 --- a/2023/05/26/how-hackable-are-automated-coding-assessments/index.html +++ b/2023/05/26/how-hackable-are-automated-coding-assessments/index.html @@ -1,5 +1,5 @@ How hackable are automated coding assessments? | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                    How hackable are automated coding assessments?

                                                                                                    Bing's interpretation of _an otter coding furiously in an attempt to pass a coding test_

                                                                                                    Bing’s interpretation of an otter coding furiously in an attempt to pass a coding test

                                                                                                    Update (2024-06-19):
                                                                                                    • If you're here for tips on CodeSignal's Industry Coding Framework, one of the best things you can read is +

                                                                                                      How hackable are automated coding assessments?

                                                                                                      Bing's interpretation of _an otter coding furiously in an attempt to pass a coding test_

                                                                                                      Bing’s interpretation of an otter coding furiously in an attempt to pass a coding test

                                                                                                      Update (2024-06-19):

                                                                                                      In the essay The Lesson to Unlearn, Paul Graham makes the claim that students are trained to win by hacking bad tests. That is, to get good grades, one has to avoid spending too much time on material that won’t be turned into test questions. Instead, one’s focus has to be on test-specific study. Students are taught that actual learning is less important than maximising grades. That is the lesson to unlearn.1

                                                                                                      Even though the essay is a few years old, it’s been on my mind recently for two reasons. The first reason is that large language models are excelling in standardised tests: I’m impressed by this progress, but it’s also a reminder of the hackability of such tests and the need to employ critical thinking to stay ahead of the AI automation wave. The second reason is that I did a CodeSignal test myself, which led me to think more deeply on the hackability of automated and timed coding assessments. This post discusses my thoughts on the topic, using CodeSignal’s Industry Coding Framework as a case study. However, most of my observations should apply to similar tests.

                                                                                                      What are hackable tests?

                                                                                                      Hacking a test is different from cheating. Hacking entails following the test’s rules, but optimising your work to exploit its weaknesses and increase your score. It doesn’t necessarily entail changing the underlying properties that the test purports to measure. By contrast, cheating entails behaviours that are prohibited by the test’s rules, such as letting someone else do the test for you, or consulting resources that are defined as off limits.

                                                                                                      A test’s hackability isn’t a binary property. Hackability lies on a scale from unhackable to fully hackable, as demonstrated by the following examples and plot.

                                                                                                      Say we take an adult and measure their height every day around the same time, over a period of a month. We can expect the measurements to have low variance. There’s little the test taker can do to significantly increase their height without cheating. The test is a good representation of the property it aims to measure – an unhackable test.

                                                                                                      On the other end of the scale, say we take the same person and ask them the same set of questions over the course of a month. Our aim is to assess their skills in a subject area such as programming. Given that we’re repeating the same questions, they can find the answers and try to memorise them after each attempt. Assuming they’re sufficiently motivated, we can expect their scores to increase even if they know nothing about programming. This test is highly hackable. It’s hard to say that it accurately reflects the property it purports to measure, i.e., programming skills. This is because scores are strongly influenced by motivation to succeed in the test, as well as short-term memorisation and retrieval abilities.

                                                                                                      An improvement over the unchanged test is generating variations from a set of possible questions.2 While our test taker would benefit from deeper skills in the subject area, they can also improve their scores by learning to recognise patterns in test questions, managing their time well, and memorising recurring elements. Again, we can expect their scores to improve over time and fail to accurately reflect the skills we care about. This gets us into the familiar territory of standardised testing, a category that I believe CodeSignal’s Industry Coding Assessments fall under. That is, tests that are not fully hackable, but still fall short of reflecting the properties they claim to measure.

                                                                                                      800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                        Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                      \ No newline at end of file diff --git a/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/index.html b/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/index.html index c276dbe76..062dc5514 100644 --- a/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/index.html +++ b/2023/06/30/was-data-science-a-failure-mode-of-software-engineering/index.html @@ -1,5 +1,5 @@ Was data science a failure mode of software engineering? | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                      Was data science a failure mode of software engineering?

                                                                                                      Not sure what's going on here, but it came from an odd conversation on the topic with Bing. Seems apt.

                                                                                                      Not sure what’s going on here, but it came from an odd conversation on the topic with Bing. Seems apt.

                                                                                                      The world was a different place in 2012. I had just finished my PhD, and I wasn’t sure what my title should be. My formal academic transcript said that I specialised in artificial intelligence, but back then it seemed silly to call myself an AI Expert. As I was interested in startups, I became the first employee of Giveable, and my title was Data Scientist. This was the year Harvard Business Review declared data scientist to be the sexiest job of the 21st century, so it suited me just fine. I got to do work I found interesting while reaping the benefits of being in an over-hyped profession.

                                                                                                      As data science was a new term, I attempted to decipher its evolving meaning. In 2014, I liked the definition by Josh Wills, who saw it as the intersection of software engineering and statistics. By 2018, I came to see it as the union of many fields, with practitioners who support and drive decisions by employing descriptive analytics, predictive models, and causal inference. In 2020, I reflected on the trend of software commodities displacing interesting data science work. Now, I look back and wonder: Was data science a failure mode of software engineering? That is, did many data science projects repeat classic software engineering mistakes (especially in the early days)?

                                                                                                      Breaking Betteridge’s law of headlines, my answer to these questions is yes. I believe that many instances of data science projects exhibited classic software engineering mistakes, especially in the 2010s. Things appear to be getting better, though. The emergence of professions like data engineering, machine learning engineering, and analytics engineering represents a move away from getting data scientists to fail at software engineering – simply because they need to do less of it. But this isn’t the case everywhere, as data maturity varies across organisations.

                                                                                                      Failure mode examples

                                                                                                      For a data science project to exhibit a failure mode of software engineering, it needs to: (1) have working software as one of its outcomes; and (2) fail in a way that software engineering projects are known to fail.

                                                                                                      Not all data science projects meet my first criterion. Some projects end with a one-off report as their outcome, which is fine if that’s the project’s goal. However, many data science projects aim to deliver software systems that need to operate continuously and reliably. Quoting one of the principles behind the agile manifesto, for projects of the latter type, working software is the primary measure of progress. My sense is that such projects were driving the data science hype, e.g., a personalisation system that automatically increases revenue is both more exciting and more valuable than a one-off report.

                                                                                                      For my second criterion, I’ll discuss some classic software engineering mistakes and how they manifest in data science projects. These come from a list compiled by Steve McConnell in 1996 and updated in 2008. While some mistakes have become less common, many are still repeated to this day. As Jeff Atwood noted in 2007, “classic mistakes are classic because they’re so seductive.” The updated list contains 42 mistakes, so I’ll highlight five I find especially pertinent: unrealistic expectations, heroics, research-oriented development, silver-bullet syndrome, and lack of automated source-code control.

                                                                                                      (M1) Unrealistic expectations. This mistake had the highest exposure index in McConnell’s 2008 report, meaning it’s both frequent and severe. I don’t have solid data on the occurrence of this mistake in data science projects, but unrealistic expectations go hand in hand with an over-hyped field. This is exemplified by the Gartner hype cycle, where technologies hit a peak of inflated expectations followed by a trough of disillusionment. While the general validity of the hype cycle model is questionable, I’ve experienced enough instances of unrealistic expectations and heard enough stories to believe that many data science projects have not escaped this classic mistake.

                                                                                                      Was data science a failure mode of software engineering?

                                                                                                      Not sure what's going on here, but it came from an odd conversation on the topic with Bing. Seems apt.

                                                                                                      Not sure what’s going on here, but it came from an odd conversation on the topic with Bing. Seems apt.

                                                                                                      The world was a different place in 2012. I had just finished my PhD, and I wasn’t sure what my title should be. My formal academic transcript said that I specialised in artificial intelligence, but back then it seemed silly to call myself an AI Expert. As I was interested in startups, I became the first employee of Giveable, and my title was Data Scientist. This was the year Harvard Business Review declared data scientist to be the sexiest job of the 21st century, so it suited me just fine. I got to do work I found interesting while reaping the benefits of being in an over-hyped profession.

                                                                                                      As data science was a new term, I attempted to decipher its evolving meaning. In 2014, I liked the definition by Josh Wills, who saw it as the intersection of software engineering and statistics. By 2018, I came to see it as the union of many fields, with practitioners who support and drive decisions by employing descriptive analytics, predictive models, and causal inference. In 2020, I reflected on the trend of software commodities displacing interesting data science work. Now, I look back and wonder: Was data science a failure mode of software engineering? That is, did many data science projects repeat classic software engineering mistakes (especially in the early days)?

                                                                                                      Breaking Betteridge’s law of headlines, my answer to these questions is yes. I believe that many instances of data science projects exhibited classic software engineering mistakes, especially in the 2010s. Things appear to be getting better, though. The emergence of professions like data engineering, machine learning engineering, and analytics engineering represents a move away from getting data scientists to fail at software engineering – simply because they need to do less of it. But this isn’t the case everywhere, as data maturity varies across organisations.

                                                                                                      Failure mode examples

                                                                                                      For a data science project to exhibit a failure mode of software engineering, it needs to: (1) have working software as one of its outcomes; and (2) fail in a way that software engineering projects are known to fail.

                                                                                                      Not all data science projects meet my first criterion. Some projects end with a one-off report as their outcome, which is fine if that’s the project’s goal. However, many data science projects aim to deliver software systems that need to operate continuously and reliably. Quoting one of the principles behind the agile manifesto, for projects of the latter type, working software is the primary measure of progress. My sense is that such projects were driving the data science hype, e.g., a personalisation system that automatically increases revenue is both more exciting and more valuable than a one-off report.

                                                                                                      For my second criterion, I’ll discuss some classic software engineering mistakes and how they manifest in data science projects. These come from a list compiled by Steve McConnell in 1996 and updated in 2008. While some mistakes have become less common, many are still repeated to this day. As Jeff Atwood noted in 2007, “classic mistakes are classic because they’re so seductive.” The updated list contains 42 mistakes, so I’ll highlight five I find especially pertinent: unrealistic expectations, heroics, research-oriented development, silver-bullet syndrome, and lack of automated source-code control.

                                                                                                      (M1) Unrealistic expectations. This mistake had the highest exposure index in McConnell’s 2008 report, meaning it’s both frequent and severe. I don’t have solid data on the occurrence of this mistake in data science projects, but unrealistic expectations go hand in hand with an over-hyped field. This is exemplified by the Gartner hype cycle, where technologies hit a peak of inflated expectations followed by a trough of disillusionment. While the general validity of the hype cycle model is questionable, I’ve experienced enough instances of unrealistic expectations and heard enough stories to believe that many data science projects have not escaped this classic mistake.

                                                                                                      800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                        Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                        \ No newline at end of file diff --git a/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/index.html b/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/index.html index d6ccda1ca..468b3ef51 100644 --- a/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/index.html +++ b/2023/08/28/my-rediscovery-of-quiet-writing-on-the-open-web/index.html @@ -1,5 +1,5 @@ My rediscovery of quiet writing on the open web | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                        My rediscovery of quiet writing on the open web

                                                                                                        Rikoriko cave

                                                                                                        I published my first post on this website almost ten years ago. My motivation was modest: Publicly link to useful stuff I wrote. Recently, I tidied up my homepage and added short descriptions to old posts, which made me reflect on how this website has changed over the years: From a quiet place for sharing some tips and progress reports, through B-list data science “influencing”, and back to a quiet place. This post summarises some of my reflections, in no particular order.

                                                                                                        Views are addictive and never enough. The first time a post I published had over a thousand views, I was excited. Then some of my posts had over ten thousand views, so a mere thousand became a disappointment. I didn’t intentionally optimise for views – it happened because I wrote on popular topics, and distributed my posts through channels that worked at the time. If I had chosen to optimise for popularity, I probably would have grown dissatisfied with view counts in the tens of thousands, and then with hundreds of thousands, and then with millions. Instead, I went backwards in terms of popularity and audience size: I deleted my Twitter account a few years ago because I found the platform unpleasant, and I lost half my followers when I migrated my site from WordPress.com to Hugo two years ago (I couldn’t port non-email followers who were using the proprietary WordPress.com Reader). More importantly, I often write about topics that may be of low appeal to current followers, and don’t invest much effort in getting the word out. I don’t even bother with accurate long-term tracking of views and interactions – I only use Cloudflare Analytics to validate that the website is still working.

                                                                                                        Meaningful engagement is more satisfying than views and likes. I removed the Like functionality when I got my website off WordPress.com. I don’t miss it, as likes have the same addictive “never enough” qualities as views. That said, views and likes are correlated with the amount of more satisfying engagement, which comes in the form of thoughtful comments and private messages. By reducing my distribution efforts, I have also reduced the amount of meaningful engagement, but such is life. I still have other motivators.

                                                                                                        Writing publicly helps me think. Even with a low number of views, the fact that practically anyone in the world can read something I wrote makes me take it more seriously. I put more effort into making myself clear and checking references than if I were to write for myself. In addition, the process of writing often becomes a process of discovery – as I write things down and add structure to a post, my subject becomes clearer to me.

                                                                                                        Writing publicly creates valuable documentation. Even if no one is reading right now, posts on this website remain accessible for years. I often link to my own writing – not (always) out of vanity, but because it’s relevant in a specific context. Recently, I started experimenting with easier-to-produce posts that I share under a today I learned (TIL) section – a format I learned about from following Simon Willison. So far, my TIL section is pretty much documentation for myself, as I put no effort into telling people about specific TIL posts. We’ll see how it goes in the long run.

                                                                                                        Platform independence is awesome (if you have the right skills). Getting my website off WordPress.com a couple of years ago was a bit of a pain, but I love the extra control it gives me. On a platform like WordPress.com, I would have had to pay extra to do something like give all my posts short meta-descriptions and organise them on a single page, as I did recently. The same goes for setting up the TIL section, which was a breeze with Hugo. Being able to have fine-grained control over the rendered content and its structure works well for me, but it’s not for everyone (there’s a reason why a large portion of the web uses WordPress). Still, it has never been easier and cheaper to self-host a static site like mine.

                                                                                                        Durable tech works well for quiet writing. Legacy technologies tend to get a bad rap. Many people prefer building with shiny new tech on shiny new platforms. Publishing on the web is no exception, as trendy ways of sharing content come and go. Twenty years ago, most of today’s social media didn’t exist. How much of it will exist in twenty years? Making any prediction is hard, but I’m willing to bet that twenty years from now, there will still be tools that can serve and render my website (the HTML / CSS / JS output of Hugo), as it exists right now. I wouldn’t make the same bet on nascent social media platforms or on writing-centric platforms such as Substack. This is in line with the Lindy effect, which states that “the future life expectancy of some non-perishable things, like a technology or an idea, is proportional to their current age”. It often makes perfect sense to go for new tech, as it comes with new capabilities. I prefer to be cautious, as I want to focus on what I get out of writing rather than on bouncing between platforms and tools.

                                                                                                        My rediscovery of quiet writing on the open web

                                                                                                        Rikoriko cave

                                                                                                        I published my first post on this website almost ten years ago. My motivation was modest: Publicly link to useful stuff I wrote. Recently, I tidied up my homepage and added short descriptions to old posts, which made me reflect on how this website has changed over the years: From a quiet place for sharing some tips and progress reports, through B-list data science “influencing”, and back to a quiet place. This post summarises some of my reflections, in no particular order.

                                                                                                        Views are addictive and never enough. The first time a post I published had over a thousand views, I was excited. Then some of my posts had over ten thousand views, so a mere thousand became a disappointment. I didn’t intentionally optimise for views – it happened because I wrote on popular topics, and distributed my posts through channels that worked at the time. If I had chosen to optimise for popularity, I probably would have grown dissatisfied with view counts in the tens of thousands, and then with hundreds of thousands, and then with millions. Instead, I went backwards in terms of popularity and audience size: I deleted my Twitter account a few years ago because I found the platform unpleasant, and I lost half my followers when I migrated my site from WordPress.com to Hugo two years ago (I couldn’t port non-email followers who were using the proprietary WordPress.com Reader). More importantly, I often write about topics that may be of low appeal to current followers, and don’t invest much effort in getting the word out. I don’t even bother with accurate long-term tracking of views and interactions – I only use Cloudflare Analytics to validate that the website is still working.

                                                                                                        Meaningful engagement is more satisfying than views and likes. I removed the Like functionality when I got my website off WordPress.com. I don’t miss it, as likes have the same addictive “never enough” qualities as views. That said, views and likes are correlated with the amount of more satisfying engagement, which comes in the form of thoughtful comments and private messages. By reducing my distribution efforts, I have also reduced the amount of meaningful engagement, but such is life. I still have other motivators.

                                                                                                        Writing publicly helps me think. Even with a low number of views, the fact that practically anyone in the world can read something I wrote makes me take it more seriously. I put more effort into making myself clear and checking references than if I were to write for myself. In addition, the process of writing often becomes a process of discovery – as I write things down and add structure to a post, my subject becomes clearer to me.

                                                                                                        Writing publicly creates valuable documentation. Even if no one is reading right now, posts on this website remain accessible for years. I often link to my own writing – not (always) out of vanity, but because it’s relevant in a specific context. Recently, I started experimenting with easier-to-produce posts that I share under a today I learned (TIL) section – a format I learned about from following Simon Willison. So far, my TIL section is pretty much documentation for myself, as I put no effort into telling people about specific TIL posts. We’ll see how it goes in the long run.

                                                                                                        Platform independence is awesome (if you have the right skills). Getting my website off WordPress.com a couple of years ago was a bit of a pain, but I love the extra control it gives me. On a platform like WordPress.com, I would have had to pay extra to do something like give all my posts short meta-descriptions and organise them on a single page, as I did recently. The same goes for setting up the TIL section, which was a breeze with Hugo. Being able to have fine-grained control over the rendered content and its structure works well for me, but it’s not for everyone (there’s a reason why a large portion of the web uses WordPress). Still, it has never been easier and cheaper to self-host a static site like mine.

                                                                                                        Durable tech works well for quiet writing. Legacy technologies tend to get a bad rap. Many people prefer building with shiny new tech on shiny new platforms. Publishing on the web is no exception, as trendy ways of sharing content come and go. Twenty years ago, most of today’s social media didn’t exist. How much of it will exist in twenty years? Making any prediction is hard, but I’m willing to bet that twenty years from now, there will still be tools that can serve and render my website (the HTML / CSS / JS output of Hugo), as it exists right now. I wouldn’t make the same bet on nascent social media platforms or on writing-centric platforms such as Substack. This is in line with the Lindy effect, which states that “the future life expectancy of some non-perishable things, like a technology or an idea, is proportional to their current age”. It often makes perfect sense to go for new tech, as it comes with new capabilities. I prefer to be cautious, as I want to focus on what I get out of writing rather than on bouncing between platforms and tools.

                                                                                                        800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                          \ No newline at end of file diff --git a/2023/10/25/lessons-from-reluctant-data-engineering/index.html b/2023/10/25/lessons-from-reluctant-data-engineering/index.html index f41048c93..9163b6429 100644 --- a/2023/10/25/lessons-from-reluctant-data-engineering/index.html +++ b/2023/10/25/lessons-from-reluctant-data-engineering/index.html @@ -1,9 +1,11 @@ Lessons from reluctant data engineering | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                          Lessons from reluctant data engineering

                                                                                                          Yanir Seroussi presenting at DataEngBytes Brisbane 2023

                                                                                                          In May 2023, I submitted the following talk abstract to the Brisbane DataEngBytes conference.

                                                                                                          As we all know, solid data engineering is essential to the success of data science and AI applications. And yet, people often get excited about fancy machine learning models and neglect the data engineering layer. This is totally understandable: playing with data in a throwaway notebook is more relaxing than dealing with a data pipeline that keeps finding ways to break in production.

                                                                                                          In this talk, I’ll share lessons on data engineering from a data science perspective. Everywhere I’ve worked, from small start-ups to established companies, I’ve found that I had to do some data engineering if I wanted my work to ever get to production. While I’ve always been reluctant to do too much of it, my engineering background has placed me in a better position to do it than colleagues who started off as analysts and academics.

                                                                                                          You could call my work full-stack data science, reluctant data engineering, or some other data & AI thing. Whatever it is, I hope that my talk will help us all play better with each other, across all layers of the data stack.

                                                                                                          As I don’t identify as a data engineer and have never attended a DataEngBytes conference, I didn’t know whether my talk would fit the agenda. However, it seemed harmless to submit an abstract and see how it goes.

                                                                                                          When I got the acceptance notification and realised I had to turn my abstract into a coherent talk, I was a bit wary of lacking a good grasp of who’s in my audience. However, when the full agenda was published, I realised that the focus of the conference won’t be on arcane data engineering knowledge, given that one of the keynotes was titled “How The Full-Stack Data Scientist Is STILL The Sexiest Job”. It turned out that despite the name and tagline (“by data engineers, for data engineers”), DataEngBytes was a great event for all data professionals.

                                                                                                          Here’s the video of the talk (slides):


                                                                                                          Quick summary. I start off with a disclaimer, stating that I am not a data engineer. Then I show evidence that the market values data engineering more than data science, given the ratio of Data Engineer to Data Scientist job ads (x3 in the AU$100-150k compensation range; x4 in the AU$200k+ range).1 I follow that observation with another disclaimer, stating that some of my lessons may be obvious or better learnt the hard way (as I often have to learn and relearn lessons). Then I detail five chronologically ordered snippets and their corresponding lessons:

                                                                                                          1. 2012: My first data science job, where we made mistakes around technology choice and premature optimisation. The lesson is that shiny tech ain’t always shiny. Like all lessons, this one ends with a quote that shows that what I learned wasn’t entirely new. The first quote is by Donald Knuth from 1974: “We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%.”
                                                                                                          2. 2013: My first head of data science job, where we solved real scaling issues by following principles and adapting solutions to our situation. The lesson is that shiny tech can be transformative; but principles beat tools, which goes with a 1911 quote by Harrington Emerson: “As to methods, there may be a million and then some, but principles are few. The person who grasps principles can successfully select their own methods. The person who tries methods, ignoring principles, is sure to have trouble.”2
                                                                                                          3. 2015: My first enterprise consulting stint, where I experienced being a not-so-useful data scientist and working with some not-so-useful data engineers. This led me to dabble in “shadow IT” (a term I learned at the conference), and build a separate Python machine learning pipeline to work around various limitations. The lesson is that you should solve problems; don’t be the problem, or in the words of circa 2004 Google: “Focus on the user and all else will follow.”
                                                                                                          4. 2017: My first remote data science job, where I played around with many job functions across the data stack and went down various data rabbit holes. The lesson is to go deep; trust but verify, which goes with a 1999 quote by Eric S. Raymond: “Given enough eyeballs, all bugs are shallow.”
                                                                                                          5. 2022: My first committed climate and biodiversity moves (still a work in progress). The lesson is that tech & titles are tools; focus on what matters, but recall Rabbi Tarfon’s quote from almost two thousand years ago: “You are not obliged to complete the work, but neither are you free to desist from it.”

                                                                                                          The main takeaway from the talk is that data problems have human roots – and human solutions. This is because:

                                                                                                          • Humans get excited by shiny tech… and produce transformative tech.
                                                                                                          • Humans optimise prematurely… and when it makes sense.
                                                                                                          • Humans can act as unreasonable blockers… and as the users we serve.
                                                                                                          • Humans generate messy data… and clean it up.
                                                                                                          • Humans get distracted by tools… and use them for beneficial ends.

                                                                                                          1. This is based on Seek searches for jobs advertised in July 2023. Given the limitations of Seek search, it’s not an accurate representation of the demand for each role, as the results included all ads that mentioned the terms. One could also argue that data engineers tend to change jobs more than data scientists, fuelling demand. Despite this, I think the results support the general message around the value of data engineering, especially as others have noted the need for 4-5 data engineers per data scientist in organisations with complex data engineering requirements↩︎

                                                                                                          2. Emerson referred to man rather than person in the original quote, but I took the liberty to make it gender-neutral and retain the original message. ↩︎

                                                                                                          Subscribe +

                                                                                                          Lessons from reluctant data engineering

                                                                                                          Yanir Seroussi presenting at DataEngBytes Brisbane 2023

                                                                                                          In May 2023, I submitted the following talk abstract to the Brisbane DataEngBytes conference.

                                                                                                          As we all know, solid data engineering is essential to the success of data science and AI applications. And yet, people often get excited about fancy machine learning models and neglect the data engineering layer. This is totally understandable: playing with data in a throwaway notebook is more relaxing than dealing with a data pipeline that keeps finding ways to break in production.

                                                                                                          In this talk, I’ll share lessons on data engineering from a data science perspective. Everywhere I’ve worked, from small start-ups to established companies, I’ve found that I had to do some data engineering if I wanted my work to ever get to production. While I’ve always been reluctant to do too much of it, my engineering background has placed me in a better position to do it than colleagues who started off as analysts and academics.

                                                                                                          You could call my work full-stack data science, reluctant data engineering, or some other data & AI thing. Whatever it is, I hope that my talk will help us all play better with each other, across all layers of the data stack.

                                                                                                          As I don’t identify as a data engineer and have never attended a DataEngBytes conference, I didn’t know whether my talk would fit the agenda. However, it seemed harmless to submit an abstract and see how it goes.

                                                                                                          When I got the acceptance notification and realised I had to turn my abstract into a coherent talk, I was a bit wary of lacking a good grasp of who’s in my audience. However, when the full agenda was published, I realised that the focus of the conference won’t be on arcane data engineering knowledge, given that one of the keynotes was titled “How The Full-Stack Data Scientist Is STILL The Sexiest Job”. It turned out that despite the name and tagline (“by data engineers, for data engineers”), DataEngBytes was a great event for all data professionals.

                                                                                                          Here’s the video of the talk (slides):


                                                                                                          Quick summary. I start off with a disclaimer, stating that I am not a data engineer. Then I show evidence that the market values data engineering more than data science, given the ratio of Data Engineer to Data Scientist job ads (x3 in the AU$100-150k compensation range; x4 in the AU$200k+ range).1 I follow that observation with another disclaimer, stating that some of my lessons may be obvious or better learnt the hard way (as I often have to learn and relearn lessons). Then I detail five chronologically ordered snippets and their corresponding lessons:

                                                                                                          1. 2012: My first data science job, where we made mistakes around technology choice and premature optimisation. The lesson is that shiny tech ain’t always shiny. Like all lessons, this one ends with a quote that shows that what I learned wasn’t entirely new. The first quote is by Donald Knuth from 1974: “We should forget about small efficiencies, say about 97% of the time: premature optimization is the root of all evil. Yet we should not pass up our opportunities in that critical 3%.”
                                                                                                          2. 2013: My first head of data science job, where we solved real scaling issues by following principles and adapting solutions to our situation. The lesson is that shiny tech can be transformative; but principles beat tools, which goes with a 1911 quote by Harrington Emerson: “As to methods, there may be a million and then some, but principles are few. The person who grasps principles can successfully select their own methods. The person who tries methods, ignoring principles, is sure to have trouble.”2
                                                                                                          3. 2015: My first enterprise consulting stint, where I experienced being a not-so-useful data scientist and working with some not-so-useful data engineers. This led me to dabble in “shadow IT” (a term I learned at the conference), and build a separate Python machine learning pipeline to work around various limitations. The lesson is that you should solve problems; don’t be the problem, or in the words of circa 2004 Google: “Focus on the user and all else will follow.”
                                                                                                          4. 2017: My first remote data science job, where I played around with many job functions across the data stack and went down various data rabbit holes. The lesson is to go deep; trust but verify, which goes with a 1999 quote by Eric S. Raymond: “Given enough eyeballs, all bugs are shallow.”
                                                                                                          5. 2022: My first committed climate and biodiversity moves (still a work in progress). The lesson is that tech & titles are tools; focus on what matters, but recall Rabbi Tarfon’s quote from almost two thousand years ago: “You are not obliged to complete the work, but neither are you free to desist from it.”

                                                                                                          The main takeaway from the talk is that data problems have human roots – and human solutions. This is because:

                                                                                                          • Humans get excited by shiny tech… and produce transformative tech.
                                                                                                          • Humans optimise prematurely… and when it makes sense.
                                                                                                          • Humans can act as unreasonable blockers… and as the users we serve.
                                                                                                          • Humans generate messy data… and clean it up.
                                                                                                          • Humans get distracted by tools… and use them for beneficial ends.

                                                                                                          1. This is based on Seek searches for jobs advertised in July 2023. Given the limitations of Seek search, it’s not an accurate representation of the demand for each role, as the results included all ads that mentioned the terms. One could also argue that data engineers tend to change jobs more than data scientists, fuelling demand. Despite this, I think the results support the general message around the value of data engineering, especially as others have noted the need for 4-5 data engineers per data scientist in organisations with complex data engineering requirements↩︎

                                                                                                          2. Emerson referred to man rather than person in the original quote, but I took the liberty to make it gender-neutral and retain the original message. ↩︎

                                                                                                          Subscribe

                                                                                                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                            \ No newline at end of file diff --git a/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.html b/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.html index 1ca774df3..2c1c5b6f5 100644 --- a/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.html +++ b/2023/11/29/supporting-volunteer-monitoring-of-marine-biodiversity-with-modern-web-and-data-tools/index.html @@ -1,9 +1,11 @@ Supporting volunteer monitoring of marine biodiversity with modern web and data tools | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                            Supporting volunteer monitoring of marine biodiversity with modern web and data tools

                                                                                                            Screenshot of Reef Species of the World

                                                                                                            I’ve been volunteering with the Reef Life Survey (RLS) citizen science project since 2015. RLS volunteers follow the same underwater visual census methodology that has been in use for decades, thereby producing data series that help inform the management of marine ecosystems. In simpler terms, we count fish (and some invertebrates), and this helps various organisations know what’s happening underwater. Among other places, RLS data has been used in scientific publications in Nature and elsewhere, and to inform the management of Australian marine parks.

                                                                                                            Over the years, I created a few online tools to help volunteers with survey work. These included web apps to visualise survey results and study species, as well as infer species from underwater photos. More recently, I agreed to help with the general maintenance of the non-WordPress parts of the RLS website and backend (somewhat reluctantly, but I suppose that’s what happens when you do things out of love).

                                                                                                            Taking greater responsibility to help the tech side of RLS along with an alignment of the research grant stars led to an opportunity to revamp the Reef Species of the World (RSoW) section – a collection of over 5,000 species with in-situ photos, descriptions, and empirical distributions derived from RLS surveys. My focus in this project was on product management, data pipelines, and backend work. I was joined by my brother, Uri Seroussi, who was in charge of front-end development (which became much more substantial than in the original RSoW).

                                                                                                            The original RSoW was a traditional PHP application that relied on a MySQL database to serve requests, with most of the HTML constructed on the server. By contrast, we re-architected the new RSoW as a progressive web app using Next.js, which has the following advantages and new features:

                                                                                                            • Fully static site: served faster with reduced server load.
                                                                                                            • Faster search and navigation: happens on the front-end without round-trips to the server.
                                                                                                            • Installable app with offline availability: RSoW can now be installed as a mobile or desktop app, and run without an internet connection.
                                                                                                            • Client-side image classification: offline availability includes image classification in the browser, which is useful when surveying in remote areas.
                                                                                                            • Replacement of previous tools and pipelines: providing a more consistent user experience and improved data reliability.

                                                                                                            The rest of this post provides details on the architecture and implementation of the new RSoW and its underlying data and machine learning pipelines. But the best way of getting a feel for the data and the tools is to have a play yourself.

                                                                                                            The new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems

                                                                                                            The new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems

                                                                                                            The RSoW web app

                                                                                                            We didn’t start with a blank slate: RSoW was already a public website, with many individual species pages ranking well on web searches (the main source of traffic). As such, a guiding principle was to retain as much of the original functionality as possible, and then build new features on top of it.

                                                                                                            When approaching a legacy codebase, there’s always the question of whether rebuilding parts or all of it is a worthwhile endeavour. As Jason Cohen notes, a more apt name for “legacy code” is “revenue code”, i.e., the code that embodies all the original and changed requirements, and has withstood the test of time. Even though RLS’s code isn’t meant to generate revenue, it’s always easy to mess things up when re-implementing existing functionality.

                                                                                                            The main reasons we decided on a rewrite of the front-end were:

                                                                                                            • User experience: Speed things up, as some species searches were pretty slow due to server round-trips and inefficient database queries.
                                                                                                            • Extensibility: Make it easier to add new features.
                                                                                                            • Offline availability: This is impossible with a traditional PHP back-end, but feasible if all the data and code gets shipped to the client.

                                                                                                            We chose Next.js as the front-end framework since it’s well-established and supports static exports. Parts of the RLS website run on WordPress, so it’s easy to add statically-generated pages and serve them efficiently via Cloudflare (I wasn’t keen on complicating the stack by adding a Node backend). With static exports, we regenerate all the species pages whenever the data changes, which means that end-user page requests don’t need to touch the database. In addition, the main search page downloads three JSONs with all the data it needs to perform any species search (see sites.json, species.json, and surveys.json in the rls-data repo). Minified and compressed, these JSONs add up to less than 2MB of data, which isn’t tiny, but it is a small price to pay to avoid hitting the database. The JSONs also cache well on Cloudflare, like the rest of the web app’s files.

                                                                                                            From a user perspective, replicating the original functionality was the less exciting part of the project. Faster and less buggy code is obviously better, but once feature parity was achieved, we turned our attention to some new features:

                                                                                                            • Supporting offline availability and installation by turning RSoW into a progressive web app: On its face, this was supposed to be simple given the next-pwa package, but it turned out to be a bit tricky because the original package was abandoned, and due to multiple layers of caching. It’s well-known that cache invalidation is one of the two hard problems in computer science (along with naming things and off-by-one errors), and progressive web apps offer a lovely variety of caches to deal with – everything needs to be cached on the client for offline availability. We got there after some tinkering and dealing with head-scratching bugs, some of which were caused by other caching layers in addition to the client-side caches (including Cloudflare and some misconfiguration of an early version of the app).
                                                                                                            • Knowledge test: A separate grant came along and Uri had the opportunity to extend RSoW by adding a section that helps test new volunteers ahead of them joining RLS.
                                                                                                            • Species frequency exploration: Bringing in the full functionality from the first tool I built for RLS back in 2017.
                                                                                                            • Client-side image classification: Deprecating the Streamlit app I built a couple of years ago.

                                                                                                            Data and machine learning pipelines

                                                                                                            On the back-end, there was an opportunity to simplify things by retiring the original PHP code that processed survey data in favour of the pipelines I implemented in the rls-data repo. Ultimately, survey data comes from the Australian Ocean Data Network (AODN), which holds many more datasets in addition to RLS. Originally, the PHP code that processed survey data into the MySQL database evolved separately from rls-data, which I implemented to generate JSONs for the tools I built. As rls-data is an open source project and the raw survey data is relatively small (<1GB), it made sense to process it with a daily GitHub Action (GHA) script that runs for free. The resultant JSONs are committed to the repo, which means that any unexpected changes are easily tracked (I keep an eye on the commits). It was simple to expand the existing rls-data pipelines to generate all the JSONs needed to serve RSoW, and then say goodbye to the PHP code that implemented similar functionality.

                                                                                                            I’m aware that running data pipelines with GitHub Actions isn’t going to win any awards for sophistication, but it’s a great fit for this project. The key principle is to use the right tool for the job, not the shiniest tool.

                                                                                                            One part of the original RSoW that we barely touched was the management interface, which allows RLS admins to update species data and upload pictures. The gains from replacing the admin part of RSoW would have been negligible, so it still runs the old PHP code on top of MySQL. Unfortunately, this meant I couldn’t retire all the PHP data pipelines, as species data also comes from the Australian Ocean Data Network and is joined with the edits made by RLS admins. This exemplifies the pragmatism that one often needs to apply when faced with legacy revenue systems: If a system works and there’s no real benefit to replacing it, sticking with the old system is the right thing to do (even if it makes your architecture diagram more complicated).

                                                                                                            I have big plans to improve the machine learning model for inferring RLS species from user images, but it’s somehow never a priority. For RSoW, I did make it a priority to support serving the model with a simple API, but then I decided it’d be worth the effort to export it to ONNX for client-side image classification. This was partly driven by curiosity about ONNX, but it also had two key benefits: (1) support for offline classification; and (2) simplified & cheaper serving architecture, as ONNX models can be served from S3 and don’t require RLS to pay for server-side compute.

                                                                                                            As to the machine learning pipelines, they all need to be manually triggered, which is fine since the image data changes slowly. These pipelines are implemented in notebooks and the command-line interface of the ichthywhat repo. I have a bit of a dream of this being an early precursor to complete automation of RLS data collection, with the historical RLS data series continued by divers who would mostly serve as video takers and fish scarers (using cameras without human divers would lead to different biases in the data). However, this is a big project that is probably best left to my next PhD, i.e., it may never happen.

                                                                                                            In the meantime, I hope to continue diving with RLS, and aim to make pragmatic decisions to keep RSoW running and supporting the community.

                                                                                                            Subscribe +

                                                                                                            Supporting volunteer monitoring of marine biodiversity with modern web and data tools

                                                                                                            Screenshot of Reef Species of the World

                                                                                                            I’ve been volunteering with the Reef Life Survey (RLS) citizen science project since 2015. RLS volunteers follow the same underwater visual census methodology that has been in use for decades, thereby producing data series that help inform the management of marine ecosystems. In simpler terms, we count fish (and some invertebrates), and this helps various organisations know what’s happening underwater. Among other places, RLS data has been used in scientific publications in Nature and elsewhere, and to inform the management of Australian marine parks.

                                                                                                            Over the years, I created a few online tools to help volunteers with survey work. These included web apps to visualise survey results and study species, as well as infer species from underwater photos. More recently, I agreed to help with the general maintenance of the non-WordPress parts of the RLS website and backend (somewhat reluctantly, but I suppose that’s what happens when you do things out of love).

                                                                                                            Taking greater responsibility to help the tech side of RLS along with an alignment of the research grant stars led to an opportunity to revamp the Reef Species of the World (RSoW) section – a collection of over 5,000 species with in-situ photos, descriptions, and empirical distributions derived from RLS surveys. My focus in this project was on product management, data pipelines, and backend work. I was joined by my brother, Uri Seroussi, who was in charge of front-end development (which became much more substantial than in the original RSoW).

                                                                                                            The original RSoW was a traditional PHP application that relied on a MySQL database to serve requests, with most of the HTML constructed on the server. By contrast, we re-architected the new RSoW as a progressive web app using Next.js, which has the following advantages and new features:

                                                                                                            • Fully static site: served faster with reduced server load.
                                                                                                            • Faster search and navigation: happens on the front-end without round-trips to the server.
                                                                                                            • Installable app with offline availability: RSoW can now be installed as a mobile or desktop app, and run without an internet connection.
                                                                                                            • Client-side image classification: offline availability includes image classification in the browser, which is useful when surveying in remote areas.
                                                                                                            • Replacement of previous tools and pipelines: providing a more consistent user experience and improved data reliability.

                                                                                                            The rest of this post provides details on the architecture and implementation of the new RSoW and its underlying data and machine learning pipelines. But the best way of getting a feel for the data and the tools is to have a play yourself.

                                                                                                            The new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems

                                                                                                            The new RSoW architecture diagram reflects the compromises between rebuilding and retaining legacy systems

                                                                                                            The RSoW web app

                                                                                                            We didn’t start with a blank slate: RSoW was already a public website, with many individual species pages ranking well on web searches (the main source of traffic). As such, a guiding principle was to retain as much of the original functionality as possible, and then build new features on top of it.

                                                                                                            When approaching a legacy codebase, there’s always the question of whether rebuilding parts or all of it is a worthwhile endeavour. As Jason Cohen notes, a more apt name for “legacy code” is “revenue code”, i.e., the code that embodies all the original and changed requirements, and has withstood the test of time. Even though RLS’s code isn’t meant to generate revenue, it’s always easy to mess things up when re-implementing existing functionality.

                                                                                                            The main reasons we decided on a rewrite of the front-end were:

                                                                                                            • User experience: Speed things up, as some species searches were pretty slow due to server round-trips and inefficient database queries.
                                                                                                            • Extensibility: Make it easier to add new features.
                                                                                                            • Offline availability: This is impossible with a traditional PHP back-end, but feasible if all the data and code gets shipped to the client.

                                                                                                            We chose Next.js as the front-end framework since it’s well-established and supports static exports. Parts of the RLS website run on WordPress, so it’s easy to add statically-generated pages and serve them efficiently via Cloudflare (I wasn’t keen on complicating the stack by adding a Node backend). With static exports, we regenerate all the species pages whenever the data changes, which means that end-user page requests don’t need to touch the database. In addition, the main search page downloads three JSONs with all the data it needs to perform any species search (see sites.json, species.json, and surveys.json in the rls-data repo). Minified and compressed, these JSONs add up to less than 2MB of data, which isn’t tiny, but it is a small price to pay to avoid hitting the database. The JSONs also cache well on Cloudflare, like the rest of the web app’s files.

                                                                                                            From a user perspective, replicating the original functionality was the less exciting part of the project. Faster and less buggy code is obviously better, but once feature parity was achieved, we turned our attention to some new features:

                                                                                                            • Supporting offline availability and installation by turning RSoW into a progressive web app: On its face, this was supposed to be simple given the next-pwa package, but it turned out to be a bit tricky because the original package was abandoned, and due to multiple layers of caching. It’s well-known that cache invalidation is one of the two hard problems in computer science (along with naming things and off-by-one errors), and progressive web apps offer a lovely variety of caches to deal with – everything needs to be cached on the client for offline availability. We got there after some tinkering and dealing with head-scratching bugs, some of which were caused by other caching layers in addition to the client-side caches (including Cloudflare and some misconfiguration of an early version of the app).
                                                                                                            • Knowledge test: A separate grant came along and Uri had the opportunity to extend RSoW by adding a section that helps test new volunteers ahead of them joining RLS.
                                                                                                            • Species frequency exploration: Bringing in the full functionality from the first tool I built for RLS back in 2017.
                                                                                                            • Client-side image classification: Deprecating the Streamlit app I built a couple of years ago.

                                                                                                            Data and machine learning pipelines

                                                                                                            On the back-end, there was an opportunity to simplify things by retiring the original PHP code that processed survey data in favour of the pipelines I implemented in the rls-data repo. Ultimately, survey data comes from the Australian Ocean Data Network (AODN), which holds many more datasets in addition to RLS. Originally, the PHP code that processed survey data into the MySQL database evolved separately from rls-data, which I implemented to generate JSONs for the tools I built. As rls-data is an open source project and the raw survey data is relatively small (<1GB), it made sense to process it with a daily GitHub Action (GHA) script that runs for free. The resultant JSONs are committed to the repo, which means that any unexpected changes are easily tracked (I keep an eye on the commits). It was simple to expand the existing rls-data pipelines to generate all the JSONs needed to serve RSoW, and then say goodbye to the PHP code that implemented similar functionality.

                                                                                                            I’m aware that running data pipelines with GitHub Actions isn’t going to win any awards for sophistication, but it’s a great fit for this project. The key principle is to use the right tool for the job, not the shiniest tool.

                                                                                                            One part of the original RSoW that we barely touched was the management interface, which allows RLS admins to update species data and upload pictures. The gains from replacing the admin part of RSoW would have been negligible, so it still runs the old PHP code on top of MySQL. Unfortunately, this meant I couldn’t retire all the PHP data pipelines, as species data also comes from the Australian Ocean Data Network and is joined with the edits made by RLS admins. This exemplifies the pragmatism that one often needs to apply when faced with legacy revenue systems: If a system works and there’s no real benefit to replacing it, sticking with the old system is the right thing to do (even if it makes your architecture diagram more complicated).

                                                                                                            I have big plans to improve the machine learning model for inferring RLS species from user images, but it’s somehow never a priority. For RSoW, I did make it a priority to support serving the model with a simple API, but then I decided it’d be worth the effort to export it to ONNX for client-side image classification. This was partly driven by curiosity about ONNX, but it also had two key benefits: (1) support for offline classification; and (2) simplified & cheaper serving architecture, as ONNX models can be served from S3 and don’t require RLS to pay for server-side compute.

                                                                                                            As to the machine learning pipelines, they all need to be manually triggered, which is fine since the image data changes slowly. These pipelines are implemented in notebooks and the command-line interface of the ichthywhat repo. I have a bit of a dream of this being an early precursor to complete automation of RLS data collection, with the historical RLS data series continued by divers who would mostly serve as video takers and fish scarers (using cameras without human divers would lead to different biases in the data). However, this is a big project that is probably best left to my next PhD, i.e., it may never happen.

                                                                                                            In the meantime, I hope to continue diving with RLS, and aim to make pragmatic decisions to keep RSoW running and supporting the community.

                                                                                                            Subscribe

                                                                                                              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                              \ No newline at end of file diff --git a/2024/01/19/new-decade-new-tagline-data-and-ai-for-impact/index.html b/2024/01/19/new-decade-new-tagline-data-and-ai-for-impact/index.html index 487c19fad..18b3faa27 100644 --- a/2024/01/19/new-decade-new-tagline-data-and-ai-for-impact/index.html +++ b/2024/01/19/new-decade-new-tagline-data-and-ai-for-impact/index.html @@ -1,9 +1,11 @@ New decade, new tagline: Data & AI for Impact | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                              New decade, new tagline: Data & AI for Impact

                                                                                                              Logo of Yanir Seroussi's consulting services, depicting a wave and an up-and-to-the-right graph.

                                                                                                              Exactly a decade ago, on 19th January 2014, I published my first post on this website (Kaggle beginner tips). In most of the following years, my tagline was Data Science and Beyond. While the beyond bit gave me an excuse to write about various topics, most posts were indeed around data science – an area that also became broader (arguably to the point of uselessness).

                                                                                                              While I’ve never abandoned my software engineering roots, the broadening of data science means that many data scientists can no longer be assumed to possess solid engineering skills. Therefore, I changed the tagline last year to Engineering Data Science & More. However, this didn’t feel quite right – some people now have an adverse reaction to any mention of data science, after negative experiences of failed projects.

                                                                                                              Recently, I switched the tagline to be both broader and narrower: Data & AI for Nature. However, upon reflection and given some feedback, I realised that the Nature bit may be off-putting to some people who do impactful work in the space but have different motivations. Therefore, I decided to go with Data & AI for Impact (for now…).

                                                                                                              More importantly, I’m planning to revitalise my approach to publishing and audience engagement:

                                                                                                              • Post more frequently – aiming for weekly from February onwards.
                                                                                                              • Use the mailing list to email full posts, and as a two-way avenue for comments and conversations (as opposed to public comments, which are now closed).
                                                                                                              • Still publish both technical and high-level posts on Data & AI.
                                                                                                              • Produce content that’s specifically useful for startups and scaleups that are early on their Data & AI journey.
                                                                                                              • Showcase positive-impact applications of Data & AI tech – especially by startups in the climate and nature-positive space.

                                                                                                              With more frequent posts, what I publish should be quicker to produce and consume. This means I may lean more heavily on showcasing other people’s work – possibly through interviews. Other than that, here are some rough post ideas for the immediate future:

                                                                                                              • Series on a minimum viable data stack
                                                                                                              • Best practices and opinions on a startup’s first data hire
                                                                                                              • Answering questions people ask on the future of data science
                                                                                                              • My experience as a Data Tech Lead with Work on Climate
                                                                                                              • Use cases for ChatGPT and other LLMs
                                                                                                              • Catching up on different aspects of LLMs / AI tech
                                                                                                              • Opportunities for Data & AI professionals in the energy transition

                                                                                                              Historically, for each post I’ve published, about 5-10 ideas went unpublished. I hope that by aiming for shorter and lower-friction publishing, more posts will see the light of day.

                                                                                                              My long-term aims are to learn by publishing, apply my Data & AI skills towards more positive impact, and help others in the space. Rather than sinking into doom and gloom, I’d like to focus on positive applications of Data & AI tech that make our world better (in the spirit of publications like Volts).

                                                                                                              Call to action:

                                                                                                              • If this all sounds uninteresting to you, you’re welcome to unsubscribe – no hard feelings.
                                                                                                              • If you know people I should talk to and feature in future posts, I’d appreciate an intro.
                                                                                                              • If you have any suggestions, please send them by replying to any of my emails, or contact me through other means – I’d love to hear from you.
                                                                                                              Subscribe +

                                                                                                              New decade, new tagline: Data & AI for Impact

                                                                                                              Logo of Yanir Seroussi's consulting services, depicting a wave and an up-and-to-the-right graph.

                                                                                                              Exactly a decade ago, on 19th January 2014, I published my first post on this website (Kaggle beginner tips). In most of the following years, my tagline was Data Science and Beyond. While the beyond bit gave me an excuse to write about various topics, most posts were indeed around data science – an area that also became broader (arguably to the point of uselessness).

                                                                                                              While I’ve never abandoned my software engineering roots, the broadening of data science means that many data scientists can no longer be assumed to possess solid engineering skills. Therefore, I changed the tagline last year to Engineering Data Science & More. However, this didn’t feel quite right – some people now have an adverse reaction to any mention of data science, after negative experiences of failed projects.

                                                                                                              Recently, I switched the tagline to be both broader and narrower: Data & AI for Nature. However, upon reflection and given some feedback, I realised that the Nature bit may be off-putting to some people who do impactful work in the space but have different motivations. Therefore, I decided to go with Data & AI for Impact (for now…).

                                                                                                              More importantly, I’m planning to revitalise my approach to publishing and audience engagement:

                                                                                                              • Post more frequently – aiming for weekly from February onwards.
                                                                                                              • Use the mailing list to email full posts, and as a two-way avenue for comments and conversations (as opposed to public comments, which are now closed).
                                                                                                              • Still publish both technical and high-level posts on Data & AI.
                                                                                                              • Produce content that’s specifically useful for startups and scaleups that are early on their Data & AI journey.
                                                                                                              • Showcase positive-impact applications of Data & AI tech – especially by startups in the climate and nature-positive space.

                                                                                                              With more frequent posts, what I publish should be quicker to produce and consume. This means I may lean more heavily on showcasing other people’s work – possibly through interviews. Other than that, here are some rough post ideas for the immediate future:

                                                                                                              • Series on a minimum viable data stack
                                                                                                              • Best practices and opinions on a startup’s first data hire
                                                                                                              • Answering questions people ask on the future of data science
                                                                                                              • My experience as a Data Tech Lead with Work on Climate
                                                                                                              • Use cases for ChatGPT and other LLMs
                                                                                                              • Catching up on different aspects of LLMs / AI tech
                                                                                                              • Opportunities for Data & AI professionals in the energy transition

                                                                                                              Historically, for each post I’ve published, about 5-10 ideas went unpublished. I hope that by aiming for shorter and lower-friction publishing, more posts will see the light of day.

                                                                                                              My long-term aims are to learn by publishing, apply my Data & AI skills towards more positive impact, and help others in the space. Rather than sinking into doom and gloom, I’d like to focus on positive applications of Data & AI tech that make our world better (in the spirit of publications like Volts).

                                                                                                              Call to action:

                                                                                                              • If this all sounds uninteresting to you, you’re welcome to unsubscribe – no hard feelings.
                                                                                                              • If you know people I should talk to and feature in future posts, I’d appreciate an intro.
                                                                                                              • If you have any suggestions, please send them by replying to any of my emails, or contact me through other means – I’d love to hear from you.
                                                                                                              Subscribe

                                                                                                                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                \ No newline at end of file diff --git a/2024/02/05/substance-over-titles-your-first-data-hire-may-be-a-data-scientist/index.html b/2024/02/05/substance-over-titles-your-first-data-hire-may-be-a-data-scientist/index.html index 3c1367775..8e273eee0 100644 --- a/2024/02/05/substance-over-titles-your-first-data-hire-may-be-a-data-scientist/index.html +++ b/2024/02/05/substance-over-titles-your-first-data-hire-may-be-a-data-scientist/index.html @@ -1,9 +1,11 @@ Substance over titles: Your first data hire may be a data scientist | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                Substance over titles: Your first data hire may be a data scientist

                                                                                                                ChatGPT's version of a versatile data person

                                                                                                                If you search the web for ‘first startup data hire’, you may come across some strongly-worded advice claiming that this person must not be a data scientist, or that they must be a data engineer / analyst. In my view, being so prescriptive about titles risks missing out on great candidates. The reality is that titles in the data world are messy and fluid – it’s best to start by getting clear on what the data person is going to do, and proceed from there.

                                                                                                                Beyond titles, this post summarises my perspective on questions that arise around the first data hire, and presents some pointers to help you hire successfully.

                                                                                                                Assumptions and Timing

                                                                                                                Key assumption: If your startup needs a data hire, you’re probably at a stage where you’re starting to be limited by visibility into your data. You are generating revenue, but your data is all over the place (spreadsheets, dashboards of various tools, unstructured logs, etc.). There are important questions about your business that you can’t answer because you’re either not collecting the data, or because it’s too hard to gather it into a coherent story. No one on your technical team has informed opinions on tools to pick out of the dozens of options for warehousing / ingestion / transformation / analytics / orchestration.

                                                                                                                If this is the case, then data and machine learning isn’t core to your product. You need someone to set up your data pipelines and analytics. These will primarily serve internal-facing use cases, like driving marketing decisions. However, the first question to ask is: Do you really need to hire someone for a permanent full-time position?

                                                                                                                Personally, I’m biased in favour of not hiring (yet): You can get started on your data journey with a contractor or a part-time (aka fractional) person. This should give you a better understanding of your data needs, and get you to a better place in terms of data infrastructure and dashboards. This person may also want to become a full-timer down the track, or help you with hiring other data people.

                                                                                                                Remember that – by definition – premature hiring unnecessarily shortens your runway. Hiring and onboarding a full-timer would usually take longer than bringing on an experienced contractor. And if you need to let them go, it may adversely affect team morale. This doesn’t apply to contractors, who are expected to leave when their contract is over.

                                                                                                                That said, there is value in retaining a long-term owner of your data and analytics. Every business, dataset, and data stack have their quirks, so the familiarity that comes with long-term ownership is a definite point in favour of hiring for a permanent role. That said, you should still be open to part-time if you don’t have full-time needs yet.

                                                                                                                Titles and Skills

                                                                                                                If you do decide to hire for a permanent role, there are three other articles worth reading:

                                                                                                                • Andrew Bartholomew covers assumptions (similar to the above), responsibilities, skills, management, and the thorny question of titles. He says that the person’s title is “the least important question […] you’re hiring a Senior Analytics Engineer or a Senior Data Analyst, but in practice this person might prefer a Senior Data Scientist title, or Analytics Lead, or something else.” I agree with this and pretty much everything else in Andrew’s article, though it is important to align on the expectations implied by titles (more on this below).
                                                                                                                • Colleen Tartow advocates for hiring a senior data engineer. While Colleen’s advice is sensible, I’d be careful with following it blindly due to the messiness of titles and experiences. For example, you probably don’t want a data engineer who’s only worked with big companies, as there’s a risk that they’d over-engineer your data stack (initially, you’re aiming for a minimum viable data stack). Also, if they’ve only ever worn the data engineer hat, they may find it hard to uncover and communicate the insights you’re after.
                                                                                                                • Sebastian Hewing goes deep into the question of timing the hire as a function of product-market fit. I agree with most points, but disagree with this phrasing: “The last person you want is a Data Scientist. […] What you need, in my opinion, is a Head of Data & Analytics.” I believe that someone who has full-stack data science experience may make a great Head of Data & Analytics – it all comes down to skills and experiences rather than past titles, which can only ever tell a part of the story. That said, Sebastian does list a bunch of other data titles that the startup shouldn’t hire, so we probably agree on the essence of the role and the person. I especially like Sebastian’s emphasis on seeking a hands-on data person who can turn data into insights AND insights into action.

                                                                                                                As you can see, the three articles disagree on the question of titles, with Andrew’s being the most pragmatic. If you want to get even more confused, ask ChatGPT to summarise the collective wisdom of the internet: When I asked it “what should a startup’s first data hire be?”, ChatGPT suggested seven(!) roles with an “it depends” reason for each one. Personally, I’d go for a senior data generalist with an engineering background, who is also attentive to the business side. It’s highly doubtful you’d find someone who goes by this title, so you’ll need to figure out how to find and attract them. This is hard if you’re not familiar with the data space. It’s worth seeking help from data folks in your network, or starting with a contractor to bootstrap the process.

                                                                                                                Summary

                                                                                                                Putting it all together, once you’ve read the above articles, my opinion is that you should:

                                                                                                                1. Get clear on the business needs that’d be addressed by a data person.
                                                                                                                2. Err on the side of not hiring prematurely – consider a contractor or rely on your current employees.
                                                                                                                3. When you’re ready to hire, sketch out a high-level plan for the person’s first 90-180-360 days.
                                                                                                                4. Run the plan and job description by some data people you trust.
                                                                                                                  • Possible title for the job ad: Data & Analytics Lead or Head of Data & Analytics (but you want a hands-on person, so make it clear that this is an individual contributor role initially).
                                                                                                                  • Make the plan a part of the job ad – it helps with aligning expectations.
                                                                                                                5. Ideally, get data people you trust to help you with the hiring process.
                                                                                                                6. Screen out specialists early, regardless of past titles and pedigree.
                                                                                                                7. Make expectations as clear as possible during the hiring process – especially if the person hasn’t worked with a startup before.
                                                                                                                8. Hire someone who’s a great fit who would help take your business to the next level.

                                                                                                                Any thoughts or suggestions? Please contact me – I will make edits to this post based on feedback.

                                                                                                                Subscribe +

                                                                                                                Substance over titles: Your first data hire may be a data scientist

                                                                                                                ChatGPT's version of a versatile data person

                                                                                                                If you search the web for ‘first startup data hire’, you may come across some strongly-worded advice claiming that this person must not be a data scientist, or that they must be a data engineer / analyst. In my view, being so prescriptive about titles risks missing out on great candidates. The reality is that titles in the data world are messy and fluid – it’s best to start by getting clear on what the data person is going to do, and proceed from there.

                                                                                                                Beyond titles, this post summarises my perspective on questions that arise around the first data hire, and presents some pointers to help you hire successfully.

                                                                                                                Assumptions and Timing

                                                                                                                Key assumption: If your startup needs a data hire, you’re probably at a stage where you’re starting to be limited by visibility into your data. You are generating revenue, but your data is all over the place (spreadsheets, dashboards of various tools, unstructured logs, etc.). There are important questions about your business that you can’t answer because you’re either not collecting the data, or because it’s too hard to gather it into a coherent story. No one on your technical team has informed opinions on tools to pick out of the dozens of options for warehousing / ingestion / transformation / analytics / orchestration.

                                                                                                                If this is the case, then data and machine learning isn’t core to your product. You need someone to set up your data pipelines and analytics. These will primarily serve internal-facing use cases, like driving marketing decisions. However, the first question to ask is: Do you really need to hire someone for a permanent full-time position?

                                                                                                                Personally, I’m biased in favour of not hiring (yet): You can get started on your data journey with a contractor or a part-time (aka fractional) person. This should give you a better understanding of your data needs, and get you to a better place in terms of data infrastructure and dashboards. This person may also want to become a full-timer down the track, or help you with hiring other data people.

                                                                                                                Remember that – by definition – premature hiring unnecessarily shortens your runway. Hiring and onboarding a full-timer would usually take longer than bringing on an experienced contractor. And if you need to let them go, it may adversely affect team morale. This doesn’t apply to contractors, who are expected to leave when their contract is over.

                                                                                                                That said, there is value in retaining a long-term owner of your data and analytics. Every business, dataset, and data stack have their quirks, so the familiarity that comes with long-term ownership is a definite point in favour of hiring for a permanent role. That said, you should still be open to part-time if you don’t have full-time needs yet.

                                                                                                                Titles and Skills

                                                                                                                If you do decide to hire for a permanent role, there are three other articles worth reading:

                                                                                                                • Andrew Bartholomew covers assumptions (similar to the above), responsibilities, skills, management, and the thorny question of titles. He says that the person’s title is “the least important question […] you’re hiring a Senior Analytics Engineer or a Senior Data Analyst, but in practice this person might prefer a Senior Data Scientist title, or Analytics Lead, or something else.” I agree with this and pretty much everything else in Andrew’s article, though it is important to align on the expectations implied by titles (more on this below).
                                                                                                                • Colleen Tartow advocates for hiring a senior data engineer. While Colleen’s advice is sensible, I’d be careful with following it blindly due to the messiness of titles and experiences. For example, you probably don’t want a data engineer who’s only worked with big companies, as there’s a risk that they’d over-engineer your data stack (initially, you’re aiming for a minimum viable data stack). Also, if they’ve only ever worn the data engineer hat, they may find it hard to uncover and communicate the insights you’re after.
                                                                                                                • Sebastian Hewing goes deep into the question of timing the hire as a function of product-market fit. I agree with most points, but disagree with this phrasing: “The last person you want is a Data Scientist. […] What you need, in my opinion, is a Head of Data & Analytics.” I believe that someone who has full-stack data science experience may make a great Head of Data & Analytics – it all comes down to skills and experiences rather than past titles, which can only ever tell a part of the story. That said, Sebastian does list a bunch of other data titles that the startup shouldn’t hire, so we probably agree on the essence of the role and the person. I especially like Sebastian’s emphasis on seeking a hands-on data person who can turn data into insights AND insights into action.

                                                                                                                As you can see, the three articles disagree on the question of titles, with Andrew’s being the most pragmatic. If you want to get even more confused, ask ChatGPT to summarise the collective wisdom of the internet: When I asked it “what should a startup’s first data hire be?”, ChatGPT suggested seven(!) roles with an “it depends” reason for each one. Personally, I’d go for a senior data generalist with an engineering background, who is also attentive to the business side. It’s highly doubtful you’d find someone who goes by this title, so you’ll need to figure out how to find and attract them. This is hard if you’re not familiar with the data space. It’s worth seeking help from data folks in your network, or starting with a contractor to bootstrap the process.

                                                                                                                Summary

                                                                                                                Putting it all together, once you’ve read the above articles, my opinion is that you should:

                                                                                                                1. Get clear on the business needs that’d be addressed by a data person.
                                                                                                                2. Err on the side of not hiring prematurely – consider a contractor or rely on your current employees.
                                                                                                                3. When you’re ready to hire, sketch out a high-level plan for the person’s first 90-180-360 days.
                                                                                                                4. Run the plan and job description by some data people you trust.
                                                                                                                  • Possible title for the job ad: Data & Analytics Lead or Head of Data & Analytics (but you want a hands-on person, so make it clear that this is an individual contributor role initially).
                                                                                                                  • Make the plan a part of the job ad – it helps with aligning expectations.
                                                                                                                5. Ideally, get data people you trust to help you with the hiring process.
                                                                                                                6. Screen out specialists early, regardless of past titles and pedigree.
                                                                                                                7. Make expectations as clear as possible during the hiring process – especially if the person hasn’t worked with a startup before.
                                                                                                                8. Hire someone who’s a great fit who would help take your business to the next level.

                                                                                                                Any thoughts or suggestions? Please contact me – I will make edits to this post based on feedback.

                                                                                                                Subscribe

                                                                                                                  Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                  \ No newline at end of file diff --git a/2024/02/12/nudging-chatgpt-to-invent-books-you-have-no-time-to-read/index.html b/2024/02/12/nudging-chatgpt-to-invent-books-you-have-no-time-to-read/index.html index 4676af996..4a2cc4413 100644 --- a/2024/02/12/nudging-chatgpt-to-invent-books-you-have-no-time-to-read/index.html +++ b/2024/02/12/nudging-chatgpt-to-invent-books-you-have-no-time-to-read/index.html @@ -1,5 +1,5 @@ Nudging ChatGPT to invent books you have no time to read | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                  Nudging ChatGPT to invent books you have no time to read

                                                                                                                  Cars on the road with a mountain in the background

                                                                                                                  Road trips are more fun with ChatGPT Plus

                                                                                                                  Large language models like ChatGPT have a wide range of capabilities. One way to discover use cases is to spend time chatting with the bots. Sometimes, by asking them to do things that may seem unnatural, you discover new useful capabilities. Here is a quick story demonstrating this sort of exploration.

                                                                                                                  Inventing a book summary

                                                                                                                  On a longish drive from Visalia to Los Angeles, I read Chris Riccomini’s thought-provoking post on database disassembly and its implications. At the bottom of the post, there’s a short promotional blurb that says:

                                                                                                                  Support this newsletter by purchasing The Missing README: A Guide for the New Software Engineer for yourself or gifting it to someone.

                                                                                                                  While my new software engineer days are in the distant past, fresh takes on the basics can be worth reviewing. In addition, my brother Uri Seroussi – who is a new software engineer – was sitting right next to me. So I followed the link, read the overview, skimmed the reviews, and recommended the book to Uri.

                                                                                                                  But… it was a long drive. I figured it’d be fun to get ChatGPT Plus to summarise the book for us – without it having access to the actual book. In response to the initial prompt, it ran a web search, which resulted in a high-level summary that wasn’t very useful (see the full transcript for details). I then asked it to go deeper on the first lesson, to which it replied:

                                                                                                                  It seems that the specific details of the first practical lesson from “The Missing README: A Guide for the New Software Engineer” are not readily available in the search results. The book covers a wide range of essential skills, techniques, and heuristics necessary for effective software engineering, focusing on the realities of the workplace that are not typically taught in school or bootcamps. For a detailed dive into the first practical lesson, it would be best to consult the book directly, as it appears to be structured in a way that sequentially builds on the knowledge and skills crucial for a new software engineer’s success in the industry.

                                                                                                                  Useless again…

                                                                                                                  I asked it to tell me what it knows without searching the web, to which it replied with a summary that was only loosely connected to the stated contents of the book.

                                                                                                                  Not good enough!

                                                                                                                  I threw the full summary from Amazon at it, which includes these points:

                                                                                                                  You’ll learn:

                                                                                                                  • How to use the legacy code change algorithm, and leave code cleaner than you found it
                                                                                                                  • How to write operable code with logging, metrics, configuration, and defensive programming
                                                                                                                  • How to write deterministic tests, submit code reviews, and give feedback on other people’s code
                                                                                                                  • The technical design process, including experiments, problem definition, documentation, and collaboration
                                                                                                                  • What to do when you are on-call, and how to navigate production incidents
                                                                                                                  • Architectural techniques that make code change easier
                                                                                                                  • Agile development practices like sprint planning, stand-ups, and retrospectives

                                                                                                                  Then I asked it to go deep on the first point, and it delivered a proper summary that seemed legit!

                                                                                                                  As Uri was driving, I got the ChatGPT app to read out each summary. This continued with subsequent points, with me prompting ChatGPT with “OK, next point”, it inventing some stuff that sounded about right, Uri and me discussing the output, and so on.

                                                                                                                  Should Uri still read the book? Probably. Reading well-written books helps lessons sink in better than listening to made up summaries. That said, the summaries did provide a good overview of the book topics, and they were educational.

                                                                                                                  While I probably wouldn’t have had this conversation if it wasn’t for the long drive, I still find this use case interesting. It’s not the first time I got ChatGPT to elaborate on specific allusions – and obtained informative results. The general pattern is giving it some text and asking “what might they mean by X?”

                                                                                                                  The cheatsheet game

                                                                                                                  There was still time on the drive, so I thought it would be fun to get ChatGPT to turn the summaries into a cheatsheet for quick reference. While I could ask for a cheatsheet-like summary and then format it myself, asking for things that aren’t strictly necessary is a good way to learn about ChatGPT’s capabilities.

                                                                                                                  I made this request:

                                                                                                                  Can you create a pretty cheatsheet summarising the main lessons? Run code if needed to do stuff like putting text in multiple columns so it fits on a single page.

                                                                                                                  Unfortunately, the result of the first attempt was horrible – too much text, too little content, and illegible:

                                                                                                                  Nudging ChatGPT to invent books you have no time to read

                                                                                                                  Cars on the road with a mountain in the background

                                                                                                                  Road trips are more fun with ChatGPT Plus

                                                                                                                  Large language models like ChatGPT have a wide range of capabilities. One way to discover use cases is to spend time chatting with the bots. Sometimes, by asking them to do things that may seem unnatural, you discover new useful capabilities. Here is a quick story demonstrating this sort of exploration.

                                                                                                                  Inventing a book summary

                                                                                                                  On a longish drive from Visalia to Los Angeles, I read Chris Riccomini’s thought-provoking post on database disassembly and its implications. At the bottom of the post, there’s a short promotional blurb that says:

                                                                                                                  Support this newsletter by purchasing The Missing README: A Guide for the New Software Engineer for yourself or gifting it to someone.

                                                                                                                  While my new software engineer days are in the distant past, fresh takes on the basics can be worth reviewing. In addition, my brother Uri Seroussi – who is a new software engineer – was sitting right next to me. So I followed the link, read the overview, skimmed the reviews, and recommended the book to Uri.

                                                                                                                  But… it was a long drive. I figured it’d be fun to get ChatGPT Plus to summarise the book for us – without it having access to the actual book. In response to the initial prompt, it ran a web search, which resulted in a high-level summary that wasn’t very useful (see the full transcript for details). I then asked it to go deeper on the first lesson, to which it replied:

                                                                                                                  It seems that the specific details of the first practical lesson from “The Missing README: A Guide for the New Software Engineer” are not readily available in the search results. The book covers a wide range of essential skills, techniques, and heuristics necessary for effective software engineering, focusing on the realities of the workplace that are not typically taught in school or bootcamps. For a detailed dive into the first practical lesson, it would be best to consult the book directly, as it appears to be structured in a way that sequentially builds on the knowledge and skills crucial for a new software engineer’s success in the industry.

                                                                                                                  Useless again…

                                                                                                                  I asked it to tell me what it knows without searching the web, to which it replied with a summary that was only loosely connected to the stated contents of the book.

                                                                                                                  Not good enough!

                                                                                                                  I threw the full summary from Amazon at it, which includes these points:

                                                                                                                  You’ll learn:

                                                                                                                  • How to use the legacy code change algorithm, and leave code cleaner than you found it
                                                                                                                  • How to write operable code with logging, metrics, configuration, and defensive programming
                                                                                                                  • How to write deterministic tests, submit code reviews, and give feedback on other people’s code
                                                                                                                  • The technical design process, including experiments, problem definition, documentation, and collaboration
                                                                                                                  • What to do when you are on-call, and how to navigate production incidents
                                                                                                                  • Architectural techniques that make code change easier
                                                                                                                  • Agile development practices like sprint planning, stand-ups, and retrospectives

                                                                                                                  Then I asked it to go deep on the first point, and it delivered a proper summary that seemed legit!

                                                                                                                  As Uri was driving, I got the ChatGPT app to read out each summary. This continued with subsequent points, with me prompting ChatGPT with “OK, next point”, it inventing some stuff that sounded about right, Uri and me discussing the output, and so on.

                                                                                                                  Should Uri still read the book? Probably. Reading well-written books helps lessons sink in better than listening to made up summaries. That said, the summaries did provide a good overview of the book topics, and they were educational.

                                                                                                                  While I probably wouldn’t have had this conversation if it wasn’t for the long drive, I still find this use case interesting. It’s not the first time I got ChatGPT to elaborate on specific allusions – and obtained informative results. The general pattern is giving it some text and asking “what might they mean by X?”

                                                                                                                  The cheatsheet game

                                                                                                                  There was still time on the drive, so I thought it would be fun to get ChatGPT to turn the summaries into a cheatsheet for quick reference. While I could ask for a cheatsheet-like summary and then format it myself, asking for things that aren’t strictly necessary is a good way to learn about ChatGPT’s capabilities.

                                                                                                                  I made this request:

                                                                                                                  Can you create a pretty cheatsheet summarising the main lessons? Run code if needed to do stuff like putting text in multiple columns so it fits on a single page.

                                                                                                                  Unfortunately, the result of the first attempt was horrible – too much text, too little content, and illegible:

                                                                                                                  800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                                    Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                    \ No newline at end of file diff --git a/2024/02/19/building-your-startups-minimum-viable-data-stack/index.html b/2024/02/19/building-your-startups-minimum-viable-data-stack/index.html index 3789fd06e..676878244 100644 --- a/2024/02/19/building-your-startups-minimum-viable-data-stack/index.html +++ b/2024/02/19/building-your-startups-minimum-viable-data-stack/index.html @@ -1,9 +1,11 @@ Building your startup's minimum viable data stack | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                    Building your startup's minimum viable data stack

                                                                                                                    Henrik Kniberg's drawing of a minimum viable product, showing the wrong way of doing it (non-functional iterations) and the right way of doing it (functional iterations).

                                                                                                                    Minimum viability is about rapidly delivering incremental value. Source: Henrik Kniberg.

                                                                                                                    In my post on your startup’s first data hire, I noted in passing that the hire’s initial role would be setting up the company’s minimum viable data stack. But what exactly is that?

                                                                                                                    Conceptually, a minimum viable data stack follows the same principles as a minimum viable product. Breaking it up, it is:

                                                                                                                    • Minimal: You don’t want to over-build beyond your resources. Instead, set up the simplest stack that satisfies the startup’s near-term needs. Then iterate based on feedback and new requirements.
                                                                                                                    • Viable: While this is sometimes forgotten in favour of an over-emphasis on minimality, your data stack has to be viable. That is, it has to satisfy stakeholder needs through every iteration, as shown by the classic drawing above.
                                                                                                                    • Data Stack: This is the product that’s getting shipped and built iteratively, consisting of the components listed below. As in my previous post, I’m assuming a startup where the data stack initially serves internal stakeholders. The main difference between a consumer-facing product and an internal-facing data stack is that the latter has users you can easily talk to and fewer unknowns. This makes the task of satisfying user needs easier.

                                                                                                                    This post is the first in a series that will serve as a quick reference for those embarking on the journey of setting up a minimum viable data stack. Future posts will go deeper into each of the key components. However, I can only cover limited ground – readers are encouraged to consult books such as Fundamentals of Data Engineering for a more thorough treatment of the topics covered in the series.

                                                                                                                    Components of a minimum viable data stack

                                                                                                                    As we’re talking about a minimum viable data stack, this list of components isn’t exhaustive. The components I consider to be the bare minimum are:

                                                                                                                    1. Storage: Where the data lives.
                                                                                                                    2. Ingestion: How the data makes it into storage.
                                                                                                                    3. Transformation: The layer that joins and changes raw data into more useful form.
                                                                                                                    4. Analytics: Presentation layer, which can be consumed by non-technical stakeholders.

                                                                                                                    Components that are perhaps conspicuous in their absence are: machine learning / AI (higher on data’s hierarchy of needs – I assume this will come later), data serving / querying (implicitly included in other layers), and orchestration (also implicit – I assume dependencies are initially simple enough so that any orchestration approach would work).

                                                                                                                    Considerations beyond stack components

                                                                                                                    There are at least two critical items to think of early on. I consider a data stack to be nonviable if no thought is given to:

                                                                                                                    • Security. Ignore data security at your own peril. Examples abound of serious data breaches, which are often the result of trivial mistakes or poor design decisions. Starting off by implementing security checklists and following best practices like the principle of least privilege is way easier than trying to enforce them later on. A good technical read on the topic is Building Secure and Reliable Systems, though you’d need to cherry-pick principles to match your needs (it’s a book by Google, and your startup is not Google).
                                                                                                                    • Privacy. You should be aware of compliance requirements for the data you store, especially when it comes to private and sensitive data. As with security, it’s much easier to start by complying with privacy requirements than retrofitting a stack once stakeholders have come to depend on data that shouldn’t be collected or retained. In the spirit of minimality, it’s best to err on the side of not storing sensitive data when it isn’t required. This also helps minimise the potential effect of breaches.

                                                                                                                    Other key considerations include:

                                                                                                                    • Data generation. I assume that the business is generating data from multiple sources, which need to be ingested into a single storage system to ultimately drive decisions. If there’s only one source system, it may be too early for a data hire, or for a more sophisticated data stack. When considering data sources, it’s important to keep in mind the three Vs of data: Volume, Velocity, and Variety.
                                                                                                                    • Quality assurance. It’s important to have some automated checks in place to avoid breaking pipelines and maintain high data quality before making changes to production systems (e.g., changing transformation or ingestion code). Again, it’s easier to start with high standards for quality than enforce them retrospectively. Low quality is likely to result in low trust of any data or insights, making the stack nonviable.
                                                                                                                    • Observability/monitoring/incident response. Inevitably, things will break in production. With good observability and incident response practices, the data team will proactively address such issues – ideally before any stakeholders notice. This goes hand in hand with setting high quality standards – the sort of culture that is easier to set early on than change down the track.
                                                                                                                    • Data management. There are many items that fall under data management (e.g., see the list on Wikipedia). Many of them are addressed implicitly or not addressed at the early stages of a data stack. For example, data discovery isn’t a major issue when the data team consists of a single person. Still, it’s worth being aware of management considerations that arise as the data stack matures.
                                                                                                                    • Timely automation. As a broad generalisation, engineers like automation. Erring on the side of automation is often a good idea, as it lets computers do what they do best and frees up human time to deal with things that have to be done manually. Done right, automation increases overall quality. However, creating automations takes time, e.g., if a monthly report takes five minutes to generate, and it’d take a day of coding to automate, it’s probably enough to write up the procedure to generate it. You have bigger fish to fry.
                                                                                                                    • The need to be boring. Another trap that you can easily fall into is trying shiny new tools. Despite what vendors might say, it’s rare for new tools to be truly transformative. You should strive to be boring in your choice of components. Use proven tools and services for the minimum viable data stack, keeping the shiny experimental stuff to your hobby side projects (I learnt this the hard way).
                                                                                                                    • Speed of iteration. Some people believe that high quality always comes at the cost of iteration speed. I disagree, for the same reasons Martin Fowler pointed out in an essay on how increasing the internal quality of software increases iteration speed within weeks of the start of a project. In short, if you don’t invest in quality, you’re committing yourself to spending much of your time firefighting as the complexity of the data stack increases. However, overthinking reversible decisions or spending too much time on non-critical issues is also a real possibility. Above all, you must remember that the goal of the data stack is to support business decisions, which may require some compromises to deliver value as rapidly as possible.

                                                                                                                    Next: Choosing components

                                                                                                                    As noted, I’m aiming for this to be the first in a series of posts on setting up a minimum viable data stack. Each future post will be dedicated to each of the key components, going deeper into currently-available tools for storage, ingestion, transformation, and analytics. The focus will be on tools that are sensible to use by startups.

                                                                                                                    Stay tuned for future posts! In the meantime, feedback is always welcome.

                                                                                                                    Update 2024-08-19: I eventually gave up on the series – here’s why.

                                                                                                                    Subscribe +

                                                                                                                    Building your startup's minimum viable data stack

                                                                                                                    Henrik Kniberg's drawing of a minimum viable product, showing the wrong way of doing it (non-functional iterations) and the right way of doing it (functional iterations).

                                                                                                                    Minimum viability is about rapidly delivering incremental value. Source: Henrik Kniberg.

                                                                                                                    In my post on your startup’s first data hire, I noted in passing that the hire’s initial role would be setting up the company’s minimum viable data stack. But what exactly is that?

                                                                                                                    Conceptually, a minimum viable data stack follows the same principles as a minimum viable product. Breaking it up, it is:

                                                                                                                    • Minimal: You don’t want to over-build beyond your resources. Instead, set up the simplest stack that satisfies the startup’s near-term needs. Then iterate based on feedback and new requirements.
                                                                                                                    • Viable: While this is sometimes forgotten in favour of an over-emphasis on minimality, your data stack has to be viable. That is, it has to satisfy stakeholder needs through every iteration, as shown by the classic drawing above.
                                                                                                                    • Data Stack: This is the product that’s getting shipped and built iteratively, consisting of the components listed below. As in my previous post, I’m assuming a startup where the data stack initially serves internal stakeholders. The main difference between a consumer-facing product and an internal-facing data stack is that the latter has users you can easily talk to and fewer unknowns. This makes the task of satisfying user needs easier.

                                                                                                                    This post is the first in a series that will serve as a quick reference for those embarking on the journey of setting up a minimum viable data stack. Future posts will go deeper into each of the key components. However, I can only cover limited ground – readers are encouraged to consult books such as Fundamentals of Data Engineering for a more thorough treatment of the topics covered in the series.

                                                                                                                    Components of a minimum viable data stack

                                                                                                                    As we’re talking about a minimum viable data stack, this list of components isn’t exhaustive. The components I consider to be the bare minimum are:

                                                                                                                    1. Storage: Where the data lives.
                                                                                                                    2. Ingestion: How the data makes it into storage.
                                                                                                                    3. Transformation: The layer that joins and changes raw data into more useful form.
                                                                                                                    4. Analytics: Presentation layer, which can be consumed by non-technical stakeholders.

                                                                                                                    Components that are perhaps conspicuous in their absence are: machine learning / AI (higher on data’s hierarchy of needs – I assume this will come later), data serving / querying (implicitly included in other layers), and orchestration (also implicit – I assume dependencies are initially simple enough so that any orchestration approach would work).

                                                                                                                    Considerations beyond stack components

                                                                                                                    There are at least two critical items to think of early on. I consider a data stack to be nonviable if no thought is given to:

                                                                                                                    • Security. Ignore data security at your own peril. Examples abound of serious data breaches, which are often the result of trivial mistakes or poor design decisions. Starting off by implementing security checklists and following best practices like the principle of least privilege is way easier than trying to enforce them later on. A good technical read on the topic is Building Secure and Reliable Systems, though you’d need to cherry-pick principles to match your needs (it’s a book by Google, and your startup is not Google).
                                                                                                                    • Privacy. You should be aware of compliance requirements for the data you store, especially when it comes to private and sensitive data. As with security, it’s much easier to start by complying with privacy requirements than retrofitting a stack once stakeholders have come to depend on data that shouldn’t be collected or retained. In the spirit of minimality, it’s best to err on the side of not storing sensitive data when it isn’t required. This also helps minimise the potential effect of breaches.

                                                                                                                    Other key considerations include:

                                                                                                                    • Data generation. I assume that the business is generating data from multiple sources, which need to be ingested into a single storage system to ultimately drive decisions. If there’s only one source system, it may be too early for a data hire, or for a more sophisticated data stack. When considering data sources, it’s important to keep in mind the three Vs of data: Volume, Velocity, and Variety.
                                                                                                                    • Quality assurance. It’s important to have some automated checks in place to avoid breaking pipelines and maintain high data quality before making changes to production systems (e.g., changing transformation or ingestion code). Again, it’s easier to start with high standards for quality than enforce them retrospectively. Low quality is likely to result in low trust of any data or insights, making the stack nonviable.
                                                                                                                    • Observability/monitoring/incident response. Inevitably, things will break in production. With good observability and incident response practices, the data team will proactively address such issues – ideally before any stakeholders notice. This goes hand in hand with setting high quality standards – the sort of culture that is easier to set early on than change down the track.
                                                                                                                    • Data management. There are many items that fall under data management (e.g., see the list on Wikipedia). Many of them are addressed implicitly or not addressed at the early stages of a data stack. For example, data discovery isn’t a major issue when the data team consists of a single person. Still, it’s worth being aware of management considerations that arise as the data stack matures.
                                                                                                                    • Timely automation. As a broad generalisation, engineers like automation. Erring on the side of automation is often a good idea, as it lets computers do what they do best and frees up human time to deal with things that have to be done manually. Done right, automation increases overall quality. However, creating automations takes time, e.g., if a monthly report takes five minutes to generate, and it’d take a day of coding to automate, it’s probably enough to write up the procedure to generate it. You have bigger fish to fry.
                                                                                                                    • The need to be boring. Another trap that you can easily fall into is trying shiny new tools. Despite what vendors might say, it’s rare for new tools to be truly transformative. You should strive to be boring in your choice of components. Use proven tools and services for the minimum viable data stack, keeping the shiny experimental stuff to your hobby side projects (I learnt this the hard way).
                                                                                                                    • Speed of iteration. Some people believe that high quality always comes at the cost of iteration speed. I disagree, for the same reasons Martin Fowler pointed out in an essay on how increasing the internal quality of software increases iteration speed within weeks of the start of a project. In short, if you don’t invest in quality, you’re committing yourself to spending much of your time firefighting as the complexity of the data stack increases. However, overthinking reversible decisions or spending too much time on non-critical issues is also a real possibility. Above all, you must remember that the goal of the data stack is to support business decisions, which may require some compromises to deliver value as rapidly as possible.

                                                                                                                    Next: Choosing components

                                                                                                                    As noted, I’m aiming for this to be the first in a series of posts on setting up a minimum viable data stack. Each future post will be dedicated to each of the key components, going deeper into currently-available tools for storage, ingestion, transformation, and analytics. The focus will be on tools that are sensible to use by startups.

                                                                                                                    Stay tuned for future posts! In the meantime, feedback is always welcome.

                                                                                                                    Update 2024-08-19: I eventually gave up on the series – here’s why.

                                                                                                                    Subscribe

                                                                                                                      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                      \ No newline at end of file diff --git a/2024/02/26/avoiding-ai-complexity-first-write-no-code/index.html b/2024/02/26/avoiding-ai-complexity-first-write-no-code/index.html index 11d28d0c6..c48d6f559 100644 --- a/2024/02/26/avoiding-ai-complexity-first-write-no-code/index.html +++ b/2024/02/26/avoiding-ai-complexity-first-write-no-code/index.html @@ -1,9 +1,11 @@ Avoiding AI complexity: First, write no code | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                      Avoiding AI complexity: First, write no code

                                                                                                                      Illustration showing ancient stone tablets with the inscription 'primum non codere' (inspired by primum non noncere: first, do no harm)

                                                                                                                      Custom software is notoriously hard to build and maintain. Machine learning (ML) adds a layer of complexity on top of traditional software, with many novel ways to accrue technical debt. Therefore, my general advice to young startups considering custom ML development borrows from the first and second rules of optimisation:

                                                                                                                      1. Don’t.
                                                                                                                      2. Don’t Yet (for experts only).

                                                                                                                      For startups where Data & AI/ML isn’t a core part of the team’s capabilities, there are usually higher priorities than building custom ML models. However, deriving commercial value from advancements in AI is still possible – even without writing code at all.

                                                                                                                      I recently witnessed two stories that exemplify this point.

                                                                                                                      Exhibit A. Consider this story: A technical lead reached out to me for advice on a computer vision project that wasn’t progressing as expected. For the sake of illustration, let’s say it was a custom model to classify food pictures as hotdog / not hotdog.

                                                                                                                      The company had contracted an ML engineer to drive the project. Despite having no background in ML, the lead felt like the contractor was going down the wrong track, and asked me for my thoughts.

                                                                                                                      It turned out that the contractor believed that the best path forward was trying different model architectures. My advice was to ensure the contractor did the data work first. Often, there are bigger gains to be had from data augmentations than from model tweaks (e.g., applying distortions to the hotdog photos). As the data wasn’t sensitive, I also suggested trying third-party computer vision APIs or GPT-4 Vision to get an idea of what’s possible with the dataset.

                                                                                                                      Exhibit B. Recently, I caught up with an entrepreneur who comes from a marketing background. Just prior to our meeting, they had successfully pitched an app they had built to a large client.

                                                                                                                      Remarkably, the app included a hotdog detector similar to the one the ML engineer was struggling to ship. The entrepreneur used the FlutterFlow no-code platform along with Google’s computer vision APIs to rapidly create an app with commercial value – without deep knowledge of ML.

                                                                                                                      Expanding the rules

                                                                                                                      No two companies are exactly alike. Sometimes, custom code or ML models are necessary. However, given the pace of innovation in no-code and low-code software & AI, starting with the least code possible is often a wise choice. Those who build software as their craft often have a blind spot when it comes to no-code possibilities – coders gonna code. However, it’s important to rein in the coding instinct. The difference in total cost between a custom build and using third-party APIs or no-code solutions can easily be in six or seven figures.

                                                                                                                      When contemplating custom ML and AI development, consider the following options:

                                                                                                                      1. Don’t build it.
                                                                                                                      2. Wait until it becomes easier.
                                                                                                                      3. Use a no-code solution.
                                                                                                                      4. Get a software engineer to implement it with third-party APIs.
                                                                                                                      5. Get a software engineer to implement it with third-party models that you self-host (with minimal customisation).
                                                                                                                      6. Get the experts to build it: ML engineers, data scientists, and data engineers.

                                                                                                                      You should be pretty certain that the cost of Option 6 is worth the investment. One way to get there is by starting with one of Options 3-5, thereby proving (or disproving) that there’s commercial value in the most expensive option. And when the time comes for Option 6, always do the data work!

                                                                                                                      Subscribe +

                                                                                                                      Avoiding AI complexity: First, write no code

                                                                                                                      Illustration showing ancient stone tablets with the inscription 'primum non codere' (inspired by primum non noncere: first, do no harm)

                                                                                                                      Custom software is notoriously hard to build and maintain. Machine learning (ML) adds a layer of complexity on top of traditional software, with many novel ways to accrue technical debt. Therefore, my general advice to young startups considering custom ML development borrows from the first and second rules of optimisation:

                                                                                                                      1. Don’t.
                                                                                                                      2. Don’t Yet (for experts only).

                                                                                                                      For startups where Data & AI/ML isn’t a core part of the team’s capabilities, there are usually higher priorities than building custom ML models. However, deriving commercial value from advancements in AI is still possible – even without writing code at all.

                                                                                                                      I recently witnessed two stories that exemplify this point.

                                                                                                                      Exhibit A. Consider this story: A technical lead reached out to me for advice on a computer vision project that wasn’t progressing as expected. For the sake of illustration, let’s say it was a custom model to classify food pictures as hotdog / not hotdog.

                                                                                                                      The company had contracted an ML engineer to drive the project. Despite having no background in ML, the lead felt like the contractor was going down the wrong track, and asked me for my thoughts.

                                                                                                                      It turned out that the contractor believed that the best path forward was trying different model architectures. My advice was to ensure the contractor did the data work first. Often, there are bigger gains to be had from data augmentations than from model tweaks (e.g., applying distortions to the hotdog photos). As the data wasn’t sensitive, I also suggested trying third-party computer vision APIs or GPT-4 Vision to get an idea of what’s possible with the dataset.

                                                                                                                      Exhibit B. Recently, I caught up with an entrepreneur who comes from a marketing background. Just prior to our meeting, they had successfully pitched an app they had built to a large client.

                                                                                                                      Remarkably, the app included a hotdog detector similar to the one the ML engineer was struggling to ship. The entrepreneur used the FlutterFlow no-code platform along with Google’s computer vision APIs to rapidly create an app with commercial value – without deep knowledge of ML.

                                                                                                                      Expanding the rules

                                                                                                                      No two companies are exactly alike. Sometimes, custom code or ML models are necessary. However, given the pace of innovation in no-code and low-code software & AI, starting with the least code possible is often a wise choice. Those who build software as their craft often have a blind spot when it comes to no-code possibilities – coders gonna code. However, it’s important to rein in the coding instinct. The difference in total cost between a custom build and using third-party APIs or no-code solutions can easily be in six or seven figures.

                                                                                                                      When contemplating custom ML and AI development, consider the following options:

                                                                                                                      1. Don’t build it.
                                                                                                                      2. Wait until it becomes easier.
                                                                                                                      3. Use a no-code solution.
                                                                                                                      4. Get a software engineer to implement it with third-party APIs.
                                                                                                                      5. Get a software engineer to implement it with third-party models that you self-host (with minimal customisation).
                                                                                                                      6. Get the experts to build it: ML engineers, data scientists, and data engineers.

                                                                                                                      You should be pretty certain that the cost of Option 6 is worth the investment. One way to get there is by starting with one of Options 3-5, thereby proving (or disproving) that there’s commercial value in the most expensive option. And when the time comes for Option 6, always do the data work!

                                                                                                                      Subscribe

                                                                                                                        Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                        \ No newline at end of file diff --git a/2024/03/04/two-types-of-startup-data-problems/index.html b/2024/03/04/two-types-of-startup-data-problems/index.html index 5fa7d05f0..34632b93a 100644 --- a/2024/03/04/two-types-of-startup-data-problems/index.html +++ b/2024/03/04/two-types-of-startup-data-problems/index.html @@ -1,5 +1,5 @@ Two types of startup data problems | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                        Two types of startup data problems

                                                                                                                        Decorative cover, based on ChatGPT's interpretation of the post.

                                                                                                                        Recently, I’ve been thinking on startup data problems to clarify where I can help potential clients. With over a decade in the data / AI / ML world, I’ve seen new hype waves and job titles appear almost every year. While for insiders this may seem natural, outsiders aren’t fully aware of the differences between the types of data professionals and the problems they solve.

                                                                                                                        One way I classify startup data problems is with the question: Do you need MLOps?

                                                                                                                        • If the answer is Yes, then it’s an ML-centric startup. Machine learning is core to the product, so effectively training, deploying, and maintaining ML models (i.e., doing MLOps) is crucial. Such startups should have strong ML and data capabilities in the founding team. Their success depends on it.
                                                                                                                        • If the answer is No, then it’s a non-ML startup. Such startups may occasionally build a one-off model, but they won’t be dealing with MLOps early on. Unless they’re building a data-intensive product,1 non-ML startups should hold off on hiring data people until they start hitting product-market fit and scaling their marketing. They can afford to build their data capabilities incrementally with a minimum viable data stack, and follow well-trodden paths of supporting decisions with data.

                                                                                                                        Somewhat confusingly, there’s an overlap between the skills required for ML-centric startups and those required from data people in non-ML startups. This is because much of ML is data work. This is reflected by the following ML system diagram, where the Data Collection and Data Verification boxes are much larger than the ML Code box. Additionally, the Analysis Tools and Monitoring boxes also require data skills, as defining metrics is one of the hardest problems of data science.

                                                                                                                        Two types of startup data problems

                                                                                                                        Decorative cover, based on ChatGPT's interpretation of the post.

                                                                                                                        Recently, I’ve been thinking on startup data problems to clarify where I can help potential clients. With over a decade in the data / AI / ML world, I’ve seen new hype waves and job titles appear almost every year. While for insiders this may seem natural, outsiders aren’t fully aware of the differences between the types of data professionals and the problems they solve.

                                                                                                                        One way I classify startup data problems is with the question: Do you need MLOps?

                                                                                                                        • If the answer is Yes, then it’s an ML-centric startup. Machine learning is core to the product, so effectively training, deploying, and maintaining ML models (i.e., doing MLOps) is crucial. Such startups should have strong ML and data capabilities in the founding team. Their success depends on it.
                                                                                                                        • If the answer is No, then it’s a non-ML startup. Such startups may occasionally build a one-off model, but they won’t be dealing with MLOps early on. Unless they’re building a data-intensive product,1 non-ML startups should hold off on hiring data people until they start hitting product-market fit and scaling their marketing. They can afford to build their data capabilities incrementally with a minimum viable data stack, and follow well-trodden paths of supporting decisions with data.

                                                                                                                        Somewhat confusingly, there’s an overlap between the skills required for ML-centric startups and those required from data people in non-ML startups. This is because much of ML is data work. This is reflected by the following ML system diagram, where the Data Collection and Data Verification boxes are much larger than the ML Code box. Additionally, the Analysis Tools and Monitoring boxes also require data skills, as defining metrics is one of the hardest problems of data science.

                                                                                                                        800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                          \ No newline at end of file diff --git a/2024/03/11/questions-to-consider-when-using-ai-for-pdf-data-extraction/index.html b/2024/03/11/questions-to-consider-when-using-ai-for-pdf-data-extraction/index.html index 26bfacbfc..23f4eb906 100644 --- a/2024/03/11/questions-to-consider-when-using-ai-for-pdf-data-extraction/index.html +++ b/2024/03/11/questions-to-consider-when-using-ai-for-pdf-data-extraction/index.html @@ -1,9 +1,11 @@ Questions to consider when using AI for PDF data extraction | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                          Questions to consider when using AI for PDF data extraction

                                                                                                                          Decorative image showing a book and flying documents with an AI-themed overlay

                                                                                                                          The jagged frontier of AI means that you can’t always know which tasks are within the capabilities of current models. One such task is the extraction of structured data from PDFs. While this is fully within the capabilities of humans, there are unique challenges in getting off-the-shelf AIs like ChatGPT to do it well. Variants of this task have come up repeatedly in my recent discussions and work, so I put together this summary of my understanding of when it makes sense to try to automate PDF data extraction with AI.

                                                                                                                          This post is structured as a series of questions I’d ask about a proposed project. Actual answers will depend on the specific project.

                                                                                                                          What is your budget?

                                                                                                                          I assume there is some business value in the extracted data. Therefore, it’s possible to estimate the dollar value of automating the manual processes that are used to extract the data. There’s a big difference between an extraction process that takes a junior employee a couple of weeks per year and one that keeps data entry specialists busy year-round. The budget determines the tools that can be used: If it’s low (e.g., in the thousands), it’s probably only worth spending a few days assessing feasibility with off-the-shelf tools like the OpenAI APIs. If it’s higher (e.g., hundreds of thousands), paying AI engineers to build a bespoke system becomes an option.

                                                                                                                          How sensitive is the data?

                                                                                                                          If the data you’re working with isn’t sensitive (e.g., financial statements by public companies), you’re in luck: You can use the best AIs available. A month ago, it was GPT-4. Today, you have a bunch of other proprietary options. Tomorrow, who knows?

                                                                                                                          If the data is sensitive and can’t leave your organisation’s systems, your options are more limited. Depending on other aspects of the problem, it may mean you’re better off waiting for better open models. However, given the rate of progress in the open source AI ecosystem, assessing feasibility is about as simple as with proprietary solutions. It’s just that right now, you won’t be using the most capable models.

                                                                                                                          How complex are the PDFs?

                                                                                                                          There’s a wide variety of documents out there. If the PDFs you’re working with can be converted to text accurately, the AI models have a good chance of being able to extract the data you’re after. If the conversion to text is insufficiently accurate, the AIs stand little chance of outperforming data entry specialists – they just don’t see the PDFs as well as we do.

                                                                                                                          It’s worth spending some time on this question. For example, if you’re using OpenAI’s APIs and much of the data you’re looking to extract is contained in tables within PDFs, you can first test whether you can reliably retrieve and display specific tables. Sticking with the example of financial statements, there’s a big difference between this 124-page sample from Grant Thornton and this seven-page sample from the Australian Fair Work Commission. Prompting GPT-4 (via ChatGPT Plus) to extract a table from the former PDF produced output that was only loosely-connected to the actual content. By contrast, GPT-4 perfectly reproduced tables from the latter PDF.

                                                                                                                          In general, perfect conversion of PDF tables to text appears to be an unsolved problem. For example, in a benchmark from last year, the best tool tested only had about 50% accuracy when applied to tables in scientific papers. However, the field keeps moving. You may get an accuracy boost by treating PDFs as images and using vision models, as recommended by the Unstructured library. Anecdotally, I found Unstructured’s table parsing to be too inaccurate, but when I used GPT-4 Vision on screenshots of the same tables it yielded much better results. It also outperformed OpenAI’s default PDF parser. Your mileage will definitely vary.

                                                                                                                          Things get even more complicated if some of the data you’re hoping to extract is in graphs and other figures contained in the PDFs. Verifying that the PDFs are not too complex for the AI models is definitely worth doing before jumping into more elaborate data extraction tasks.

                                                                                                                          Can the tokenised PDFs fully fit in the model’s context window?

                                                                                                                          The complexity of the PDFs determines how well the AI models can see them. Their length determines how much of them the models can see at once. There’s nuance depending on whether the PDFs are fully converted to text tokens or to text and images, but the number of tokens that can be fed in (i.e., the context window) is limited with the current generation of AI models. Context windows are rapidly expanding, with Google recently releasing a million-token model (approximately 700,000 words), but there’s still a cost per token that you need to consider when building solutions.

                                                                                                                          If you’re working with PDFs that are larger than your context windows, you’re probably going to use retrieval-augmented generation, i.e., break the documents into chunks and feed specific chunks into the model based on the query. With some tools and APIs, this may be handled for you (e.g., with OpenAI’s Assistants). However, it introduces a source of inaccuracies that may not be acceptable for your use case.

                                                                                                                          What is your teaching approach?

                                                                                                                          Assuming you’re satisfied that the AIs can see enough of the PDFs well enough to provide useful answers, it’s time to test different ways of teaching them about the data extraction tasks. The question of which approaches you can test is closely tied to your budget. But even with large budgets, it’s best to start simple and only attempt more complicated approaches if the simpler ones fail. In order of implementation complexity, key teaching approaches are:

                                                                                                                          • Zero-shot: Describe the task and the expected output, provide a new PDF, and see if you get the expected output.
                                                                                                                          • Few-shot: In addition to describing the task and expected output, also provide examples of past PDFs and their extracted outputs. Given context window limitations, this is only feasible with relatively short PDFs and simple outputs.
                                                                                                                          • Fine-tuning: This goes beyond the sort of prompting that has become accessible to the general population via ChatGPT. The general idea is that you can get better results by teaching the underlying model about your expected inputs and outputs. Even if you don’t have machine learning experts on your team, you may get good results by following resources such as the fine-tuning guide by OpenAI. However, success isn’t guaranteed, so it’s important to manage expectations and budgets accordingly. It may well be the case that you’re better off waiting a few months or years for new AI models, rather than investing in fine-tuning experimentation. New models are likely to make lower-effort zero/few-shot results better.
                                                                                                                          • Custom models: Taking a step beyond fine-tuning, building custom machine learning models may be a good match for your budget and available expertise. However, you definitely won’t be doing it to automate a low-cost-low-frequency data entry process.

                                                                                                                          Implicit in the above is the availability of some training & testing data (i.e., input PDFs and expected outputs). That is, no matter what approach you follow, you’d want to have some confidence that it works beyond a few test samples – use a large representative dataset to gain confidence in your solution.

                                                                                                                          Can the AI model understand the input structures?

                                                                                                                          This is closely related to the question of PDF complexity, but worth considering separately. I’m anthropomorphising a bit by talking about AI understanding, but just as they can’t see the same as we do, their level of understanding may also be unintuitive. For example, a recent paper that proposed a fine-tuning approach to improve GPT-3.5’s table understanding made the case that general language models can’t read tables reliably because:

                                                                                                                          Natural language texts are (1) one-directional, (2) read left-to-right, where (3) swapping two tokens will generally change the meaning of a sentence. In contrast, relational tables are (1) two-dimensional in nature with both rows and columns, (2) where reading top-to-bottom in the vertical direction for values in the same column, is crucial in many table-tasks. Furthermore, unlike text, (3) tables are largely “invariant” to row and column permutations, where swapping two rows or columns do not generally change the semantic meaning of the table.

                                                                                                                          This argument is compelling, but given the emergent abilities of large language models and the fact that we no longer know what goes into proprietary models beyond GPT-3.5, I wouldn’t bet on these limitations being an issue for all tabular data. Again, experimenting with your specific use case is key. If you encounter issues, it’s worth probing the models to check if they exhibit any semantic understanding beyond just reproducing the inputs.

                                                                                                                          Can the AI model understand and produce the output structures?

                                                                                                                          If you’re building custom models, it’s straightforward to get exactly the output structure you want (e.g., a complex JSON). Otherwise, if you’re prompting a language model, you need to ask nicely and hope for the best. That said, there are strategies to get models to produce the output structures you want, such as using OpenAI’s function calling or the Outlines library. However, as with the example of tabular inputs, there is a difference between being able to produce an output that conforms to a specific output schema and populating the schema with values that make semantic sense. Breaking down complex outputs to simpler structures and using prompt chaining may be helpful in some cases.

                                                                                                                          What is your long-term validation approach?

                                                                                                                          Assuming you successfully build an AI solution that can replace manual data entry, should you completely stop manual data extraction? As with other questions, it depends on the use case, but it’s worth considering a gradual switch to full automation. For example, you can keep the manual process for 10% of new data to verify that the whole system works as expected. This is especially worth doing when working with publicly-available datasets, as there’s a non-zero chance that the models you’re using have seen the input training data before (though they probably haven’t seen your outputs).

                                                                                                                          Anything else?

                                                                                                                          If I missed important questions, please let me know and I will update this post.

                                                                                                                          Subscribe +

                                                                                                                          Questions to consider when using AI for PDF data extraction

                                                                                                                          Decorative image showing a book and flying documents with an AI-themed overlay

                                                                                                                          The jagged frontier of AI means that you can’t always know which tasks are within the capabilities of current models. One such task is the extraction of structured data from PDFs. While this is fully within the capabilities of humans, there are unique challenges in getting off-the-shelf AIs like ChatGPT to do it well. Variants of this task have come up repeatedly in my recent discussions and work, so I put together this summary of my understanding of when it makes sense to try to automate PDF data extraction with AI.

                                                                                                                          This post is structured as a series of questions I’d ask about a proposed project. Actual answers will depend on the specific project.

                                                                                                                          What is your budget?

                                                                                                                          I assume there is some business value in the extracted data. Therefore, it’s possible to estimate the dollar value of automating the manual processes that are used to extract the data. There’s a big difference between an extraction process that takes a junior employee a couple of weeks per year and one that keeps data entry specialists busy year-round. The budget determines the tools that can be used: If it’s low (e.g., in the thousands), it’s probably only worth spending a few days assessing feasibility with off-the-shelf tools like the OpenAI APIs. If it’s higher (e.g., hundreds of thousands), paying AI engineers to build a bespoke system becomes an option.

                                                                                                                          How sensitive is the data?

                                                                                                                          If the data you’re working with isn’t sensitive (e.g., financial statements by public companies), you’re in luck: You can use the best AIs available. A month ago, it was GPT-4. Today, you have a bunch of other proprietary options. Tomorrow, who knows?

                                                                                                                          If the data is sensitive and can’t leave your organisation’s systems, your options are more limited. Depending on other aspects of the problem, it may mean you’re better off waiting for better open models. However, given the rate of progress in the open source AI ecosystem, assessing feasibility is about as simple as with proprietary solutions. It’s just that right now, you won’t be using the most capable models.

                                                                                                                          How complex are the PDFs?

                                                                                                                          There’s a wide variety of documents out there. If the PDFs you’re working with can be converted to text accurately, the AI models have a good chance of being able to extract the data you’re after. If the conversion to text is insufficiently accurate, the AIs stand little chance of outperforming data entry specialists – they just don’t see the PDFs as well as we do.

                                                                                                                          It’s worth spending some time on this question. For example, if you’re using OpenAI’s APIs and much of the data you’re looking to extract is contained in tables within PDFs, you can first test whether you can reliably retrieve and display specific tables. Sticking with the example of financial statements, there’s a big difference between this 124-page sample from Grant Thornton and this seven-page sample from the Australian Fair Work Commission. Prompting GPT-4 (via ChatGPT Plus) to extract a table from the former PDF produced output that was only loosely-connected to the actual content. By contrast, GPT-4 perfectly reproduced tables from the latter PDF.

                                                                                                                          In general, perfect conversion of PDF tables to text appears to be an unsolved problem. For example, in a benchmark from last year, the best tool tested only had about 50% accuracy when applied to tables in scientific papers. However, the field keeps moving. You may get an accuracy boost by treating PDFs as images and using vision models, as recommended by the Unstructured library. Anecdotally, I found Unstructured’s table parsing to be too inaccurate, but when I used GPT-4 Vision on screenshots of the same tables it yielded much better results. It also outperformed OpenAI’s default PDF parser. Your mileage will definitely vary.

                                                                                                                          Things get even more complicated if some of the data you’re hoping to extract is in graphs and other figures contained in the PDFs. Verifying that the PDFs are not too complex for the AI models is definitely worth doing before jumping into more elaborate data extraction tasks.

                                                                                                                          Can the tokenised PDFs fully fit in the model’s context window?

                                                                                                                          The complexity of the PDFs determines how well the AI models can see them. Their length determines how much of them the models can see at once. There’s nuance depending on whether the PDFs are fully converted to text tokens or to text and images, but the number of tokens that can be fed in (i.e., the context window) is limited with the current generation of AI models. Context windows are rapidly expanding, with Google recently releasing a million-token model (approximately 700,000 words), but there’s still a cost per token that you need to consider when building solutions.

                                                                                                                          If you’re working with PDFs that are larger than your context windows, you’re probably going to use retrieval-augmented generation, i.e., break the documents into chunks and feed specific chunks into the model based on the query. With some tools and APIs, this may be handled for you (e.g., with OpenAI’s Assistants). However, it introduces a source of inaccuracies that may not be acceptable for your use case.

                                                                                                                          What is your teaching approach?

                                                                                                                          Assuming you’re satisfied that the AIs can see enough of the PDFs well enough to provide useful answers, it’s time to test different ways of teaching them about the data extraction tasks. The question of which approaches you can test is closely tied to your budget. But even with large budgets, it’s best to start simple and only attempt more complicated approaches if the simpler ones fail. In order of implementation complexity, key teaching approaches are:

                                                                                                                          • Zero-shot: Describe the task and the expected output, provide a new PDF, and see if you get the expected output.
                                                                                                                          • Few-shot: In addition to describing the task and expected output, also provide examples of past PDFs and their extracted outputs. Given context window limitations, this is only feasible with relatively short PDFs and simple outputs.
                                                                                                                          • Fine-tuning: This goes beyond the sort of prompting that has become accessible to the general population via ChatGPT. The general idea is that you can get better results by teaching the underlying model about your expected inputs and outputs. Even if you don’t have machine learning experts on your team, you may get good results by following resources such as the fine-tuning guide by OpenAI. However, success isn’t guaranteed, so it’s important to manage expectations and budgets accordingly. It may well be the case that you’re better off waiting a few months or years for new AI models, rather than investing in fine-tuning experimentation. New models are likely to make lower-effort zero/few-shot results better.
                                                                                                                          • Custom models: Taking a step beyond fine-tuning, building custom machine learning models may be a good match for your budget and available expertise. However, you definitely won’t be doing it to automate a low-cost-low-frequency data entry process.

                                                                                                                          Implicit in the above is the availability of some training & testing data (i.e., input PDFs and expected outputs). That is, no matter what approach you follow, you’d want to have some confidence that it works beyond a few test samples – use a large representative dataset to gain confidence in your solution.

                                                                                                                          Can the AI model understand the input structures?

                                                                                                                          This is closely related to the question of PDF complexity, but worth considering separately. I’m anthropomorphising a bit by talking about AI understanding, but just as they can’t see the same as we do, their level of understanding may also be unintuitive. For example, a recent paper that proposed a fine-tuning approach to improve GPT-3.5’s table understanding made the case that general language models can’t read tables reliably because:

                                                                                                                          Natural language texts are (1) one-directional, (2) read left-to-right, where (3) swapping two tokens will generally change the meaning of a sentence. In contrast, relational tables are (1) two-dimensional in nature with both rows and columns, (2) where reading top-to-bottom in the vertical direction for values in the same column, is crucial in many table-tasks. Furthermore, unlike text, (3) tables are largely “invariant” to row and column permutations, where swapping two rows or columns do not generally change the semantic meaning of the table.

                                                                                                                          This argument is compelling, but given the emergent abilities of large language models and the fact that we no longer know what goes into proprietary models beyond GPT-3.5, I wouldn’t bet on these limitations being an issue for all tabular data. Again, experimenting with your specific use case is key. If you encounter issues, it’s worth probing the models to check if they exhibit any semantic understanding beyond just reproducing the inputs.

                                                                                                                          Can the AI model understand and produce the output structures?

                                                                                                                          If you’re building custom models, it’s straightforward to get exactly the output structure you want (e.g., a complex JSON). Otherwise, if you’re prompting a language model, you need to ask nicely and hope for the best. That said, there are strategies to get models to produce the output structures you want, such as using OpenAI’s function calling or the Outlines library. However, as with the example of tabular inputs, there is a difference between being able to produce an output that conforms to a specific output schema and populating the schema with values that make semantic sense. Breaking down complex outputs to simpler structures and using prompt chaining may be helpful in some cases.

                                                                                                                          What is your long-term validation approach?

                                                                                                                          Assuming you successfully build an AI solution that can replace manual data entry, should you completely stop manual data extraction? As with other questions, it depends on the use case, but it’s worth considering a gradual switch to full automation. For example, you can keep the manual process for 10% of new data to verify that the whole system works as expected. This is especially worth doing when working with publicly-available datasets, as there’s a non-zero chance that the models you’re using have seen the input training data before (though they probably haven’t seen your outputs).

                                                                                                                          Anything else?

                                                                                                                          If I missed important questions, please let me know and I will update this post.

                                                                                                                          Subscribe

                                                                                                                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                            \ No newline at end of file diff --git a/2024/04/01/artificial-intelligence-automation-and-the-art-of-counting-fish/index.html b/2024/04/01/artificial-intelligence-automation-and-the-art-of-counting-fish/index.html index fa959bed7..db28181c6 100644 --- a/2024/04/01/artificial-intelligence-automation-and-the-art-of-counting-fish/index.html +++ b/2024/04/01/artificial-intelligence-automation-and-the-art-of-counting-fish/index.html @@ -1,5 +1,5 @@ Artificial intelligence, automation, and the art of counting fish | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                            Artificial intelligence, automation, and the art of counting fish

                                                                                                                            View from Lord Howe Island's Intermediate Hill (showcasing many RLS sites)

                                                                                                                            View from Lord Howe Island’s Intermediate Hill (showcasing many RLS sites)

                                                                                                                            I recently returned from a Reef Life Survey (RLS) trip to Lord Howe Island. As a volunteer RLS diver, I record fish and invertebrates using methods that have remained unchanged for decades. Sites around Lord Howe Island have been consistently surveyed at least every two years since February 2006. This regularity and consistency in survey methods enables comparisons over time, which help inform marine park management decisions.

                                                                                                                            As a Data & AI specialist, I’ve wondered about the potential for automating RLS surveys since I first became a volunteer nearly ten years ago. Surely the data can be collected without using an underwater clipboard? Given advances in AI over the past decade, I believe that the answer is an emphatic yes – in principle.

                                                                                                                            However, just because something can be automated doesn’t mean it happens instantaneously. As the saying goes, the future is here, but it is unevenly distributed. In fact, Lord Howe Island provides another example of this uneven distribution: As of 2024, the island still lacks mobile phone reception, despite the feasibility of providing coverage.

                                                                                                                            Specifically for RLS automation, even though computers can in principle identify, count, and size fish better than humans, there are several key challenges on the path to automation:

                                                                                                                            • Consistency: Much of the value of RLS data comes from employing the same methods over decades. Using new survey methods would mean introducing different biases, which would need to be modelled when comparing new data to pre-automation data.
                                                                                                                            • Underwater logistics and costs: Replacing volunteer divers with robots isn’t simple, as the RLS methodology goes beyond swimming along a transect line and counting fish. Divers also need to look under ledges and in crevices, move kelp, and inspect the undersides of shells. This is probably within the abilities of underwater robots, but it’d be hard for such robots to compete with volunteer divers on cost.
                                                                                                                            • Boating logistics and costs: Anyone who’s been to sea knows that boating isn’t straightforward. Conditions vary, and things break all the time. Getting humans out of the loop can make things more expensive, especially when there are many volunteers like me, who happily pay to go to sea and dive with or without RLS. Unlike humans, robots don’t dive for fun.

                                                                                                                            Given the challenges, partial automation may be the way to go initially: Mount cameras on volunteers, collect many survey videos, and process them as a quality control. With enough work, the output of the processing should be similar to the survey output of the volunteers – or at a minimum, the biases will be better understood. This approach would enable modelling of biases in subsequent analyses, addressing the issue of consistency. At that point, it’d be possible to do away with the need for volunteers to record the data manually – they can just record videos of the dives. This would be an example of combining AI with citizen science to supercharge ecological monitoring.

                                                                                                                            If we accept partial automation as a desirable option, the key question becomes one of budget allocation. Collecting enough survey videos and doing the data and machine learning work isn’t a trivial exercise, i.e., it won’t be cheap. That said, I believe that with current technology the cost isn’t prohibitive either – it can probably be done as a PhD project with the right student and guidance. Still, whether the budget is best spent on partial automation or on other initiatives is an open question (that is not for me to decide).

                                                                                                                            In general, similar questions around budget allocation arise everywhere. While many of us in tech feel somewhat overwhelmed by the rate of progress in Data & AI, it is important to remember the uneven distribution of automation (treating AI as a synonym of automation makes it sound less magical). This uneven distribution means that opportunities abound for deploying proven technologies in various domains. These opportunities are unlikely to disappear overnight – they will be here for years to come. Focusing on solutions to real problems while maintaining an awareness of emerging tech (without chasing shiny objects) will remain the key to success. In particular, Data & AI engineering are likely to remain lucrative in coming years, though they will undoubtedly change with new tools and novel automation methods.

                                                                                                                            Artificial intelligence, automation, and the art of counting fish

                                                                                                                            View from Lord Howe Island's Intermediate Hill (showcasing many RLS sites)

                                                                                                                            View from Lord Howe Island’s Intermediate Hill (showcasing many RLS sites)

                                                                                                                            I recently returned from a Reef Life Survey (RLS) trip to Lord Howe Island. As a volunteer RLS diver, I record fish and invertebrates using methods that have remained unchanged for decades. Sites around Lord Howe Island have been consistently surveyed at least every two years since February 2006. This regularity and consistency in survey methods enables comparisons over time, which help inform marine park management decisions.

                                                                                                                            As a Data & AI specialist, I’ve wondered about the potential for automating RLS surveys since I first became a volunteer nearly ten years ago. Surely the data can be collected without using an underwater clipboard? Given advances in AI over the past decade, I believe that the answer is an emphatic yes – in principle.

                                                                                                                            However, just because something can be automated doesn’t mean it happens instantaneously. As the saying goes, the future is here, but it is unevenly distributed. In fact, Lord Howe Island provides another example of this uneven distribution: As of 2024, the island still lacks mobile phone reception, despite the feasibility of providing coverage.

                                                                                                                            Specifically for RLS automation, even though computers can in principle identify, count, and size fish better than humans, there are several key challenges on the path to automation:

                                                                                                                            • Consistency: Much of the value of RLS data comes from employing the same methods over decades. Using new survey methods would mean introducing different biases, which would need to be modelled when comparing new data to pre-automation data.
                                                                                                                            • Underwater logistics and costs: Replacing volunteer divers with robots isn’t simple, as the RLS methodology goes beyond swimming along a transect line and counting fish. Divers also need to look under ledges and in crevices, move kelp, and inspect the undersides of shells. This is probably within the abilities of underwater robots, but it’d be hard for such robots to compete with volunteer divers on cost.
                                                                                                                            • Boating logistics and costs: Anyone who’s been to sea knows that boating isn’t straightforward. Conditions vary, and things break all the time. Getting humans out of the loop can make things more expensive, especially when there are many volunteers like me, who happily pay to go to sea and dive with or without RLS. Unlike humans, robots don’t dive for fun.

                                                                                                                            Given the challenges, partial automation may be the way to go initially: Mount cameras on volunteers, collect many survey videos, and process them as a quality control. With enough work, the output of the processing should be similar to the survey output of the volunteers – or at a minimum, the biases will be better understood. This approach would enable modelling of biases in subsequent analyses, addressing the issue of consistency. At that point, it’d be possible to do away with the need for volunteers to record the data manually – they can just record videos of the dives. This would be an example of combining AI with citizen science to supercharge ecological monitoring.

                                                                                                                            If we accept partial automation as a desirable option, the key question becomes one of budget allocation. Collecting enough survey videos and doing the data and machine learning work isn’t a trivial exercise, i.e., it won’t be cheap. That said, I believe that with current technology the cost isn’t prohibitive either – it can probably be done as a PhD project with the right student and guidance. Still, whether the budget is best spent on partial automation or on other initiatives is an open question (that is not for me to decide).

                                                                                                                            In general, similar questions around budget allocation arise everywhere. While many of us in tech feel somewhat overwhelmed by the rate of progress in Data & AI, it is important to remember the uneven distribution of automation (treating AI as a synonym of automation makes it sound less magical). This uneven distribution means that opportunities abound for deploying proven technologies in various domains. These opportunities are unlikely to disappear overnight – they will be here for years to come. Focusing on solutions to real problems while maintaining an awareness of emerging tech (without chasing shiny objects) will remain the key to success. In particular, Data & AI engineering are likely to remain lucrative in coming years, though they will undoubtedly change with new tools and novel automation methods.

                                                                                                                            800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                                              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                              \ No newline at end of file diff --git a/2024/04/08/my-experience-as-a-data-tech-lead-with-work-on-climate/index.html b/2024/04/08/my-experience-as-a-data-tech-lead-with-work-on-climate/index.html index d6b1951e5..92278e037 100644 --- a/2024/04/08/my-experience-as-a-data-tech-lead-with-work-on-climate/index.html +++ b/2024/04/08/my-experience-as-a-data-tech-lead-with-work-on-climate/index.html @@ -1,5 +1,5 @@ My experience as a Data Tech Lead with Work on Climate | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                              My experience as a Data Tech Lead with Work on Climate

                                                                                                                              After leaving my last startup gig last year, I gave myself time to explore. My plan was to build my own product business while doing a bit of consulting on the side. As I was researching the climate tech space, I attended Work on Climate’s Expert Office Hours to get feedback on one of my ideas. The session was informative – it helped me avoid the common trap of techies, and not build something I would find hard to sell.

                                                                                                                              To my surprise, I received an email from a user researcher with Work on Climate shortly after the session. They wanted to interview me about my experience booking and attending the Expert Office Hours. This was a level of professionalism I wasn’t expecting from an organisation that’s mostly run by volunteers!

                                                                                                                              Following that experience, I poked around the Work on Climate website and saw they were looking for a volunteer Data Engineer on the Metrics & Data Team. Despite being reluctant to do data engineering work full time, the specified time commitment of five hours per week for six months seemed manageable. I like maintaining awareness of what’s happening in the data engineering world, and this seemed like a productive way of doing it: supporting climate tech work and giving back to an organisation that has helped me. Further, I thought it’d be fun to join a global team of highly-skilled volunteers while figuring out my solo ventures.

                                                                                                                              Applying and joining the team

                                                                                                                              My impression of Work on Climate as a professional organisation was reinforced throughout the application and onboarding process:

                                                                                                                              • Despite being primarily run by volunteers, positions are advertised on a Work with Us page with descriptions that are similar to those of full-time jobs.
                                                                                                                              • In my application, I had to address key criteria: share why I wanted to volunteer, explain how I’d address common challenges with the role (listed in the position description), and describe a side project that demonstrates my volunteer experience.
                                                                                                                              • Following the submission of the application form, I had screening calls with a recruiter (Sarah Fowler), the team lead (Xanthe Travlos), and a data engineer on the team (Misha Panshenskov).

                                                                                                                              In general, this was much like a “normal” job application process, but condensed. If you look through the LinkedIn profiles above, you’ll see why: Work on Climate volunteers have extensive professional experience in the same fields they contribute to as part of the organisation.

                                                                                                                              The theme of volunteering being like a normal job but condensed extended throughout my onboarding and initial work. I went through the usual experience of getting access to systems, becoming familiar with the team and problems, and picking up my first introductory issues – just as I would if this were a full-time job. In addition to being condensed, another big difference to a full-time job was the passage of time: As a volunteer workweek is only five hours, any project that spans a few workdays becomes weeks in calendar time. This requires being more mindful of delivering incremental value than you would in a full-time environment: Challenging, but doable.

                                                                                                                              Becoming a tech lead

                                                                                                                              One of the challenges listed in the job ad was:

                                                                                                                              We are a start-up in our data maturity—we are still figuring out how to use data and how best to model and transform it to meet wider team needs. Some things we do might not make sense to you—we’ll welcome your suggestions on how to improve things!

                                                                                                                              Other challenges were along the same lines, so my response to the application question about addressing them was:

                                                                                                                              The challenges listed sound like many organisations I’ve worked with, especially having a distributed time-constrained team with a data stack that grows organically and isn’t very well-documented. I don’t think there’s a silver bullet to address those challenges other than patiently getting up to speed and working with others to figure out the top priorities. It also makes sense to add documentation as part of my on-boarding. I suspect that prioritisation and planning given the volunteer time constraints will be key, but even when working with full-timers it’s often the case that there’s more work than time.

                                                                                                                              Indeed, that was pretty much what happened. While my original intent was to act as an individual contributor, I find it hard to hold back when I see ways to improve processes and systems. As promised in the ad, my suggestions were well-received, but beyond what I had expected: Xanthe suggested we become co-leads of the Metrics & Data team. This made sense given that our backgrounds are complementary – she has extensive product management experience, while I have more hands-on exposure to the data world. Therefore, we settled into a new team structure, with her retaining the team lead position and business-facing / product management responsibilities, and me setting the technical direction for the data platform.

                                                                                                                              This pattern matches what had happened before in some of my full-time roles. For example, in my work with Automattic, I became the tech lead for the experimentation platform after organically spotting the need for better experimentation processes when working on machine learning for marketing applications. With Automattic, as with Work on Climate, working as a tech lead alongside a capable product manager and team lead allowed us both to capitalise on our strengths.

                                                                                                                              A brief example to illustrate the need for a data tech lead: As promised in the job ad, one of the challenges with Work on Climate is that the data stack has evolved “organically”. This included data coming from scheduled Jupyter notebooks, various APIs (via a JavaScript codebase), and some transformations with dbt. As the notebooks weren’t well-maintained, it made sense to absorb them into other parts of the stack and reduce the maintenance load. As a tech lead, I help spot and prioritise such work, moving Work on Climate towards a minimum viable data stack that serves the organisation’s needs.

                                                                                                                              Challenges and opportunities

                                                                                                                              Much has been happening concurrently to my work with Work on Climate. My exploration of product ideas that I could bootstrap by myself led me to the realisation that the lines between solo consulting and product building are blurry. Borrowing from Jonathan Stark, my business strategy has turned into helping people I like get what they want, which seems easier to achieve as a solo consultant than by building a software product. While concise positioning remains a challenge, I started taking my consulting practice more seriously rather than seeing as it a side gig. My goal is to help climate tech and nature-positive startups with the sort of problems I’ve been helping Work on Climate and various companies throughout my career, i.e., with shipping Data & AI solutions. However, like many solo consultants, I’ve discovered that generating a pipeline of qualified leads is a key challenge – harder than the technical aspects of my work.

                                                                                                                              One unexpected challenge with my climate focus has been the October 7th attacks on Israel. While I’ve been living in Australia since 2009, I am a Jew from Israel, so I’ve been deeply affected by October 7th and its aftermath. Beyond the horror of the massacres, I was horrified by the response of some politicians and activists who claim to be “green”. Fortunately, I haven’t witnessed such responses within Work on Climate, where the internal reaction was of support and understanding of the human suffering caused by wars. However, these events have led me to revise the criteria of “people I like” and want to help in the climate space. I definitely dislike those who promote Jew hatred or support calls for the destruction of Israel (which would result in the death of my family). Fortunately, this doesn’t exclude everyone, as there’s a fair number of Jews and generally-decent people who are focused on building climate solutions – truly hateful people are a loud minority.

                                                                                                                              Anyway, by the beginning of 2024 I managed to find a new balance and a professional direction as an independent consultant. Further, volunteering with Work on Climate has highlighted an opportunity for consulting engagements: I learned that it is possible to provide value as a data tech lead even with five hours per week. The trendy name for this is a fractional chief data/analytics/AI officer. My ideal clients for such engagements are startups around the stage of getting their first data hire, with a similar level of data maturity to Work on Climate.

                                                                                                                              That said, Work on Climate has some unique challenges and opportunities that don’t show up in startups with similar data maturity. On the one hand, given the large number of volunteers, keeping everyone in sync and breaking down silos is harder than with a smaller group of employees. On the other hand, the cost of “hiring” volunteers is in recruitment and onboarding rather than in ongoing salaries, which gives the organisation access to “free” fractional talent that most startups can only dream of. In any case, it is an interesting organisation to volunteer with if you come in with the right mindset. From a logistical viewpoint, it’s good fit if you know you’ll have about five hours per week for at least six months, keeping in mind that the hours may be spread throughout the week (e.g., for calls and discussions with other volunteers).

                                                                                                                              Future moves

                                                                                                                              My original intention was to help Work on Climate for at least six months. I am now about nine months in. With my newfound consulting focus, I find that I have quite a bit to juggle outside Work of Climate. While I would love to do it all, I have less of the mental space to contribute to the organisation. Therefore, we are looking for someone to replace me in the coming months. However, I will likely still contribute on a more limited advisory basis. If you are interested in volunteering with Work on Climate, or know someone who would be a suitable data tech lead, please get in touch!

                                                                                                                              My experience as a Data Tech Lead with Work on Climate

                                                                                                                              After leaving my last startup gig last year, I gave myself time to explore. My plan was to build my own product business while doing a bit of consulting on the side. As I was researching the climate tech space, I attended Work on Climate’s Expert Office Hours to get feedback on one of my ideas. The session was informative – it helped me avoid the common trap of techies, and not build something I would find hard to sell.

                                                                                                                              To my surprise, I received an email from a user researcher with Work on Climate shortly after the session. They wanted to interview me about my experience booking and attending the Expert Office Hours. This was a level of professionalism I wasn’t expecting from an organisation that’s mostly run by volunteers!

                                                                                                                              Following that experience, I poked around the Work on Climate website and saw they were looking for a volunteer Data Engineer on the Metrics & Data Team. Despite being reluctant to do data engineering work full time, the specified time commitment of five hours per week for six months seemed manageable. I like maintaining awareness of what’s happening in the data engineering world, and this seemed like a productive way of doing it: supporting climate tech work and giving back to an organisation that has helped me. Further, I thought it’d be fun to join a global team of highly-skilled volunteers while figuring out my solo ventures.

                                                                                                                              Applying and joining the team

                                                                                                                              My impression of Work on Climate as a professional organisation was reinforced throughout the application and onboarding process:

                                                                                                                              • Despite being primarily run by volunteers, positions are advertised on a Work with Us page with descriptions that are similar to those of full-time jobs.
                                                                                                                              • In my application, I had to address key criteria: share why I wanted to volunteer, explain how I’d address common challenges with the role (listed in the position description), and describe a side project that demonstrates my volunteer experience.
                                                                                                                              • Following the submission of the application form, I had screening calls with a recruiter (Sarah Fowler), the team lead (Xanthe Travlos), and a data engineer on the team (Misha Panshenskov).

                                                                                                                              In general, this was much like a “normal” job application process, but condensed. If you look through the LinkedIn profiles above, you’ll see why: Work on Climate volunteers have extensive professional experience in the same fields they contribute to as part of the organisation.

                                                                                                                              The theme of volunteering being like a normal job but condensed extended throughout my onboarding and initial work. I went through the usual experience of getting access to systems, becoming familiar with the team and problems, and picking up my first introductory issues – just as I would if this were a full-time job. In addition to being condensed, another big difference to a full-time job was the passage of time: As a volunteer workweek is only five hours, any project that spans a few workdays becomes weeks in calendar time. This requires being more mindful of delivering incremental value than you would in a full-time environment: Challenging, but doable.

                                                                                                                              Becoming a tech lead

                                                                                                                              One of the challenges listed in the job ad was:

                                                                                                                              We are a start-up in our data maturity—we are still figuring out how to use data and how best to model and transform it to meet wider team needs. Some things we do might not make sense to you—we’ll welcome your suggestions on how to improve things!

                                                                                                                              Other challenges were along the same lines, so my response to the application question about addressing them was:

                                                                                                                              The challenges listed sound like many organisations I’ve worked with, especially having a distributed time-constrained team with a data stack that grows organically and isn’t very well-documented. I don’t think there’s a silver bullet to address those challenges other than patiently getting up to speed and working with others to figure out the top priorities. It also makes sense to add documentation as part of my on-boarding. I suspect that prioritisation and planning given the volunteer time constraints will be key, but even when working with full-timers it’s often the case that there’s more work than time.

                                                                                                                              Indeed, that was pretty much what happened. While my original intent was to act as an individual contributor, I find it hard to hold back when I see ways to improve processes and systems. As promised in the ad, my suggestions were well-received, but beyond what I had expected: Xanthe suggested we become co-leads of the Metrics & Data team. This made sense given that our backgrounds are complementary – she has extensive product management experience, while I have more hands-on exposure to the data world. Therefore, we settled into a new team structure, with her retaining the team lead position and business-facing / product management responsibilities, and me setting the technical direction for the data platform.

                                                                                                                              This pattern matches what had happened before in some of my full-time roles. For example, in my work with Automattic, I became the tech lead for the experimentation platform after organically spotting the need for better experimentation processes when working on machine learning for marketing applications. With Automattic, as with Work on Climate, working as a tech lead alongside a capable product manager and team lead allowed us both to capitalise on our strengths.

                                                                                                                              A brief example to illustrate the need for a data tech lead: As promised in the job ad, one of the challenges with Work on Climate is that the data stack has evolved “organically”. This included data coming from scheduled Jupyter notebooks, various APIs (via a JavaScript codebase), and some transformations with dbt. As the notebooks weren’t well-maintained, it made sense to absorb them into other parts of the stack and reduce the maintenance load. As a tech lead, I help spot and prioritise such work, moving Work on Climate towards a minimum viable data stack that serves the organisation’s needs.

                                                                                                                              Challenges and opportunities

                                                                                                                              Much has been happening concurrently to my work with Work on Climate. My exploration of product ideas that I could bootstrap by myself led me to the realisation that the lines between solo consulting and product building are blurry. Borrowing from Jonathan Stark, my business strategy has turned into helping people I like get what they want, which seems easier to achieve as a solo consultant than by building a software product. While concise positioning remains a challenge, I started taking my consulting practice more seriously rather than seeing as it a side gig. My goal is to help climate tech and nature-positive startups with the sort of problems I’ve been helping Work on Climate and various companies throughout my career, i.e., with shipping Data & AI solutions. However, like many solo consultants, I’ve discovered that generating a pipeline of qualified leads is a key challenge – harder than the technical aspects of my work.

                                                                                                                              One unexpected challenge with my climate focus has been the October 7th attacks on Israel. While I’ve been living in Australia since 2009, I am a Jew from Israel, so I’ve been deeply affected by October 7th and its aftermath. Beyond the horror of the massacres, I was horrified by the response of some politicians and activists who claim to be “green”. Fortunately, I haven’t witnessed such responses within Work on Climate, where the internal reaction was of support and understanding of the human suffering caused by wars. However, these events have led me to revise the criteria of “people I like” and want to help in the climate space. I definitely dislike those who promote Jew hatred or support calls for the destruction of Israel (which would result in the death of my family). Fortunately, this doesn’t exclude everyone, as there’s a fair number of Jews and generally-decent people who are focused on building climate solutions – truly hateful people are a loud minority.

                                                                                                                              Anyway, by the beginning of 2024 I managed to find a new balance and a professional direction as an independent consultant. Further, volunteering with Work on Climate has highlighted an opportunity for consulting engagements: I learned that it is possible to provide value as a data tech lead even with five hours per week. The trendy name for this is a fractional chief data/analytics/AI officer. My ideal clients for such engagements are startups around the stage of getting their first data hire, with a similar level of data maturity to Work on Climate.

                                                                                                                              That said, Work on Climate has some unique challenges and opportunities that don’t show up in startups with similar data maturity. On the one hand, given the large number of volunteers, keeping everyone in sync and breaking down silos is harder than with a smaller group of employees. On the other hand, the cost of “hiring” volunteers is in recruitment and onboarding rather than in ongoing salaries, which gives the organisation access to “free” fractional talent that most startups can only dream of. In any case, it is an interesting organisation to volunteer with if you come in with the right mindset. From a logistical viewpoint, it’s good fit if you know you’ll have about five hours per week for at least six months, keeping in mind that the hours may be spread throughout the week (e.g., for calls and discussions with other volunteers).

                                                                                                                              Future moves

                                                                                                                              My original intention was to help Work on Climate for at least six months. I am now about nine months in. With my newfound consulting focus, I find that I have quite a bit to juggle outside Work of Climate. While I would love to do it all, I have less of the mental space to contribute to the organisation. Therefore, we are looking for someone to replace me in the coming months. However, I will likely still contribute on a more limited advisory basis. If you are interested in volunteering with Work on Climate, or know someone who would be a suitable data tech lead, please get in touch!

                                                                                                                              800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                                                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                \ No newline at end of file diff --git a/2024/04/15/ai-does-not-obviate-the-need-for-testing-and-observability/index.html b/2024/04/15/ai-does-not-obviate-the-need-for-testing-and-observability/index.html index 059e63daf..ac646ebf2 100644 --- a/2024/04/15/ai-does-not-obviate-the-need-for-testing-and-observability/index.html +++ b/2024/04/15/ai-does-not-obviate-the-need-for-testing-and-observability/index.html @@ -1,9 +1,11 @@ AI does not obviate the need for testing and observability | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                AI does not obviate the need for testing and observability

                                                                                                                                clunky untested bot on the left, slicker bot on the right

                                                                                                                                The excitement sparked by ChatGPT has led to a flood of funding for building AI applications, especially around large language models (LLMs). The ease of getting started with AI can lead to excessive enthusiasm, to the point of believing that we have entered a new regime of software development where old best practices no longer apply. The goal of this post is to demonstrate that we are still in the old regime: Testing and observability remain key to AI success beyond initial prototypes.

                                                                                                                                Bookmark and reuse if anyone tries to claim otherwise.

                                                                                                                                First, let’s acknowledge the fact that prototyping AI applications is now easier than ever. For example, I recently watched this video by Hrishi Olickel, which demonstrates how to go from zero to a working AI-powered app in about thirty minutes. Examples like this abound, but I have a feeling that people might miss two key messages from the video:

                                                                                                                                1. 99% of the time, the problem is with your data.
                                                                                                                                2. The app isn’t ready for production.

                                                                                                                                Two elements that solid production-level apps include are testing and observability. This is highlighted in recent posts by two consultants who are helping companies ship LLM-powered applications:

                                                                                                                                1. Your AI Product Needs Evals by Hamel Husain. Key quote: “Unsuccessful products almost always share a common root cause: a failure to create robust evaluation systems.”
                                                                                                                                2. Levels of Complexity: RAG Applications by Jason Liu. Level 3 is observability. Level 4 is evaluations.

                                                                                                                                The use of the word evaluations (or evals) by both authors is intentional. This is the common term for testing that deals with the challenges of working with LLMs (essentially a complex mapping from any text input to any text output). As noted in the OpenAI Evals repository:

                                                                                                                                If you are building with LLMs, creating high quality evals is one of the most impactful things you can do. Without evals, it can be very difficult and time intensive to understand how different model versions might affect your use case.

                                                                                                                                That is, we are at the opposite to a new regime where traditional software testing can be forgotten: Production-level AI apps still require all the usual software tests, as well as AI-specific evaluations.

                                                                                                                                In a way, this is nothing new. Before ChatGPT drew significant attention to LLMs, much of the buzz was around traditional machine learning (ML) apps. And many of the best practices from ML engineering apply to LLM / AI engineering.

                                                                                                                                If you are inexperienced with shipping production-grade AI/ML/LLM applications, please don’t let it stop you from prototyping. But if you are getting serious about going beyond a prototype, it’s time to either get help from experienced AI engineers, or to become one yourself (experience is a great teacher). Just remember that there is no way around testing and observability if you want to ship a quality product.

                                                                                                                                Subscribe +

                                                                                                                                AI does not obviate the need for testing and observability

                                                                                                                                clunky untested bot on the left, slicker bot on the right

                                                                                                                                The excitement sparked by ChatGPT has led to a flood of funding for building AI applications, especially around large language models (LLMs). The ease of getting started with AI can lead to excessive enthusiasm, to the point of believing that we have entered a new regime of software development where old best practices no longer apply. The goal of this post is to demonstrate that we are still in the old regime: Testing and observability remain key to AI success beyond initial prototypes.

                                                                                                                                Bookmark and reuse if anyone tries to claim otherwise.

                                                                                                                                First, let’s acknowledge the fact that prototyping AI applications is now easier than ever. For example, I recently watched this video by Hrishi Olickel, which demonstrates how to go from zero to a working AI-powered app in about thirty minutes. Examples like this abound, but I have a feeling that people might miss two key messages from the video:

                                                                                                                                1. 99% of the time, the problem is with your data.
                                                                                                                                2. The app isn’t ready for production.

                                                                                                                                Two elements that solid production-level apps include are testing and observability. This is highlighted in recent posts by two consultants who are helping companies ship LLM-powered applications:

                                                                                                                                1. Your AI Product Needs Evals by Hamel Husain. Key quote: “Unsuccessful products almost always share a common root cause: a failure to create robust evaluation systems.”
                                                                                                                                2. Levels of Complexity: RAG Applications by Jason Liu. Level 3 is observability. Level 4 is evaluations.

                                                                                                                                The use of the word evaluations (or evals) by both authors is intentional. This is the common term for testing that deals with the challenges of working with LLMs (essentially a complex mapping from any text input to any text output). As noted in the OpenAI Evals repository:

                                                                                                                                If you are building with LLMs, creating high quality evals is one of the most impactful things you can do. Without evals, it can be very difficult and time intensive to understand how different model versions might affect your use case.

                                                                                                                                That is, we are at the opposite to a new regime where traditional software testing can be forgotten: Production-level AI apps still require all the usual software tests, as well as AI-specific evaluations.

                                                                                                                                In a way, this is nothing new. Before ChatGPT drew significant attention to LLMs, much of the buzz was around traditional machine learning (ML) apps. And many of the best practices from ML engineering apply to LLM / AI engineering.

                                                                                                                                If you are inexperienced with shipping production-grade AI/ML/LLM applications, please don’t let it stop you from prototyping. But if you are getting serious about going beyond a prototype, it’s time to either get help from experienced AI engineers, or to become one yourself (experience is a great teacher). Just remember that there is no way around testing and observability if you want to ship a quality product.

                                                                                                                                Subscribe

                                                                                                                                  Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                  \ No newline at end of file diff --git a/2024/04/22/assessing-a-startups-data-to-ai-health/index.html b/2024/04/22/assessing-a-startups-data-to-ai-health/index.html index 8314c35bd..87494967c 100644 --- a/2024/04/22/assessing-a-startups-data-to-ai-health/index.html +++ b/2024/04/22/assessing-a-startups-data-to-ai-health/index.html @@ -1,9 +1,11 @@ Assessing a startup's data-to-AI health | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                  Assessing a startup's data-to-AI health

                                                                                                                                  a person dressed as a doctor conducting a health check on a screen

                                                                                                                                  In the past year, I went from exploring product ideas to committing to my current consulting practice. One thing that became apparent was that I needed to get better at communicating my unique value proposition: who I serve, and how I can help them. The circus that is data (/AI/ML/BI/analytics/…) titles and terminology definitely doesn’t help. Sprinkle a couple of decades of hype cycles on top, and you end up where we are today: a mess of inflated expectations followed by disappointments. But also a wealth of opportunities to generate genuine value.

                                                                                                                                  Anyway, I’m now fairly clear on whom I’m actively targeting: funded Australian startups (around seed to series A) in the climate & nature tech space, who can use help on their data-to-AI journey. I started calling it data-to-AI rather than data & AI or data/AI/ML because anything AI (/ML/data science/…) or analytics starts with data – and keeps going back to data.

                                                                                                                                  How I can help isn’t as clearly communicated as I’d like it to be, so I’ve been working on that. Offerings at the cheap and expensive ends of the spectrum are easy to explain: one-off advisory calls include bespoke on-the-spot advice, while fractional chief data & AI officer engagements include similar responsibilities to those of a full-timer with the same title. However, it’s in nobody’s best interest to jump straight into a fractional relationship. To address this, I’ve been working on a standard offering that’d be more structured than advisory calls, deliver value to the client, and allow both parties to uncover opportunities and see how we work together.

                                                                                                                                  My working title for the offering is Data-to-AI Health Check (better suggestions welcome). The idea is to assess where the startup stands with their data/AI/ML stack and capabilities, and identify the top opportunities for improvement.

                                                                                                                                  This has been on my mind for a while, so I’ve collected a heap of documents and questions for inspiration. I’m now at the “too overwhelmed” phase of turning it into something I can present, but hopefully I’ll have it all sorted in the coming weeks.

                                                                                                                                  In the meantime (and in the spirit of building in public), the rest of this post describes the areas I think are most important to assess. Suggestions for areas I might have missed are welcome. In future posts, I’ll add more detail on performing the assessment, which will undoubtedly evolve as I offer it to more clients.

                                                                                                                                  Assessment areas

                                                                                                                                  Product and business model. Understanding what the startup is about and where it’s going is key to understanding where data/AI/ML fit in. One useful lens is determining whether the product is ML-centric or non-ML, with non-ML products varying in their data intensity from data-centric to data-supported. It’s also important to understand key metrics and how they’re measured.

                                                                                                                                  People. Who’s working for the company and what is the team structure? In particular, what are the current data/AI/ML capabilities and experience? Can the current staff deliver what the business needs? If there are skill gaps (e.g., they haven’t yet made their first data hire), what’s the plan to address them? Can the current team adequately assess the skills of data people?

                                                                                                                                  Processes and project management. The best people will fail to deliver projects if the company’s processes have deep flaws. My general opinion is that all the best practices from software development can and should be applied to data projects (e.g., see posts from 2023 and 2018). However, data entropy and the probabilistic nature of AI/ML require extra care and practices in addition to traditional software development.

                                                                                                                                  Culture. Knowing what people are on the team and what processes are in place isn’t enough to assess how well the team can deliver the product vision. Culture – the unwritten norms and beliefs of the company – matters. A lot. For example, if the founder doesn’t tolerate data-backed evidence that contradicts their preconceived notions, it’s likely to be an impediment to data/AI/ML project delivery. Similarly, it’s worth paying attention to how experiments are treated: If a hypothesis behind an experiment turns out to be unsupported, it’s not a failure. Failing to learn from experiments is the true failure.

                                                                                                                                  Data. What data is the company dealing with? What are the data’s volume, velocity, and variety? Is all the necessary data being captured? How clean is it? Where is it stored and how is it processed? What data management practices are in place, both explicitly and implicitly?

                                                                                                                                  Tech. Closely related to data is the tech architecture, systems, and software. Tech includes where the data lives and how it flows, particularly how it feeds into AI/ML/analytics applications. Of particular interest is the allocation of innovation tokens. Innovation tokens should be spent on tech that makes the startup meaningfully unique to its customers. Everything else should be boring and standard, i.e., proven to work and fit for purpose.

                                                                                                                                  Security and compliance. Security is interwoven through all of the above. For example, you want a culture where any person can flag security risks – some of which may only be visible if you’re close to the code and data. Security breaches and data leaks can destroy companies, especially young startups that haven’t earned customer trust yet. Particular attention should be paid to compliance issues that arise with data collection, e.g., around personal and regulated data.

                                                                                                                                  Other opportunities and risks. In exploring the above areas, issues that don’t fit neatly into any bucket are likely to be uncovered. These may be new opportunities or risks. It’s important to keep an eye out for such cases and flag them accordingly.

                                                                                                                                  Closing thoughts

                                                                                                                                  In my experience, it’s easy to find a thousand areas for improvement once you become familiar with a startup or a large company division. It’s harder to identify the top three items to work on next – it is a bet on the highest-impact items that are feasible to deliver.

                                                                                                                                  It is also a challenge to distill the Data-to-AI Health Check to a set of questions that would probe the right areas without burdening the startup too much. I’ll report back once I’ve figured it out. In the meantime, comments are welcome!

                                                                                                                                  Subscribe +

                                                                                                                                  Assessing a startup's data-to-AI health

                                                                                                                                  a person dressed as a doctor conducting a health check on a screen

                                                                                                                                  In the past year, I went from exploring product ideas to committing to my current consulting practice. One thing that became apparent was that I needed to get better at communicating my unique value proposition: who I serve, and how I can help them. The circus that is data (/AI/ML/BI/analytics/…) titles and terminology definitely doesn’t help. Sprinkle a couple of decades of hype cycles on top, and you end up where we are today: a mess of inflated expectations followed by disappointments. But also a wealth of opportunities to generate genuine value.

                                                                                                                                  Anyway, I’m now fairly clear on whom I’m actively targeting: funded Australian startups (around seed to series A) in the climate & nature tech space, who can use help on their data-to-AI journey. I started calling it data-to-AI rather than data & AI or data/AI/ML because anything AI (/ML/data science/…) or analytics starts with data – and keeps going back to data.

                                                                                                                                  How I can help isn’t as clearly communicated as I’d like it to be, so I’ve been working on that. Offerings at the cheap and expensive ends of the spectrum are easy to explain: one-off advisory calls include bespoke on-the-spot advice, while fractional chief data & AI officer engagements include similar responsibilities to those of a full-timer with the same title. However, it’s in nobody’s best interest to jump straight into a fractional relationship. To address this, I’ve been working on a standard offering that’d be more structured than advisory calls, deliver value to the client, and allow both parties to uncover opportunities and see how we work together.

                                                                                                                                  My working title for the offering is Data-to-AI Health Check (better suggestions welcome). The idea is to assess where the startup stands with their data/AI/ML stack and capabilities, and identify the top opportunities for improvement.

                                                                                                                                  This has been on my mind for a while, so I’ve collected a heap of documents and questions for inspiration. I’m now at the “too overwhelmed” phase of turning it into something I can present, but hopefully I’ll have it all sorted in the coming weeks.

                                                                                                                                  In the meantime (and in the spirit of building in public), the rest of this post describes the areas I think are most important to assess. Suggestions for areas I might have missed are welcome. In future posts, I’ll add more detail on performing the assessment, which will undoubtedly evolve as I offer it to more clients.

                                                                                                                                  Assessment areas

                                                                                                                                  Product and business model. Understanding what the startup is about and where it’s going is key to understanding where data/AI/ML fit in. One useful lens is determining whether the product is ML-centric or non-ML, with non-ML products varying in their data intensity from data-centric to data-supported. It’s also important to understand key metrics and how they’re measured.

                                                                                                                                  People. Who’s working for the company and what is the team structure? In particular, what are the current data/AI/ML capabilities and experience? Can the current staff deliver what the business needs? If there are skill gaps (e.g., they haven’t yet made their first data hire), what’s the plan to address them? Can the current team adequately assess the skills of data people?

                                                                                                                                  Processes and project management. The best people will fail to deliver projects if the company’s processes have deep flaws. My general opinion is that all the best practices from software development can and should be applied to data projects (e.g., see posts from 2023 and 2018). However, data entropy and the probabilistic nature of AI/ML require extra care and practices in addition to traditional software development.

                                                                                                                                  Culture. Knowing what people are on the team and what processes are in place isn’t enough to assess how well the team can deliver the product vision. Culture – the unwritten norms and beliefs of the company – matters. A lot. For example, if the founder doesn’t tolerate data-backed evidence that contradicts their preconceived notions, it’s likely to be an impediment to data/AI/ML project delivery. Similarly, it’s worth paying attention to how experiments are treated: If a hypothesis behind an experiment turns out to be unsupported, it’s not a failure. Failing to learn from experiments is the true failure.

                                                                                                                                  Data. What data is the company dealing with? What are the data’s volume, velocity, and variety? Is all the necessary data being captured? How clean is it? Where is it stored and how is it processed? What data management practices are in place, both explicitly and implicitly?

                                                                                                                                  Tech. Closely related to data is the tech architecture, systems, and software. Tech includes where the data lives and how it flows, particularly how it feeds into AI/ML/analytics applications. Of particular interest is the allocation of innovation tokens. Innovation tokens should be spent on tech that makes the startup meaningfully unique to its customers. Everything else should be boring and standard, i.e., proven to work and fit for purpose.

                                                                                                                                  Security and compliance. Security is interwoven through all of the above. For example, you want a culture where any person can flag security risks – some of which may only be visible if you’re close to the code and data. Security breaches and data leaks can destroy companies, especially young startups that haven’t earned customer trust yet. Particular attention should be paid to compliance issues that arise with data collection, e.g., around personal and regulated data.

                                                                                                                                  Other opportunities and risks. In exploring the above areas, issues that don’t fit neatly into any bucket are likely to be uncovered. These may be new opportunities or risks. It’s important to keep an eye out for such cases and flag them accordingly.

                                                                                                                                  Closing thoughts

                                                                                                                                  In my experience, it’s easy to find a thousand areas for improvement once you become familiar with a startup or a large company division. It’s harder to identify the top three items to work on next – it is a bet on the highest-impact items that are feasible to deliver.

                                                                                                                                  It is also a challenge to distill the Data-to-AI Health Check to a set of questions that would probe the right areas without burdening the startup too much. I’ll report back once I’ve figured it out. In the meantime, comments are welcome!

                                                                                                                                  Subscribe

                                                                                                                                    Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                    \ No newline at end of file diff --git a/2024/04/29/mentorship-and-the-art-of-actionable-advice/index.html b/2024/04/29/mentorship-and-the-art-of-actionable-advice/index.html index 8f4032ab1..6ea62761c 100644 --- a/2024/04/29/mentorship-and-the-art-of-actionable-advice/index.html +++ b/2024/04/29/mentorship-and-the-art-of-actionable-advice/index.html @@ -1,9 +1,11 @@ Mentorship and the art of actionable advice | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                    Mentorship and the art of actionable advice

                                                                                                                                    ChatGPT's depiction of a robot mentoring a robot

                                                                                                                                    One of my challenges with the transition to solo consulting is learning to deliver timely, actionable advice. It’s usually easy for me to identify many areas for improvement. Distilling a long list of “obvious” opportunities to the top items that would make a difference is harder. And the hardest thing is packaging it all up as timely advice that people can act on.

                                                                                                                                    To help address this challenge, I recently joined EnergyLab and GrowthMentor as a mentor. EnergyLab is Australia’s largest climate tech startup accelerator, while GrowthMentor is an international platform for mentorship around startup growth. Both are relevant to my focus on helping leaders of climate/nature tech startups ship data-intensive solutions (including AI/ML, data science, and advanced analytics – this stuff goes by many confusing and hyped-up names).

                                                                                                                                    The rest of this post presents some of my reflections on packaging advice and expertise. I’m always happy to discuss these topics and connect directly with people I may be able to help, so please feel free to reach out with feedback.

                                                                                                                                    Actionable and timely advice by example

                                                                                                                                    We all know we should get enough sleep. But telling a busy insomniac with young children to “sleep more” isn’t actionable. It’s more helpful to provide them with specific strategies for improving their sleep hygiene, like keeping screens out of their bed. And the more specific, the better: “At 10pm, put your phone to charge in another room” gives them exactly one thing they can do tonight.

                                                                                                                                    There is more to it, though. If your insomniac friend comes to you complaining about having a bad night, they’re probably not expecting advice on where to charge their phone – at least not in that specific moment. The timing of the advice can make all the difference between them following it and them doing nothing (or worse – getting annoyed by your lack of empathy).

                                                                                                                                    The same goes for advising anyone about anything.

                                                                                                                                    In my case, advice of a general nature like “your data should be clean, relevant, and plentiful” is nice – but it’s also kinda useless. Getting more specific on strategies and tools is better, e.g, “consider dbt to manage and test data transformations”. Getting to the root of what they want to achieve may yield completely different advice, like “don’t worry about dbt for now, if you want that ML project you mentioned to succeed, you need to instrument and start collecting data around feature X of your product as soon as possible.”

                                                                                                                                    On listening and packaging expertise

                                                                                                                                    To get to a point of giving timely, actionable advice, you need more than functional expertise. It’s important to listen to what the other person is saying (and not saying), and figuring out what they’re most likely to respond to. This is easier with people with whom you’ve already built a relationship than with new acquaintances – which makes the challenge of mentoring at scale all the more interesting.

                                                                                                                                    One key aspect is aligning on expectations. Coming at it from the mentor side, I aim to be transparent about where I can and cannot help, so as to only attract mentees who are likely to be a good fit. However, after almost twenty years in the tech industry and over a decade in data / AI / engineering roles with startups & scaleups, it’s hard to succinctly describe my area of expertise. For example, I liked the label data scientist when it became popular around 2012, but both the label and I have changed over the years. There are major differences between my experience and that of a new data scientist who is fresh out of university. Me using a commodity label like data scientist is not in anyone’s best interest.

                                                                                                                                    Aligning on expectations is easier in close long-term relationships. In our professional lives, such relationships are commonly formed when working for one employer at a time. Indeed, most of my work experience was that of an employee. And like many employees with long-term roles, it was easy for me to identify opportunities for improvement and provide actionable advice to my colleagues. There is a lot of implicit listening going on when you are dedicated to a single employer!

                                                                                                                                    In the absence of a long-term relationship, it’s important to communicate expectations ahead of time. For example, this is what I put in as my “support offered” for EnergyLab founders:

                                                                                                                                    Advice on data strategy, data hiring, AI/ML projects, data science, advanced analytics, and data-intensive solutions.

                                                                                                                                    I have over a decade of experience in data / AI / engineering roles with Australian startups (most famous: Car Next Door / Uber Carshare & Orkestra), international scaleups (Automattic / WordPress.com), and big tech (Intel / Qualcomm / Google). This means I also have many opinions on tech and startups beyond my specific expertise, which may be of use to some founders. :)

                                                                                                                                    In a case of a potential fit, the next step on my end is to listen. My aim is to only offer mentorship in situations where I add value. Redirecting founders to others in my network who may be a better fit than me is a better outcome than attempting to give advice on topics that fall outside my area of expertise.

                                                                                                                                    True experts are always learning

                                                                                                                                    Another key aspect of providing advice as a mentor/expert is recognising that no one knows everything. Even within narrow areas of Data & AI, things are moving so fast that even the most knowledgeable people have no chance of keeping up.

                                                                                                                                    However, expertise is a relative term. I know more about shipping data-intensive solutions than a non-technical CEO, so I can probably help them (especially if they don’t have in-house data experts). I know less about PyTorch internals than an ML engineer who has been focused solely on deep learning for the past decade, so I’ll defer to such experts when deep PyTorch expertise is needed.

                                                                                                                                    As another analogy, consider a general practice doctor named Amy – she is a medical expert in comparison to most of the population. But Amy wouldn’t try to perform brain surgery – she’ll refer you to a neurosurgeon (Barbara?), who is an expert in comparison to Amy.

                                                                                                                                    Things are fuzzier in the unregulated software and data worlds. Memorably, the young child of a past manager one day announced: “My computer has data on it! I am a data scientist!” The equivalent of such pronouncements in the adult world was the swift shift of LinkedIn titles in the years after 2012 – peak data science hype. By contrast, declaring yourself a medical doctor will land you in prison in many countries.

                                                                                                                                    In the absence of regulated data expertise (which is probably undesirable), we are left with heuristics for determining who should be providing data advice. One of my favourite heuristics aligns with GrowthMentor’s core value of humility. In their words: “Nobody knows everything and we should all be open to hearing a different perspective on what we are working on. […] Opening yourself up to feedback from your peers will not only make you a stronger person, but also lead to more confidence in your professional life.”

                                                                                                                                    To me, this is the sign of a true expert: Knowing that you still have a lot to learn. And this brings me back to what I’m aiming to learn and improve through mentorship: Giving timely, actionable advice outside the context of employee-employer relationships.

                                                                                                                                    I’ll report back on how it goes in the future.

                                                                                                                                    Subscribe +

                                                                                                                                    Mentorship and the art of actionable advice

                                                                                                                                    ChatGPT's depiction of a robot mentoring a robot

                                                                                                                                    One of my challenges with the transition to solo consulting is learning to deliver timely, actionable advice. It’s usually easy for me to identify many areas for improvement. Distilling a long list of “obvious” opportunities to the top items that would make a difference is harder. And the hardest thing is packaging it all up as timely advice that people can act on.

                                                                                                                                    To help address this challenge, I recently joined EnergyLab and GrowthMentor as a mentor. EnergyLab is Australia’s largest climate tech startup accelerator, while GrowthMentor is an international platform for mentorship around startup growth. Both are relevant to my focus on helping leaders of climate/nature tech startups ship data-intensive solutions (including AI/ML, data science, and advanced analytics – this stuff goes by many confusing and hyped-up names).

                                                                                                                                    The rest of this post presents some of my reflections on packaging advice and expertise. I’m always happy to discuss these topics and connect directly with people I may be able to help, so please feel free to reach out with feedback.

                                                                                                                                    Actionable and timely advice by example

                                                                                                                                    We all know we should get enough sleep. But telling a busy insomniac with young children to “sleep more” isn’t actionable. It’s more helpful to provide them with specific strategies for improving their sleep hygiene, like keeping screens out of their bed. And the more specific, the better: “At 10pm, put your phone to charge in another room” gives them exactly one thing they can do tonight.

                                                                                                                                    There is more to it, though. If your insomniac friend comes to you complaining about having a bad night, they’re probably not expecting advice on where to charge their phone – at least not in that specific moment. The timing of the advice can make all the difference between them following it and them doing nothing (or worse – getting annoyed by your lack of empathy).

                                                                                                                                    The same goes for advising anyone about anything.

                                                                                                                                    In my case, advice of a general nature like “your data should be clean, relevant, and plentiful” is nice – but it’s also kinda useless. Getting more specific on strategies and tools is better, e.g, “consider dbt to manage and test data transformations”. Getting to the root of what they want to achieve may yield completely different advice, like “don’t worry about dbt for now, if you want that ML project you mentioned to succeed, you need to instrument and start collecting data around feature X of your product as soon as possible.”

                                                                                                                                    On listening and packaging expertise

                                                                                                                                    To get to a point of giving timely, actionable advice, you need more than functional expertise. It’s important to listen to what the other person is saying (and not saying), and figuring out what they’re most likely to respond to. This is easier with people with whom you’ve already built a relationship than with new acquaintances – which makes the challenge of mentoring at scale all the more interesting.

                                                                                                                                    One key aspect is aligning on expectations. Coming at it from the mentor side, I aim to be transparent about where I can and cannot help, so as to only attract mentees who are likely to be a good fit. However, after almost twenty years in the tech industry and over a decade in data / AI / engineering roles with startups & scaleups, it’s hard to succinctly describe my area of expertise. For example, I liked the label data scientist when it became popular around 2012, but both the label and I have changed over the years. There are major differences between my experience and that of a new data scientist who is fresh out of university. Me using a commodity label like data scientist is not in anyone’s best interest.

                                                                                                                                    Aligning on expectations is easier in close long-term relationships. In our professional lives, such relationships are commonly formed when working for one employer at a time. Indeed, most of my work experience was that of an employee. And like many employees with long-term roles, it was easy for me to identify opportunities for improvement and provide actionable advice to my colleagues. There is a lot of implicit listening going on when you are dedicated to a single employer!

                                                                                                                                    In the absence of a long-term relationship, it’s important to communicate expectations ahead of time. For example, this is what I put in as my “support offered” for EnergyLab founders:

                                                                                                                                    Advice on data strategy, data hiring, AI/ML projects, data science, advanced analytics, and data-intensive solutions.

                                                                                                                                    I have over a decade of experience in data / AI / engineering roles with Australian startups (most famous: Car Next Door / Uber Carshare & Orkestra), international scaleups (Automattic / WordPress.com), and big tech (Intel / Qualcomm / Google). This means I also have many opinions on tech and startups beyond my specific expertise, which may be of use to some founders. :)

                                                                                                                                    In a case of a potential fit, the next step on my end is to listen. My aim is to only offer mentorship in situations where I add value. Redirecting founders to others in my network who may be a better fit than me is a better outcome than attempting to give advice on topics that fall outside my area of expertise.

                                                                                                                                    True experts are always learning

                                                                                                                                    Another key aspect of providing advice as a mentor/expert is recognising that no one knows everything. Even within narrow areas of Data & AI, things are moving so fast that even the most knowledgeable people have no chance of keeping up.

                                                                                                                                    However, expertise is a relative term. I know more about shipping data-intensive solutions than a non-technical CEO, so I can probably help them (especially if they don’t have in-house data experts). I know less about PyTorch internals than an ML engineer who has been focused solely on deep learning for the past decade, so I’ll defer to such experts when deep PyTorch expertise is needed.

                                                                                                                                    As another analogy, consider a general practice doctor named Amy – she is a medical expert in comparison to most of the population. But Amy wouldn’t try to perform brain surgery – she’ll refer you to a neurosurgeon (Barbara?), who is an expert in comparison to Amy.

                                                                                                                                    Things are fuzzier in the unregulated software and data worlds. Memorably, the young child of a past manager one day announced: “My computer has data on it! I am a data scientist!” The equivalent of such pronouncements in the adult world was the swift shift of LinkedIn titles in the years after 2012 – peak data science hype. By contrast, declaring yourself a medical doctor will land you in prison in many countries.

                                                                                                                                    In the absence of regulated data expertise (which is probably undesirable), we are left with heuristics for determining who should be providing data advice. One of my favourite heuristics aligns with GrowthMentor’s core value of humility. In their words: “Nobody knows everything and we should all be open to hearing a different perspective on what we are working on. […] Opening yourself up to feedback from your peers will not only make you a stronger person, but also lead to more confidence in your professional life.”

                                                                                                                                    To me, this is the sign of a true expert: Knowing that you still have a lot to learn. And this brings me back to what I’m aiming to learn and improve through mentorship: Giving timely, actionable advice outside the context of employee-employer relationships.

                                                                                                                                    I’ll report back on how it goes in the future.

                                                                                                                                    Subscribe

                                                                                                                                      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                      \ No newline at end of file diff --git a/2024/05/06/business-questions-to-ask-before-taking-a-startup-data-role/index.html b/2024/05/06/business-questions-to-ask-before-taking-a-startup-data-role/index.html index be6ebfef9..839725cc1 100644 --- a/2024/05/06/business-questions-to-ask-before-taking-a-startup-data-role/index.html +++ b/2024/05/06/business-questions-to-ask-before-taking-a-startup-data-role/index.html @@ -1,9 +1,11 @@ Business questions to ask before taking a startup data role | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                      Business questions to ask before taking a startup data role

                                                                                                                                      black box with a question mark

                                                                                                                                      If you join a startup as an early employee, you’re essentially an investor. But unlike capital investors, you can’t diversify your portfolio as a full-timer. You need assurances that your time investment is likely to yield a positive return. Ideally, this would be a better return than the return on any other use of your work time.

                                                                                                                                      Good startups guarantee a return for early employees by paying a competitive base salary. Typically, compensation also includes equity in the company, which vests over time. However, unlike equity in publicly-traded companies, there’s a good chance that your startup equity will be worthless.

                                                                                                                                      To help you assess the value of your startup equity, this post presents questions from the Product & Business Model section of my Data-to-AI Health Check for Startups. In creating the health check, I realised I could have titled it “questions I should have asked past employers”, i.e., you get to learn from my mistakes! The health check has seven other areas, which I may cover in future posts.

                                                                                                                                      Before proceeding, note these assumptions:

                                                                                                                                      1. You don’t need to take a specific job offer urgently, i.e., you have multiple options or time to search for better options. If there’s urgency, you can take a suboptimal role, build your skills and savings, and aim to generate better options on your next search.
                                                                                                                                      2. You’re not seen as too junior to be asking deep questions. If you’re early in your career, consider if early-stage startups are for you: My view is that you may be better off working with an established company, where there are more structured mentorship opportunities.
                                                                                                                                      3. The startup isn’t proposing an unpaid equity-only position. If you’re expected to work full-time without a salary, you’re a founder.
                                                                                                                                      4. The startup is small enough for you to speak with the founders as part of the recruitment process (maybe <100 employees).
                                                                                                                                      5. The founders aren’t refusing to answer your questions. If they are, move on.

                                                                                                                                      Investor-level product & business questions

                                                                                                                                      Most of the following questions are typically answered by a pitch deck, so asking the founders to take you through the pitch may be the most time-efficient way of getting answers. Depending on the stage of product development, you may also be able to gather some answers yourself from the company’s website.

                                                                                                                                      While these are investor-level questions, your assessment of the answers should be different from that of an investor. You don’t have dozens of other startups in your portfolio – only one. Better make sure it’s a good one.

                                                                                                                                      Q1: What is your company’s purpose? What problem are you solving and why? You’re looking for a plausible story and solution. It’s important to understand founder motivations, and assess whether you want to spend many of your waking hours bringing their vision to life.

                                                                                                                                      Q2: What does your product do? If the product is already live, a demo would be best to answer this. Otherwise, wireframes and other plans would be good enough. If the product isn’t live yet, watch out for unrealistic plans. You don’t want to work on a product that will never get released.

                                                                                                                                      Q3: What are the relevant market sizes (TAM/SAM/SOM)? TAM is total addressable market – the total market demand for the product, which helps assess the growth potential. SAM is serviceable addressable market – the market demand the product can plausibly fulfill, which helps assess revenue targets. SOM is serviceable obtainable market – the part of the market that the startup’s product can capture, which helps assess short-term growth potential.

                                                                                                                                      Q4: Where do the problem, market, and solution sit on Jason Cohen’s problem flowchart? While TAM/SAM/SOM are useful high-level metrics, the problem flowchart goes deeper into assessing the viability of a startup. A surprising number of startups skip such assessments, and fail as a result. You may regret joining startups that make such preventable blunders.

                                                                                                                                      Q5: What is on the product roadmap for the next 6-12-24 months? In startup-land, plans for 12-24 months are in the realm of wishful thinking, but it’s good to have an idea of the general direction. Knowing what’s on the roadmap for the next six months will help you assess whether you want to come on board.

                                                                                                                                      Q6: What is the business model, i.e., how do you make money? Together with the other questions, this will help you assess the viability of the business. You should be especially wary if the founders haven’t figured out how to generate revenue yet, which means they’ll have to raise money to keep paying you. If they’re not seeing healthy growth in other key metrics (e.g., number of active users), they’ll struggle to raise more funding.

                                                                                                                                      Q7: What is the competition? How’s your product differentiated in the eyes of customers? How hard is it for competitors to copy you? Founders should have solid knowledge of the competitive landscape, and be able to explain why customers choose their product over the competition – and why they’ll continue to do so. Steer clear of founders who exhibit a low understanding of customer wants and needs. The company’s value ultimately comes from making something people want.

                                                                                                                                      Q8: What are the key business metrics (definitions, values, and trajectories)? This is especially pertinent if you’re the type of data person who’s going to get deep into business metrics as part of your job (a data scientist/analyst, as opposed to a data/AI/ML engineer). But regardless of role, it’s important for you to know how the business is performing. You should be confident that startup executives are measuring the right things.

                                                                                                                                      Q9: Since the last raise, how has the company performed against its goals? This includes goals that are covered by the key business metrics, as well as product development milestones. Repeatedly failing to achieve self-imposed goals is often a red flag – the goals may be unrealistic, and the business may not be viable.

                                                                                                                                      Q10: How much runway is left before another raise is needed? This is critical for employees to know. For example, if there are only three months left before the startup runs out of money, you may be out of a job pretty quickly. Note that the question still applies if the startup is bootstrapped (i.e., self-funded or funded by revenue) – money needs to come from somewhere to cover your salary.

                                                                                                                                      Data-to-AI product & business questions

                                                                                                                                      While the above questions should be asked by any early startup employee, you should also get answers for the following questions if you’re considering a data/AI/ML role. If you’re the first data hire, pay specific attention to answers that indicate that the startup isn’t ready for a data hire, or that you may have to wear hats you’re unwilling to wear. For example, if you’re passionate about advanced AI/ML modelling but there are gaps in data engineering and basic analytics, you’re likely to be the one doing the data work to address those gaps.

                                                                                                                                      Q11: What is the data intensity of the product on a scale of 1-5? High data intensity typically requires low-latency processing of large volumes of data with more than one database server. With high intensity, a slowdown in data processing would noticeably affect key business metrics. High data intensity means that solid data engineering skills are required for success – it’s important to ascertain that founders are aware of this requirement.

                                                                                                                                      Q12: Is advanced AI/ML core to the product? What if you used simple heuristics? One issue with AI/ML is the hype. AI is indeed transformative and exciting, but using AI isn’t always required for the product to succeed. In the words of Google’s first rule of ML: “Don’t be afraid to launch a product without machine learning”. As a data professional and an outsider, you are in a good position to assess whether advanced AI/ML has to be core to the product. The answer should only be yes if it would make a difference in the eyes of the customers. Using AI/ML too early is often a premature optimisation. You should assess whether the added complexity of dealing with MLOps is justified.

                                                                                                                                      Q13: Are you planning to increase data intensity or advanced AI/ML use? Why? This question is similar to the one about the product roadmap, but specific to data/AI/ML. Again, the Why is key – ensure that there’s a solid business case for increased data/AI/ML complexity. In a healthy startup, increased complexity is driven by customer need, not by excitement about shiny tech.

                                                                                                                                      Q14: Are any decisions routinely blocked or delayed by limited access to data? This question helps assess gaps in data collection and quality, as well as the company’s culture around the use of data. It should also help you understand what sort of work is likely to be needed, e.g., even if there are plans to use more advanced AI/ML, the reality of data gaps may mean that plenty of data engineering work is needed.

                                                                                                                                      Feedback welcome

                                                                                                                                      If you found the above questions helpful or if you have any other feedback, I’d love to hear from you. I’m planning to evolve my Data-to-AI Health Check over time and post more on the other areas you should ask about. Subscribing for updates is the best way to get notified when it happens.

                                                                                                                                      Subscribe +

                                                                                                                                      Business questions to ask before taking a startup data role

                                                                                                                                      black box with a question mark

                                                                                                                                      If you join a startup as an early employee, you’re essentially an investor. But unlike capital investors, you can’t diversify your portfolio as a full-timer. You need assurances that your time investment is likely to yield a positive return. Ideally, this would be a better return than the return on any other use of your work time.

                                                                                                                                      Good startups guarantee a return for early employees by paying a competitive base salary. Typically, compensation also includes equity in the company, which vests over time. However, unlike equity in publicly-traded companies, there’s a good chance that your startup equity will be worthless.

                                                                                                                                      To help you assess the value of your startup equity, this post presents questions from the Product & Business Model section of my Data-to-AI Health Check for Startups. In creating the health check, I realised I could have titled it “questions I should have asked past employers”, i.e., you get to learn from my mistakes! The health check has seven other areas, which I may cover in future posts.

                                                                                                                                      Before proceeding, note these assumptions:

                                                                                                                                      1. You don’t need to take a specific job offer urgently, i.e., you have multiple options or time to search for better options. If there’s urgency, you can take a suboptimal role, build your skills and savings, and aim to generate better options on your next search.
                                                                                                                                      2. You’re not seen as too junior to be asking deep questions. If you’re early in your career, consider if early-stage startups are for you: My view is that you may be better off working with an established company, where there are more structured mentorship opportunities.
                                                                                                                                      3. The startup isn’t proposing an unpaid equity-only position. If you’re expected to work full-time without a salary, you’re a founder.
                                                                                                                                      4. The startup is small enough for you to speak with the founders as part of the recruitment process (maybe <100 employees).
                                                                                                                                      5. The founders aren’t refusing to answer your questions. If they are, move on.

                                                                                                                                      Investor-level product & business questions

                                                                                                                                      Most of the following questions are typically answered by a pitch deck, so asking the founders to take you through the pitch may be the most time-efficient way of getting answers. Depending on the stage of product development, you may also be able to gather some answers yourself from the company’s website.

                                                                                                                                      While these are investor-level questions, your assessment of the answers should be different from that of an investor. You don’t have dozens of other startups in your portfolio – only one. Better make sure it’s a good one.

                                                                                                                                      Q1: What is your company’s purpose? What problem are you solving and why? You’re looking for a plausible story and solution. It’s important to understand founder motivations, and assess whether you want to spend many of your waking hours bringing their vision to life.

                                                                                                                                      Q2: What does your product do? If the product is already live, a demo would be best to answer this. Otherwise, wireframes and other plans would be good enough. If the product isn’t live yet, watch out for unrealistic plans. You don’t want to work on a product that will never get released.

                                                                                                                                      Q3: What are the relevant market sizes (TAM/SAM/SOM)? TAM is total addressable market – the total market demand for the product, which helps assess the growth potential. SAM is serviceable addressable market – the market demand the product can plausibly fulfill, which helps assess revenue targets. SOM is serviceable obtainable market – the part of the market that the startup’s product can capture, which helps assess short-term growth potential.

                                                                                                                                      Q4: Where do the problem, market, and solution sit on Jason Cohen’s problem flowchart? While TAM/SAM/SOM are useful high-level metrics, the problem flowchart goes deeper into assessing the viability of a startup. A surprising number of startups skip such assessments, and fail as a result. You may regret joining startups that make such preventable blunders.

                                                                                                                                      Q5: What is on the product roadmap for the next 6-12-24 months? In startup-land, plans for 12-24 months are in the realm of wishful thinking, but it’s good to have an idea of the general direction. Knowing what’s on the roadmap for the next six months will help you assess whether you want to come on board.

                                                                                                                                      Q6: What is the business model, i.e., how do you make money? Together with the other questions, this will help you assess the viability of the business. You should be especially wary if the founders haven’t figured out how to generate revenue yet, which means they’ll have to raise money to keep paying you. If they’re not seeing healthy growth in other key metrics (e.g., number of active users), they’ll struggle to raise more funding.

                                                                                                                                      Q7: What is the competition? How’s your product differentiated in the eyes of customers? How hard is it for competitors to copy you? Founders should have solid knowledge of the competitive landscape, and be able to explain why customers choose their product over the competition – and why they’ll continue to do so. Steer clear of founders who exhibit a low understanding of customer wants and needs. The company’s value ultimately comes from making something people want.

                                                                                                                                      Q8: What are the key business metrics (definitions, values, and trajectories)? This is especially pertinent if you’re the type of data person who’s going to get deep into business metrics as part of your job (a data scientist/analyst, as opposed to a data/AI/ML engineer). But regardless of role, it’s important for you to know how the business is performing. You should be confident that startup executives are measuring the right things.

                                                                                                                                      Q9: Since the last raise, how has the company performed against its goals? This includes goals that are covered by the key business metrics, as well as product development milestones. Repeatedly failing to achieve self-imposed goals is often a red flag – the goals may be unrealistic, and the business may not be viable.

                                                                                                                                      Q10: How much runway is left before another raise is needed? This is critical for employees to know. For example, if there are only three months left before the startup runs out of money, you may be out of a job pretty quickly. Note that the question still applies if the startup is bootstrapped (i.e., self-funded or funded by revenue) – money needs to come from somewhere to cover your salary.

                                                                                                                                      Data-to-AI product & business questions

                                                                                                                                      While the above questions should be asked by any early startup employee, you should also get answers for the following questions if you’re considering a data/AI/ML role. If you’re the first data hire, pay specific attention to answers that indicate that the startup isn’t ready for a data hire, or that you may have to wear hats you’re unwilling to wear. For example, if you’re passionate about advanced AI/ML modelling but there are gaps in data engineering and basic analytics, you’re likely to be the one doing the data work to address those gaps.

                                                                                                                                      Q11: What is the data intensity of the product on a scale of 1-5? High data intensity typically requires low-latency processing of large volumes of data with more than one database server. With high intensity, a slowdown in data processing would noticeably affect key business metrics. High data intensity means that solid data engineering skills are required for success – it’s important to ascertain that founders are aware of this requirement.

                                                                                                                                      Q12: Is advanced AI/ML core to the product? What if you used simple heuristics? One issue with AI/ML is the hype. AI is indeed transformative and exciting, but using AI isn’t always required for the product to succeed. In the words of Google’s first rule of ML: “Don’t be afraid to launch a product without machine learning”. As a data professional and an outsider, you are in a good position to assess whether advanced AI/ML has to be core to the product. The answer should only be yes if it would make a difference in the eyes of the customers. Using AI/ML too early is often a premature optimisation. You should assess whether the added complexity of dealing with MLOps is justified.

                                                                                                                                      Q13: Are you planning to increase data intensity or advanced AI/ML use? Why? This question is similar to the one about the product roadmap, but specific to data/AI/ML. Again, the Why is key – ensure that there’s a solid business case for increased data/AI/ML complexity. In a healthy startup, increased complexity is driven by customer need, not by excitement about shiny tech.

                                                                                                                                      Q14: Are any decisions routinely blocked or delayed by limited access to data? This question helps assess gaps in data collection and quality, as well as the company’s culture around the use of data. It should also help you understand what sort of work is likely to be needed, e.g., even if there are plans to use more advanced AI/ML, the reality of data gaps may mean that plenty of data engineering work is needed.

                                                                                                                                      Feedback welcome

                                                                                                                                      If you found the above questions helpful or if you have any other feedback, I’d love to hear from you. I’m planning to evolve my Data-to-AI Health Check over time and post more on the other areas you should ask about. Subscribing for updates is the best way to get notified when it happens.

                                                                                                                                      Subscribe

                                                                                                                                        Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                        \ No newline at end of file diff --git a/2024/05/13/probing-the-people-aspects-of-an-early-stage-startup/index.html b/2024/05/13/probing-the-people-aspects-of-an-early-stage-startup/index.html index 037680910..1418a07b9 100644 --- a/2024/05/13/probing-the-people-aspects-of-an-early-stage-startup/index.html +++ b/2024/05/13/probing-the-people-aspects-of-an-early-stage-startup/index.html @@ -1,9 +1,11 @@ Probing the People aspects of an early-stage startup | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                        Probing the People aspects of an early-stage startup

                                                                                                                                        startupy people in a startupy space around a massive question mark (good old ChatGPT...)

                                                                                                                                        A successful startup is fundamentally a group of people who execute well together. Building a viable product is key, but changing the product is easier than changing the founders. If you’re considering a role at an early-stage startup, you will be investing your best waking hours in the company. It’s important to assess the team before making such an investment.

                                                                                                                                        To learn about the team, ask questions from the People section of my Data-to-AI Health Check for Startups. Some questions emphasise probing Data/AI/ML capabilities – an area that’s often misunderstood by non-specialists. However, this emphasis can be shifted to different functional areas as needed. Similarly to my previous post on scrutinising the Product & Business Model, the rest of this post lists my questions along with brief opinionated explanations.

                                                                                                                                        People questions

                                                                                                                                        Q1: Who are the founders? What are their skills and experience? Founders make or break a startup. It’s important to gain confidence that they have the skills and experience required to build the company, along with the mindset needed to keep learning and developing relevant skills. Founders who were previously successful are an especially positive sign – it indicates that they have the persistence and flexibility needed to build a business.

                                                                                                                                        Q2: What motivates the founders? How passionate are they about the startup’s problem space? My favourite founders are those who build a business based on their deep understanding of customer problems in an area they deeply care about. For example, I previously worked with Orkestra – a software-as-a-service startup that grew directly out of the founders’ experience as consultants. In Orkestra’s case, the founders had already spent years working together solving customer problems prior to founding the company. By contrast, some startups are founded by near-strangers just because the founders want to build something – a red flag.

                                                                                                                                        Q3: Have any founders left? Why and how? Startups can turn friends into foes. Foes holding a significant share of the company may lead to its destruction. But even in cases where founders leave on good terms without significant equity, their departure stories help understand founder personalities and the trajectory of the company. For example, if you’re considering a full-time position, and the story of how the remaining founders treated departing founders gives you pause, you’re better off working elsewhere.

                                                                                                                                        Q4: Who are the key employees? Early employees are almost as important to startup success as founders. In fact, I know of multiple cases where employees “became” founders even though they weren’t there from day one. For example, after my PhD I joined Giveable as a founding data scientist. As the first employee, I was in charge of building the backend for Giveable’s B2C gift recommendation web app. Due to market conditions, we pivoted to a B2B recommender-as-a-service offering – not what the original founder had envisioned. He decided to move on, and I was left with much more equity than originally planned, along with the rights to the code. While I could have kept going as a “founder”, I decided to use the codebase to continue building the same B2B product as part of a more established ecommerce startup.

                                                                                                                                        Q5: Have any key employees left (including involuntarily)? Why and how? With early employees being almost as important to startup success as founders, stories of their departures can be as informative as stories of founder departures. If you’re considering a startup job, these stories can tell you a lot about founder-employee dynamics, before you become an employee. If you’re especially thorough, you can even reach out to the former employees to get their side of the story. Positive signs include low employee turnover and founders who are comfortable with you speaking to their former employees.

                                                                                                                                        Q6: How committed are the founders and key employees (partly measured by work time spent on the startup)? Early on, it’s common for founders and employees to be involved on a part-time basis. This is fine, but if you’re going to commit a significant chunk of your time to the startup, you need to know who you’ll be working with. As an employee, you can usually ignore the big names that are listed as advisors on the startup’s website – their involvement is typically minimal. That said, both advisors and fractional contractors provide access to expertise and connections that may not be necessary on a full-time basis. In fact, fractional help is much better than premature hiring, which unnecessarily burns through funding. The main things to look at in an answer to the commitment question are: (1) transparency; and (2) that committed staff have the skills needed to achieve the next milestones.

                                                                                                                                        Q7: What hiring practices do you follow? How do you assess the skills of new experts (e.g., first data hire)? Given the importance of early employees, a loose hiring process is a cause for alarm. However, thoughtlessly borrowing hiring practices from the likes of Google is also problematic, as such processes are laughably hackable and tedious to everyone involved. Startups can and should move faster on hiring than established players: My favourite hiring processes include paid work on real problems after an initial low-cost filter. These are hard to scale, but there’s no need to scale hiring in the early days. Paying for work on real problems also helps address the challenge of assessing the skills of new experts – they are judged on real work output rather than on confidence, pedigree, and performance on convoluted tasks.

                                                                                                                                        Q8: Do you pay market rates? Startups that don’t pay market rates are best avoided. They’re unlikely to attract and retain quality employees. Founders of such startups may also fall victim to classic fallacies like the 1975 Mythical Man-Month, and make expensive mistakes like hiring two mediocre engineers in place of one excellent engineer. When it comes to software (and data) development, higher quality often incurs a lower overall cost. Paying market rates and hiring great people is the way to go, especially in the age of AI-powered interns.

                                                                                                                                        Q9: Are there any critical skill gaps among current personnel (especially around data/AI/ML)? If you’re asking this question as a candidate, you’re probably going to fill one of the gaps. However, gaps are relative to what the startup is trying to do. For example, if they have ambitious AI/ML plans that require a range of data skills they don’t have on the current team (from data engineering through data science to AI/ML engineering), they better be planning to hire more than one junior data generalist.

                                                                                                                                        Q10: What’s the hiring roadmap for the next 6-12-24 months? How will it affect the runway? Is it dependent on new funding or revenue growth? Startup founders usually have grand plans – that’s what you want from founders! But plans for 12-24 months are often in the realm of wishful thinking, and a lot can change even in six months. As a candidate, try to get a realistic view of the hiring that is highly likely to happen, along with the hiring that is dependent on new money coming in. Assuming that the latter doesn’t happen due to a cashflow crunch, would you still take the job?

                                                                                                                                        Even more questions?

                                                                                                                                        This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                        You can download a guide containing all the questions as a PDF. The next area I’ll cover is Culture – how people work together. Feedback is always welcome!

                                                                                                                                        Subscribe +

                                                                                                                                        Probing the People aspects of an early-stage startup

                                                                                                                                        startupy people in a startupy space around a massive question mark (good old ChatGPT...)

                                                                                                                                        A successful startup is fundamentally a group of people who execute well together. Building a viable product is key, but changing the product is easier than changing the founders. If you’re considering a role at an early-stage startup, you will be investing your best waking hours in the company. It’s important to assess the team before making such an investment.

                                                                                                                                        To learn about the team, ask questions from the People section of my Data-to-AI Health Check for Startups. Some questions emphasise probing Data/AI/ML capabilities – an area that’s often misunderstood by non-specialists. However, this emphasis can be shifted to different functional areas as needed. Similarly to my previous post on scrutinising the Product & Business Model, the rest of this post lists my questions along with brief opinionated explanations.

                                                                                                                                        People questions

                                                                                                                                        Q1: Who are the founders? What are their skills and experience? Founders make or break a startup. It’s important to gain confidence that they have the skills and experience required to build the company, along with the mindset needed to keep learning and developing relevant skills. Founders who were previously successful are an especially positive sign – it indicates that they have the persistence and flexibility needed to build a business.

                                                                                                                                        Q2: What motivates the founders? How passionate are they about the startup’s problem space? My favourite founders are those who build a business based on their deep understanding of customer problems in an area they deeply care about. For example, I previously worked with Orkestra – a software-as-a-service startup that grew directly out of the founders’ experience as consultants. In Orkestra’s case, the founders had already spent years working together solving customer problems prior to founding the company. By contrast, some startups are founded by near-strangers just because the founders want to build something – a red flag.

                                                                                                                                        Q3: Have any founders left? Why and how? Startups can turn friends into foes. Foes holding a significant share of the company may lead to its destruction. But even in cases where founders leave on good terms without significant equity, their departure stories help understand founder personalities and the trajectory of the company. For example, if you’re considering a full-time position, and the story of how the remaining founders treated departing founders gives you pause, you’re better off working elsewhere.

                                                                                                                                        Q4: Who are the key employees? Early employees are almost as important to startup success as founders. In fact, I know of multiple cases where employees “became” founders even though they weren’t there from day one. For example, after my PhD I joined Giveable as a founding data scientist. As the first employee, I was in charge of building the backend for Giveable’s B2C gift recommendation web app. Due to market conditions, we pivoted to a B2B recommender-as-a-service offering – not what the original founder had envisioned. He decided to move on, and I was left with much more equity than originally planned, along with the rights to the code. While I could have kept going as a “founder”, I decided to use the codebase to continue building the same B2B product as part of a more established ecommerce startup.

                                                                                                                                        Q5: Have any key employees left (including involuntarily)? Why and how? With early employees being almost as important to startup success as founders, stories of their departures can be as informative as stories of founder departures. If you’re considering a startup job, these stories can tell you a lot about founder-employee dynamics, before you become an employee. If you’re especially thorough, you can even reach out to the former employees to get their side of the story. Positive signs include low employee turnover and founders who are comfortable with you speaking to their former employees.

                                                                                                                                        Q6: How committed are the founders and key employees (partly measured by work time spent on the startup)? Early on, it’s common for founders and employees to be involved on a part-time basis. This is fine, but if you’re going to commit a significant chunk of your time to the startup, you need to know who you’ll be working with. As an employee, you can usually ignore the big names that are listed as advisors on the startup’s website – their involvement is typically minimal. That said, both advisors and fractional contractors provide access to expertise and connections that may not be necessary on a full-time basis. In fact, fractional help is much better than premature hiring, which unnecessarily burns through funding. The main things to look at in an answer to the commitment question are: (1) transparency; and (2) that committed staff have the skills needed to achieve the next milestones.

                                                                                                                                        Q7: What hiring practices do you follow? How do you assess the skills of new experts (e.g., first data hire)? Given the importance of early employees, a loose hiring process is a cause for alarm. However, thoughtlessly borrowing hiring practices from the likes of Google is also problematic, as such processes are laughably hackable and tedious to everyone involved. Startups can and should move faster on hiring than established players: My favourite hiring processes include paid work on real problems after an initial low-cost filter. These are hard to scale, but there’s no need to scale hiring in the early days. Paying for work on real problems also helps address the challenge of assessing the skills of new experts – they are judged on real work output rather than on confidence, pedigree, and performance on convoluted tasks.

                                                                                                                                        Q8: Do you pay market rates? Startups that don’t pay market rates are best avoided. They’re unlikely to attract and retain quality employees. Founders of such startups may also fall victim to classic fallacies like the 1975 Mythical Man-Month, and make expensive mistakes like hiring two mediocre engineers in place of one excellent engineer. When it comes to software (and data) development, higher quality often incurs a lower overall cost. Paying market rates and hiring great people is the way to go, especially in the age of AI-powered interns.

                                                                                                                                        Q9: Are there any critical skill gaps among current personnel (especially around data/AI/ML)? If you’re asking this question as a candidate, you’re probably going to fill one of the gaps. However, gaps are relative to what the startup is trying to do. For example, if they have ambitious AI/ML plans that require a range of data skills they don’t have on the current team (from data engineering through data science to AI/ML engineering), they better be planning to hire more than one junior data generalist.

                                                                                                                                        Q10: What’s the hiring roadmap for the next 6-12-24 months? How will it affect the runway? Is it dependent on new funding or revenue growth? Startup founders usually have grand plans – that’s what you want from founders! But plans for 12-24 months are often in the realm of wishful thinking, and a lot can change even in six months. As a candidate, try to get a realistic view of the hiring that is highly likely to happen, along with the hiring that is dependent on new money coming in. Assuming that the latter doesn’t happen due to a cashflow crunch, would you still take the job?

                                                                                                                                        Even more questions?

                                                                                                                                        This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                        You can download a guide containing all the questions as a PDF. The next area I’ll cover is Culture – how people work together. Feedback is always welcome!

                                                                                                                                        Subscribe

                                                                                                                                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                          \ No newline at end of file diff --git a/2024/05/20/question-startup-culture-before-accepting-a-data-to-ai-role/index.html b/2024/05/20/question-startup-culture-before-accepting-a-data-to-ai-role/index.html index 627aef3e6..9d4c9f2e8 100644 --- a/2024/05/20/question-startup-culture-before-accepting-a-data-to-ai-role/index.html +++ b/2024/05/20/question-startup-culture-before-accepting-a-data-to-ai-role/index.html @@ -1,9 +1,11 @@ Question startup culture before accepting a data-to-AI role | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                          Question startup culture before accepting a data-to-AI role

                                                                                                                                          an illustration of the data 'garbage in, garbage out' concept

                                                                                                                                          AI shmAI. If you’ve been paying any attention, you’d know that for the vast majority of AI/ML projects, the real value comes from the data.

                                                                                                                                          And if you’re considering a role with a startup that has grand AI plans, you’d better learn about the startup’s work culture – which includes its data culture.

                                                                                                                                          To help you, this post discusses the questions from the Culture section of my Data-to-AI Health Check for Startups. Let’s jump right into it.

                                                                                                                                          Q1: How often are people expected to work outside normal business hours (founders included)? Is unreasonable overtime compensated? “It’s a startup” isn’t a valid excuse for constant overwork. Most successful startups require a sustained effort over many years, and working at capacity reduces productivity over time. That said, high-effort spikes are inevitable – but unusual efforts should be recognised and compensated.

                                                                                                                                          Q2: Do people go on leave regularly (founders included)? This probes a similar cultural norm around overwork to Q1. Stay away from places where people never go on leave. It leads to burnout and collective stupidity: Knowledge workers need downtime to take a step back and come up with new creative ideas. Humans are not AIs.

                                                                                                                                          Q3: How do employees view the leadership team and founders? A small startup won’t have significant quantitative data on employee views. Even at larger companies, employee surveys are often designed and administered in a way that masks problems. If you’re considering a role with a startup, ask to speak with current employees to learn about their views on the culture, founders, and the company’s prospects. If the company is established, sites like Glassdoor and Blind can help you probe issues beyond the current employee base. In any case, remember that there’s an inherent selection bias in only questioning current staff, which is why it’s worth learning about former employees and ex-founders.

                                                                                                                                          Q4: How are wins celebrated? How are failures and mistakes analysed? The unfortunate reality is that many startup founders have little experience running or working at a startup. Therefore, they may not appreciate the need to celebrate wins or to take time to learn from failures. Rather than asking about general rituals, you could ask for examples: What did you do after the last big release? What did you learn from the latest outage? How will you mitigate similar outages?

                                                                                                                                          Q5: How is excellent/poor individual performance evaluated and handled? If you’ve worked anywhere at any capacity, you’d know that the following is true: (1) underappreciated excellent employees may leave; and (2) poor performers may drag an entire team down. Before you join a growing startup, it pays to know that founders have put some thought into performance management – especially if excellence is one of your core values.

                                                                                                                                          Q6: Does the company run data-informed experiments (like A/B tests)? If so, what is considered a successful experiment? For example, what happens if a well-run experiment produces results that contradict the CEO’s opinion? Finally, a question that directly addresses data culture! If you are considering a data role, a culture of intelligent experimentation is a positive sign that the startup is right for you. The correct definition of a successful experiment is “an experiment that taught us something new”. The common answer of “an experiment that confirmed our preconceived notions” (or in A/B testing terms: “an experiment where we shipped the test variation”) is absolutely wrong.

                                                                                                                                          Q7: Do leaders at the company explicitly seek truthful data, even when the truth may expose their mistakes? As with Q4, this may be best probed by asking leaders for examples of cases where they uncovered data that proved them wrong. Startups that harbour a culture of hiding from bad news are best avoided by excellent data people. In my experience and based on countless stories by friends, avoidance of bad news and truthful data becomes more common as companies grow. Great startup leaders care about the success of their business and know that hiding from the truth isn’t going to make it disappear.

                                                                                                                                          Q8: How is uncertainty quantified and communicated? How does it affect decisions? Common sources of uncertainty include sampling biases and missing or wrong data. Marketers are especially notorious for ignoring uncertainty for the sake of memorability (“nine out of ten doctors agree…”). But ignoring uncertainty has long been a way of getting data driven off a cliff. This is at the core of why I recommend the Calling Bullshit book and course to any aspiring data professional. Don’t work with startups that exhibit bullshit failure modes and ignore uncertainty – unless you have the mandate to shape the data culture for the better.

                                                                                                                                          Data-to-AI health beyond culture

                                                                                                                                          This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                          You can download a guide containing all the questions as a PDF. The next area I’ll cover is Processes & Project Management – aspects of delivery that are more formal than the somewhat-intangible Culture. Feedback is always welcome!

                                                                                                                                          Subscribe +

                                                                                                                                          Question startup culture before accepting a data-to-AI role

                                                                                                                                          an illustration of the data 'garbage in, garbage out' concept

                                                                                                                                          AI shmAI. If you’ve been paying any attention, you’d know that for the vast majority of AI/ML projects, the real value comes from the data.

                                                                                                                                          And if you’re considering a role with a startup that has grand AI plans, you’d better learn about the startup’s work culture – which includes its data culture.

                                                                                                                                          To help you, this post discusses the questions from the Culture section of my Data-to-AI Health Check for Startups. Let’s jump right into it.

                                                                                                                                          Q1: How often are people expected to work outside normal business hours (founders included)? Is unreasonable overtime compensated? “It’s a startup” isn’t a valid excuse for constant overwork. Most successful startups require a sustained effort over many years, and working at capacity reduces productivity over time. That said, high-effort spikes are inevitable – but unusual efforts should be recognised and compensated.

                                                                                                                                          Q2: Do people go on leave regularly (founders included)? This probes a similar cultural norm around overwork to Q1. Stay away from places where people never go on leave. It leads to burnout and collective stupidity: Knowledge workers need downtime to take a step back and come up with new creative ideas. Humans are not AIs.

                                                                                                                                          Q3: How do employees view the leadership team and founders? A small startup won’t have significant quantitative data on employee views. Even at larger companies, employee surveys are often designed and administered in a way that masks problems. If you’re considering a role with a startup, ask to speak with current employees to learn about their views on the culture, founders, and the company’s prospects. If the company is established, sites like Glassdoor and Blind can help you probe issues beyond the current employee base. In any case, remember that there’s an inherent selection bias in only questioning current staff, which is why it’s worth learning about former employees and ex-founders.

                                                                                                                                          Q4: How are wins celebrated? How are failures and mistakes analysed? The unfortunate reality is that many startup founders have little experience running or working at a startup. Therefore, they may not appreciate the need to celebrate wins or to take time to learn from failures. Rather than asking about general rituals, you could ask for examples: What did you do after the last big release? What did you learn from the latest outage? How will you mitigate similar outages?

                                                                                                                                          Q5: How is excellent/poor individual performance evaluated and handled? If you’ve worked anywhere at any capacity, you’d know that the following is true: (1) underappreciated excellent employees may leave; and (2) poor performers may drag an entire team down. Before you join a growing startup, it pays to know that founders have put some thought into performance management – especially if excellence is one of your core values.

                                                                                                                                          Q6: Does the company run data-informed experiments (like A/B tests)? If so, what is considered a successful experiment? For example, what happens if a well-run experiment produces results that contradict the CEO’s opinion? Finally, a question that directly addresses data culture! If you are considering a data role, a culture of intelligent experimentation is a positive sign that the startup is right for you. The correct definition of a successful experiment is “an experiment that taught us something new”. The common answer of “an experiment that confirmed our preconceived notions” (or in A/B testing terms: “an experiment where we shipped the test variation”) is absolutely wrong.

                                                                                                                                          Q7: Do leaders at the company explicitly seek truthful data, even when the truth may expose their mistakes? As with Q4, this may be best probed by asking leaders for examples of cases where they uncovered data that proved them wrong. Startups that harbour a culture of hiding from bad news are best avoided by excellent data people. In my experience and based on countless stories by friends, avoidance of bad news and truthful data becomes more common as companies grow. Great startup leaders care about the success of their business and know that hiding from the truth isn’t going to make it disappear.

                                                                                                                                          Q8: How is uncertainty quantified and communicated? How does it affect decisions? Common sources of uncertainty include sampling biases and missing or wrong data. Marketers are especially notorious for ignoring uncertainty for the sake of memorability (“nine out of ten doctors agree…”). But ignoring uncertainty has long been a way of getting data driven off a cliff. This is at the core of why I recommend the Calling Bullshit book and course to any aspiring data professional. Don’t work with startups that exhibit bullshit failure modes and ignore uncertainty – unless you have the mandate to shape the data culture for the better.

                                                                                                                                          Data-to-AI health beyond culture

                                                                                                                                          This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                          You can download a guide containing all the questions as a PDF. The next area I’ll cover is Processes & Project Management – aspects of delivery that are more formal than the somewhat-intangible Culture. Feedback is always welcome!

                                                                                                                                          Subscribe

                                                                                                                                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                            \ No newline at end of file diff --git a/2024/05/27/plumbing-decisions-and-automation-de-hyping-data-and-ai/index.html b/2024/05/27/plumbing-decisions-and-automation-de-hyping-data-and-ai/index.html index 9880f51c1..82deadbdd 100644 --- a/2024/05/27/plumbing-decisions-and-automation-de-hyping-data-and-ai/index.html +++ b/2024/05/27/plumbing-decisions-and-automation-de-hyping-data-and-ai/index.html @@ -1,9 +1,11 @@ Plumbing, Decisions, and Automation: De-hyping Data & AI | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                            Plumbing, Decisions, and Automation: De-hyping Data & AI

                                                                                                                                            contrasting an amateur and a professional otter; the amateur asks about tools, the professional asks about plumbing, decisions, and automation

                                                                                                                                            Data & AI health is hard to define. Recently, it occurred to me that its essence can be distilled with three questions:

                                                                                                                                            1. Plumbing: What’s the state of your data engineering lifecycles?
                                                                                                                                            2. Decisions: How do you use descriptive, predictive, and causal modelling to support decisions?
                                                                                                                                            3. Automation: How do you use AI to automate processes?

                                                                                                                                            These questions help identify gaps and opportunities. While each question focuses on the present state, it’s natural to follow up with plans for a brighter future.

                                                                                                                                            In practice, you would go deep on each area. Each question is a door that leads to a corridor with many more doors.

                                                                                                                                            Amateurs versus professionals

                                                                                                                                            If you’ve ever worked with data, you’d have a sense of what amateur and professional answers to the above questions may look like. In practice, answers are multifaceted and fall on a continuum. But here are some simplified examples from each end of the continuum:

                                                                                                                                            AmateurProfessional
                                                                                                                                            PlumbingRudimentary pipelines, manually-populated spreadsheetsAll necessary data is trustworthy and available on tap
                                                                                                                                            DecisionsRelying on one-off charts and models, along with the intuition of HiPPOs (highest-paid persons’ opinions)Relying on relevant data and modelling efforts that are proportional to the gravity of each decision
                                                                                                                                            AutomationSuperficial use of off-the-shelf toolsDeep, mindful integration of tech to replace manual work where it delivers the most value

                                                                                                                                            Going down the rabbit hole

                                                                                                                                            The three areas pretty much define my career, but there is always much more to learn. The main message of this post is that little has changed since Harrington Emerson uttered these words in 1911:

                                                                                                                                            As to methods, there may be a million and then some, but principles are few. The person who grasps principles can successfully select their own methods. The person who tries methods, ignoring principles, is sure to have trouble.

                                                                                                                                            (OK, one thing did change – Emerson used man rather than person, but I fixed it for him.)

                                                                                                                                            You can explore further with these posts:

                                                                                                                                            1. Plumbing: Fully understanding the data engineering lifecycle is more important than mastering a single tool.
                                                                                                                                            2. Decisions: According to my 2018 definition, this is what data science is all about. There’s endless depth to building descriptive, predictive, and causal models. But the key to rising above tool hype is understanding the why of data science, which is to support decisions.
                                                                                                                                            3. Automation: The term AI is around peak hype right now. This makes it easy for cynics to dismiss the over-excited claims of AI proponents. Avoid cynicism – simply think of AI as automation and understand that relentless but mindful automation is key to success in our world.

                                                                                                                                            More questions to probe the Data-to-AI health of startups

                                                                                                                                            This post is a slight detour from the series on my Data-to-AI Health Check for Startups. I figured it’s a valuable detour since I now see the triad of Plumbing, Decisions, and Automation as the essence of Data & AI health for any organisation.

                                                                                                                                            Previous posts in the series:

                                                                                                                                            You can download a guide containing all the questions as a PDF. I’m still planning to cover Processes & Project Management next – hopefully I won’t get detoured again. Feedback is always welcome!

                                                                                                                                            Subscribe +

                                                                                                                                            Plumbing, Decisions, and Automation: De-hyping Data & AI

                                                                                                                                            contrasting an amateur and a professional otter; the amateur asks about tools, the professional asks about plumbing, decisions, and automation

                                                                                                                                            Data & AI health is hard to define. Recently, it occurred to me that its essence can be distilled with three questions:

                                                                                                                                            1. Plumbing: What’s the state of your data engineering lifecycles?
                                                                                                                                            2. Decisions: How do you use descriptive, predictive, and causal modelling to support decisions?
                                                                                                                                            3. Automation: How do you use AI to automate processes?

                                                                                                                                            These questions help identify gaps and opportunities. While each question focuses on the present state, it’s natural to follow up with plans for a brighter future.

                                                                                                                                            In practice, you would go deep on each area. Each question is a door that leads to a corridor with many more doors.

                                                                                                                                            Amateurs versus professionals

                                                                                                                                            If you’ve ever worked with data, you’d have a sense of what amateur and professional answers to the above questions may look like. In practice, answers are multifaceted and fall on a continuum. But here are some simplified examples from each end of the continuum:

                                                                                                                                            AmateurProfessional
                                                                                                                                            PlumbingRudimentary pipelines, manually-populated spreadsheetsAll necessary data is trustworthy and available on tap
                                                                                                                                            DecisionsRelying on one-off charts and models, along with the intuition of HiPPOs (highest-paid persons’ opinions)Relying on relevant data and modelling efforts that are proportional to the gravity of each decision
                                                                                                                                            AutomationSuperficial use of off-the-shelf toolsDeep, mindful integration of tech to replace manual work where it delivers the most value

                                                                                                                                            Going down the rabbit hole

                                                                                                                                            The three areas pretty much define my career, but there is always much more to learn. The main message of this post is that little has changed since Harrington Emerson uttered these words in 1911:

                                                                                                                                            As to methods, there may be a million and then some, but principles are few. The person who grasps principles can successfully select their own methods. The person who tries methods, ignoring principles, is sure to have trouble.

                                                                                                                                            (OK, one thing did change – Emerson used man rather than person, but I fixed it for him.)

                                                                                                                                            You can explore further with these posts:

                                                                                                                                            1. Plumbing: Fully understanding the data engineering lifecycle is more important than mastering a single tool.
                                                                                                                                            2. Decisions: According to my 2018 definition, this is what data science is all about. There’s endless depth to building descriptive, predictive, and causal models. But the key to rising above tool hype is understanding the why of data science, which is to support decisions.
                                                                                                                                            3. Automation: The term AI is around peak hype right now. This makes it easy for cynics to dismiss the over-excited claims of AI proponents. Avoid cynicism – simply think of AI as automation and understand that relentless but mindful automation is key to success in our world.

                                                                                                                                            More questions to probe the Data-to-AI health of startups

                                                                                                                                            This post is a slight detour from the series on my Data-to-AI Health Check for Startups. I figured it’s a valuable detour since I now see the triad of Plumbing, Decisions, and Automation as the essence of Data & AI health for any organisation.

                                                                                                                                            Previous posts in the series:

                                                                                                                                            You can download a guide containing all the questions as a PDF. I’m still planning to cover Processes & Project Management next – hopefully I won’t get detoured again. Feedback is always welcome!

                                                                                                                                            Subscribe

                                                                                                                                              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                              \ No newline at end of file diff --git a/2024/06/03/how-to-avoid-startups-with-poor-development-processes/index.html b/2024/06/03/how-to-avoid-startups-with-poor-development-processes/index.html index 53aeebb42..f5610e8ff 100644 --- a/2024/06/03/how-to-avoid-startups-with-poor-development-processes/index.html +++ b/2024/06/03/how-to-avoid-startups-with-poor-development-processes/index.html @@ -1,9 +1,11 @@ How to avoid startups with poor development processes | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                              How to avoid startups with poor development processes

                                                                                                                                              minimalist image of the phrase RERO: release early, release often

                                                                                                                                              Avoid those who release late, release seldom, and don’t listen to their customers.

                                                                                                                                              Many founders have never worked at a startup. This may make them oblivious to failure modes that arise from poor development processes. With poor processes, even the most brilliant people are ineffective, as they’re constantly wasting time and fighting fires.

                                                                                                                                              You don’t want to join a startup that’s mired in intractable chaos. To avoid such a place, ask questions from the Processes & Project Management section of my Data-to-AI Health Check for Startups. Do this even if you’re a data scientist or a junior engineer – because everyone suffers in an environment with poor processes.

                                                                                                                                              This post briefly explains each question, along with expected answers and suggestions for eliciting informative responses.

                                                                                                                                              The questions

                                                                                                                                              Q1: How often are changes shipped to production? Even in 2024, there are software startups that don’t follow the RERO philosophy: “Release early. Release often. And listen to your customers.” Sporadic releases are often indicative of poor processes. Stay away from such startups, unless you have a mandate to improve things.

                                                                                                                                              Q2: How is the impact of new features quantified before and after their release? Another classic quote in the spirit of RERO is the first principle behind the Agile Manifesto: “Our highest priority is to satisfy the customer through early and continuous delivery of valuable software.” And you can’t know what’s valuable software if you don’t collect data from your users and meaningfully aggregate it. Helping with this is likely to be your responsibility if you’re the first data hire – ensure that the founders understand that.

                                                                                                                                              Q3: What processes and systems are in place to collect qualitative and quantitative feedback from users (both internal and external)? This takes a more holistic view of user feedback and includes internal users. A common failure mode for internal-facing data teams is to spend too much time satisfying low-value requests, e.g., build a dashboard that is barely used. Given the opportunity cost, it’s important to collect truthful feedback from all users.

                                                                                                                                              Q4: Were there any outages in recent months? How did they affect users? What changes did you implement to reduce the chance of outages and impact on users? Some outages are unavoidable, especially early on. If the answers reveal that there are repeated issues that go unaddressed, the team may be too busy fighting fires rather than shipping valuable software. This applies to internal systems as well, e.g., data ingestion pipelines that keep breaking shouldn’t be seen as normal.

                                                                                                                                              Q5: What system do you use for prioritising and tracking work across the company? This includes project management tools, and their use in practice – but any tool can be misused. The important things to look for are that: (1) a system exists; and (2) the system will improve over time.

                                                                                                                                              Q6: How do you balance paying down tech debt and shipping new features? Tech debt is unavoidable, especially in a fast-growing startup. You’re looking for an acknowledgement of this fact, along with an understanding that accruing too much tech debt reduces the ability to ship new features. While the concept comes from software engineering, it also applies to product analytics: if the data is a tangled mess and pipelines are unreliable (i.e., data tech debt is high), then analytics can’t be trusted to help improve the product.

                                                                                                                                              Q7: What proportion of the engineering and data staff time is spent on: (1) dealing with bugs and incidents; (2) meetings and admin overheads; and (3) shipping new features? This question takes Q5 & Q6 from philosophy to practice – “don’t tell me your priorities; show me your calendar”. If individual contributors don’t spend most of their time shipping new features (including research into the necessity of these features), there may be too much tech debt or too many overheads.

                                                                                                                                              Q8: What are the key team rituals (e.g., recurring meetings, sprint planning, demos, postmortems, standups)? Again, the calendar is a source of insights. For early-stage startups, awareness of the need for rituals may be low, and they may be implemented poorly. For example, I believe that daily synchronous standups are a bad idea for remote teams. A better approach is using a bot that asks everyone for their progress, upcoming tasks, and blockers – and following up to ensure that everyone is on track as a team.

                                                                                                                                              Q9: What processes are in place for code, design, and architecture reviews? The depth and time of each process should be proportional to the magnitude and impact of the change. For example, a trivial code improvement in a product that barely has any users can be shipped with a post-commit review. In contrast, migrating a live product to a different database system requires deeper consultation.

                                                                                                                                              Q10: Are there different processes for internal-facing data products (e.g., custom admin dashboards)? The unfortunate reality of many organisations is that data analytics and software engineering live in different silos. This can easily happen at small startups as well, with a single analyst working in isolation from product teams. I believe that the ideal situation is that internal-facing data products are treated like the software products that they are. They may not need to be as pretty as external-facing products, but their development should follow processes that ensure high quality, trustworthiness, and satisfaction of user needs.

                                                                                                                                              Q11: Give an example of how the above items manifest in a big project that was recently completed. It’s often hard to speak of abstract processes. An example of a big project may be the best way of moving from the abstract to the everyday reality of the startup.

                                                                                                                                              Q12: Are there any gaps in the current processes or changes you’d like to introduce? This probes for a growth mindset. If the answer is no, there’s probably something wrong.

                                                                                                                                              What if you can’t fit in all the questions?

                                                                                                                                              If you’re simply trying to avoid a dysfunctional startup rather than dive deep into development processes, a subset of the questions will suffice:

                                                                                                                                              • Q1 + Q2: Ensure that products are continuously improving based on user feedback.
                                                                                                                                              • Q7: Ensure that time spent by current staff aligns with how you want to spend your time.
                                                                                                                                              • Q11 + Q12: Ensure that leaders are conscious of the need to implement and improve processes.

                                                                                                                                              Data-to-AI health beyond processes

                                                                                                                                              This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                              You can download a guide containing all the questions as a PDF. The next area of the health check is Data (finally!). Feedback is always welcome!

                                                                                                                                              Subscribe +

                                                                                                                                              How to avoid startups with poor development processes

                                                                                                                                              minimalist image of the phrase RERO: release early, release often

                                                                                                                                              Avoid those who release late, release seldom, and don’t listen to their customers.

                                                                                                                                              Many founders have never worked at a startup. This may make them oblivious to failure modes that arise from poor development processes. With poor processes, even the most brilliant people are ineffective, as they’re constantly wasting time and fighting fires.

                                                                                                                                              You don’t want to join a startup that’s mired in intractable chaos. To avoid such a place, ask questions from the Processes & Project Management section of my Data-to-AI Health Check for Startups. Do this even if you’re a data scientist or a junior engineer – because everyone suffers in an environment with poor processes.

                                                                                                                                              This post briefly explains each question, along with expected answers and suggestions for eliciting informative responses.

                                                                                                                                              The questions

                                                                                                                                              Q1: How often are changes shipped to production? Even in 2024, there are software startups that don’t follow the RERO philosophy: “Release early. Release often. And listen to your customers.” Sporadic releases are often indicative of poor processes. Stay away from such startups, unless you have a mandate to improve things.

                                                                                                                                              Q2: How is the impact of new features quantified before and after their release? Another classic quote in the spirit of RERO is the first principle behind the Agile Manifesto: “Our highest priority is to satisfy the customer through early and continuous delivery of valuable software.” And you can’t know what’s valuable software if you don’t collect data from your users and meaningfully aggregate it. Helping with this is likely to be your responsibility if you’re the first data hire – ensure that the founders understand that.

                                                                                                                                              Q3: What processes and systems are in place to collect qualitative and quantitative feedback from users (both internal and external)? This takes a more holistic view of user feedback and includes internal users. A common failure mode for internal-facing data teams is to spend too much time satisfying low-value requests, e.g., build a dashboard that is barely used. Given the opportunity cost, it’s important to collect truthful feedback from all users.

                                                                                                                                              Q4: Were there any outages in recent months? How did they affect users? What changes did you implement to reduce the chance of outages and impact on users? Some outages are unavoidable, especially early on. If the answers reveal that there are repeated issues that go unaddressed, the team may be too busy fighting fires rather than shipping valuable software. This applies to internal systems as well, e.g., data ingestion pipelines that keep breaking shouldn’t be seen as normal.

                                                                                                                                              Q5: What system do you use for prioritising and tracking work across the company? This includes project management tools, and their use in practice – but any tool can be misused. The important things to look for are that: (1) a system exists; and (2) the system will improve over time.

                                                                                                                                              Q6: How do you balance paying down tech debt and shipping new features? Tech debt is unavoidable, especially in a fast-growing startup. You’re looking for an acknowledgement of this fact, along with an understanding that accruing too much tech debt reduces the ability to ship new features. While the concept comes from software engineering, it also applies to product analytics: if the data is a tangled mess and pipelines are unreliable (i.e., data tech debt is high), then analytics can’t be trusted to help improve the product.

                                                                                                                                              Q7: What proportion of the engineering and data staff time is spent on: (1) dealing with bugs and incidents; (2) meetings and admin overheads; and (3) shipping new features? This question takes Q5 & Q6 from philosophy to practice – “don’t tell me your priorities; show me your calendar”. If individual contributors don’t spend most of their time shipping new features (including research into the necessity of these features), there may be too much tech debt or too many overheads.

                                                                                                                                              Q8: What are the key team rituals (e.g., recurring meetings, sprint planning, demos, postmortems, standups)? Again, the calendar is a source of insights. For early-stage startups, awareness of the need for rituals may be low, and they may be implemented poorly. For example, I believe that daily synchronous standups are a bad idea for remote teams. A better approach is using a bot that asks everyone for their progress, upcoming tasks, and blockers – and following up to ensure that everyone is on track as a team.

                                                                                                                                              Q9: What processes are in place for code, design, and architecture reviews? The depth and time of each process should be proportional to the magnitude and impact of the change. For example, a trivial code improvement in a product that barely has any users can be shipped with a post-commit review. In contrast, migrating a live product to a different database system requires deeper consultation.

                                                                                                                                              Q10: Are there different processes for internal-facing data products (e.g., custom admin dashboards)? The unfortunate reality of many organisations is that data analytics and software engineering live in different silos. This can easily happen at small startups as well, with a single analyst working in isolation from product teams. I believe that the ideal situation is that internal-facing data products are treated like the software products that they are. They may not need to be as pretty as external-facing products, but their development should follow processes that ensure high quality, trustworthiness, and satisfaction of user needs.

                                                                                                                                              Q11: Give an example of how the above items manifest in a big project that was recently completed. It’s often hard to speak of abstract processes. An example of a big project may be the best way of moving from the abstract to the everyday reality of the startup.

                                                                                                                                              Q12: Are there any gaps in the current processes or changes you’d like to introduce? This probes for a growth mindset. If the answer is no, there’s probably something wrong.

                                                                                                                                              What if you can’t fit in all the questions?

                                                                                                                                              If you’re simply trying to avoid a dysfunctional startup rather than dive deep into development processes, a subset of the questions will suffice:

                                                                                                                                              • Q1 + Q2: Ensure that products are continuously improving based on user feedback.
                                                                                                                                              • Q7: Ensure that time spent by current staff aligns with how you want to spend your time.
                                                                                                                                              • Q11 + Q12: Ensure that leaders are conscious of the need to implement and improve processes.

                                                                                                                                              Data-to-AI health beyond processes

                                                                                                                                              This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                              You can download a guide containing all the questions as a PDF. The next area of the health check is Data (finally!). Feedback is always welcome!

                                                                                                                                              Subscribe

                                                                                                                                                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                \ No newline at end of file diff --git a/2024/06/10/startup-data-health-starts-with-healthy-event-tracking/index.html b/2024/06/10/startup-data-health-starts-with-healthy-event-tracking/index.html index 7a0b618c9..223a20cbb 100644 --- a/2024/06/10/startup-data-health-starts-with-healthy-event-tracking/index.html +++ b/2024/06/10/startup-data-health-starts-with-healthy-event-tracking/index.html @@ -1,5 +1,5 @@ Startup data health starts with healthy event tracking | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                Startup data health starts with healthy event tracking

                                                                                                                                                Kafka-inspired startup data pipelines

                                                                                                                                                The first question in the Data section of my Data-to-AI Health Check for Startups is:

                                                                                                                                                Do you track Kukuyeva’s five business aspects as wide events?

                                                                                                                                                Whole books can be written about this question, so I figured it’s worth its own post. Let’s dive in.

                                                                                                                                                Why track and when?

                                                                                                                                                In my past decade of work with startups and scaleups, one thing has remained a constant: There are always gaps in the data.

                                                                                                                                                Data gaps may take the form of missing data or low-quality data (from partial to deceptively wrong). The feasibility of any AI/ML automation or data-informed decisions depends on the nature of the data gaps.

                                                                                                                                                Another important feature of data tracking is: You can’t go back in time to collect or fix most proprietary data. In other words, historical gaps in the data are there to stay.

                                                                                                                                                Therefore, the answers to the questions from the heading are:

                                                                                                                                                1. Why track? To make data-informed decisions, and support automation and optimisation of decision-making via AI/ML. The latter comes after the former, as you need to turn your data into trustworthy business metrics before you use algorithms to optimise for those metrics.
                                                                                                                                                2. When do you start tracking? As soon as you have interesting data to track, along with the commitment to ensuring that you’re not just tracking garbage. Typically, you should start once you move from a throwaway prototype to a product that has user traction.

                                                                                                                                                What are wide events?

                                                                                                                                                An event is essentially a timestamped key-value document. Wide events encourage liberal use of attributes to support downstream exploration.

                                                                                                                                                Here’s an example from Ivan Burmistrov’s post on the topic, of a wide event for ad impressions at Meta:

                                                                                                                                                {
                                                                                                                                                +

                                                                                                                                                Startup data health starts with healthy event tracking

                                                                                                                                                Kafka-inspired startup data pipelines

                                                                                                                                                The first question in the Data section of my Data-to-AI Health Check for Startups is:

                                                                                                                                                Do you track Kukuyeva’s five business aspects as wide events?

                                                                                                                                                Whole books can be written about this question, so I figured it’s worth its own post. Let’s dive in.

                                                                                                                                                Why track and when?

                                                                                                                                                In my past decade of work with startups and scaleups, one thing has remained a constant: There are always gaps in the data.

                                                                                                                                                Data gaps may take the form of missing data or low-quality data (from partial to deceptively wrong). The feasibility of any AI/ML automation or data-informed decisions depends on the nature of the data gaps.

                                                                                                                                                Another important feature of data tracking is: You can’t go back in time to collect or fix most proprietary data. In other words, historical gaps in the data are there to stay.

                                                                                                                                                Therefore, the answers to the questions from the heading are:

                                                                                                                                                1. Why track? To make data-informed decisions, and support automation and optimisation of decision-making via AI/ML. The latter comes after the former, as you need to turn your data into trustworthy business metrics before you use algorithms to optimise for those metrics.
                                                                                                                                                2. When do you start tracking? As soon as you have interesting data to track, along with the commitment to ensuring that you’re not just tracking garbage. Typically, you should start once you move from a throwaway prototype to a product that has user traction.

                                                                                                                                                What are wide events?

                                                                                                                                                An event is essentially a timestamped key-value document. Wide events encourage liberal use of attributes to support downstream exploration.

                                                                                                                                                Here’s an example from Ivan Burmistrov’s post on the topic, of a wide event for ad impressions at Meta:

                                                                                                                                                {
                                                                                                                                                     "Timestamp": "1707951423",
                                                                                                                                                     "AdId": "542508c92f6f47c2916691d6e8551279”,
                                                                                                                                                     "UserCountry": "US",
                                                                                                                                                @@ -14,7 +14,9 @@
                                                                                                                                                 

                                                                                                                                                  Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                  \ No newline at end of file diff --git a/2024/06/17/ai-aint-gonna-save-you-from-bad-data/index.html b/2024/06/17/ai-aint-gonna-save-you-from-bad-data/index.html index 7ee937309..3e138d8d4 100644 --- a/2024/06/17/ai-aint-gonna-save-you-from-bad-data/index.html +++ b/2024/06/17/ai-aint-gonna-save-you-from-bad-data/index.html @@ -1,9 +1,11 @@ AI ain't gonna save you from bad data | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                  AI ain't gonna save you from bad data

                                                                                                                                                  bad data monster with a helpless robot

                                                                                                                                                  Now that we have generative AI, we no longer need to worry about data, right? Well, we’re not quite there yet. On their own, ChatGPT, Gemini, and their friends can’t save us from bad decisions around data collection and modelling, or from poorly-designed metrics.

                                                                                                                                                  While we wait for better AI agents to replace data scientists and engineers, I propose we ask a standard set of six questions about the data health of any project. These questions come from my Data-to-AI Health Check for Startups, but they apply anywhere. You can use them as a starting point when joining a new initiative, or to assess the state of an existing project.

                                                                                                                                                  Before we start, note that the goal is to identify gaps as opportunities for improvement. It’s easy to see data issues as insurmountable, but despair isn’t a viable data strategy. Aim to adopt Stockdale-style optimism when dealing with data:

                                                                                                                                                  You must never confuse faith that you will prevail in the end – which you can never afford to lose – with the discipline to confront the most brutal facts of your current reality, whatever they might be.

                                                                                                                                                  As with other posts in the health check series, this post provides a brief explanation for every question.

                                                                                                                                                  Let’s jump in.

                                                                                                                                                  The questions

                                                                                                                                                  Q1: Do you track Kukuyeva’s five business aspects as wide events?

                                                                                                                                                  This question is foundational, as inadequate instrumentation often makes data-informed decisions impossible. Given its importance, I wrote a separate post on this question.

                                                                                                                                                  The short story is that you need event-based tracking and event data modelling of key aspects of the business and product. Events are essentially timestamped mappings. For example, a customer purchase is an event that should be logged along with metadata on the customer and platform at the time of purchase.

                                                                                                                                                  Q2: Is there data you need that isn’t collected or is inaccessible? What is stopping you from obtaining it?

                                                                                                                                                  While Q1 covers high-level event tracking, Q2 brings it down to specific needs.

                                                                                                                                                  Sometimes, data can’t be collected due to practical or legal reasons. In some cases, other data can serve as a proxy. For example, companies are typically interested in measuring “customer satisfaction”, but asking directly about satisfaction suffers from a host of problems (e.g., not everyone responds, and the timing and phrasing of questions influence answers). Instead, customer satisfaction can be partly inferred from behaviour like repeat purchases – but that’s also not perfect.

                                                                                                                                                  In any case, seeking perfection in data is a recipe for disappointment. You should start with business needs and find the data to best address them within a reasonable timeframe.

                                                                                                                                                  Q3: On a scale of 1 to 5, rate the quality of your key datasets. If you’re unsure due to limited observability and quality checks, it’s a 1.

                                                                                                                                                  This question should be answered by those who are closest to the data: usually data engineers, scientists, analysts, or one of the dozens of other titles data specialists go by. An experienced data specialist would have a subjective sense of data quality, so it’s worth agreeing on quality definitions if the rating is done by multiple people. Generally, a dataset is of high quality if it’s fit for its intended uses in decisions and automation.

                                                                                                                                                  Again, absolute perfection is impossible: Data is a model of the world, and all models are wrong, but some are useful.

                                                                                                                                                  Q4: On a scale of 1 to 5, what is the confidence of stakeholders in the data and metrics that are used to make business decisions? Explain why.

                                                                                                                                                  Low data quality often leads to low trust and confidence in the outputs of data specialists. However, that’s not always the case. Sometimes, stakeholders may have high confidence in metrics because they’re unaware of underlying data issues. In other cases, confidence is low due to historical reasons: Trust takes time to build – it is a trailing indicator of consistently making and keeping promises. Data specialists are sometimes enamoured with fancy tech and tools, neglecting simple wins that are at the foundation of data’s hierarchy of needs. After decades of hype (from big data through data science to AI agents), I can see why many people treat anything that falls under Data & AI with suspicion. If you’re a data specialist, serving your customers with relevant trustworthy data can be a rare delight.

                                                                                                                                                  Q5: If you are currently using advanced AI/ML, do you have all the data you need for the models to perform as accurately as required?

                                                                                                                                                  By advanced AI/ML, I mean fine-tuning or building machine learning models from scratch. This is distinct from basic AI/ML, which relies on third-party models as black boxes. For example, calling a vision API to extract text from images is basic AI/ML. Training a model on your proprietary image data is advanced AI/ML. The latter requires data of sufficient quality for model accuracy, where sufficient accuracy depends on the application.

                                                                                                                                                  Implicit in this question are satisfactory answers to Q1-Q3: You need to be tracking advanced AI/ML performance in the context where it’s used (Q1), have access to the data you need (Q2), and ensure that data quality is fit for advanced AI/ML (Q3). Advanced AI/ML is hard to do well, and failure can erode stakeholder trust (Q4). However, “failure” depends on expectations – AI/ML models are probabilistic, so setting the right expectations is key. For example, as ChatGPT has shown, it’s possible to build a useful consumer product on top of an AI/ML model that is often wrong.

                                                                                                                                                  Q6: If you are planning new advanced AI/ML projects, do you have all the data you need for them? If not, what is the effort required to obtain the data? Is it time-sensitive (e.g., ingesting a public dataset is less time-sensitive than starting to collect timestamped proprietary data)?

                                                                                                                                                  This is the future-oriented version of Q5. It’s best to think of data and metrics before kicking off advanced AI/ML projects. Further, it’s better to start without AI/ML than over-complicate things early on.

                                                                                                                                                  In short, user needs should inform project decisions on what to build. These decisions and plans then inform data collection. Don’t make the common mistake of starting with shiny tech as the proverbial hammer that’s looking for nail-shaped problems.

                                                                                                                                                  Don’t forget the opportunities!

                                                                                                                                                  Data is much like solar energy: It exists even if you don’t capture it, and most of it bounces back to space unused. Harnessed wisely, it can power business decisions and become a differentiator for your product.

                                                                                                                                                  However, when working closely with data, it’s easy to feel despair due to the never-ending stream of quality issues and stakeholder requests. I’ve felt this despair myself many times.

                                                                                                                                                  For me, the cure for data despair comes from shifting focus from gaps to opportunities:

                                                                                                                                                  1. Map the current state of data, including key gaps.
                                                                                                                                                  2. Learn about relevant business opportunities from internal/external customers and industry peers.
                                                                                                                                                  3. Create a plan to incrementally improve the state of data, and seize opportunities starting with the lowest-hanging fruit.
                                                                                                                                                  4. Execute the plan.
                                                                                                                                                  5. Repeat steps 1-4 periodically.

                                                                                                                                                  Data-to-AI health beyond abstract data

                                                                                                                                                  This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                                  You can download a guide containing all the questions as a PDF. Next, I’ll go into the questions from the Tech section, which are directly related to how the abstract Data questions manifest in practice. Feedback is always welcome!

                                                                                                                                                  Subscribe +

                                                                                                                                                  AI ain't gonna save you from bad data

                                                                                                                                                  bad data monster with a helpless robot

                                                                                                                                                  Now that we have generative AI, we no longer need to worry about data, right? Well, we’re not quite there yet. On their own, ChatGPT, Gemini, and their friends can’t save us from bad decisions around data collection and modelling, or from poorly-designed metrics.

                                                                                                                                                  While we wait for better AI agents to replace data scientists and engineers, I propose we ask a standard set of six questions about the data health of any project. These questions come from my Data-to-AI Health Check for Startups, but they apply anywhere. You can use them as a starting point when joining a new initiative, or to assess the state of an existing project.

                                                                                                                                                  Before we start, note that the goal is to identify gaps as opportunities for improvement. It’s easy to see data issues as insurmountable, but despair isn’t a viable data strategy. Aim to adopt Stockdale-style optimism when dealing with data:

                                                                                                                                                  You must never confuse faith that you will prevail in the end – which you can never afford to lose – with the discipline to confront the most brutal facts of your current reality, whatever they might be.

                                                                                                                                                  As with other posts in the health check series, this post provides a brief explanation for every question.

                                                                                                                                                  Let’s jump in.

                                                                                                                                                  The questions

                                                                                                                                                  Q1: Do you track Kukuyeva’s five business aspects as wide events?

                                                                                                                                                  This question is foundational, as inadequate instrumentation often makes data-informed decisions impossible. Given its importance, I wrote a separate post on this question.

                                                                                                                                                  The short story is that you need event-based tracking and event data modelling of key aspects of the business and product. Events are essentially timestamped mappings. For example, a customer purchase is an event that should be logged along with metadata on the customer and platform at the time of purchase.

                                                                                                                                                  Q2: Is there data you need that isn’t collected or is inaccessible? What is stopping you from obtaining it?

                                                                                                                                                  While Q1 covers high-level event tracking, Q2 brings it down to specific needs.

                                                                                                                                                  Sometimes, data can’t be collected due to practical or legal reasons. In some cases, other data can serve as a proxy. For example, companies are typically interested in measuring “customer satisfaction”, but asking directly about satisfaction suffers from a host of problems (e.g., not everyone responds, and the timing and phrasing of questions influence answers). Instead, customer satisfaction can be partly inferred from behaviour like repeat purchases – but that’s also not perfect.

                                                                                                                                                  In any case, seeking perfection in data is a recipe for disappointment. You should start with business needs and find the data to best address them within a reasonable timeframe.

                                                                                                                                                  Q3: On a scale of 1 to 5, rate the quality of your key datasets. If you’re unsure due to limited observability and quality checks, it’s a 1.

                                                                                                                                                  This question should be answered by those who are closest to the data: usually data engineers, scientists, analysts, or one of the dozens of other titles data specialists go by. An experienced data specialist would have a subjective sense of data quality, so it’s worth agreeing on quality definitions if the rating is done by multiple people. Generally, a dataset is of high quality if it’s fit for its intended uses in decisions and automation.

                                                                                                                                                  Again, absolute perfection is impossible: Data is a model of the world, and all models are wrong, but some are useful.

                                                                                                                                                  Q4: On a scale of 1 to 5, what is the confidence of stakeholders in the data and metrics that are used to make business decisions? Explain why.

                                                                                                                                                  Low data quality often leads to low trust and confidence in the outputs of data specialists. However, that’s not always the case. Sometimes, stakeholders may have high confidence in metrics because they’re unaware of underlying data issues. In other cases, confidence is low due to historical reasons: Trust takes time to build – it is a trailing indicator of consistently making and keeping promises. Data specialists are sometimes enamoured with fancy tech and tools, neglecting simple wins that are at the foundation of data’s hierarchy of needs. After decades of hype (from big data through data science to AI agents), I can see why many people treat anything that falls under Data & AI with suspicion. If you’re a data specialist, serving your customers with relevant trustworthy data can be a rare delight.

                                                                                                                                                  Q5: If you are currently using advanced AI/ML, do you have all the data you need for the models to perform as accurately as required?

                                                                                                                                                  By advanced AI/ML, I mean fine-tuning or building machine learning models from scratch. This is distinct from basic AI/ML, which relies on third-party models as black boxes. For example, calling a vision API to extract text from images is basic AI/ML. Training a model on your proprietary image data is advanced AI/ML. The latter requires data of sufficient quality for model accuracy, where sufficient accuracy depends on the application.

                                                                                                                                                  Implicit in this question are satisfactory answers to Q1-Q3: You need to be tracking advanced AI/ML performance in the context where it’s used (Q1), have access to the data you need (Q2), and ensure that data quality is fit for advanced AI/ML (Q3). Advanced AI/ML is hard to do well, and failure can erode stakeholder trust (Q4). However, “failure” depends on expectations – AI/ML models are probabilistic, so setting the right expectations is key. For example, as ChatGPT has shown, it’s possible to build a useful consumer product on top of an AI/ML model that is often wrong.

                                                                                                                                                  Q6: If you are planning new advanced AI/ML projects, do you have all the data you need for them? If not, what is the effort required to obtain the data? Is it time-sensitive (e.g., ingesting a public dataset is less time-sensitive than starting to collect timestamped proprietary data)?

                                                                                                                                                  This is the future-oriented version of Q5. It’s best to think of data and metrics before kicking off advanced AI/ML projects. Further, it’s better to start without AI/ML than over-complicate things early on.

                                                                                                                                                  In short, user needs should inform project decisions on what to build. These decisions and plans then inform data collection. Don’t make the common mistake of starting with shiny tech as the proverbial hammer that’s looking for nail-shaped problems.

                                                                                                                                                  Don’t forget the opportunities!

                                                                                                                                                  Data is much like solar energy: It exists even if you don’t capture it, and most of it bounces back to space unused. Harnessed wisely, it can power business decisions and become a differentiator for your product.

                                                                                                                                                  However, when working closely with data, it’s easy to feel despair due to the never-ending stream of quality issues and stakeholder requests. I’ve felt this despair myself many times.

                                                                                                                                                  For me, the cure for data despair comes from shifting focus from gaps to opportunities:

                                                                                                                                                  1. Map the current state of data, including key gaps.
                                                                                                                                                  2. Learn about relevant business opportunities from internal/external customers and industry peers.
                                                                                                                                                  3. Create a plan to incrementally improve the state of data, and seize opportunities starting with the lowest-hanging fruit.
                                                                                                                                                  4. Execute the plan.
                                                                                                                                                  5. Repeat steps 1-4 periodically.

                                                                                                                                                  Data-to-AI health beyond abstract data

                                                                                                                                                  This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                                  You can download a guide containing all the questions as a PDF. Next, I’ll go into the questions from the Tech section, which are directly related to how the abstract Data questions manifest in practice. Feedback is always welcome!

                                                                                                                                                  Subscribe

                                                                                                                                                    Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                    \ No newline at end of file diff --git a/2024/06/24/is-your-tech-stack-ready-for-data-intensive-applications/index.html b/2024/06/24/is-your-tech-stack-ready-for-data-intensive-applications/index.html index e7be7bcb3..09493f705 100644 --- a/2024/06/24/is-your-tech-stack-ready-for-data-intensive-applications/index.html +++ b/2024/06/24/is-your-tech-stack-ready-for-data-intensive-applications/index.html @@ -1,9 +1,11 @@ Is your tech stack ready for data-intensive applications? | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                    Is your tech stack ready for data-intensive applications?

                                                                                                                                                    a stack of computers, wires, and hay in an office area

                                                                                                                                                    Data-intensive projects fail when you treat them like traditional software projects. But they also fail when you don’t apply best practices from software engineering.

                                                                                                                                                    Why?

                                                                                                                                                    Because data-intensive systems are made of data, and also made of software. Therefore:

                                                                                                                                                    1. data changes can lead to failures; and
                                                                                                                                                    2. software changes can lead to failures.

                                                                                                                                                    In traditional software systems, you fully control the changes. Your software doesn’t change unexpectedly.

                                                                                                                                                    In data-intensive systems, you cede control to the data. The data changes constantly, and it affects the behaviour of your system.

                                                                                                                                                    To succeed, you need to manage both the data and software aspects of your systems. This successful management is the essence of the questions from the Tech section of my Data-to-AI Health Check for Startups. This post presents the questions along with guidance on what constitutes healthy answers.

                                                                                                                                                    What do I mean by data intensity?

                                                                                                                                                    For the last few months, I have set my LinkedIn tagline to “helping startups ship data-intensive solutions (AI/ML for climate/nature tech)”. I landed on it after a bit of a struggle with succinctly defining exactly what it is I do.

                                                                                                                                                    The problem is that after over a decade of “data” roles, I don’t see the field of AI/ML (artificial intelligence and machine learning) as a sanctified sphere that’s separate from real-world data and humans. Further, while business intelligence (aka analytics) is seen by some as less “sexy” than AI/ML, I see it as a different lens of using data to drive business outcomes. Essentially, it all comes down to plumbing, decisions, and automation.

                                                                                                                                                    In the days of the Big Data hype, much attention was given to the three Vs of data: Volume, Velocity, and Variety – what flows through the plumbing. To me, data intensity goes beyond the three Vs. This is how I define it in the first section of my Data-to-AI Health Check:

                                                                                                                                                    High data intensity typically requires low-latency processing of large volumes of data with more than one database server. With high intensity, data processing issues noticeably affect key business metrics.

                                                                                                                                                    That is, in data-intensive settings, data issues affect decisions and automation in a way that hurts the business.

                                                                                                                                                    A couple of examples may help:

                                                                                                                                                    • Low intensity: A dashboard that doesn’t contain any actionable metrics. If the metrics change due to bugs in the data processing, it doesn’t affect decisions.
                                                                                                                                                    • High intensity: An ad-serving platform that personalises ads in real time based on numerous data points. If any model or system breaks, millions of dollars may be lost.

                                                                                                                                                    In short, the higher the data intensity, the more the flow of data affects the bottom line.

                                                                                                                                                    Understanding tech stacks and lifecycles

                                                                                                                                                    At 15 questions, the Tech section of my Data-to-AI Health Check for Startups is long and deep. To keep this post digestible, I won’t go into every question. Instead, I’ve grouped the questions by theme.

                                                                                                                                                    First up, on the tech stacks and lifecycles:

                                                                                                                                                    • Q1: Provide an architecture diagram for your tech systems (product and data stacks), including first-party and third-party tools and databases. If a diagram doesn’t exist, an ad hoc drawing would work as well.
                                                                                                                                                    • Q2: Zooming in on data stacks, what tools and pipelines do you use for the data engineering lifecycles (generation, storage, ingestion, transformation, and serving), and downstream uses (analytics, AI/ML, and reverse ETL)?
                                                                                                                                                    • Q3: Zooming in further on the downstream uses of analytics and AI/ML, what systems, processes, and tools do you use to manage their lifecycles (discovery, data preparation, model engineering, deployment, monitoring, and maintenance)? Give specific project examples.
                                                                                                                                                    • Q4: Are there any tech choices you regret? Why?
                                                                                                                                                    • Q5: Are there any new tools you want to introduce to your stack? Why?

                                                                                                                                                    To an extent, tech stacks and lifecycles follow the Anna Karenina principle: All healthy stacks are alike; each unhealthy stack is unhealthy in its own way.

                                                                                                                                                    By asking for their descriptions, I’m aiming to uncover gaps and opportunities.

                                                                                                                                                    Often, some gaps are known to the people in charge, but they haven’t been explicitly discussed. This is especially common in startups, where competing priorities and resource constraints require compromising on scope and quality to fuel growth. In addition, it’s impossible for small startups to have all the relevant experts on the founding team, so best practices aren’t followed due to ignorance rather than due to intentional compromises made to move fast. However, a lack of awareness of best practices can often lead to the startup moving too slowly.

                                                                                                                                                    Two concrete examples:

                                                                                                                                                    • many people outside the data world are unaware of recent advances in tooling for management of data transformations (dbt and its competitors), and
                                                                                                                                                    • practitioners who’ve only built ML models in academia rarely appreciate the complexity of running ML in production (MLOps is much more than ML).

                                                                                                                                                    Beyond gaps that may be exposed by Q1-Q3, explicitly asking about regrettable and future tech choices (Q4 & Q5) helps surface evidence of an overreliance on unproven or exotic tech (aka wasted innovation tokens) and an underreliance on proven tech (aka reinvention of wheels). This is especially common with inexperienced operators who are too excited about playing with shiny tools. Use of unproven tech should be reserved to the cases where it confers a competitive advantage (e.g., being first to market with the latest AI advances).

                                                                                                                                                    Basic quality assurance and delivery

                                                                                                                                                    The next set of questions covers what I consider to be the basics of quality assurance and continuous delivery:

                                                                                                                                                    • Q6: How do you test product code and infrastructure setup? How good is the coverage (formally – percentage of statements covered, and conceptually – confidence from 1 to 5 that tests capture faults prior to deployment)?
                                                                                                                                                    • Q7: Do all tests run automatically on every version of the code?
                                                                                                                                                    • Q8: Are deployments done as a single automated step (e.g., push new containers to production when the main branch is updated)?
                                                                                                                                                    • Q9: How faithful are development, testing, and staging environments to the production setup? Are there gaps that can be feasibly addressed? If so, what is stopping you from addressing them?

                                                                                                                                                    As I’m writing this in 2024, all the tooling exists to set things up with solid testing and deployment processes – and it’s constantly getting easier. The only place where such processes may be skipped is in throwaway prototypes, where testing unnecessarily slows things down.

                                                                                                                                                    Being a startup is also not an excuse. As Martin Fowler pointed out years ago, the internal quality of software doesn’t incur a cost. That is, by implementing solid systems and processes for automated testing and deployment, teams move faster. Teams that cut corners on internal quality may move faster in the very short term, but typically get overtaken by their higher-internal-quality counterparts within weeks.

                                                                                                                                                    No startup aims to be around only for a few weeks, so investing in internal quality is key to tech health.

                                                                                                                                                    In Fowler’s words:

                                                                                                                                                    • Neglecting internal quality leads to rapid build up of cruft
                                                                                                                                                    • This cruft slows down feature development
                                                                                                                                                    • Even a great team produces cruft, but by keeping internal quality high, is able to keep it under control
                                                                                                                                                    • High internal quality keeps cruft to a minimum, allowing a team to add features with less effort, time, and cost

                                                                                                                                                    Unfortunately, some software engineers never learn this lesson. Further, data professionals that don’t have a software background are even less likely to be exposed to the importance of internal quality and how it can be enforced.

                                                                                                                                                    That said, it’s never too late to learn and improve. This is key to avoiding failure modes that arise in data projects when best practices from software engineering aren’t applied.

                                                                                                                                                    Specific data-intensive failure modes

                                                                                                                                                    The next set of questions probes for failure modes that are specific to data-intensive work (data engineering, analytics, AI/ML, etc.):

                                                                                                                                                    • Q10: Do you apply the same standards of testing and deploying product code to data? For example, is there untested SQL code hidden in dashboarding tools or the database layer, or is SQL treated like core product code (tracked in source control with isolated testing)?
                                                                                                                                                    • Q11: How are schema changes managed and tested in each data system?
                                                                                                                                                    • Q12: Do you rely on notebooks for production data code? If so, how do you ensure that notebook code meets the same quality standards as core product code (especially around testing and change management)?
                                                                                                                                                    • Q13: Do advanced AI/ML projects meet your performance expectations? If not, do you know how to improve performance without data changes?

                                                                                                                                                    Data-intensive work is essentially about building models with software:

                                                                                                                                                    • Raw data is a model of real-world entities and events, expressed in database schemas (even “schemaless” databases have a schema – it’s just unbounded).
                                                                                                                                                    • Dashboards present models of metrics that originate in raw data, with the goal of informing decisions.
                                                                                                                                                    • AI/ML models are essentially complex data transformations, e.g., from a matrix of pixels to a probability that the image modelled by the pixels is of a cat or a dog.

                                                                                                                                                    Due to historical and practical reasons, much of this modelling work is done by people with no training in software engineering. While the industry is maturing, Q10-13 often expose gaps. The ideal answer to each question is that all models are fully tested and managed – just like software, but with extra care for the complexity introduced by data.

                                                                                                                                                    Maintaining long-term success

                                                                                                                                                    Finally, the last two questions cover monitoring and maintenance:

                                                                                                                                                    • Q14: On a scale of 1 to 5, how confident are you in detecting and addressing issues in production (including product, infra, data, and ML observability 1.0 & 2.0)? Do you have action plans to increase your level of confidence?
                                                                                                                                                    • Q15: What DevOps, DataOps, and MLOps practices do you follow that weren’t covered above? Are there known gaps and plans to address them?

                                                                                                                                                    Even if a data-intensive project is considered “done”, it still changes in production due to its dependence on data. The degree of likely change varies by project, but it needs to be actively managed for long-term success.

                                                                                                                                                    Data-to-AI health beyond the tech

                                                                                                                                                    This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                                    You can download a guide containing all the questions as a PDF. Next, I’ll go into the questions from the Security & Compliance section. Feedback is always welcome!

                                                                                                                                                    Subscribe +

                                                                                                                                                    Is your tech stack ready for data-intensive applications?

                                                                                                                                                    a stack of computers, wires, and hay in an office area

                                                                                                                                                    Data-intensive projects fail when you treat them like traditional software projects. But they also fail when you don’t apply best practices from software engineering.

                                                                                                                                                    Why?

                                                                                                                                                    Because data-intensive systems are made of data, and also made of software. Therefore:

                                                                                                                                                    1. data changes can lead to failures; and
                                                                                                                                                    2. software changes can lead to failures.

                                                                                                                                                    In traditional software systems, you fully control the changes. Your software doesn’t change unexpectedly.

                                                                                                                                                    In data-intensive systems, you cede control to the data. The data changes constantly, and it affects the behaviour of your system.

                                                                                                                                                    To succeed, you need to manage both the data and software aspects of your systems. This successful management is the essence of the questions from the Tech section of my Data-to-AI Health Check for Startups. This post presents the questions along with guidance on what constitutes healthy answers.

                                                                                                                                                    What do I mean by data intensity?

                                                                                                                                                    For the last few months, I have set my LinkedIn tagline to “helping startups ship data-intensive solutions (AI/ML for climate/nature tech)”. I landed on it after a bit of a struggle with succinctly defining exactly what it is I do.

                                                                                                                                                    The problem is that after over a decade of “data” roles, I don’t see the field of AI/ML (artificial intelligence and machine learning) as a sanctified sphere that’s separate from real-world data and humans. Further, while business intelligence (aka analytics) is seen by some as less “sexy” than AI/ML, I see it as a different lens of using data to drive business outcomes. Essentially, it all comes down to plumbing, decisions, and automation.

                                                                                                                                                    In the days of the Big Data hype, much attention was given to the three Vs of data: Volume, Velocity, and Variety – what flows through the plumbing. To me, data intensity goes beyond the three Vs. This is how I define it in the first section of my Data-to-AI Health Check:

                                                                                                                                                    High data intensity typically requires low-latency processing of large volumes of data with more than one database server. With high intensity, data processing issues noticeably affect key business metrics.

                                                                                                                                                    That is, in data-intensive settings, data issues affect decisions and automation in a way that hurts the business.

                                                                                                                                                    A couple of examples may help:

                                                                                                                                                    • Low intensity: A dashboard that doesn’t contain any actionable metrics. If the metrics change due to bugs in the data processing, it doesn’t affect decisions.
                                                                                                                                                    • High intensity: An ad-serving platform that personalises ads in real time based on numerous data points. If any model or system breaks, millions of dollars may be lost.

                                                                                                                                                    In short, the higher the data intensity, the more the flow of data affects the bottom line.

                                                                                                                                                    Understanding tech stacks and lifecycles

                                                                                                                                                    At 15 questions, the Tech section of my Data-to-AI Health Check for Startups is long and deep. To keep this post digestible, I won’t go into every question. Instead, I’ve grouped the questions by theme.

                                                                                                                                                    First up, on the tech stacks and lifecycles:

                                                                                                                                                    • Q1: Provide an architecture diagram for your tech systems (product and data stacks), including first-party and third-party tools and databases. If a diagram doesn’t exist, an ad hoc drawing would work as well.
                                                                                                                                                    • Q2: Zooming in on data stacks, what tools and pipelines do you use for the data engineering lifecycles (generation, storage, ingestion, transformation, and serving), and downstream uses (analytics, AI/ML, and reverse ETL)?
                                                                                                                                                    • Q3: Zooming in further on the downstream uses of analytics and AI/ML, what systems, processes, and tools do you use to manage their lifecycles (discovery, data preparation, model engineering, deployment, monitoring, and maintenance)? Give specific project examples.
                                                                                                                                                    • Q4: Are there any tech choices you regret? Why?
                                                                                                                                                    • Q5: Are there any new tools you want to introduce to your stack? Why?

                                                                                                                                                    To an extent, tech stacks and lifecycles follow the Anna Karenina principle: All healthy stacks are alike; each unhealthy stack is unhealthy in its own way.

                                                                                                                                                    By asking for their descriptions, I’m aiming to uncover gaps and opportunities.

                                                                                                                                                    Often, some gaps are known to the people in charge, but they haven’t been explicitly discussed. This is especially common in startups, where competing priorities and resource constraints require compromising on scope and quality to fuel growth. In addition, it’s impossible for small startups to have all the relevant experts on the founding team, so best practices aren’t followed due to ignorance rather than due to intentional compromises made to move fast. However, a lack of awareness of best practices can often lead to the startup moving too slowly.

                                                                                                                                                    Two concrete examples:

                                                                                                                                                    • many people outside the data world are unaware of recent advances in tooling for management of data transformations (dbt and its competitors), and
                                                                                                                                                    • practitioners who’ve only built ML models in academia rarely appreciate the complexity of running ML in production (MLOps is much more than ML).

                                                                                                                                                    Beyond gaps that may be exposed by Q1-Q3, explicitly asking about regrettable and future tech choices (Q4 & Q5) helps surface evidence of an overreliance on unproven or exotic tech (aka wasted innovation tokens) and an underreliance on proven tech (aka reinvention of wheels). This is especially common with inexperienced operators who are too excited about playing with shiny tools. Use of unproven tech should be reserved to the cases where it confers a competitive advantage (e.g., being first to market with the latest AI advances).

                                                                                                                                                    Basic quality assurance and delivery

                                                                                                                                                    The next set of questions covers what I consider to be the basics of quality assurance and continuous delivery:

                                                                                                                                                    • Q6: How do you test product code and infrastructure setup? How good is the coverage (formally – percentage of statements covered, and conceptually – confidence from 1 to 5 that tests capture faults prior to deployment)?
                                                                                                                                                    • Q7: Do all tests run automatically on every version of the code?
                                                                                                                                                    • Q8: Are deployments done as a single automated step (e.g., push new containers to production when the main branch is updated)?
                                                                                                                                                    • Q9: How faithful are development, testing, and staging environments to the production setup? Are there gaps that can be feasibly addressed? If so, what is stopping you from addressing them?

                                                                                                                                                    As I’m writing this in 2024, all the tooling exists to set things up with solid testing and deployment processes – and it’s constantly getting easier. The only place where such processes may be skipped is in throwaway prototypes, where testing unnecessarily slows things down.

                                                                                                                                                    Being a startup is also not an excuse. As Martin Fowler pointed out years ago, the internal quality of software doesn’t incur a cost. That is, by implementing solid systems and processes for automated testing and deployment, teams move faster. Teams that cut corners on internal quality may move faster in the very short term, but typically get overtaken by their higher-internal-quality counterparts within weeks.

                                                                                                                                                    No startup aims to be around only for a few weeks, so investing in internal quality is key to tech health.

                                                                                                                                                    In Fowler’s words:

                                                                                                                                                    • Neglecting internal quality leads to rapid build up of cruft
                                                                                                                                                    • This cruft slows down feature development
                                                                                                                                                    • Even a great team produces cruft, but by keeping internal quality high, is able to keep it under control
                                                                                                                                                    • High internal quality keeps cruft to a minimum, allowing a team to add features with less effort, time, and cost

                                                                                                                                                    Unfortunately, some software engineers never learn this lesson. Further, data professionals that don’t have a software background are even less likely to be exposed to the importance of internal quality and how it can be enforced.

                                                                                                                                                    That said, it’s never too late to learn and improve. This is key to avoiding failure modes that arise in data projects when best practices from software engineering aren’t applied.

                                                                                                                                                    Specific data-intensive failure modes

                                                                                                                                                    The next set of questions probes for failure modes that are specific to data-intensive work (data engineering, analytics, AI/ML, etc.):

                                                                                                                                                    • Q10: Do you apply the same standards of testing and deploying product code to data? For example, is there untested SQL code hidden in dashboarding tools or the database layer, or is SQL treated like core product code (tracked in source control with isolated testing)?
                                                                                                                                                    • Q11: How are schema changes managed and tested in each data system?
                                                                                                                                                    • Q12: Do you rely on notebooks for production data code? If so, how do you ensure that notebook code meets the same quality standards as core product code (especially around testing and change management)?
                                                                                                                                                    • Q13: Do advanced AI/ML projects meet your performance expectations? If not, do you know how to improve performance without data changes?

                                                                                                                                                    Data-intensive work is essentially about building models with software:

                                                                                                                                                    • Raw data is a model of real-world entities and events, expressed in database schemas (even “schemaless” databases have a schema – it’s just unbounded).
                                                                                                                                                    • Dashboards present models of metrics that originate in raw data, with the goal of informing decisions.
                                                                                                                                                    • AI/ML models are essentially complex data transformations, e.g., from a matrix of pixels to a probability that the image modelled by the pixels is of a cat or a dog.

                                                                                                                                                    Due to historical and practical reasons, much of this modelling work is done by people with no training in software engineering. While the industry is maturing, Q10-13 often expose gaps. The ideal answer to each question is that all models are fully tested and managed – just like software, but with extra care for the complexity introduced by data.

                                                                                                                                                    Maintaining long-term success

                                                                                                                                                    Finally, the last two questions cover monitoring and maintenance:

                                                                                                                                                    • Q14: On a scale of 1 to 5, how confident are you in detecting and addressing issues in production (including product, infra, data, and ML observability 1.0 & 2.0)? Do you have action plans to increase your level of confidence?
                                                                                                                                                    • Q15: What DevOps, DataOps, and MLOps practices do you follow that weren’t covered above? Are there known gaps and plans to address them?

                                                                                                                                                    Even if a data-intensive project is considered “done”, it still changes in production due to its dependence on data. The degree of likely change varies by project, but it needs to be actively managed for long-term success.

                                                                                                                                                    Data-to-AI health beyond the tech

                                                                                                                                                    This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                                    You can download a guide containing all the questions as a PDF. Next, I’ll go into the questions from the Security & Compliance section. Feedback is always welcome!

                                                                                                                                                    Subscribe

                                                                                                                                                      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                      \ No newline at end of file diff --git a/2024/07/01/stay-alert-security-is-everyones-responsibility/index.html b/2024/07/01/stay-alert-security-is-everyones-responsibility/index.html index 52aa0bcc1..9b157e90b 100644 --- a/2024/07/01/stay-alert-security-is-everyones-responsibility/index.html +++ b/2024/07/01/stay-alert-security-is-everyones-responsibility/index.html @@ -1,9 +1,11 @@ Stay alert! Security is everyone's responsibility | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                      Stay alert! Security is everyone's responsibility

                                                                                                                                                      a man putting on shoes to outrun other people (not the tiger)

                                                                                                                                                      Security. Trust. Compliance.

                                                                                                                                                      Have I lost you yet?

                                                                                                                                                      If you’re building a startup, you may find the above words boring and corporate. But security gaps can destroy your business.

                                                                                                                                                      While few startups have security experts on the founding team, everyone should have a basic level of security awareness these days.

                                                                                                                                                      I’m not a security expert myself, yet I’ve included a Security & Compliance section in my Data-to-AI Health Check for Startups. The questions from this section can’t replace a professional audit, as they only prod for common issues. However, it’s never a bad time to learn more and improve your security posture. This is the goal of the health check section and this post.

                                                                                                                                                      Throughout my career, I’ve spotted many security issues and have either addressed or escalated them – despite never having “security” as part of my formal job description. Still, I know I have a lot to learn about security – especially as the threat landscape keeps shifting.

                                                                                                                                                      If you’re a founder, you want to foster a culture where security issues are escalated and addressed promptly. Otherwise, you may end up with expensive data breaches that can destroy customer trust and your reputation. For example, the Optus data breach in 2022 occurred because of trivial mistakes. Meanwhile, Optus Glassdoor reviews highlight serious cultural issues that may have prevented such trivial mistakes from being escalated.

                                                                                                                                                      Unfortunately, such examples abound in the corporate world. But your startup can do better.

                                                                                                                                                      The rest of this post presents the questions from the Security & Compliance section, along with short explanations. If you think I’ve missed any critical questions, please let me know.

                                                                                                                                                      Basic hygiene for everyone

                                                                                                                                                      Any modern human should give an affirmative answer to the first three questions. Unfortunately, this still isn’t the case (as of 2024). Even if you don’t run a startup, you should practise basic hygiene and secure your digital life. It’s on the same level as brushing your teeth.

                                                                                                                                                      • Q1: Does everyone on the team use a password manager to generate unique long passwords and share credentials?
                                                                                                                                                      • Q2: Does everyone on the team use two-factor authentication on all key accounts?
                                                                                                                                                      • Q3: Is everyone on the team aware of phishing risks, including novel attacks that use AI for impersonation?

                                                                                                                                                      Basic hygiene for system admins

                                                                                                                                                      Early in a startup’s life, system administration is a hat, not a full-time role. This hat is often worn poorly, e.g., it may seem easier to grant excessive access to speed up execution than keep a tight control on accounts and adhere to the principle of least privilege. However, it’s much easier to start with secure admin practices with a handful of people than revoke access and disrupt the workflows of dozens of people later on.

                                                                                                                                                      You’d also sleep better at night if the answers to all the following questions imply that you’re securing things to the best of your abilities.

                                                                                                                                                      • Q4: Is the principle of least privilege enforced on all entities? That is, do human and non-human accounts have only the minimal privileges needed to do their job?
                                                                                                                                                      • Q5: Do you follow zero trust principles?
                                                                                                                                                      • Q6: Are all company devices encrypted? Are they kept up to date and protected from common threats like malware?
                                                                                                                                                      • Q7: Is any work done on non-company devices? If so, are they secured as well as company devices?

                                                                                                                                                      Least privilege is key. Last year, I participated in a reading group of the book Building Secure and Reliable Systems. The moderator – who has over 10 years of security experience with Google – referred to the chapter on least privilege as “the meat and potatoes” of the whole book. Ignore at your own risk.

                                                                                                                                                      Basic hygiene for developers

                                                                                                                                                      While system administration is an early-stage hat, tech startups typically have developers shipping software as their primary role. Unfortunately, basic security hygiene is frequently overlooked by developers. And even if developers know what they should do, they may cut corners under the pressure of deadlines.

                                                                                                                                                      Fortunately, early-stage startups are rarely an interesting target for hackers, so there’s time to improve your security posture around these four questions:

                                                                                                                                                      • Q8: Are all API keys stored appropriately (e.g., in a cloud provider’s secret store rather than in source code)?
                                                                                                                                                      • Q9: Are developers aware of the OWASP top ten security risks?
                                                                                                                                                      • Q10: Do you keep up with vulnerability reports and promptly apply security patches to your product (including updates of third-party dependencies)?
                                                                                                                                                      • Q11: If you’re building with large language models, are you aware of the new class of threats they introduce (like prompt injections)? How do you mitigate those risks?

                                                                                                                                                      Confidence in current security posture

                                                                                                                                                      When it comes to security, what you don’t know can hurt you. Continuous learning on threats and best practices is key. However, it can’t replace professional audits and proactive improvement and monitoring of your systems. This is the focus of the next three questions:

                                                                                                                                                      • Q12: Were any security audits done by a third party?
                                                                                                                                                      • Q13: Were there any security incidents? How were they handled?
                                                                                                                                                      • Q14: Do you have the instrumentation in place to be confident that there isn’t a breach right now? How confident are you on a scale of 1 to 5?

                                                                                                                                                      Data-specific questions

                                                                                                                                                      The best protection from data breaches and compliance issues is to avoid collecting any sensitive data. However, as business value often comes from proprietary data, this isn’t a practical solution. Instead, keep the following questions in mind as you build and run your startup:

                                                                                                                                                      • Q15: Do you handle any sensitive data?
                                                                                                                                                      • Q16: Are you aware of and comply with all relevant legislation around the data you handle and retain?
                                                                                                                                                      • Q17: How do you secure data at rest and in flight?
                                                                                                                                                      • Q18: What systems do you have for mitigating and recovering from data loss? How likely is a catastrophic data loss (e.g., reliant on one person or a single data centre)? How long will it take you to recover?

                                                                                                                                                      Data-to-AI health beyond security & compliance

                                                                                                                                                      This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                                      You can download a guide containing all the questions as a PDF. As this turned out to be an epic (and fun) project, I may also publish a post on distilling the top questions and running an efficient health check. Feedback is always welcome!

                                                                                                                                                      Subscribe +

                                                                                                                                                      Stay alert! Security is everyone's responsibility

                                                                                                                                                      a man putting on shoes to outrun other people (not the tiger)

                                                                                                                                                      Security. Trust. Compliance.

                                                                                                                                                      Have I lost you yet?

                                                                                                                                                      If you’re building a startup, you may find the above words boring and corporate. But security gaps can destroy your business.

                                                                                                                                                      While few startups have security experts on the founding team, everyone should have a basic level of security awareness these days.

                                                                                                                                                      I’m not a security expert myself, yet I’ve included a Security & Compliance section in my Data-to-AI Health Check for Startups. The questions from this section can’t replace a professional audit, as they only prod for common issues. However, it’s never a bad time to learn more and improve your security posture. This is the goal of the health check section and this post.

                                                                                                                                                      Throughout my career, I’ve spotted many security issues and have either addressed or escalated them – despite never having “security” as part of my formal job description. Still, I know I have a lot to learn about security – especially as the threat landscape keeps shifting.

                                                                                                                                                      If you’re a founder, you want to foster a culture where security issues are escalated and addressed promptly. Otherwise, you may end up with expensive data breaches that can destroy customer trust and your reputation. For example, the Optus data breach in 2022 occurred because of trivial mistakes. Meanwhile, Optus Glassdoor reviews highlight serious cultural issues that may have prevented such trivial mistakes from being escalated.

                                                                                                                                                      Unfortunately, such examples abound in the corporate world. But your startup can do better.

                                                                                                                                                      The rest of this post presents the questions from the Security & Compliance section, along with short explanations. If you think I’ve missed any critical questions, please let me know.

                                                                                                                                                      Basic hygiene for everyone

                                                                                                                                                      Any modern human should give an affirmative answer to the first three questions. Unfortunately, this still isn’t the case (as of 2024). Even if you don’t run a startup, you should practise basic hygiene and secure your digital life. It’s on the same level as brushing your teeth.

                                                                                                                                                      • Q1: Does everyone on the team use a password manager to generate unique long passwords and share credentials?
                                                                                                                                                      • Q2: Does everyone on the team use two-factor authentication on all key accounts?
                                                                                                                                                      • Q3: Is everyone on the team aware of phishing risks, including novel attacks that use AI for impersonation?

                                                                                                                                                      Basic hygiene for system admins

                                                                                                                                                      Early in a startup’s life, system administration is a hat, not a full-time role. This hat is often worn poorly, e.g., it may seem easier to grant excessive access to speed up execution than keep a tight control on accounts and adhere to the principle of least privilege. However, it’s much easier to start with secure admin practices with a handful of people than revoke access and disrupt the workflows of dozens of people later on.

                                                                                                                                                      You’d also sleep better at night if the answers to all the following questions imply that you’re securing things to the best of your abilities.

                                                                                                                                                      • Q4: Is the principle of least privilege enforced on all entities? That is, do human and non-human accounts have only the minimal privileges needed to do their job?
                                                                                                                                                      • Q5: Do you follow zero trust principles?
                                                                                                                                                      • Q6: Are all company devices encrypted? Are they kept up to date and protected from common threats like malware?
                                                                                                                                                      • Q7: Is any work done on non-company devices? If so, are they secured as well as company devices?

                                                                                                                                                      Least privilege is key. Last year, I participated in a reading group of the book Building Secure and Reliable Systems. The moderator – who has over 10 years of security experience with Google – referred to the chapter on least privilege as “the meat and potatoes” of the whole book. Ignore at your own risk.

                                                                                                                                                      Basic hygiene for developers

                                                                                                                                                      While system administration is an early-stage hat, tech startups typically have developers shipping software as their primary role. Unfortunately, basic security hygiene is frequently overlooked by developers. And even if developers know what they should do, they may cut corners under the pressure of deadlines.

                                                                                                                                                      Fortunately, early-stage startups are rarely an interesting target for hackers, so there’s time to improve your security posture around these four questions:

                                                                                                                                                      • Q8: Are all API keys stored appropriately (e.g., in a cloud provider’s secret store rather than in source code)?
                                                                                                                                                      • Q9: Are developers aware of the OWASP top ten security risks?
                                                                                                                                                      • Q10: Do you keep up with vulnerability reports and promptly apply security patches to your product (including updates of third-party dependencies)?
                                                                                                                                                      • Q11: If you’re building with large language models, are you aware of the new class of threats they introduce (like prompt injections)? How do you mitigate those risks?

                                                                                                                                                      Confidence in current security posture

                                                                                                                                                      When it comes to security, what you don’t know can hurt you. Continuous learning on threats and best practices is key. However, it can’t replace professional audits and proactive improvement and monitoring of your systems. This is the focus of the next three questions:

                                                                                                                                                      • Q12: Were any security audits done by a third party?
                                                                                                                                                      • Q13: Were there any security incidents? How were they handled?
                                                                                                                                                      • Q14: Do you have the instrumentation in place to be confident that there isn’t a breach right now? How confident are you on a scale of 1 to 5?

                                                                                                                                                      Data-specific questions

                                                                                                                                                      The best protection from data breaches and compliance issues is to avoid collecting any sensitive data. However, as business value often comes from proprietary data, this isn’t a practical solution. Instead, keep the following questions in mind as you build and run your startup:

                                                                                                                                                      • Q15: Do you handle any sensitive data?
                                                                                                                                                      • Q16: Are you aware of and comply with all relevant legislation around the data you handle and retain?
                                                                                                                                                      • Q17: How do you secure data at rest and in flight?
                                                                                                                                                      • Q18: What systems do you have for mitigating and recovering from data loss? How likely is a catastrophic data loss (e.g., reliant on one person or a single data centre)? How long will it take you to recover?

                                                                                                                                                      Data-to-AI health beyond security & compliance

                                                                                                                                                      This post is part of a series on my Data-to-AI Health Check for Startups. Previous posts:

                                                                                                                                                      You can download a guide containing all the questions as a PDF. As this turned out to be an epic (and fun) project, I may also publish a post on distilling the top questions and running an efficient health check. Feedback is always welcome!

                                                                                                                                                      Subscribe

                                                                                                                                                        Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                        \ No newline at end of file diff --git a/2024/07/08/exploring-an-ai-product-idea-with-the-latest-chatgpt-claude-and-gemini/index.html b/2024/07/08/exploring-an-ai-product-idea-with-the-latest-chatgpt-claude-and-gemini/index.html index 663876701..faa01293e 100644 --- a/2024/07/08/exploring-an-ai-product-idea-with-the-latest-chatgpt-claude-and-gemini/index.html +++ b/2024/07/08/exploring-an-ai-product-idea-with-the-latest-chatgpt-claude-and-gemini/index.html @@ -1,9 +1,11 @@ Exploring an AI product idea with the latest ChatGPT, Claude, and Gemini | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                        Exploring an AI product idea with the latest ChatGPT, Claude, and Gemini

                                                                                                                                                        screenshot of the top of ChatBot Arena leaderboard, showing GPT-4o, Claude 3.5 Sonnet, and Gemini 1.5

                                                                                                                                                        With the hype around new AI models and tools, I often feel like I’m missing out on something amazing. For example, the excitement around Claude 3.5 Sonnet has been especially notable, despite it being ranked below ChatGPT-4o on the LMSYS Chatbot Arena Leaderboard.

                                                                                                                                                        The best way to get a feel for a model is to play with it yourself (though you’d need more than that to take language models to production). To this end, I had near-identical chats with Claude 3.5 Sonnet, ChatGPT-4o, and Gemini 1.5 Pro.

                                                                                                                                                        My conclusion: The leaderboard and benchmarks don’t lie, the top three models feel about the same (but your mileage will vary by task and prompt).

                                                                                                                                                        As I’ve recently opined on LinkedIn, it seems likely that the top models will keep improving but remain roughly equivalent. In the long term, the real competition is around deriving business value from the models, which comes down to building products and distribution.

                                                                                                                                                        Commercialisation is an area where players like Google have a massive advantage. For example, Google is running a competition for Gemini applications where the rewards include over $1M in cash and an electric DeLorean. Meanwhile, Anthropic is running a competition for Claude applications where the reward is $10k in Claude API credits. The difference is stark.

                                                                                                                                                        Google’s competition made me consider MagicGrantMaker – an AI product idea that could rely on Gemini. In short, it’s about helping niche organisations find and apply for grants. This has been on my mind recently, as I’ve had discussions with Reef Life Survey around collaboration on AI grants to support automation of aspects of their marine biodiversity monitoring. However, I haven’t had much time to find suitable grants.

                                                                                                                                                        My primary chatbot is ChatGPT-4o. Assessing the viability of MagicGrantMaker was a good opportunity to take the latest Claude and Gemini for a spin.

                                                                                                                                                        The rest of this post presents the MagicGrantMaker idea, along with a comparison of highlights from the chat transcripts.

                                                                                                                                                        Setting the scene

                                                                                                                                                        I like starting chats by ensuring that the chatbot and I are on the same page when it comes to background knowledge:

                                                                                                                                                        YS: Hey, have you heard of Reef Life Survey?

                                                                                                                                                        Unsurprisingly, the three bots gave near-identical answers. Here are the first two sentences from each response:

                                                                                                                                                        ChatGPT: Yes, I’m familiar with the Reef Life Survey (RLS). It’s a global marine biodiversity monitoring program that enlists the help of trained volunteer divers to collect data on reef species and habitats. […]

                                                                                                                                                        Claude: Yes, I have heard of Reef Life Survey. It’s an international program that involves trained volunteer divers collecting data on marine life and habitats in coastal areas around the world. […]

                                                                                                                                                        Gemini: Yes, I have heard of Reef Life Survey (RLS). It is a world-renowned marine monitoring program that uses trained recreational SCUBA divers to collect data on reef fish, invertebrates, and habitats. […]

                                                                                                                                                        They all kept going with extra details, and ended their response with a suggestion to provide more information.

                                                                                                                                                        The product idea

                                                                                                                                                        Once the scene was set, I hit the bots with the big idea. I sent the three the following message:

                                                                                                                                                        YS: I’m deeply familiar with the organisation, as I’m an RLS volunteer.

                                                                                                                                                        What I’m after today is this: I have an idea for a web app that’d help organisations like RLS find grant opportunities and apply for them. As it relies on recent improvements in LLMs / AI (specifically text summarisation and generation), I call it MagicGrantMaker (MGM).

                                                                                                                                                        The main value proposition for RLS is that it’d:

                                                                                                                                                        1. Surface new grant opportunities
                                                                                                                                                        2. Draft grant applications

                                                                                                                                                        That is, it’d save them time and help them increase their funding.

                                                                                                                                                        One key strength of MGM is in its focus on a specific niche for the MVP (Australian marine ecology researchers). Beyond the MVP, the idea is to keep finding under-served niches and deliver an exceptional experience to specific grant seekers.

                                                                                                                                                        The second niche would be Australian renewable energy startups that may be able to get funding from the likes of ARENA.

                                                                                                                                                        The third niche may be startups like MGM itself: supporters of climate tech and nature conservation.

                                                                                                                                                        If this proves to be a viable product beyond a prototype, it’d follow a freemium model:

                                                                                                                                                        • Free: Search for grants using keywords and a natural language interface
                                                                                                                                                        • Paid tier 1: Create an organisation profile (much of it can be scaffolded automatically from websites and LinkedIn), and get personalised grant opportunities emailed to you.
                                                                                                                                                        • Paid tier 2: Draft grant applications based on your org profile with AI.

                                                                                                                                                        Viability would depend on data quality, coupled with the quality of the AI implementation. Then it’s all up to effective marketing. The idea is that if the search works well (again, depends on getting a unique dataset), then the free part would be valuable enough to lure people in.

                                                                                                                                                        I haven’t spoken to any potential customers yet. The next step is finding grant sources to assess the viability of the search engine for the initial niche.

                                                                                                                                                        What do you think? Please be brutally honest.

                                                                                                                                                        Despite my request for the bots to be brutally honest, they still felt too gentle. What I found most interesting is that they all followed the same response structure: intro, strengths, weaknesses, next steps, and conclusion. Given the length of my prompt, I expected more divergence.

                                                                                                                                                        For brevity, I’ll summarise their key points rather than provide the full transcripts.

                                                                                                                                                        Strengths:

                                                                                                                                                        • [All three] Niche focus
                                                                                                                                                        • [All three] Freemium model
                                                                                                                                                        • [ChatGPT & Claude] Scalability
                                                                                                                                                        • [ChatGPT] Value proposition
                                                                                                                                                        • [Claude] Time-saving potential
                                                                                                                                                        • [Gemini] Addresses a real pain point
                                                                                                                                                        • [Gemini] Leverages AI effectively

                                                                                                                                                        Challenges:

                                                                                                                                                        • [All three] Data quality and access
                                                                                                                                                        • [All three] Competition
                                                                                                                                                        • [All three] AI limitations
                                                                                                                                                        • [ChatGPT & Gemini] Marketing and user acquisition
                                                                                                                                                        • [ChatGPT] Customer validation
                                                                                                                                                        • [Claude] Customer adoption
                                                                                                                                                        • [Claude] Regulatory compliance
                                                                                                                                                        • [Claude] Pricing strategy
                                                                                                                                                        • [Gemini] Freemium conversion

                                                                                                                                                        Next steps:

                                                                                                                                                        • [ChatGPT] (1) customer interviews, (2) prototype development, (3) data partnerships, (4) AI and UX testing, (5) marketing plan.
                                                                                                                                                        • [Claude] (1) market research, (2) grant source assessment, (3) MVP development, (4) legal considerations.
                                                                                                                                                        • [Gemini] (1) data validation, (2) prototype and testing, (3) competitive analysis, (4) talk to potential customers.

                                                                                                                                                        Apart from Claude being a bit too concerned about legal issues, the outputs are essentially the same. I suppose it’s not too surprising: The current generation of general-purpose chatbots excel at surfacing generic advice. Often, generic advice is all you need. And as with coaching, it’s the coachee who needs to do most of the work.

                                                                                                                                                        Grant sources

                                                                                                                                                        My main motivation for the chat was to surface data sources and assess how hard it’d be to collect the required data. I proceeded to ask about:

                                                                                                                                                        • Grant sources for RLS
                                                                                                                                                        • Grant sources for applying AI to RLS work
                                                                                                                                                        • Grant sources for Australian renewable energy startups

                                                                                                                                                        As with the beginning of the chats, all three recommended pretty much the same funding bodies. No surprises there!

                                                                                                                                                        One thing I found interesting was the number of potential grant sources, which was in the dozens. These were all valid – I was either already familiar with them, or verified that they had funded RLS’s work before.

                                                                                                                                                        The number and diversity of grant sources has supported my suspicion that keeping on top of grants is a non-trivial problem. In fact, I found a potential grant to apply for as part of this work. However, this isn’t enough evidence that MagicGrantMaker is a viable business idea – I’d need to talk with potential customers if I were to pursue it. Still, given the existence of similar companies in the space (e.g., a couple of startups are helping American companies get climate tech grants), I do believe that with the right niches and execution, MagicGrantMaker could be a thing.

                                                                                                                                                        Conclusion

                                                                                                                                                        Paraphrasing the words of an OpenAI engineer: “AI model behaviour is determined by your dataset, nothing else”. Therefore, it is somewhat unsurprising to see ChatGPT, Claude, and Gemini converging to near-identical outputs. The main dataset is the open web, but a lot of additional proprietary data comes from fine-tuning the base models on human feedback (the part that turns auto-complete language models into friendly chatbots).

                                                                                                                                                        You’d expect the proprietary data to make a difference, but companies like Anthropic and OpenAI source at least some of their proprietary data from the same providers (like Scale). The fact that all models compete on beating the same benchmarks also contributes to their sameness.

                                                                                                                                                        While my comparative analysis is anecdotal, it has helped reaffirm my suspicion that when it comes to the top general-purpose models, it’s fine to just choose one with a convenient UI and tooling. I believe that we’ll keep seeing advancements, but that the top players will keep catching up with each other. This is similar to what happens in Kaggle competitions and other sports, where simply knowing that a score is attainable causes all competitors to put in extra effort to attain it.

                                                                                                                                                        It’s a different story when it comes to special-purpose AI tools that are trained on proprietary data. MagicGrantMaker is an example of such a tool: its value would come from being hyper-niche and collecting a unique dataset from specific customers. It’s simple, but not easy. And the hardest part is execution over time rather than coming up with new ideas.

                                                                                                                                                        Subscribe +

                                                                                                                                                        Exploring an AI product idea with the latest ChatGPT, Claude, and Gemini

                                                                                                                                                        screenshot of the top of ChatBot Arena leaderboard, showing GPT-4o, Claude 3.5 Sonnet, and Gemini 1.5

                                                                                                                                                        With the hype around new AI models and tools, I often feel like I’m missing out on something amazing. For example, the excitement around Claude 3.5 Sonnet has been especially notable, despite it being ranked below ChatGPT-4o on the LMSYS Chatbot Arena Leaderboard.

                                                                                                                                                        The best way to get a feel for a model is to play with it yourself (though you’d need more than that to take language models to production). To this end, I had near-identical chats with Claude 3.5 Sonnet, ChatGPT-4o, and Gemini 1.5 Pro.

                                                                                                                                                        My conclusion: The leaderboard and benchmarks don’t lie, the top three models feel about the same (but your mileage will vary by task and prompt).

                                                                                                                                                        As I’ve recently opined on LinkedIn, it seems likely that the top models will keep improving but remain roughly equivalent. In the long term, the real competition is around deriving business value from the models, which comes down to building products and distribution.

                                                                                                                                                        Commercialisation is an area where players like Google have a massive advantage. For example, Google is running a competition for Gemini applications where the rewards include over $1M in cash and an electric DeLorean. Meanwhile, Anthropic is running a competition for Claude applications where the reward is $10k in Claude API credits. The difference is stark.

                                                                                                                                                        Google’s competition made me consider MagicGrantMaker – an AI product idea that could rely on Gemini. In short, it’s about helping niche organisations find and apply for grants. This has been on my mind recently, as I’ve had discussions with Reef Life Survey around collaboration on AI grants to support automation of aspects of their marine biodiversity monitoring. However, I haven’t had much time to find suitable grants.

                                                                                                                                                        My primary chatbot is ChatGPT-4o. Assessing the viability of MagicGrantMaker was a good opportunity to take the latest Claude and Gemini for a spin.

                                                                                                                                                        The rest of this post presents the MagicGrantMaker idea, along with a comparison of highlights from the chat transcripts.

                                                                                                                                                        Setting the scene

                                                                                                                                                        I like starting chats by ensuring that the chatbot and I are on the same page when it comes to background knowledge:

                                                                                                                                                        YS: Hey, have you heard of Reef Life Survey?

                                                                                                                                                        Unsurprisingly, the three bots gave near-identical answers. Here are the first two sentences from each response:

                                                                                                                                                        ChatGPT: Yes, I’m familiar with the Reef Life Survey (RLS). It’s a global marine biodiversity monitoring program that enlists the help of trained volunteer divers to collect data on reef species and habitats. […]

                                                                                                                                                        Claude: Yes, I have heard of Reef Life Survey. It’s an international program that involves trained volunteer divers collecting data on marine life and habitats in coastal areas around the world. […]

                                                                                                                                                        Gemini: Yes, I have heard of Reef Life Survey (RLS). It is a world-renowned marine monitoring program that uses trained recreational SCUBA divers to collect data on reef fish, invertebrates, and habitats. […]

                                                                                                                                                        They all kept going with extra details, and ended their response with a suggestion to provide more information.

                                                                                                                                                        The product idea

                                                                                                                                                        Once the scene was set, I hit the bots with the big idea. I sent the three the following message:

                                                                                                                                                        YS: I’m deeply familiar with the organisation, as I’m an RLS volunteer.

                                                                                                                                                        What I’m after today is this: I have an idea for a web app that’d help organisations like RLS find grant opportunities and apply for them. As it relies on recent improvements in LLMs / AI (specifically text summarisation and generation), I call it MagicGrantMaker (MGM).

                                                                                                                                                        The main value proposition for RLS is that it’d:

                                                                                                                                                        1. Surface new grant opportunities
                                                                                                                                                        2. Draft grant applications

                                                                                                                                                        That is, it’d save them time and help them increase their funding.

                                                                                                                                                        One key strength of MGM is in its focus on a specific niche for the MVP (Australian marine ecology researchers). Beyond the MVP, the idea is to keep finding under-served niches and deliver an exceptional experience to specific grant seekers.

                                                                                                                                                        The second niche would be Australian renewable energy startups that may be able to get funding from the likes of ARENA.

                                                                                                                                                        The third niche may be startups like MGM itself: supporters of climate tech and nature conservation.

                                                                                                                                                        If this proves to be a viable product beyond a prototype, it’d follow a freemium model:

                                                                                                                                                        • Free: Search for grants using keywords and a natural language interface
                                                                                                                                                        • Paid tier 1: Create an organisation profile (much of it can be scaffolded automatically from websites and LinkedIn), and get personalised grant opportunities emailed to you.
                                                                                                                                                        • Paid tier 2: Draft grant applications based on your org profile with AI.

                                                                                                                                                        Viability would depend on data quality, coupled with the quality of the AI implementation. Then it’s all up to effective marketing. The idea is that if the search works well (again, depends on getting a unique dataset), then the free part would be valuable enough to lure people in.

                                                                                                                                                        I haven’t spoken to any potential customers yet. The next step is finding grant sources to assess the viability of the search engine for the initial niche.

                                                                                                                                                        What do you think? Please be brutally honest.

                                                                                                                                                        Despite my request for the bots to be brutally honest, they still felt too gentle. What I found most interesting is that they all followed the same response structure: intro, strengths, weaknesses, next steps, and conclusion. Given the length of my prompt, I expected more divergence.

                                                                                                                                                        For brevity, I’ll summarise their key points rather than provide the full transcripts.

                                                                                                                                                        Strengths:

                                                                                                                                                        • [All three] Niche focus
                                                                                                                                                        • [All three] Freemium model
                                                                                                                                                        • [ChatGPT & Claude] Scalability
                                                                                                                                                        • [ChatGPT] Value proposition
                                                                                                                                                        • [Claude] Time-saving potential
                                                                                                                                                        • [Gemini] Addresses a real pain point
                                                                                                                                                        • [Gemini] Leverages AI effectively

                                                                                                                                                        Challenges:

                                                                                                                                                        • [All three] Data quality and access
                                                                                                                                                        • [All three] Competition
                                                                                                                                                        • [All three] AI limitations
                                                                                                                                                        • [ChatGPT & Gemini] Marketing and user acquisition
                                                                                                                                                        • [ChatGPT] Customer validation
                                                                                                                                                        • [Claude] Customer adoption
                                                                                                                                                        • [Claude] Regulatory compliance
                                                                                                                                                        • [Claude] Pricing strategy
                                                                                                                                                        • [Gemini] Freemium conversion

                                                                                                                                                        Next steps:

                                                                                                                                                        • [ChatGPT] (1) customer interviews, (2) prototype development, (3) data partnerships, (4) AI and UX testing, (5) marketing plan.
                                                                                                                                                        • [Claude] (1) market research, (2) grant source assessment, (3) MVP development, (4) legal considerations.
                                                                                                                                                        • [Gemini] (1) data validation, (2) prototype and testing, (3) competitive analysis, (4) talk to potential customers.

                                                                                                                                                        Apart from Claude being a bit too concerned about legal issues, the outputs are essentially the same. I suppose it’s not too surprising: The current generation of general-purpose chatbots excel at surfacing generic advice. Often, generic advice is all you need. And as with coaching, it’s the coachee who needs to do most of the work.

                                                                                                                                                        Grant sources

                                                                                                                                                        My main motivation for the chat was to surface data sources and assess how hard it’d be to collect the required data. I proceeded to ask about:

                                                                                                                                                        • Grant sources for RLS
                                                                                                                                                        • Grant sources for applying AI to RLS work
                                                                                                                                                        • Grant sources for Australian renewable energy startups

                                                                                                                                                        As with the beginning of the chats, all three recommended pretty much the same funding bodies. No surprises there!

                                                                                                                                                        One thing I found interesting was the number of potential grant sources, which was in the dozens. These were all valid – I was either already familiar with them, or verified that they had funded RLS’s work before.

                                                                                                                                                        The number and diversity of grant sources has supported my suspicion that keeping on top of grants is a non-trivial problem. In fact, I found a potential grant to apply for as part of this work. However, this isn’t enough evidence that MagicGrantMaker is a viable business idea – I’d need to talk with potential customers if I were to pursue it. Still, given the existence of similar companies in the space (e.g., a couple of startups are helping American companies get climate tech grants), I do believe that with the right niches and execution, MagicGrantMaker could be a thing.

                                                                                                                                                        Conclusion

                                                                                                                                                        Paraphrasing the words of an OpenAI engineer: “AI model behaviour is determined by your dataset, nothing else”. Therefore, it is somewhat unsurprising to see ChatGPT, Claude, and Gemini converging to near-identical outputs. The main dataset is the open web, but a lot of additional proprietary data comes from fine-tuning the base models on human feedback (the part that turns auto-complete language models into friendly chatbots).

                                                                                                                                                        You’d expect the proprietary data to make a difference, but companies like Anthropic and OpenAI source at least some of their proprietary data from the same providers (like Scale). The fact that all models compete on beating the same benchmarks also contributes to their sameness.

                                                                                                                                                        While my comparative analysis is anecdotal, it has helped reaffirm my suspicion that when it comes to the top general-purpose models, it’s fine to just choose one with a convenient UI and tooling. I believe that we’ll keep seeing advancements, but that the top players will keep catching up with each other. This is similar to what happens in Kaggle competitions and other sports, where simply knowing that a score is attainable causes all competitors to put in extra effort to attain it.

                                                                                                                                                        It’s a different story when it comes to special-purpose AI tools that are trained on proprietary data. MagicGrantMaker is an example of such a tool: its value would come from being hyper-niche and collecting a unique dataset from specific customers. It’s simple, but not easy. And the hardest part is execution over time rather than coming up with new ideas.

                                                                                                                                                        Subscribe

                                                                                                                                                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                          \ No newline at end of file diff --git a/2024/07/15/learn-about-dataland-to-avoid-expensive-hiring-mistakes/index.html b/2024/07/15/learn-about-dataland-to-avoid-expensive-hiring-mistakes/index.html index 3db979f6b..624b0f6b5 100644 --- a/2024/07/15/learn-about-dataland-to-avoid-expensive-hiring-mistakes/index.html +++ b/2024/07/15/learn-about-dataland-to-avoid-expensive-hiring-mistakes/index.html @@ -1,5 +1,5 @@ Learn about Dataland to avoid expensive hiring mistakes | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                          Learn about Dataland to avoid expensive hiring mistakes

                                                                                                                                                          screenshot of Yanir Seroussi presenting a slide from the webinar

                                                                                                                                                          This week, I will present a webinar on hiring a startup’s first Data-to-AI specialist.

                                                                                                                                                          Why Data-to-AI rather than Data & AI?

                                                                                                                                                          Because AI isn’t distinct from Data – AI is built on data. Further, solid data plumbing underlies both analytics and advanced AI/ML. Despite the hype, descriptive models (aka analytics) are often as critical as predictive and causal models (aka advanced AI/ML & statistics).

                                                                                                                                                          As practice for the webinar, I recorded myself going through the first part of the deck. I’m a bit uncomfortable watching myself, which makes sharing the video an especially valuable exercise:

                                                                                                                                                          The full slides are here (use the left/right and up/down arrows to navigate).

                                                                                                                                                          Key points from the video

                                                                                                                                                          These are the key points covered in the video, along with a couple of pointers to relevant posts:

                                                                                                                                                          • Intro: The intended audience and my relevant background.
                                                                                                                                                          • Main goal of the webinar: Avoid expensive mistakes – remembering that the cost of a wrong hire entails compensation + slowdown + opportunity.
                                                                                                                                                          • Sub-goals:
                                                                                                                                                            • De-hype Data & AI
                                                                                                                                                            • Clarify needs & opportunities
                                                                                                                                                            • Consider not hiring
                                                                                                                                                            • Hire well
                                                                                                                                                            • Avoid pitfalls
                                                                                                                                                          • Dataland is Venn diagram paradise – but startups need not worry about Dataland Venns. The lines between areas like artificial intelligence and data science are arbitrary and blurry. In any case, startups need generalists early on.
                                                                                                                                                          • Ask de-hyping questions on plumbing, decisions, and automation:
                                                                                                                                                            • Plumbing: What’s the state of your data engineering lifecycles?
                                                                                                                                                            • Decisions: How do you use descriptive, predictive, and causal modelling to support decisions?
                                                                                                                                                            • Automation: How do you use AI to automate processes?
                                                                                                                                                          • Work up from principles and business problems rather than by starting with the many tools from the MAD landscape. But keep in mind that tools do matter. Paraphrasing a quote from David Allen, a great wrench doesn’t make a great plumber, but a great plumber will always want to use a great wrench.
                                                                                                                                                          • Keep these terms in mind:
                                                                                                                                                          • Don’t expect magic: Advanced AI/ML and Analytics are both hard and depend on solid plumbing.

                                                                                                                                                          Going deep into one Venn diagram

                                                                                                                                                          After creating the initial draft of the slides and making fun of Dataland Venns, I realised that going deep into one Venn diagram would be beneficial. I cover the diagram below in the last part of the video.

                                                                                                                                                          In the diagram, I highlight these three roles:

                                                                                                                                                          • Analyst: Focused on understanding and explaining the business (descriptive modelling)
                                                                                                                                                          • Engineer: Focused on building scalable systems (i.e., plumbing)
                                                                                                                                                          • Statistician: Focused on building mathematical models from data (predictive and causal modelling)

                                                                                                                                                          Learn about Dataland to avoid expensive hiring mistakes

                                                                                                                                                          screenshot of Yanir Seroussi presenting a slide from the webinar

                                                                                                                                                          This week, I will present a webinar on hiring a startup’s first Data-to-AI specialist.

                                                                                                                                                          Why Data-to-AI rather than Data & AI?

                                                                                                                                                          Because AI isn’t distinct from Data – AI is built on data. Further, solid data plumbing underlies both analytics and advanced AI/ML. Despite the hype, descriptive models (aka analytics) are often as critical as predictive and causal models (aka advanced AI/ML & statistics).

                                                                                                                                                          As practice for the webinar, I recorded myself going through the first part of the deck. I’m a bit uncomfortable watching myself, which makes sharing the video an especially valuable exercise:

                                                                                                                                                          The full slides are here (use the left/right and up/down arrows to navigate).

                                                                                                                                                          Key points from the video

                                                                                                                                                          These are the key points covered in the video, along with a couple of pointers to relevant posts:

                                                                                                                                                          • Intro: The intended audience and my relevant background.
                                                                                                                                                          • Main goal of the webinar: Avoid expensive mistakes – remembering that the cost of a wrong hire entails compensation + slowdown + opportunity.
                                                                                                                                                          • Sub-goals:
                                                                                                                                                            • De-hype Data & AI
                                                                                                                                                            • Clarify needs & opportunities
                                                                                                                                                            • Consider not hiring
                                                                                                                                                            • Hire well
                                                                                                                                                            • Avoid pitfalls
                                                                                                                                                          • Dataland is Venn diagram paradise – but startups need not worry about Dataland Venns. The lines between areas like artificial intelligence and data science are arbitrary and blurry. In any case, startups need generalists early on.
                                                                                                                                                          • Ask de-hyping questions on plumbing, decisions, and automation:
                                                                                                                                                            • Plumbing: What’s the state of your data engineering lifecycles?
                                                                                                                                                            • Decisions: How do you use descriptive, predictive, and causal modelling to support decisions?
                                                                                                                                                            • Automation: How do you use AI to automate processes?
                                                                                                                                                          • Work up from principles and business problems rather than by starting with the many tools from the MAD landscape. But keep in mind that tools do matter. Paraphrasing a quote from David Allen, a great wrench doesn’t make a great plumber, but a great plumber will always want to use a great wrench.
                                                                                                                                                          • Keep these terms in mind:
                                                                                                                                                          • Don’t expect magic: Advanced AI/ML and Analytics are both hard and depend on solid plumbing.

                                                                                                                                                          Going deep into one Venn diagram

                                                                                                                                                          After creating the initial draft of the slides and making fun of Dataland Venns, I realised that going deep into one Venn diagram would be beneficial. I cover the diagram below in the last part of the video.

                                                                                                                                                          In the diagram, I highlight these three roles:

                                                                                                                                                          • Analyst: Focused on understanding and explaining the business (descriptive modelling)
                                                                                                                                                          • Engineer: Focused on building scalable systems (i.e., plumbing)
                                                                                                                                                          • Statistician: Focused on building mathematical models from data (predictive and causal modelling)
                                                                                                                                                          800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                                                                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                            \ No newline at end of file diff --git a/2024/07/22/your-first-data-to-ai-hire-run-a-lovable-process/index.html b/2024/07/22/your-first-data-to-ai-hire-run-a-lovable-process/index.html index 50485db8a..af824d4f8 100644 --- a/2024/07/22/your-first-data-to-ai-hire-run-a-lovable-process/index.html +++ b/2024/07/22/your-first-data-to-ai-hire-run-a-lovable-process/index.html @@ -1,9 +1,11 @@ Your first Data-to-AI hire: Run a lovable process | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                            Your first Data-to-AI hire: Run a lovable process

                                                                                                                                                            screenshot of Yanir Seroussi with a slide that says 'above all... keep it efficient and respectful'

                                                                                                                                                            Hiring well is hard. You’re trying to find a match for an under-specified role from a pool of unique humans. That said, running a reasonable hiring process shouldn’t be that hard. This is especially true for startups, which are unconstrained by the legacy requirements of a large bureaucracy.

                                                                                                                                                            Recently, I gave a webinar on hiring a startup’s first Data-to-AI specialist. As practice, I recorded myself going through the slides.

                                                                                                                                                            The first part of the webinar covered what startups can expect from Data / AI / ML roles (posted last week). The second part gets into advice on hiring decisions and running a reasonable process:

                                                                                                                                                            The full slides are here (use the left/right and up/down arrows to navigate). The webinar page contains a recording that also includes audience questions.

                                                                                                                                                            The rest of this post discusses the key points from the video. Much of this applies to any startup role – especially to hiring the first specialist outside your expertise.

                                                                                                                                                            Before hiring

                                                                                                                                                            First, you should clarify the role and consider not hiring. Specifically:

                                                                                                                                                            • Ensure that the role is needed for near-term growth.
                                                                                                                                                            • Start with responsibilities rather than titles, especially given that data titles are often uninformative.
                                                                                                                                                            • Make a 90-day plan for the hire, including successful outcomes.
                                                                                                                                                            • Consider alternatives to hiring a full-timer: doing urgent work in-house, getting a contractor or fractional (i.e., part-time) person, and deferring non-urgent work.

                                                                                                                                                            Deciding to hire

                                                                                                                                                            Assuming you’ve decided to hire, you should further clarify the role. This includes:

                                                                                                                                                            • Clarify the activities and successful outcomes for the first 30 & 90 days, so the person can hit the ground running.
                                                                                                                                                            • Define aspirational goals for the 6-12 month horizon. These are likely to shift in startups, but should help with aligning expectations.
                                                                                                                                                            • Decide on the person’s level, which is likely to be mid-senior: a hands-on person who can lead a team and drive tech decisions. If they’re too junior, they will struggle to ship quality work independently – the opposite of what you want from your first Data-to-AI specialist. If they’re too senior, they’re less likely to be hands-on and more likely to be expensive. That said, keep in mind that seniority is correlated with years of experience, but not perfectly – curiosity and a growth mindset are key.
                                                                                                                                                            • Set a compensation range. This includes salary and equity, but you can trade monetary compensation for perks like 80% or 90% time. Further, some people may be willing to accept lower compensation for learning and growth opportunities, and for contributing to a mission that aligns with their values. This should all be within reason, though – if you try to offer half the prevailing rates for similar startup roles, you will miss out on quality candidates.
                                                                                                                                                            • Give the role a title. As discussed in the previous post, this is likely to be a Data Tech Lead (Analytics Engineer) if the person will be setting up data pipelines and analytics, or an AI/ML Lead (AI/ML Engineer) if they’ll be focused on contributing to the core product of an AI/ML-centric startup.

                                                                                                                                                            After getting clearer on the role, you should consider getting help. This includes:

                                                                                                                                                            • External recruiters for sourcing candidates and understanding the market. Many recruiters are terrible, but the better ones can expedite the process and help you get better outcomes. Even if you don’t retain an external recruiter for their full set of services, some will offer advice on your job ad, compensation range, and process.
                                                                                                                                                            • Internal recruiters for initial screening and running the process. While your startup may not be hiring enough people to justify a full-time internal recruiter, you should consider retaining a temporary recruiter if you’re trying to fill multiple roles (e.g., after a raise). This would help with aligning incentives in comparison to retaining an external agency.
                                                                                                                                                            • Data / AI / ML specialists for technical screening. Keep in mind the different areas of Dataland, though – an analyst won’t be of much help in screening AI/ML engineers.

                                                                                                                                                            The hiring process

                                                                                                                                                            My main message here is to run a process you’d love. Some startup founders were never employees, but you should try to put yourself in the shoes of candidates. Career arcs are long, so the person you’ve put through a poor process may interview you one day – or they may end up working for a potential client. Further, people share experiences on platforms like Glassdoor and Blind, and it’s hard to undo reputational damage there.

                                                                                                                                                            With that in mind, my main suggestions for running a process you’d love are:

                                                                                                                                                            • Be clear and honest. This includes honesty about the challenges at the company, which would help with retention in comparison to selling a false narrative.
                                                                                                                                                            • Be responsive and efficient. As a startup, being quick is a competitive advantage. You also want to get back to building your product as soon as possible. According to a recruitment leader I spoke to, ghosting is the number one complaint they get from candidates. You can stand out by responding quickly. A quick rejection is better than no response.
                                                                                                                                                            • Ask relevant, informative questions. Assuming the role and 90-day plan are well-defined, they should be the source of your questions. Also check industry best practices, but…
                                                                                                                                                            • Remember you are not Google. Startups should run faster, more efficient processes than big companies. There’s a lot of silliness in big tech hiring, including hackable assessments that rely on memorising a handful of techniques. Personally, I’ve had identical questions asked by Google, Microsoft, Qualcomm, and Intel. Such questions mostly test candidate motivation and their ability to prepare for interviews, which are only partly correlated to on-the-job performance.

                                                                                                                                                            Potential process flow

                                                                                                                                                            1. Job ad:
                                                                                                                                                              • Make it clear and honest by including key challenges that a successful candidate is likely to face.
                                                                                                                                                              • Include the hiring process, compensation range, perks, 30-90 day plans, and 6-12 month aspirations.
                                                                                                                                                              • Conclude with a simple call to action and custom questions (e.g., how would you deal with the key challenges). Relevant custom questions are better than asking for an open-ended cover letter, especially as many cover letters are written by ChatGPT these days (i.e., they’re useless).
                                                                                                                                                            2. Quick rejections: For each applicant who’s neither a spammer nor a bot, reject quickly with a template note if they’re not a good fit.
                                                                                                                                                            3. Initial screen: Invite each candidate who is a potential fit to a 30-minute intro call. This is a low-pressure two-way street to understand candidate expectations and see if there’s a high-level alignment. Tips:
                                                                                                                                                              • Try to avoid wasting time on reciting details from the job ad, or on getting candidates to talk about details that can be found in their resume. Instead, you can dig into how their past work and current motivations make them a good fit based on the role’s definition. This may include clarifying and elaborating on the questions you asked as part of the application.
                                                                                                                                                              • Consider asking simple technical questions to filter out people who look good on paper but may be clueless, e.g., any AI/ML person should be able to explain the difference between classification and regression or between supervised and unsupervised learning. Further, explaining technical concepts to non-technical people is a required skill for your first Data-to-AI hire.
                                                                                                                                                            4. Technical screen:
                                                                                                                                                              • Personally, I’m a fan of take-home tasks that can be completed in reasonable time. They are superior to tightly-timed assessments, as take-home tasks are closer to real-life work. However, completion in reasonable time is key. For example, you could design a task that can be completed in 1-2 hours, then give candidates 3-4 hours to complete it. If it takes too long, some candidates will drop out of the process.
                                                                                                                                                              • To ensure that candidates understand what they’ve done, schedule a call to dig into their task answers. We live in the age of AI assistants, so I believe candidates should be allowed to use AI as part of a realistic screening process. However, blindly copying chatbot answers is foolish – the live call will help you filter out candidates who did that.
                                                                                                                                                              • As alternatives to a long technical screen, consider paid trials and contract-to-hire arrangements. The best indication of work performance is doing actual work, but whether this is feasible depends on your candidate pool.
                                                                                                                                                            5. Final screen and offer: This may include a chat with the founders and reference checks. Keep in mind that reference checks are typically time-consuming and provide little signal – one recruiter I’ve spoken to said they’ve done thousands and only a handful changed the decision. If you’re ready to extend an offer, you may do it orally at the final screen call.

                                                                                                                                                            Common pitfalls

                                                                                                                                                            Common pitfalls in hiring the first Data-to-AI specialist include:

                                                                                                                                                            • Ignoring experts. The main experts I have in mind are recruiters. The best ones have gained a deep expertise through handling thousands of candidates – it’s worth consulting them.
                                                                                                                                                            • Trusting the wrong experts. For example, while I can help with navigating Data / AI / ML roles and technical screening, I’m not a recruitment expert. Take anything I say with a grain of salt.
                                                                                                                                                            • Google worship. One aspect of this is blindly copying Google (or other BigName) processes. Another is getting dazzled by pedigree – big companies hire many people who end up under-performing. Further, those who have only worked at a big company may not transition well to a startup role.
                                                                                                                                                            • Hunting unicorns. As noted in the first part of the talk, unicorns who are good at everything do exist, but they’re hard to hire and retain. Also, there’s a limit to how effective a single person can be when they’re wearing many hats.
                                                                                                                                                            • Expecting magic. Dataland has experienced many hype waves – from Big Data through Data Science to Generative AI. Therefore, data experts are often expected to wave their AI wand and deliver magical results. However, getting long-term business value from Data / AI / ML is a non-trivial matter, and recruiting perfect humans is impossible. Aim to align your expectations with what’s feasible for your startup.
                                                                                                                                                            • Hiring narrow specialists. People who are too narrow in their skills are rarely a good fit for a startup, especially since the first specialist will need to wear many hats. Exceptions include specialists who have a narrow expertise in the core value proposition of the startup.

                                                                                                                                                            Recap: Key takeaways

                                                                                                                                                            Overall, the key takeaways from both parts of the webinar are:

                                                                                                                                                            • Data & AI are all about plumbing, decisions, and automation.
                                                                                                                                                            • Before hiring, get clear on your needs.
                                                                                                                                                            • Outcomes first: Consider alternatives to hiring.
                                                                                                                                                            • If hiring, run a process you’d love.
                                                                                                                                                            • Keep learning, get help, and try to avoid pitfalls.

                                                                                                                                                            Feedback is always welcome!

                                                                                                                                                            Subscribe +

                                                                                                                                                            Your first Data-to-AI hire: Run a lovable process

                                                                                                                                                            screenshot of Yanir Seroussi with a slide that says 'above all... keep it efficient and respectful'

                                                                                                                                                            Hiring well is hard. You’re trying to find a match for an under-specified role from a pool of unique humans. That said, running a reasonable hiring process shouldn’t be that hard. This is especially true for startups, which are unconstrained by the legacy requirements of a large bureaucracy.

                                                                                                                                                            Recently, I gave a webinar on hiring a startup’s first Data-to-AI specialist. As practice, I recorded myself going through the slides.

                                                                                                                                                            The first part of the webinar covered what startups can expect from Data / AI / ML roles (posted last week). The second part gets into advice on hiring decisions and running a reasonable process:

                                                                                                                                                            The full slides are here (use the left/right and up/down arrows to navigate). The webinar page contains a recording that also includes audience questions.

                                                                                                                                                            The rest of this post discusses the key points from the video. Much of this applies to any startup role – especially to hiring the first specialist outside your expertise.

                                                                                                                                                            Before hiring

                                                                                                                                                            First, you should clarify the role and consider not hiring. Specifically:

                                                                                                                                                            • Ensure that the role is needed for near-term growth.
                                                                                                                                                            • Start with responsibilities rather than titles, especially given that data titles are often uninformative.
                                                                                                                                                            • Make a 90-day plan for the hire, including successful outcomes.
                                                                                                                                                            • Consider alternatives to hiring a full-timer: doing urgent work in-house, getting a contractor or fractional (i.e., part-time) person, and deferring non-urgent work.

                                                                                                                                                            Deciding to hire

                                                                                                                                                            Assuming you’ve decided to hire, you should further clarify the role. This includes:

                                                                                                                                                            • Clarify the activities and successful outcomes for the first 30 & 90 days, so the person can hit the ground running.
                                                                                                                                                            • Define aspirational goals for the 6-12 month horizon. These are likely to shift in startups, but should help with aligning expectations.
                                                                                                                                                            • Decide on the person’s level, which is likely to be mid-senior: a hands-on person who can lead a team and drive tech decisions. If they’re too junior, they will struggle to ship quality work independently – the opposite of what you want from your first Data-to-AI specialist. If they’re too senior, they’re less likely to be hands-on and more likely to be expensive. That said, keep in mind that seniority is correlated with years of experience, but not perfectly – curiosity and a growth mindset are key.
                                                                                                                                                            • Set a compensation range. This includes salary and equity, but you can trade monetary compensation for perks like 80% or 90% time. Further, some people may be willing to accept lower compensation for learning and growth opportunities, and for contributing to a mission that aligns with their values. This should all be within reason, though – if you try to offer half the prevailing rates for similar startup roles, you will miss out on quality candidates.
                                                                                                                                                            • Give the role a title. As discussed in the previous post, this is likely to be a Data Tech Lead (Analytics Engineer) if the person will be setting up data pipelines and analytics, or an AI/ML Lead (AI/ML Engineer) if they’ll be focused on contributing to the core product of an AI/ML-centric startup.

                                                                                                                                                            After getting clearer on the role, you should consider getting help. This includes:

                                                                                                                                                            • External recruiters for sourcing candidates and understanding the market. Many recruiters are terrible, but the better ones can expedite the process and help you get better outcomes. Even if you don’t retain an external recruiter for their full set of services, some will offer advice on your job ad, compensation range, and process.
                                                                                                                                                            • Internal recruiters for initial screening and running the process. While your startup may not be hiring enough people to justify a full-time internal recruiter, you should consider retaining a temporary recruiter if you’re trying to fill multiple roles (e.g., after a raise). This would help with aligning incentives in comparison to retaining an external agency.
                                                                                                                                                            • Data / AI / ML specialists for technical screening. Keep in mind the different areas of Dataland, though – an analyst won’t be of much help in screening AI/ML engineers.

                                                                                                                                                            The hiring process

                                                                                                                                                            My main message here is to run a process you’d love. Some startup founders were never employees, but you should try to put yourself in the shoes of candidates. Career arcs are long, so the person you’ve put through a poor process may interview you one day – or they may end up working for a potential client. Further, people share experiences on platforms like Glassdoor and Blind, and it’s hard to undo reputational damage there.

                                                                                                                                                            With that in mind, my main suggestions for running a process you’d love are:

                                                                                                                                                            • Be clear and honest. This includes honesty about the challenges at the company, which would help with retention in comparison to selling a false narrative.
                                                                                                                                                            • Be responsive and efficient. As a startup, being quick is a competitive advantage. You also want to get back to building your product as soon as possible. According to a recruitment leader I spoke to, ghosting is the number one complaint they get from candidates. You can stand out by responding quickly. A quick rejection is better than no response.
                                                                                                                                                            • Ask relevant, informative questions. Assuming the role and 90-day plan are well-defined, they should be the source of your questions. Also check industry best practices, but…
                                                                                                                                                            • Remember you are not Google. Startups should run faster, more efficient processes than big companies. There’s a lot of silliness in big tech hiring, including hackable assessments that rely on memorising a handful of techniques. Personally, I’ve had identical questions asked by Google, Microsoft, Qualcomm, and Intel. Such questions mostly test candidate motivation and their ability to prepare for interviews, which are only partly correlated to on-the-job performance.

                                                                                                                                                            Potential process flow

                                                                                                                                                            1. Job ad:
                                                                                                                                                              • Make it clear and honest by including key challenges that a successful candidate is likely to face.
                                                                                                                                                              • Include the hiring process, compensation range, perks, 30-90 day plans, and 6-12 month aspirations.
                                                                                                                                                              • Conclude with a simple call to action and custom questions (e.g., how would you deal with the key challenges). Relevant custom questions are better than asking for an open-ended cover letter, especially as many cover letters are written by ChatGPT these days (i.e., they’re useless).
                                                                                                                                                            2. Quick rejections: For each applicant who’s neither a spammer nor a bot, reject quickly with a template note if they’re not a good fit.
                                                                                                                                                            3. Initial screen: Invite each candidate who is a potential fit to a 30-minute intro call. This is a low-pressure two-way street to understand candidate expectations and see if there’s a high-level alignment. Tips:
                                                                                                                                                              • Try to avoid wasting time on reciting details from the job ad, or on getting candidates to talk about details that can be found in their resume. Instead, you can dig into how their past work and current motivations make them a good fit based on the role’s definition. This may include clarifying and elaborating on the questions you asked as part of the application.
                                                                                                                                                              • Consider asking simple technical questions to filter out people who look good on paper but may be clueless, e.g., any AI/ML person should be able to explain the difference between classification and regression or between supervised and unsupervised learning. Further, explaining technical concepts to non-technical people is a required skill for your first Data-to-AI hire.
                                                                                                                                                            4. Technical screen:
                                                                                                                                                              • Personally, I’m a fan of take-home tasks that can be completed in reasonable time. They are superior to tightly-timed assessments, as take-home tasks are closer to real-life work. However, completion in reasonable time is key. For example, you could design a task that can be completed in 1-2 hours, then give candidates 3-4 hours to complete it. If it takes too long, some candidates will drop out of the process.
                                                                                                                                                              • To ensure that candidates understand what they’ve done, schedule a call to dig into their task answers. We live in the age of AI assistants, so I believe candidates should be allowed to use AI as part of a realistic screening process. However, blindly copying chatbot answers is foolish – the live call will help you filter out candidates who did that.
                                                                                                                                                              • As alternatives to a long technical screen, consider paid trials and contract-to-hire arrangements. The best indication of work performance is doing actual work, but whether this is feasible depends on your candidate pool.
                                                                                                                                                            5. Final screen and offer: This may include a chat with the founders and reference checks. Keep in mind that reference checks are typically time-consuming and provide little signal – one recruiter I’ve spoken to said they’ve done thousands and only a handful changed the decision. If you’re ready to extend an offer, you may do it orally at the final screen call.

                                                                                                                                                            Common pitfalls

                                                                                                                                                            Common pitfalls in hiring the first Data-to-AI specialist include:

                                                                                                                                                            • Ignoring experts. The main experts I have in mind are recruiters. The best ones have gained a deep expertise through handling thousands of candidates – it’s worth consulting them.
                                                                                                                                                            • Trusting the wrong experts. For example, while I can help with navigating Data / AI / ML roles and technical screening, I’m not a recruitment expert. Take anything I say with a grain of salt.
                                                                                                                                                            • Google worship. One aspect of this is blindly copying Google (or other BigName) processes. Another is getting dazzled by pedigree – big companies hire many people who end up under-performing. Further, those who have only worked at a big company may not transition well to a startup role.
                                                                                                                                                            • Hunting unicorns. As noted in the first part of the talk, unicorns who are good at everything do exist, but they’re hard to hire and retain. Also, there’s a limit to how effective a single person can be when they’re wearing many hats.
                                                                                                                                                            • Expecting magic. Dataland has experienced many hype waves – from Big Data through Data Science to Generative AI. Therefore, data experts are often expected to wave their AI wand and deliver magical results. However, getting long-term business value from Data / AI / ML is a non-trivial matter, and recruiting perfect humans is impossible. Aim to align your expectations with what’s feasible for your startup.
                                                                                                                                                            • Hiring narrow specialists. People who are too narrow in their skills are rarely a good fit for a startup, especially since the first specialist will need to wear many hats. Exceptions include specialists who have a narrow expertise in the core value proposition of the startup.

                                                                                                                                                            Recap: Key takeaways

                                                                                                                                                            Overall, the key takeaways from both parts of the webinar are:

                                                                                                                                                            • Data & AI are all about plumbing, decisions, and automation.
                                                                                                                                                            • Before hiring, get clear on your needs.
                                                                                                                                                            • Outcomes first: Consider alternatives to hiring.
                                                                                                                                                            • If hiring, run a process you’d love.
                                                                                                                                                            • Keep learning, get help, and try to avoid pitfalls.

                                                                                                                                                            Feedback is always welcome!

                                                                                                                                                            Subscribe

                                                                                                                                                              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                              \ No newline at end of file diff --git a/2024/07/29/ai-ml-lifecycle-models-versus-real-world-mess/index.html b/2024/07/29/ai-ml-lifecycle-models-versus-real-world-mess/index.html index 1a32fb544..1d738a057 100644 --- a/2024/07/29/ai-ml-lifecycle-models-versus-real-world-mess/index.html +++ b/2024/07/29/ai-ml-lifecycle-models-versus-real-world-mess/index.html @@ -1,5 +1,5 @@ AI/ML lifecycle models versus real-world mess | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                              AI/ML lifecycle models versus real-world mess

                                                                                                                                                              ChatGPT-generated messy machine learning lifecycle model (inspired by xkcd)

                                                                                                                                                              One of my challenges with the transition to consulting is running effective diagnoses. As a long-term employee, you develop an awareness of problems within your organisation – especially problems you’ve caused yourself (e.g., by taking on tech debt). As an outsider with deep industry knowledge, you can guess what the problems are based on the initial brief. However, there’s often no way around spending time with a client to thoroughly diagnose their problems and propose custom solutions.

                                                                                                                                                              For a recent engagement, the high-level brief was to help the client overcome challenges around the reproducibility and rigour of their machine learning work (AI/ML henceforth). Without getting into confidential details, it was immediately apparent that some of the challenges were due to a lack of engineering experience by the scientists building the models (which is a common problem).

                                                                                                                                                              As my path into data science & AI/ML was via software engineering, I’ve always found myself doing engineering work in my data roles. If I were a full-time employee of the client, I’d have ample time to introduce engineering best practices and processes, and help the scientists develop relevant skills (e.g., as I’ve done in my full-time work with Automattic). However, as an external consultant, I didn’t have the luxury of understanding the context and building relationships over months and years. While I do offer long-term engagements to help with implementation, I prefer completing the initial discovery phase as a separate engagement, before requiring either party to commit to working together long-term. Therefore, I had to figure out an effective way to diagnose the problems and propose a roadmap.

                                                                                                                                                              As a part of the discovery phase, I could use the questions from the Tech section of my Data-to-AI Health Check for Startups. +

                                                                                                                                                              AI/ML lifecycle models versus real-world mess

                                                                                                                                                              ChatGPT-generated messy machine learning lifecycle model (inspired by xkcd)

                                                                                                                                                              One of my challenges with the transition to consulting is running effective diagnoses. As a long-term employee, you develop an awareness of problems within your organisation – especially problems you’ve caused yourself (e.g., by taking on tech debt). As an outsider with deep industry knowledge, you can guess what the problems are based on the initial brief. However, there’s often no way around spending time with a client to thoroughly diagnose their problems and propose custom solutions.

                                                                                                                                                              For a recent engagement, the high-level brief was to help the client overcome challenges around the reproducibility and rigour of their machine learning work (AI/ML henceforth). Without getting into confidential details, it was immediately apparent that some of the challenges were due to a lack of engineering experience by the scientists building the models (which is a common problem).

                                                                                                                                                              As my path into data science & AI/ML was via software engineering, I’ve always found myself doing engineering work in my data roles. If I were a full-time employee of the client, I’d have ample time to introduce engineering best practices and processes, and help the scientists develop relevant skills (e.g., as I’ve done in my full-time work with Automattic). However, as an external consultant, I didn’t have the luxury of understanding the context and building relationships over months and years. While I do offer long-term engagements to help with implementation, I prefer completing the initial discovery phase as a separate engagement, before requiring either party to commit to working together long-term. Therefore, I had to figure out an effective way to diagnose the problems and propose a roadmap.

                                                                                                                                                              As a part of the discovery phase, I could use the questions from the Tech section of my Data-to-AI Health Check for Startups. Specifically, the first three questions should provide a good overview of the current state of processes and tooling:

                                                                                                                                                              • Q1: Provide an architecture diagram for your tech systems (product and data stacks), including first-party and third-party tools and databases. If a diagram doesn’t exist, an ad hoc drawing would work as well.
                                                                                                                                                              • Q2: Zooming in on data stacks, what tools and pipelines do you use for the data engineering lifecycles (generation, storage, ingestion, transformation, and serving), and downstream uses (analytics, AI/ML, and reverse ETL)?
                                                                                                                                                              • Q3: Zooming in further on the downstream uses of analytics and AI/ML, what systems, processes, and tools do you use to manage their lifecycles (discovery, data preparation, model engineering, deployment, monitoring, and maintenance)? Give specific project examples.

                                                                                                                                                              However, while I was satisfied with the definition of the data engineering lifecycle (as noted in the Q2 link, its ultimate source is the book Fundamentals of Data Engineering), I didn’t love the definition of the AI/ML lifecycle. As I find lifecycle models incredibly useful for uncovering gaps and opportunities, I decided to dig deeper and find an AI/ML lifecycle model that suits my diagnostic needs. The rest of this post discusses the problem in more detail, and presents my findings.

                                                                                                                                                              My problems with the AI/ML lifecycle model

                                                                                                                                                              The AI/ML lifecycle model is messy in at least three ways.

                                                                                                                                                              First, the subject of each stage is different:

                                                                                                                                                              • Discovery deals with the business context, stakeholders, and technical feasibility.
                                                                                                                                                              • Data preparation focuses on transforming and exploring the data.
                                                                                                                                                              • Model engineering is where AI-assisted humans create models via iterative coding and experimentation, with the goal of satisfying offline metrics.
                                                                                                                                                              • Deployment, monitoring, and maintenance happen in production – potentially by different people and with results that are far removed from the expectations set by offline metrics.

                                                                                                                                                              Contrast this with the relative simplicity of the data engineering lifecycle: generation, storage, ingestion, transformation, and serving are all things that happen to data. That is, data is the subject of each stage.

                                                                                                                                                              Second, experimentation and the probabilistic nature of AI/ML lead to feedback loops. Authors of AI/ML lifecycle models attempt to capture these loops in diagrams that I find confusing. For example, see two of the better sources I found:

                                                                                                                                                              800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                                                                                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                \ No newline at end of file diff --git a/2024/08/05/first-year-lessons-from-a-solo-expertise-biz-in-data-and-ai/index.html b/2024/08/05/first-year-lessons-from-a-solo-expertise-biz-in-data-and-ai/index.html index 6076ff5fa..7df012ef6 100644 --- a/2024/08/05/first-year-lessons-from-a-solo-expertise-biz-in-data-and-ai/index.html +++ b/2024/08/05/first-year-lessons-from-a-solo-expertise-biz-in-data-and-ai/index.html @@ -1,5 +1,5 @@ First year lessons from a solo expertise biz in Data & AI | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                                First year lessons from a solo expertise biz in Data & AI

                                                                                                                                                                My LinkedIn cover tells a part of the story. It used to be much less targeted.

                                                                                                                                                                My LinkedIn cover tells a part of the story. It used to be much less targeted.

                                                                                                                                                                In a world of AI-generated slop, it’s refreshing to read real human stories.

                                                                                                                                                                To contribute my bit to humanity and help me write consistently, I decided to start sharing a monthly reflection. This is in line with old-school blogging, which should help people who find themselves in similar situations to mine.

                                                                                                                                                                The focus will be on my career in Data & AI, but it’s not completely detachable from the rest of my life.

                                                                                                                                                                To set the scene for future reflections, I’ll start with where I’m at now, and what I’m trying to do. I’ll then summarise some highlights and lessons from the past year, and conclude with the past month.

                                                                                                                                                                Now building: Solo expertise business

                                                                                                                                                                It took me an embarrassingly long time to understand this, but broadly speaking, I’m building a solo expertise business.

                                                                                                                                                                Let’s break this up:

                                                                                                                                                                With this in mind, let’s get into an overview of how I got to this point.

                                                                                                                                                                First, ChatGPT’s summary

                                                                                                                                                                ⚠️ The post ended up being way longer than I wanted it to be. While I don’t advocate for blindly publishing AI-generated slop, AI is amazing for summarisation. Therefore, I asked ChatGPT to summarise the rest of the post. It’s not exactly right, and it lacks personality. However, it does capture the key ideas in case you’re busy or uninterested in going deep into the whole story. Now that I got it out there, I will aim to make future reflections shorter and avoid AI-generated summaries.

                                                                                                                                                                1. Leaving Orkestra and Exploring Options (Q2 2023): After leaving Orkestra, the author explored various projects, including revamping a marine biodiversity tool and applying to AI labs. Key lessons included the importance of trust and the realization that foundational AI research was not a preferred path.

                                                                                                                                                                2. Shifting Focus and Embracing Consulting (Q3 2023): The author moved away from the idea of building a solo software product to focusing more on consulting, especially in the climate tech space. This period emphasized the value of finding a niche, clear communication of value propositions, and the blurring lines between consulting and product building.

                                                                                                                                                                3. Navigating Distractions and Positioning Clearly (Q4 2023): Faced with personal and global challenges, the author learned to manage distractions, position themselves clearly in engagements, and recognize that their identity and ethnicity are inescapable aspects of their work. This period also underscored the limited intersection between true detractors and the target client base.

                                                                                                                                                                4. Building Authority and Trust through Outreach (Q1 2024): The start of the new year saw the author becoming more systematic in outreach and lead generation. They committed to consistent publishing, leveraging parasocial relationships to build trust at scale, and reactivating their existing network, which led to increased confidence and new opportunities.

                                                                                                                                                                5. Commitment to a Solo Expertise Business (Q2 2024): In this period, the author fully embraced the concept of a solo expertise business with multiple revenue streams. They focused on refining their LinkedIn profile, joining relevant communities, and understanding the importance of long-term relationship building over immediate gains. This commitment marked the true start of their business journey.

                                                                                                                                                                These points provide a comprehensive overview of the author’s journey from leaving a startup to fully committing to and refining their solo expertise business in Data & AI.

                                                                                                                                                                Q2 2023: Leaving Orkestra and messing around

                                                                                                                                                                In April 2023, I left Orkestra – a climate tech startup I joined early 2022. It was a classic first Data-to-AI hire story: I worked on an AI/ML project that they ended up pivoting away from (as startups do). While I could have stayed around to help with engineering on their main product, I decided it was time to leave and figure things out on my own. I still like the Orkestra crew and hope they do well, though.

                                                                                                                                                                As financial years in Australia run from July to June, I was in no rush to generate more Q2 revenue due to the high tax rate. Instead, I nominally took time off, but actually ended up doing a few different work-y things in addition to a couple of diving adventures:

                                                                                                                                                                1. Wrapped up a paid project for revamping Reef Life Survey’s Reef Species of the World.
                                                                                                                                                                2. After getting swept up with generative AI excitement and a bit of an existential crisis, I decided to try applying for a job with a top AI lab. To my surprise, they sent me their automated code assessment, which led to me spending more time reflecting on why I didn’t want to proceed with the process than the time I spent preparing for the assessment.
                                                                                                                                                                3. Spent some time exploring software product ideas I could build in the energy space, which led me to a year-long volunteering stint with Work on Climate.
                                                                                                                                                                4. Reflected on the shifting landscape of data science in an attempt to come to terms with the fact that I always needed to do plenty of engineering to be effective in data science roles. The latter reflection turned into a talk abstract, which led to a conference talk in Q3.

                                                                                                                                                                OK, so I’m not great at truly taking time off. But figuring things out was a key goal of Q2 2023.

                                                                                                                                                                What did I learn from Q2 2023?

                                                                                                                                                                1. I wasn’t ready for this lesson yet, but the Reef Life Survey project was an example of the importance of trust and value pricing. I was basically given a budget to improve the web tools because they trust me after almost a decade of volunteering. Scaling up this trust to enough clients will be key to a sustainable consulting practice.
                                                                                                                                                                2. I’m not sufficiently interested in foundational AI research – I prefer applying AI to specific problems. I also wasn’t ready for relocating from Australia, even temporarily for an amazing AI lab job.
                                                                                                                                                                3. I was getting close to realising that I don’t have a software startup idea that I’d commit to, but I wasn’t quite there yet.
                                                                                                                                                                4. I came to terms with data science drifting away from its engineering roots, and realised I needed to reposition in a way that’d emphasise my engineering skills.

                                                                                                                                                                Q3 2023: Shifting away from freelancer and indie hacker mindsets

                                                                                                                                                                Going into July 2023, I was aiming to follow a similar path to my 2014-2015 direction: Do a bit of freelancing to generate runway for building a solo software-as-a-service product business (aka an indie hacker product, partly popularised by Start Small, Stay Small). One thing I learned from 2014-2015 was that it’s hard to sustain the effort needed to build a successful product if I don’t care about the problem it solves, so I focused on exploring ideas in the climate space.

                                                                                                                                                                As no idea felt compelling, I ended up leaning more towards consulting – especially once I realised that the lines between solo consulting and product building are blurry. Putting more effort into consulting in the climate tech space was meant to serve multiple goals: (1) generate revenue; (2) get exposure to problems that may be addressed by an indie solo product; and (3) improve my marketing and sales skills.

                                                                                                                                                                In parallel, I kept an eye on job ads but didn’t hear back about roles I found interesting. Inbound interest in my work was also rather low, partly as a reflection of the job market (not the best timing for me). I was also picky because I didn’t want a full-time job I didn’t care for. However, I was still doing some paid work for Reef Life Survey, and had an inbound lead from a local agency regarding an interesting AI advisory gig. This was more than enough to break even financially, even after going to Bali for a two-week dive trip.

                                                                                                                                                                I also pretty much came to terms with being at the intersection of data science and engineering by giving the talk mentioned above: Lessons from Reluctant Data Engineering. My positioning was still pretty commodified, though – I still labelled myself a “full-stack data scientist”.

                                                                                                                                                                What did I learn from Q3 2023?

                                                                                                                                                                1. Finding a niche that you can serve well is key to building either a solo software product or a solo consultancy.
                                                                                                                                                                2. Likewise, communicating the value proposition clearly is needed for both products and services.
                                                                                                                                                                3. I started moving away from the framing I had of software products as the only “real” sustainable approach, and towards Jonathan Stark’s business strategy: “Help people you like get what they want.”

                                                                                                                                                                Q4 2023: War and hate are powerful distractions

                                                                                                                                                                October 2023 started pleasantly enough, but quickly turned into one of the worst times of my life. I’ve been living in Australia since 2009, but I am a Jew from Israel and still have family and friends there. Therefore, I was deeply affected by October 7th and its aftermath.

                                                                                                                                                                This had two main elements:

                                                                                                                                                                1. I over-consumed news initially. Apart from the immediate worry and uncertainty, this led to secondary trauma (which I only learned about around that time).
                                                                                                                                                                2. I was shocked and appalled by the response of politicians and individuals who claim to support tolerance, peace, and non-violence. Locally, this has included politicians from The Australian Greens whom I had previously voted for. Examples abound, but to me the response is epitomised by the slogan “from the river to the sea…” – a call for ethnic cleansing by the same people who generally bend over backwards to avoid hurting the feelings of non-Jewish minorities.

                                                                                                                                                                Anyway, it goes without saying that being so shocked and distracted wasn’t good for business. But it pales in comparison to what people in the war zone are going through.

                                                                                                                                                                The war has also revealed that I couldn’t work with some people in the climate space due to the tribalism and hate by certain activists. For example, Greta Thunberg was filmed chanting “crush Zionism” – a call that I interpret as advocating for the death or displacement of the seven million Jews that live in Israel. However, by the end of the year I realised that: (1) most activists aren’t building climate tech businesses; and (2) most people understand they don’t know enough about Middle Eastern geopolitics to have strong opinions either way. Therefore, the intersection between true haters and my potential client base is minimal.

                                                                                                                                                                Concurrently, the AI advisory work with the agency didn’t turn out well. We had different ideas on the direction of the product, I didn’t want to increase my commitment and help them build it, and so we parted ways.

                                                                                                                                                                With whatever energy I could summon, I managed to start positioning myself as a fractional chief Data & AI officer, and began reaching out to prospects more systematically. However, things were slowing down with the end of the year, so nothing came of it at that point.

                                                                                                                                                                Looking at my notes and messages, at that point I still didn’t fully let go of going down the software product path. This wasn’t particularly fruitful, as time was in short supply, and I was also exploring a startup idea with a former colleague. You really can’t do it all.

                                                                                                                                                                What did I learn from Q4 2023?

                                                                                                                                                                1. I can’t escape core elements of my identity and ethnicity. This applies in any work scenario, but is especially salient in a solo business.
                                                                                                                                                                2. Advancements in technology and education still haven’t countered human tribalism, including among the well-educated.
                                                                                                                                                                3. I need to manage my exposure to news and other distractions. Indeed, by the end of 2023, I was down to about one news check a week – enough to remain informed without getting too sucked in. I also strictly reduced my exposure to algorithmic feeds (especially LinkedIn – I’m not on Twitter, TikTok, or Instagram, and Facebook is boring).
                                                                                                                                                                4. When negotiating engagements, I need to position myself clearly. With the agency, they hoped I’d be “a pair of hands” rather than an advisor (despite how it was pitched to me). This led to disappointments on both sides.
                                                                                                                                                                5. Most people in Australia don’t care about Middle Eastern geopolitics. While some local politicians and news publishers use it to fire up their base, it has little to do with what’s happening here. This is hard to fathom if you’re over-exposed to the news and algorithmic outrage machines.

                                                                                                                                                                Q1 2024: Time off, some work, and the business of authority

                                                                                                                                                                Coming into the new year, it was time to move on from the mess that was Q4 – at least to the extent that the political reality would allow me. Indeed, the war has led to one added time constraint, as I went on a family trip to the US for a couple of weeks in January. I had also committed to a Reef Life Survey dive trip in March, which meant I took four weeks off in Q1. This wasn’t ideal given the stage I was at with the business, but not terrible either, given that one of my goals as a soloist is time flexibility.

                                                                                                                                                                Early in the year, I still wasn’t fully committed to the business. I had even considered going full-time for the right role and applied to a few jobs. However, I didn’t proceed when I heard back, which meant that it was overall mostly wasted time.

                                                                                                                                                                On the non-wasted-time front, I started becoming more systematic with my outreach and lead generation. Back in Q4, I started with cold rather than warm outreach, with lacklustre results. In Q1, I got more into reactivating my existing network, and experimented with building an outreach system inspired by Atomic Habits. This worked much better. Among other things, it led to project work, fresh LinkedIn recommendations, increased confidence, and various leads. It was also kinda fun, though I’m still figuring out how to best do it efficiently.

                                                                                                                                                                When it comes to efficiency, one thing I resisted up until Q1 was publishing more. However, following Jonathan Stark and equipped with lessons learned from The Business of Authority podcast (co-hosted with Rochelle Moulton), I realised that publishing is the path to parasocial relationships. In short, it’s about building trust at scale with people who know much more about me than I know about them. To this end, I committed to posting here weekly when I’m not on holiday – what you’re reading now is a direct result of that! I also started getting more active on LinkedIn, albeit somewhat sporadically.

                                                                                                                                                                What did I learn from Q1 2024?

                                                                                                                                                                1. Letting fear get in the way of commitment to one path is silly. The job applications mostly served to allay my concern that I’m becoming unemployable, but they were a time sink. They also meant I wasn’t positioning well – to get a job you need to fit into a commodity box, which is the opposite of the positioning needed for a high-value expert.
                                                                                                                                                                2. Building trust and relationships is invaluable. One of the projects I did was with Orkestra – my former employer. It took 15 minutes to seal the deal with a fixed price. This is another instance of the Q2 2023 lesson from working with Reef Life Survey – scaling up to multiple clients with whom I have mutual trust is the key to high-value engagements.
                                                                                                                                                                3. Nurturing all your relationships can be helpful to your business. Everyone is busy, and this should go beyond transactional activities, but Consulting Success mentions network reactivation as a key strategy for a reason. It actually works, as the people who care about you want to see you succeed – but they can’t help you if you don’t keep in touch and say what you’re up to. Shocking, I know.
                                                                                                                                                                4. Publishing consistently is key to trust at scale. In the words of a recent post by Stark, his marketing strategy is: “help people you like get what they want for free at scale”. Yes, it’s kinda cringe, and I don’t absolutely love everything I post, but I see it working on me and for me. And again, this post is a part of it – it may help people I like get to their solo biz or startup goals faster than me.
                                                                                                                                                                5. Prioritising time off is good for the soul. In my view, there’s little point in going down this business path if it doesn’t eventually lead to work that is more flexible, impactful, profitable, and satisfying than what I could achieve as an employee. This is also why I chose this path over cofounding a high-growth startup – I don’t want to overwork for at least 5-10 years in the hope of a future reward.

                                                                                                                                                                Q2 2024: The magic of commitment and focus

                                                                                                                                                                On return from my Reef Life Survey trip in April 2024, I had a choice to make: I could interview for a well-paying role with a big tech company (following interest from a recruiter), or I could keep going with the biz. Big tech interviews are a bit of a pain – I know I can pass them, but I also know they require a significant time investment to pass. The problem was – once again – the potential for distraction.

                                                                                                                                                                Following some solid leads in early April, I decided to stick with the biz. Unfortunately, many of those leads ended up fizzling or going super-slowly through Q2. I’m targeting startups with my consulting to work closely with quick decision makers, but sometimes the best decision for them is to focus on other areas than on using my expertise. Oh well.

                                                                                                                                                                Another major shift in my thinking happened in Q2, after taking Justin Welsh’s LinkedIn Operating System course. I initially dismissed Welsh as a LinkedIn influencer bro, but he grew on me – especially as his content just makes sense. The shift was fully “getting” the idea of a solo expertise business with multiple revenue streams. As with other shifts, it was partly experiential. I paid something like US$100-150 for his course on a whim, which is how he designed it. Given that he’s sold the course to over 20,000 people, you can figure out yourself how valuable such “whims” are to a soloist.

                                                                                                                                                                Now, I’m not going to start selling “how to LinkedIn” courses, but coming from the world of building complicated software and data solutions, I have a silly bias against finding “low tech” ways of providing value. And that is the thing that took embarrassingly long to get to: “helping people you like get what they want” can take many forms. It can be a high-touch fractional Chief Data & AI Officer engagement with a climate tech startup, or it can simply be pointing people who ask me about solo consulting to this post. And it can be anything in between – even if it doesn’t perfectly align with my high-touch mission of helping climate tech startups.

                                                                                                                                                                For example, given my new mindset of low-touch monetisation, I decided to look for an appropriate affiliate for my established-remote company list. This list gets 100-200 unique visitors on a normal day, with some mad spikes (e.g., almost 10,000 visitors two weeks ago). A quick search led me to Remote Rocketship, so I signed up and added the affiliate link to the list and to my long and popular rant post on automated coding assessments. To date, those links have generated a bit over US$100, which is far from life-changing. But given the low level of effort required to add the links, the effective hourly rate on that revenue stream is going to keep rising.

                                                                                                                                                                Another key change I implemented in Q2 was committing to the solo expertise business more wholeheartedly. In a sense, I consider April 2024 to be the true start of the business, after a year of messing around. A key part of this commitment was revamping my LinkedIn profile and sharpening my positioning. In the spirit of Stark’s laser-focused positioning statement, this is more or less where I stand:

                                                                                                                                                                I’m a Data & AI expert who helps climate & nature tech startups with shipping data-intensive solutions (advanced analytics / AI/ML). Unlike my competitors, I have over a decade of experience working with startups & scaleups across data science, engineering, and tech leadership roles – including close collaborations with product & marketing teams, and a track record of respectfully communicating inconvenient truths to CEOs and other authority figures.

                                                                                                                                                                Following Welsh’s advice, I see my LinkedIn profile as my top-of-funnel landing page, so it looks like this now:

                                                                                                                                                                First year lessons from a solo expertise biz in Data & AI

                                                                                                                                                                My LinkedIn cover tells a part of the story. It used to be much less targeted.

                                                                                                                                                                My LinkedIn cover tells a part of the story. It used to be much less targeted.

                                                                                                                                                                In a world of AI-generated slop, it’s refreshing to read real human stories.

                                                                                                                                                                To contribute my bit to humanity and help me write consistently, I decided to start sharing a monthly reflection. This is in line with old-school blogging, which should help people who find themselves in similar situations to mine.

                                                                                                                                                                The focus will be on my career in Data & AI, but it’s not completely detachable from the rest of my life.

                                                                                                                                                                To set the scene for future reflections, I’ll start with where I’m at now, and what I’m trying to do. I’ll then summarise some highlights and lessons from the past year, and conclude with the past month.

                                                                                                                                                                Now building: Solo expertise business

                                                                                                                                                                It took me an embarrassingly long time to understand this, but broadly speaking, I’m building a solo expertise business.

                                                                                                                                                                Let’s break this up:

                                                                                                                                                                With this in mind, let’s get into an overview of how I got to this point.

                                                                                                                                                                First, ChatGPT’s summary

                                                                                                                                                                ⚠️ The post ended up being way longer than I wanted it to be. While I don’t advocate for blindly publishing AI-generated slop, AI is amazing for summarisation. Therefore, I asked ChatGPT to summarise the rest of the post. It’s not exactly right, and it lacks personality. However, it does capture the key ideas in case you’re busy or uninterested in going deep into the whole story. Now that I got it out there, I will aim to make future reflections shorter and avoid AI-generated summaries.

                                                                                                                                                                1. Leaving Orkestra and Exploring Options (Q2 2023): After leaving Orkestra, the author explored various projects, including revamping a marine biodiversity tool and applying to AI labs. Key lessons included the importance of trust and the realization that foundational AI research was not a preferred path.

                                                                                                                                                                2. Shifting Focus and Embracing Consulting (Q3 2023): The author moved away from the idea of building a solo software product to focusing more on consulting, especially in the climate tech space. This period emphasized the value of finding a niche, clear communication of value propositions, and the blurring lines between consulting and product building.

                                                                                                                                                                3. Navigating Distractions and Positioning Clearly (Q4 2023): Faced with personal and global challenges, the author learned to manage distractions, position themselves clearly in engagements, and recognize that their identity and ethnicity are inescapable aspects of their work. This period also underscored the limited intersection between true detractors and the target client base.

                                                                                                                                                                4. Building Authority and Trust through Outreach (Q1 2024): The start of the new year saw the author becoming more systematic in outreach and lead generation. They committed to consistent publishing, leveraging parasocial relationships to build trust at scale, and reactivating their existing network, which led to increased confidence and new opportunities.

                                                                                                                                                                5. Commitment to a Solo Expertise Business (Q2 2024): In this period, the author fully embraced the concept of a solo expertise business with multiple revenue streams. They focused on refining their LinkedIn profile, joining relevant communities, and understanding the importance of long-term relationship building over immediate gains. This commitment marked the true start of their business journey.

                                                                                                                                                                These points provide a comprehensive overview of the author’s journey from leaving a startup to fully committing to and refining their solo expertise business in Data & AI.

                                                                                                                                                                Q2 2023: Leaving Orkestra and messing around

                                                                                                                                                                In April 2023, I left Orkestra – a climate tech startup I joined early 2022. It was a classic first Data-to-AI hire story: I worked on an AI/ML project that they ended up pivoting away from (as startups do). While I could have stayed around to help with engineering on their main product, I decided it was time to leave and figure things out on my own. I still like the Orkestra crew and hope they do well, though.

                                                                                                                                                                As financial years in Australia run from July to June, I was in no rush to generate more Q2 revenue due to the high tax rate. Instead, I nominally took time off, but actually ended up doing a few different work-y things in addition to a couple of diving adventures:

                                                                                                                                                                1. Wrapped up a paid project for revamping Reef Life Survey’s Reef Species of the World.
                                                                                                                                                                2. After getting swept up with generative AI excitement and a bit of an existential crisis, I decided to try applying for a job with a top AI lab. To my surprise, they sent me their automated code assessment, which led to me spending more time reflecting on why I didn’t want to proceed with the process than the time I spent preparing for the assessment.
                                                                                                                                                                3. Spent some time exploring software product ideas I could build in the energy space, which led me to a year-long volunteering stint with Work on Climate.
                                                                                                                                                                4. Reflected on the shifting landscape of data science in an attempt to come to terms with the fact that I always needed to do plenty of engineering to be effective in data science roles. The latter reflection turned into a talk abstract, which led to a conference talk in Q3.

                                                                                                                                                                OK, so I’m not great at truly taking time off. But figuring things out was a key goal of Q2 2023.

                                                                                                                                                                What did I learn from Q2 2023?

                                                                                                                                                                1. I wasn’t ready for this lesson yet, but the Reef Life Survey project was an example of the importance of trust and value pricing. I was basically given a budget to improve the web tools because they trust me after almost a decade of volunteering. Scaling up this trust to enough clients will be key to a sustainable consulting practice.
                                                                                                                                                                2. I’m not sufficiently interested in foundational AI research – I prefer applying AI to specific problems. I also wasn’t ready for relocating from Australia, even temporarily for an amazing AI lab job.
                                                                                                                                                                3. I was getting close to realising that I don’t have a software startup idea that I’d commit to, but I wasn’t quite there yet.
                                                                                                                                                                4. I came to terms with data science drifting away from its engineering roots, and realised I needed to reposition in a way that’d emphasise my engineering skills.

                                                                                                                                                                Q3 2023: Shifting away from freelancer and indie hacker mindsets

                                                                                                                                                                Going into July 2023, I was aiming to follow a similar path to my 2014-2015 direction: Do a bit of freelancing to generate runway for building a solo software-as-a-service product business (aka an indie hacker product, partly popularised by Start Small, Stay Small). One thing I learned from 2014-2015 was that it’s hard to sustain the effort needed to build a successful product if I don’t care about the problem it solves, so I focused on exploring ideas in the climate space.

                                                                                                                                                                As no idea felt compelling, I ended up leaning more towards consulting – especially once I realised that the lines between solo consulting and product building are blurry. Putting more effort into consulting in the climate tech space was meant to serve multiple goals: (1) generate revenue; (2) get exposure to problems that may be addressed by an indie solo product; and (3) improve my marketing and sales skills.

                                                                                                                                                                In parallel, I kept an eye on job ads but didn’t hear back about roles I found interesting. Inbound interest in my work was also rather low, partly as a reflection of the job market (not the best timing for me). I was also picky because I didn’t want a full-time job I didn’t care for. However, I was still doing some paid work for Reef Life Survey, and had an inbound lead from a local agency regarding an interesting AI advisory gig. This was more than enough to break even financially, even after going to Bali for a two-week dive trip.

                                                                                                                                                                I also pretty much came to terms with being at the intersection of data science and engineering by giving the talk mentioned above: Lessons from Reluctant Data Engineering. My positioning was still pretty commodified, though – I still labelled myself a “full-stack data scientist”.

                                                                                                                                                                What did I learn from Q3 2023?

                                                                                                                                                                1. Finding a niche that you can serve well is key to building either a solo software product or a solo consultancy.
                                                                                                                                                                2. Likewise, communicating the value proposition clearly is needed for both products and services.
                                                                                                                                                                3. I started moving away from the framing I had of software products as the only “real” sustainable approach, and towards Jonathan Stark’s business strategy: “Help people you like get what they want.”

                                                                                                                                                                Q4 2023: War and hate are powerful distractions

                                                                                                                                                                October 2023 started pleasantly enough, but quickly turned into one of the worst times of my life. I’ve been living in Australia since 2009, but I am a Jew from Israel and still have family and friends there. Therefore, I was deeply affected by October 7th and its aftermath.

                                                                                                                                                                This had two main elements:

                                                                                                                                                                1. I over-consumed news initially. Apart from the immediate worry and uncertainty, this led to secondary trauma (which I only learned about around that time).
                                                                                                                                                                2. I was shocked and appalled by the response of politicians and individuals who claim to support tolerance, peace, and non-violence. Locally, this has included politicians from The Australian Greens whom I had previously voted for. Examples abound, but to me the response is epitomised by the slogan “from the river to the sea…” – a call for ethnic cleansing by the same people who generally bend over backwards to avoid hurting the feelings of non-Jewish minorities.

                                                                                                                                                                Anyway, it goes without saying that being so shocked and distracted wasn’t good for business. But it pales in comparison to what people in the war zone are going through.

                                                                                                                                                                The war has also revealed that I couldn’t work with some people in the climate space due to the tribalism and hate by certain activists. For example, Greta Thunberg was filmed chanting “crush Zionism” – a call that I interpret as advocating for the death or displacement of the seven million Jews that live in Israel. However, by the end of the year I realised that: (1) most activists aren’t building climate tech businesses; and (2) most people understand they don’t know enough about Middle Eastern geopolitics to have strong opinions either way. Therefore, the intersection between true haters and my potential client base is minimal.

                                                                                                                                                                Concurrently, the AI advisory work with the agency didn’t turn out well. We had different ideas on the direction of the product, I didn’t want to increase my commitment and help them build it, and so we parted ways.

                                                                                                                                                                With whatever energy I could summon, I managed to start positioning myself as a fractional chief Data & AI officer, and began reaching out to prospects more systematically. However, things were slowing down with the end of the year, so nothing came of it at that point.

                                                                                                                                                                Looking at my notes and messages, at that point I still didn’t fully let go of going down the software product path. This wasn’t particularly fruitful, as time was in short supply, and I was also exploring a startup idea with a former colleague. You really can’t do it all.

                                                                                                                                                                What did I learn from Q4 2023?

                                                                                                                                                                1. I can’t escape core elements of my identity and ethnicity. This applies in any work scenario, but is especially salient in a solo business.
                                                                                                                                                                2. Advancements in technology and education still haven’t countered human tribalism, including among the well-educated.
                                                                                                                                                                3. I need to manage my exposure to news and other distractions. Indeed, by the end of 2023, I was down to about one news check a week – enough to remain informed without getting too sucked in. I also strictly reduced my exposure to algorithmic feeds (especially LinkedIn – I’m not on Twitter, TikTok, or Instagram, and Facebook is boring).
                                                                                                                                                                4. When negotiating engagements, I need to position myself clearly. With the agency, they hoped I’d be “a pair of hands” rather than an advisor (despite how it was pitched to me). This led to disappointments on both sides.
                                                                                                                                                                5. Most people in Australia don’t care about Middle Eastern geopolitics. While some local politicians and news publishers use it to fire up their base, it has little to do with what’s happening here. This is hard to fathom if you’re over-exposed to the news and algorithmic outrage machines.

                                                                                                                                                                Q1 2024: Time off, some work, and the business of authority

                                                                                                                                                                Coming into the new year, it was time to move on from the mess that was Q4 – at least to the extent that the political reality would allow me. Indeed, the war has led to one added time constraint, as I went on a family trip to the US for a couple of weeks in January. I had also committed to a Reef Life Survey dive trip in March, which meant I took four weeks off in Q1. This wasn’t ideal given the stage I was at with the business, but not terrible either, given that one of my goals as a soloist is time flexibility.

                                                                                                                                                                Early in the year, I still wasn’t fully committed to the business. I had even considered going full-time for the right role and applied to a few jobs. However, I didn’t proceed when I heard back, which meant that it was overall mostly wasted time.

                                                                                                                                                                On the non-wasted-time front, I started becoming more systematic with my outreach and lead generation. Back in Q4, I started with cold rather than warm outreach, with lacklustre results. In Q1, I got more into reactivating my existing network, and experimented with building an outreach system inspired by Atomic Habits. This worked much better. Among other things, it led to project work, fresh LinkedIn recommendations, increased confidence, and various leads. It was also kinda fun, though I’m still figuring out how to best do it efficiently.

                                                                                                                                                                When it comes to efficiency, one thing I resisted up until Q1 was publishing more. However, following Jonathan Stark and equipped with lessons learned from The Business of Authority podcast (co-hosted with Rochelle Moulton), I realised that publishing is the path to parasocial relationships. In short, it’s about building trust at scale with people who know much more about me than I know about them. To this end, I committed to posting here weekly when I’m not on holiday – what you’re reading now is a direct result of that! I also started getting more active on LinkedIn, albeit somewhat sporadically.

                                                                                                                                                                What did I learn from Q1 2024?

                                                                                                                                                                1. Letting fear get in the way of commitment to one path is silly. The job applications mostly served to allay my concern that I’m becoming unemployable, but they were a time sink. They also meant I wasn’t positioning well – to get a job you need to fit into a commodity box, which is the opposite of the positioning needed for a high-value expert.
                                                                                                                                                                2. Building trust and relationships is invaluable. One of the projects I did was with Orkestra – my former employer. It took 15 minutes to seal the deal with a fixed price. This is another instance of the Q2 2023 lesson from working with Reef Life Survey – scaling up to multiple clients with whom I have mutual trust is the key to high-value engagements.
                                                                                                                                                                3. Nurturing all your relationships can be helpful to your business. Everyone is busy, and this should go beyond transactional activities, but Consulting Success mentions network reactivation as a key strategy for a reason. It actually works, as the people who care about you want to see you succeed – but they can’t help you if you don’t keep in touch and say what you’re up to. Shocking, I know.
                                                                                                                                                                4. Publishing consistently is key to trust at scale. In the words of a recent post by Stark, his marketing strategy is: “help people you like get what they want for free at scale”. Yes, it’s kinda cringe, and I don’t absolutely love everything I post, but I see it working on me and for me. And again, this post is a part of it – it may help people I like get to their solo biz or startup goals faster than me.
                                                                                                                                                                5. Prioritising time off is good for the soul. In my view, there’s little point in going down this business path if it doesn’t eventually lead to work that is more flexible, impactful, profitable, and satisfying than what I could achieve as an employee. This is also why I chose this path over cofounding a high-growth startup – I don’t want to overwork for at least 5-10 years in the hope of a future reward.

                                                                                                                                                                Q2 2024: The magic of commitment and focus

                                                                                                                                                                On return from my Reef Life Survey trip in April 2024, I had a choice to make: I could interview for a well-paying role with a big tech company (following interest from a recruiter), or I could keep going with the biz. Big tech interviews are a bit of a pain – I know I can pass them, but I also know they require a significant time investment to pass. The problem was – once again – the potential for distraction.

                                                                                                                                                                Following some solid leads in early April, I decided to stick with the biz. Unfortunately, many of those leads ended up fizzling or going super-slowly through Q2. I’m targeting startups with my consulting to work closely with quick decision makers, but sometimes the best decision for them is to focus on other areas than on using my expertise. Oh well.

                                                                                                                                                                Another major shift in my thinking happened in Q2, after taking Justin Welsh’s LinkedIn Operating System course. I initially dismissed Welsh as a LinkedIn influencer bro, but he grew on me – especially as his content just makes sense. The shift was fully “getting” the idea of a solo expertise business with multiple revenue streams. As with other shifts, it was partly experiential. I paid something like US$100-150 for his course on a whim, which is how he designed it. Given that he’s sold the course to over 20,000 people, you can figure out yourself how valuable such “whims” are to a soloist.

                                                                                                                                                                Now, I’m not going to start selling “how to LinkedIn” courses, but coming from the world of building complicated software and data solutions, I have a silly bias against finding “low tech” ways of providing value. And that is the thing that took embarrassingly long to get to: “helping people you like get what they want” can take many forms. It can be a high-touch fractional Chief Data & AI Officer engagement with a climate tech startup, or it can simply be pointing people who ask me about solo consulting to this post. And it can be anything in between – even if it doesn’t perfectly align with my high-touch mission of helping climate tech startups.

                                                                                                                                                                For example, given my new mindset of low-touch monetisation, I decided to look for an appropriate affiliate for my established-remote company list. This list gets 100-200 unique visitors on a normal day, with some mad spikes (e.g., almost 10,000 visitors two weeks ago). A quick search led me to Remote Rocketship, so I signed up and added the affiliate link to the list and to my long and popular rant post on automated coding assessments. To date, those links have generated a bit over US$100, which is far from life-changing. But given the low level of effort required to add the links, the effective hourly rate on that revenue stream is going to keep rising.

                                                                                                                                                                Another key change I implemented in Q2 was committing to the solo expertise business more wholeheartedly. In a sense, I consider April 2024 to be the true start of the business, after a year of messing around. A key part of this commitment was revamping my LinkedIn profile and sharpening my positioning. In the spirit of Stark’s laser-focused positioning statement, this is more or less where I stand:

                                                                                                                                                                I’m a Data & AI expert who helps climate & nature tech startups with shipping data-intensive solutions (advanced analytics / AI/ML). Unlike my competitors, I have over a decade of experience working with startups & scaleups across data science, engineering, and tech leadership roles – including close collaborations with product & marketing teams, and a track record of respectfully communicating inconvenient truths to CEOs and other authority figures.

                                                                                                                                                                Following Welsh’s advice, I see my LinkedIn profile as my top-of-funnel landing page, so it looks like this now:

                                                                                                                                                                800||document.documentElement.scrollTop>800?(mailingListButton.style.visibility="visible",mailingListButton.style.opacity="1"):(mailingListButton.style.visibility="hidden",mailingListButton.style.opacity="0")}

                                                                                                                                                                  Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                  \ No newline at end of file diff --git a/2024/08/12/keep-learning-your-career-is-never-truly-done/index.html b/2024/08/12/keep-learning-your-career-is-never-truly-done/index.html index 8d33e725f..db0ecb360 100644 --- a/2024/08/12/keep-learning-your-career-is-never-truly-done/index.html +++ b/2024/08/12/keep-learning-your-career-is-never-truly-done/index.html @@ -1,9 +1,11 @@ Keep learning: Your career is never truly done | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                                  Keep learning: Your career is never truly done

                                                                                                                                                                  illustration of the career journey described in the podcast

                                                                                                                                                                  If there’s one thing that wasn’t that clear to me when I was younger, it’s that your career is never truly done (while you’re still alive and well). For example, as a PhD student, I felt like I’d be “done” once I graduated. But that wasn’t the case, and it still isn’t the case 12 years later. I just find new things to do.

                                                                                                                                                                  In the spirit of never done, I recently joined Eli Gündüz of Careersy Coaching on his podcast to talk about my career journey. While I’m more comfortable with writing than with unscripted public speaking, it was good to practise the latter, and I didn’t waffle too much. You can listen to the chat on Spotify or below:

                                                                                                                                                                  In the spirit of the times, here’s an AI-generated summary of the top ten takeaways from the chat, with some minor edits for clarity:

                                                                                                                                                                  1. Career Journey: Yanir, originally from Israel, moved to Australia and transitioned from software engineering to data science, eventually becoming an independent consultant focused on helping startups ship data-intensive solutions.
                                                                                                                                                                  2. Data Science Evolution: Yanir noted how the field of data science has evolved, especially since 2012, when it gained popularity. He also discussed the changing terminologies and how roles like AI and machine learning engineering have become more mainstream.
                                                                                                                                                                  3. Educational Background: Yanir noted how his PhD helped him move into data science, as an unintended result of wanting to work on interesting things.
                                                                                                                                                                  4. Internship Experience: They discussed how Yanir secured an internship at Google Sydney during his PhD, which helped his career by providing industry experience alongside academic research.
                                                                                                                                                                  5. Consulting Approach: Yanir shared his approach to consulting, focusing on providing advisory services and helping startups navigate the complexities of data and AI, especially in the climate tech and energy transition sectors.
                                                                                                                                                                  6. Tools vs. Concepts: Yanir stressed the importance of focusing on underlying concepts rather than specific tools, as tools in the data science industry constantly change, while core concepts are evergreen.
                                                                                                                                                                  7. Challenges in Data Science Recruitment: The discussion highlighted the difficulties in recruiting and defining roles in data science due to the broad and overlapping responsibilities of various titles like data scientist, data engineer, and machine learning engineer.
                                                                                                                                                                  8. AI in Recruitment: The conversation touched on the impact of AI in recruitment, discussing how AI can be used to streamline candidate selection but also the potential biases and limitations of relying solely on AI-driven processes.
                                                                                                                                                                  9. Independent Consulting: Yanir reflected on their decision to leave a stable job to pursue independent consulting, driven by a desire to focus on climate-related projects and have more control over his work.
                                                                                                                                                                  10. Personal Branding and LinkedIn: Yanir touched on the importance of personal branding, particularly on LinkedIn, where he’s been actively posting and refining his profile to attract clients and opportunities in the consulting space.

                                                                                                                                                                  As noted, I’m more comfortable with writing, so here are some pointers to posts from the past decade that cover similar ground to the conversation:

                                                                                                                                                                  If you found the chat useful or have any questions, please feel free to get in touch.

                                                                                                                                                                  Subscribe +

                                                                                                                                                                  Keep learning: Your career is never truly done

                                                                                                                                                                  illustration of the career journey described in the podcast

                                                                                                                                                                  If there’s one thing that wasn’t that clear to me when I was younger, it’s that your career is never truly done (while you’re still alive and well). For example, as a PhD student, I felt like I’d be “done” once I graduated. But that wasn’t the case, and it still isn’t the case 12 years later. I just find new things to do.

                                                                                                                                                                  In the spirit of never done, I recently joined Eli Gündüz of Careersy Coaching on his podcast to talk about my career journey. While I’m more comfortable with writing than with unscripted public speaking, it was good to practise the latter, and I didn’t waffle too much. You can listen to the chat on Spotify or below:

                                                                                                                                                                  In the spirit of the times, here’s an AI-generated summary of the top ten takeaways from the chat, with some minor edits for clarity:

                                                                                                                                                                  1. Career Journey: Yanir, originally from Israel, moved to Australia and transitioned from software engineering to data science, eventually becoming an independent consultant focused on helping startups ship data-intensive solutions.
                                                                                                                                                                  2. Data Science Evolution: Yanir noted how the field of data science has evolved, especially since 2012, when it gained popularity. He also discussed the changing terminologies and how roles like AI and machine learning engineering have become more mainstream.
                                                                                                                                                                  3. Educational Background: Yanir noted how his PhD helped him move into data science, as an unintended result of wanting to work on interesting things.
                                                                                                                                                                  4. Internship Experience: They discussed how Yanir secured an internship at Google Sydney during his PhD, which helped his career by providing industry experience alongside academic research.
                                                                                                                                                                  5. Consulting Approach: Yanir shared his approach to consulting, focusing on providing advisory services and helping startups navigate the complexities of data and AI, especially in the climate tech and energy transition sectors.
                                                                                                                                                                  6. Tools vs. Concepts: Yanir stressed the importance of focusing on underlying concepts rather than specific tools, as tools in the data science industry constantly change, while core concepts are evergreen.
                                                                                                                                                                  7. Challenges in Data Science Recruitment: The discussion highlighted the difficulties in recruiting and defining roles in data science due to the broad and overlapping responsibilities of various titles like data scientist, data engineer, and machine learning engineer.
                                                                                                                                                                  8. AI in Recruitment: The conversation touched on the impact of AI in recruitment, discussing how AI can be used to streamline candidate selection but also the potential biases and limitations of relying solely on AI-driven processes.
                                                                                                                                                                  9. Independent Consulting: Yanir reflected on their decision to leave a stable job to pursue independent consulting, driven by a desire to focus on climate-related projects and have more control over his work.
                                                                                                                                                                  10. Personal Branding and LinkedIn: Yanir touched on the importance of personal branding, particularly on LinkedIn, where he’s been actively posting and refining his profile to attract clients and opportunities in the consulting space.

                                                                                                                                                                  As noted, I’m more comfortable with writing, so here are some pointers to posts from the past decade that cover similar ground to the conversation:

                                                                                                                                                                  If you found the chat useful or have any questions, please feel free to get in touch.

                                                                                                                                                                  Subscribe

                                                                                                                                                                    Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                    \ No newline at end of file diff --git a/2024/08/19/giving-up-on-the-minimum-viable-data-stack/index.html b/2024/08/19/giving-up-on-the-minimum-viable-data-stack/index.html index a2d8e32c6..111033916 100644 --- a/2024/08/19/giving-up-on-the-minimum-viable-data-stack/index.html +++ b/2024/08/19/giving-up-on-the-minimum-viable-data-stack/index.html @@ -1,9 +1,11 @@ Giving up on the minimum viable data stack | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                                    Giving up on the minimum viable data stack

                                                                                                                                                                    MAD landscape 2024. 2000+ logos and expanding. [Source](https://mattturck.com/mad2024/).

                                                                                                                                                                    MAD landscape 2024. 2000+ logos and expanding. Source.

                                                                                                                                                                    Back in February, I had an idea for a series of posts on a startup’s minimum viable data stack. The motivation was to provide advice that’d help startups navigate the ever-expanding MAD (Machine Learning, AI, and Data) landscape, which as of 2024 includes over 2,000 logos. I was also aiming to deepen my knowledge through research and writing for the series.

                                                                                                                                                                    Since then, I’ve decided to drop the idea because:

                                                                                                                                                                    1. Specific constraints are key. A startup’s decisions on hiring, infrastructure, and requirements constrain what’s minimally viable in their case.
                                                                                                                                                                    2. MAD is a moving target. The MAD landscape is a moving target even when restricted to “minimum viable” data stack components.
                                                                                                                                                                    3. Deep research requires real-world experimentation. I was planning to use every tool as part of my research, but this is both time-consuming and shallow. I simply don’t have the capacity to use every tool to the level it’d be used by a full-time team (no single person does).

                                                                                                                                                                    I’ll illustrate these issues through a question I advised on recently:

                                                                                                                                                                    Should a startup use Snowflake, Databricks, or a collection of AWS products at the core of their Data & AI stack?

                                                                                                                                                                    (1) Specific constraints are key

                                                                                                                                                                    The question is constrained to three top-level options. This was informed by the company’s choice of AWS as their cloud platform. While Snowflake and Databricks can run with either of the three major cloud providers, throwing Google Cloud or Azure into the mix would be foolish.

                                                                                                                                                                    Indeed, when the time comes to get serious about Data & AI, it’s often the case that many infrastructure decisions have already been made. These decisions introduce constraints – which is a Good Thing. Such constraints narrow down the search through the MAD landscape.

                                                                                                                                                                    The question also implies that the company would benefit from a cloud lakehouse solution. This isn’t always the case, especially in a minimum viable setting! Sometimes, spreadsheets are good enough. In other cases, the increasingly-popular DuckDB can serve analytical needs. Determining what’s minimally viable requires understanding the specific context.

                                                                                                                                                                    Finally, the context also includes current staff capabilities and plans for hiring. Continuing with our example, setting up Databricks or cobbling together AWS products requires more data engineering expertise than getting started with Snowflake (at least in 2024). The preferred choice has to be informed by team expertise, which limits the usefulness of general advice posts.

                                                                                                                                                                    (2) MAD is a moving target

                                                                                                                                                                    One Redditor distilled the question of Snowflake versus Databricks in February 2023 to a choice between: "(1) a product made to be a cloud RDBMS-style SQL engine focused on ELT and data collaboration, which is now adding data engineering workloads as a bolt-on; or (2) a product made to be a datalake-based Spark engine for distributed computations and engineering/data science workloads, which is adding a SQL database as a bolt-on."

                                                                                                                                                                    The collection of AWS offerings is also ever-evolving, but anyone who’s used AWS products deeply knows that they’re not all equal in quality. For example, I was wondering whether AWS Redshift has caught up with Snowflake and Databricks, given that it now includes a serverless option. However, Redshift still isn’t getting much love – both from personal discussions with experts and from Reddit comments like: “We use Redshift extensively, and I would take my chances and pick an unknown solution over Redshift at this point”.

                                                                                                                                                                    Check again next year and things might change. General recommendations based on current capabilities will get old pretty quickly.

                                                                                                                                                                    (3) Deep research requires real-world experimentation

                                                                                                                                                                    Vendors tend to paint a rosy picture of the everyday reality of setting up, managing, and using the products they’re selling. For complex tools, going through a basic setup on a sample project isn’t enough to gain the deep understanding that comes from real-world deployments. For an illustration of the practical knowledge gained by making many such decisions, check out the post (almost) every infrastructure decision I endorse or regret after 4 years running infrastructure at a startup.

                                                                                                                                                                    Further, when it comes to Data & AI platforms, there are multiple users with different views. For example, how an analyst interacts with the platform is different from how a data engineer interacts with it. My original thought of trying to cover all bases in a post series was just too ambitious.

                                                                                                                                                                    As a shortcut to gaining deep experience with all the tools (an impossible endeavour), I opt for a combination of desk research and consulting experts. Returning to the running example, this sort of research means that I can now speak more confidently about the trade-offs between choosing Snowflake, Databricks, and AWS tools in 2024. The short answer to the question of which Data & AI platform is a big IT DEPENDS (and it will keep changing).

                                                                                                                                                                    Beyond the tools: People, Platform, and Processes

                                                                                                                                                                    Tools cover only one aspect of making technology decisions – the Platform. People and Processes are no less important. Keeping the Platform constant, some companies will succeed, and some will fail.

                                                                                                                                                                    In general, the best tools are those that your People already know, or are committed to using well. Given the richness of the MAD landscape and software tooling in general, making reversible decisions and choosing boring, well-supported components where possible is the way to go.

                                                                                                                                                                    While I’ve abandoned the idea of posting about each component in the minimum viable data stack, I still advocate for the core concepts: Work back from business needs, set up the minimal Platform that makes sense in your case, and ensure that your People and Processes are making the most out of the chosen Platform. Then keep iterating.

                                                                                                                                                                    Good luck out there!

                                                                                                                                                                    Subscribe +

                                                                                                                                                                    Giving up on the minimum viable data stack

                                                                                                                                                                    MAD landscape 2024. 2000+ logos and expanding. [Source](https://mattturck.com/mad2024/).

                                                                                                                                                                    MAD landscape 2024. 2000+ logos and expanding. Source.

                                                                                                                                                                    Back in February, I had an idea for a series of posts on a startup’s minimum viable data stack. The motivation was to provide advice that’d help startups navigate the ever-expanding MAD (Machine Learning, AI, and Data) landscape, which as of 2024 includes over 2,000 logos. I was also aiming to deepen my knowledge through research and writing for the series.

                                                                                                                                                                    Since then, I’ve decided to drop the idea because:

                                                                                                                                                                    1. Specific constraints are key. A startup’s decisions on hiring, infrastructure, and requirements constrain what’s minimally viable in their case.
                                                                                                                                                                    2. MAD is a moving target. The MAD landscape is a moving target even when restricted to “minimum viable” data stack components.
                                                                                                                                                                    3. Deep research requires real-world experimentation. I was planning to use every tool as part of my research, but this is both time-consuming and shallow. I simply don’t have the capacity to use every tool to the level it’d be used by a full-time team (no single person does).

                                                                                                                                                                    I’ll illustrate these issues through a question I advised on recently:

                                                                                                                                                                    Should a startup use Snowflake, Databricks, or a collection of AWS products at the core of their Data & AI stack?

                                                                                                                                                                    (1) Specific constraints are key

                                                                                                                                                                    The question is constrained to three top-level options. This was informed by the company’s choice of AWS as their cloud platform. While Snowflake and Databricks can run with either of the three major cloud providers, throwing Google Cloud or Azure into the mix would be foolish.

                                                                                                                                                                    Indeed, when the time comes to get serious about Data & AI, it’s often the case that many infrastructure decisions have already been made. These decisions introduce constraints – which is a Good Thing. Such constraints narrow down the search through the MAD landscape.

                                                                                                                                                                    The question also implies that the company would benefit from a cloud lakehouse solution. This isn’t always the case, especially in a minimum viable setting! Sometimes, spreadsheets are good enough. In other cases, the increasingly-popular DuckDB can serve analytical needs. Determining what’s minimally viable requires understanding the specific context.

                                                                                                                                                                    Finally, the context also includes current staff capabilities and plans for hiring. Continuing with our example, setting up Databricks or cobbling together AWS products requires more data engineering expertise than getting started with Snowflake (at least in 2024). The preferred choice has to be informed by team expertise, which limits the usefulness of general advice posts.

                                                                                                                                                                    (2) MAD is a moving target

                                                                                                                                                                    One Redditor distilled the question of Snowflake versus Databricks in February 2023 to a choice between: "(1) a product made to be a cloud RDBMS-style SQL engine focused on ELT and data collaboration, which is now adding data engineering workloads as a bolt-on; or (2) a product made to be a datalake-based Spark engine for distributed computations and engineering/data science workloads, which is adding a SQL database as a bolt-on."

                                                                                                                                                                    The collection of AWS offerings is also ever-evolving, but anyone who’s used AWS products deeply knows that they’re not all equal in quality. For example, I was wondering whether AWS Redshift has caught up with Snowflake and Databricks, given that it now includes a serverless option. However, Redshift still isn’t getting much love – both from personal discussions with experts and from Reddit comments like: “We use Redshift extensively, and I would take my chances and pick an unknown solution over Redshift at this point”.

                                                                                                                                                                    Check again next year and things might change. General recommendations based on current capabilities will get old pretty quickly.

                                                                                                                                                                    (3) Deep research requires real-world experimentation

                                                                                                                                                                    Vendors tend to paint a rosy picture of the everyday reality of setting up, managing, and using the products they’re selling. For complex tools, going through a basic setup on a sample project isn’t enough to gain the deep understanding that comes from real-world deployments. For an illustration of the practical knowledge gained by making many such decisions, check out the post (almost) every infrastructure decision I endorse or regret after 4 years running infrastructure at a startup.

                                                                                                                                                                    Further, when it comes to Data & AI platforms, there are multiple users with different views. For example, how an analyst interacts with the platform is different from how a data engineer interacts with it. My original thought of trying to cover all bases in a post series was just too ambitious.

                                                                                                                                                                    As a shortcut to gaining deep experience with all the tools (an impossible endeavour), I opt for a combination of desk research and consulting experts. Returning to the running example, this sort of research means that I can now speak more confidently about the trade-offs between choosing Snowflake, Databricks, and AWS tools in 2024. The short answer to the question of which Data & AI platform is a big IT DEPENDS (and it will keep changing).

                                                                                                                                                                    Beyond the tools: People, Platform, and Processes

                                                                                                                                                                    Tools cover only one aspect of making technology decisions – the Platform. People and Processes are no less important. Keeping the Platform constant, some companies will succeed, and some will fail.

                                                                                                                                                                    In general, the best tools are those that your People already know, or are committed to using well. Given the richness of the MAD landscape and software tooling in general, making reversible decisions and choosing boring, well-supported components where possible is the way to go.

                                                                                                                                                                    While I’ve abandoned the idea of posting about each component in the minimum viable data stack, I still advocate for the core concepts: Work back from business needs, set up the minimal Platform that makes sense in your case, and ensure that your People and Processes are making the most out of the chosen Platform. Then keep iterating.

                                                                                                                                                                    Good luck out there!

                                                                                                                                                                    Subscribe

                                                                                                                                                                      Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                      \ No newline at end of file diff --git a/2024/08/26/ai-hype-ai-bullshit-and-the-real-deal/index.html b/2024/08/26/ai-hype-ai-bullshit-and-the-real-deal/index.html index e5a92d798..7996de05c 100644 --- a/2024/08/26/ai-hype-ai-bullshit-and-the-real-deal/index.html +++ b/2024/08/26/ai-hype-ai-bullshit-and-the-real-deal/index.html @@ -1,5 +1,5 @@ AI hype, AI bullshit, and the real deal | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                                      AI hype, AI bullshit, and the real deal

                                                                                                                                                                      Alice in the Sparkly AI Wonderland (generated by Meta AI)

                                                                                                                                                                      Are you tired of ✨ AI buttons everywhere?

                                                                                                                                                                      Have you had enough of false AI 💫 promises by vendors and grifters?

                                                                                                                                                                      Well, it might be tempting to dismiss AI as a fad and become a grumpy AI cynic.

                                                                                                                                                                      Don’t do that!

                                                                                                                                                                      As with any hype wave, you need to discern between marketing bullshit and the real deal.

                                                                                                                                                                      Here’s how I think about it.

                                                                                                                                                                      AI hype

                                                                                                                                                                      hype (noun): extravagant or intensive publicity or promotion.

                                                                                                                                                                      +

                                                                                                                                                                      AI hype, AI bullshit, and the real deal

                                                                                                                                                                      Alice in the Sparkly AI Wonderland (generated by Meta AI)

                                                                                                                                                                      Are you tired of ✨ AI buttons everywhere?

                                                                                                                                                                      Have you had enough of false AI 💫 promises by vendors and grifters?

                                                                                                                                                                      Well, it might be tempting to dismiss AI as a fad and become a grumpy AI cynic.

                                                                                                                                                                      Don’t do that!

                                                                                                                                                                      As with any hype wave, you need to discern between marketing bullshit and the real deal.

                                                                                                                                                                      Here’s how I think about it.

                                                                                                                                                                      AI hype

                                                                                                                                                                      hype (noun): extravagant or intensive publicity or promotion.

                                                                                                                                                                      I doubt anyone would challenge the claim that we’ve been witnessing AI hype over the past two years. The above definition perfectly describes the situation: The collection of technologies named AI is receiving intensive publicity and promotion.

                                                                                                                                                                      While publicity doesn’t always imply utility, hype doesn’t diminish the actual value of AI. The value of the tech is real – but it needs to be wielded wisely.

                                                                                                                                                                      Regardless of value, hype tends to follow what Andrew Chen called The Law of Shitty Clickthroughs: “Over time, all marketing strategies result in shitty clickthrough rates”.

                                                                                                                                                                      With ubiquitous ✨ AI buttons and seemingly every tech company talking about its AI solutions, the novelty of AI promises is quickly wearing off. @@ -28,7 +28,9 @@

                                                                                                                                                                        Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                        \ No newline at end of file diff --git a/2024/09/02/juggling-delivery-admin-and-leads-monthly-biz-recap/index.html b/2024/09/02/juggling-delivery-admin-and-leads-monthly-biz-recap/index.html index 29e4d65c2..5f0b4baaf 100644 --- a/2024/09/02/juggling-delivery-admin-and-leads-monthly-biz-recap/index.html +++ b/2024/09/02/juggling-delivery-admin-and-leads-monthly-biz-recap/index.html @@ -1,5 +1,5 @@ Juggling delivery, admin, and leads: Monthly biz recap | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                                        Juggling delivery, admin, and leads: Monthly biz recap

                                                                                                                                                                        winter day on a boat off the east of Moreton Island

                                                                                                                                                                        Most of my month wasn’t this calm.

                                                                                                                                                                        Last month, I posted an overview of my first year lessons from building a solo expertise business. +

                                                                                                                                                                        Juggling delivery, admin, and leads: Monthly biz recap

                                                                                                                                                                        winter day on a boat off the east of Moreton Island

                                                                                                                                                                        Most of my month wasn’t this calm.

                                                                                                                                                                        Last month, I posted an overview of my first year lessons from building a solo expertise business. I plan to write on this topic every month, leading to shorter and more timely posts.

                                                                                                                                                                        As a reminder, solo expertise business means: (1) solo – no employees other than myself; (2) expertise – the value comes from packaging my skills and experience; and (3) business – value capture isn’t limited to selling my labour. I believe that some of what I’m learning would be useful to other business owners, as the lines are blurry between my type of business and early-stage product startups.

                                                                                                                                                                        This post covers five highlights from August 2024:

                                                                                                                                                                        1. Discovery package delivery
                                                                                                                                                                        2. Fractional cash flow and potential product direction
                                                                                                                                                                        3. Company admin rabbit holes
                                                                                                                                                                        4. Political distractions
                                                                                                                                                                        5. Opportunity flow and time constraints

                                                                                                                                                                        Discovery package delivery

                                                                                                                                                                        One path towards high profitability is through value pricing, i.e., setting prices for offerings based on the value to the client rather than based on costs. When it comes to selling services, the main cost is my time, so the idea is to move away from hourly billing. @@ -45,7 +45,9 @@

                                                                                                                                                                          Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                          \ No newline at end of file diff --git a/2024/09/09/data-ai-humans-and-climate-carving-a-consulting-niche/index.html b/2024/09/09/data-ai-humans-and-climate-carving-a-consulting-niche/index.html index 1e4048ae9..b6da28772 100644 --- a/2024/09/09/data-ai-humans-and-climate-carving-a-consulting-niche/index.html +++ b/2024/09/09/data-ai-humans-and-climate-carving-a-consulting-niche/index.html @@ -1,10 +1,12 @@ Data, AI, humans, and climate: Carving a consulting niche | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                                          Data, AI, humans, and climate: Carving a consulting niche

                                                                                                                                                                          cover image of the Data Dojo podcast episode with Yanir Seroussi

                                                                                                                                                                          I recently joined Adam Massey on the Data Dojo podcast to chat about my career journey and consulting focus. +

                                                                                                                                                                          Data, AI, humans, and climate: Carving a consulting niche

                                                                                                                                                                          cover image of the Data Dojo podcast episode with Yanir Seroussi

                                                                                                                                                                          I recently joined Adam Massey on the Data Dojo podcast to chat about my career journey and consulting focus. You can check it out on YouTube:

                                                                                                                                                                          Episode summary

                                                                                                                                                                          AI-generated summaries are getting better, as is the ease of generating them. For the following summary, I: (1) expanded the transcript on YouTube; (2) copied the entire text from the YouTube page (not just the transcript); and (3) threw it into Google AI Studio (with Gemini 1.5 Pro). I didn’t even bother prompting the model – it figured out I wanted a summary by itself.

                                                                                                                                                                          Key Takeaways:

                                                                                                                                                                          • Bridging the Gap: Startups often struggle to bridge the gap between data science theory and practical implementation.
                                                                                                                                                                          • Managing Expectations: Over-excitement about AI’s potential often leads to disappointment. Realistic goals and iterative project scoping are crucial.
                                                                                                                                                                          • Data Quality: As always, data quality remains paramount. Startups must ensure their ambitions align with available data.
                                                                                                                                                                          • Communication is Key: Success hinges on effective communication with stakeholders, both internally and externally.
                                                                                                                                                                          • Niche Expertise: Focusing on a niche, like climate tech, helps build a strong brand and attract relevant clients.
                                                                                                                                                                          • AI’s Evolving Landscape: AI tools are rapidly evolving, demanding adaptability and a willingness to learn new technologies.
                                                                                                                                                                          • Human Skills Remain Essential: Technical expertise alone isn’t enough. Strong communication, business acumen, and stakeholder management remain crucial.
                                                                                                                                                                          • Future of AI: While predicting the future is difficult, AI will likely continue to automate tasks and transform industries in unforeseen ways.

                                                                                                                                                                          Actionable Insights:

                                                                                                                                                                          • For data professionals:
                                                                                                                                                                            • Stay updated on new tools and technologies.
                                                                                                                                                                            • Focus on communication and business skills.
                                                                                                                                                                            • Learn to effectively manage stakeholder expectations.
                                                                                                                                                                          • For business owners:
                                                                                                                                                                            • Carefully assess your data and AI readiness.
                                                                                                                                                                            • Start with small, achievable projects.
                                                                                                                                                                            • Partner with experts to bridge the skills gap.

                                                                                                                                                                          Overall:

                                                                                                                                                                          The episode offers valuable insights into the challenges and opportunities presented by AI, particularly for startups in the climate and nature tech space. It emphasises the importance of a balanced approach, combining technical expertise with human skills and a focus on delivering tangible business value.


                                                                                                                                                                          Note to self: I should speak slower in complete sentences. I definitely write better than I speak (especially off the cuff), but that’s a good reason to keep practising. 😉

                                                                                                                                                                          Subscribe

                                                                                                                                                                            Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                            \ No newline at end of file diff --git a/2024/09/23/in-praise-of-inconsistency-ditching-weekly-posts/index.html b/2024/09/23/in-praise-of-inconsistency-ditching-weekly-posts/index.html index 80d4d8088..a74cb8bac 100644 --- a/2024/09/23/in-praise-of-inconsistency-ditching-weekly-posts/index.html +++ b/2024/09/23/in-praise-of-inconsistency-ditching-weekly-posts/index.html @@ -1,5 +1,5 @@ In praise of inconsistency: Ditching weekly posts | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                                            In praise of inconsistency: Ditching weekly posts

                                                                                                                                                                            a little hobgoblin at a little laptop

                                                                                                                                                                            A foolish consistency is the hobgoblin of little minds.

                                                                                                                                                                            +

                                                                                                                                                                            In praise of inconsistency: Ditching weekly posts

                                                                                                                                                                            a little hobgoblin at a little laptop

                                                                                                                                                                            A foolish consistency is the hobgoblin of little minds.

                                                                                                                                                                            Earlier this year, I decided to start posting more consistently to my website and mailing list. I’ve pretty much stuck to a weekly posting schedule, partly because I enjoy writing and partly as a way of exploring and sharing ideas. However, the problem with feeling like I have to post weekly is that it doesn’t always do justice to the topics I want to explore.

                                                                                                                                                                            Meanwhile, the past few weeks have been pretty busy, so I’ve zoomed out to reconsider my publishing activities. @@ -13,7 +13,9 @@

                                                                                                                                                                              Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                              \ No newline at end of file diff --git a/2024/11/18/dont-build-ai-build-with-ai/index.html b/2024/11/18/dont-build-ai-build-with-ai/index.html index dc5d10a7a..468c8c723 100644 --- a/2024/11/18/dont-build-ai-build-with-ai/index.html +++ b/2024/11/18/dont-build-ai-build-with-ai/index.html @@ -1,9 +1,11 @@ Don't build AI, build with AI | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                                              Don't build AI, build with AI

                                                                                                                                                                              A robotic intern points towards a glowing AI God surrounded by cogs

                                                                                                                                                                              Drew Breunig divides AI uses cases into Gods, Interns, and Cogs:

                                                                                                                                                                              1. Gods: Super-intelligent, artificial entities that do things autonomously.
                                                                                                                                                                              2. Interns: Supervised copilots that collaborate with experts, focusing on grunt work.
                                                                                                                                                                              3. Cogs: Functions optimized to perform a single task extremely well, usually as part of a pipeline or interface.

                                                                                                                                                                              Each category comes with different costs:

                                                                                                                                                                              This categorisation is useful when considering startup and product ideas, and your use of AI:

                                                                                                                                                                              1. Don’t build AI Gods, unless you have access to ungodly amounts of capital. This may be obvious to most people, but I still see pitches for model-centric startups, and God-building model companies are the focus of much AI hype.
                                                                                                                                                                              2. Build with AI Interns, as they can significantly increase your productivity. Interns include writing assistants, meeting transcribers, and programming copilots. Ignoring the wealth of AI interns is foolish for individuals and companies alike.
                                                                                                                                                                              3. Don’t build complex AI Interns, unless you have a use case that justifies the costs and risks. For example, Bloomberg’s multi-million dollar model was outperformed by GPT-4 a few months after its release, and it’s unclear whether it ended up powering any features.
                                                                                                                                                                              4. Build with and build AI Cogs, but ensure you can manage non-deterministic components in production. This includes anything from traditional machine learning to the plethora of AI tasks that have become commodified in recent years like object recognition and text summarisation.

                                                                                                                                                                              Above all, start by defining the problem and assessing the impact rather than making AI use your goal. Focusing on problems and solution impact is robust to hype cycles. I’ve considered this focus to be the hardest problem in data science since at least 2015. Amazon data scientist Dzidas Martinaitis has recently captured a similar sentiment in his flowchart for data science projects. Similarly, Douglas Gray and Evan Shellshear have found that data science and AI projects typically fail due to issues with strategy and process, rather than tech and people shortfalls.

                                                                                                                                                                              Ignore at your own risk.


                                                                                                                                                                              Acknowledgement: This post was produced with the help of AI interns. One version of Gemini produced the cover image, while another made helpful suggestions on earlier drafts.

                                                                                                                                                                              Subscribe +

                                                                                                                                                                              Don't build AI, build with AI

                                                                                                                                                                              A robotic intern points towards a glowing AI God surrounded by cogs

                                                                                                                                                                              Drew Breunig divides AI uses cases into Gods, Interns, and Cogs:

                                                                                                                                                                              1. Gods: Super-intelligent, artificial entities that do things autonomously.
                                                                                                                                                                              2. Interns: Supervised copilots that collaborate with experts, focusing on grunt work.
                                                                                                                                                                              3. Cogs: Functions optimized to perform a single task extremely well, usually as part of a pipeline or interface.

                                                                                                                                                                              Each category comes with different costs:

                                                                                                                                                                              This categorisation is useful when considering startup and product ideas, and your use of AI:

                                                                                                                                                                              1. Don’t build AI Gods, unless you have access to ungodly amounts of capital. This may be obvious to most people, but I still see pitches for model-centric startups, and God-building model companies are the focus of much AI hype.
                                                                                                                                                                              2. Build with AI Interns, as they can significantly increase your productivity. Interns include writing assistants, meeting transcribers, and programming copilots. Ignoring the wealth of AI interns is foolish for individuals and companies alike.
                                                                                                                                                                              3. Don’t build complex AI Interns, unless you have a use case that justifies the costs and risks. For example, Bloomberg’s multi-million dollar model was outperformed by GPT-4 a few months after its release, and it’s unclear whether it ended up powering any features.
                                                                                                                                                                              4. Build with and build AI Cogs, but ensure you can manage non-deterministic components in production. This includes anything from traditional machine learning to the plethora of AI tasks that have become commodified in recent years like object recognition and text summarisation.

                                                                                                                                                                              Above all, start by defining the problem and assessing the impact rather than making AI use your goal. Focusing on problems and solution impact is robust to hype cycles. I’ve considered this focus to be the hardest problem in data science since at least 2015. Amazon data scientist Dzidas Martinaitis has recently captured a similar sentiment in his flowchart for data science projects. Similarly, Douglas Gray and Evan Shellshear have found that data science and AI projects typically fail due to issues with strategy and process, rather than tech and people shortfalls.

                                                                                                                                                                              Ignore at your own risk.


                                                                                                                                                                              Acknowledgement: This post was produced with the help of AI interns. One version of Gemini produced the cover image, while another made helpful suggestions on earlier drafts.

                                                                                                                                                                              Subscribe

                                                                                                                                                                                Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                                \ No newline at end of file diff --git a/2024/11/25/data-moats-stealthy-ai-and-more-ai-con-2024-notes/index.html b/2024/11/25/data-moats-stealthy-ai-and-more-ai-con-2024-notes/index.html index a05c8c24a..51c85b97f 100644 --- a/2024/11/25/data-moats-stealthy-ai-and-more-ai-con-2024-notes/index.html +++ b/2024/11/25/data-moats-stealthy-ai-and-more-ai-con-2024-notes/index.html @@ -1,5 +1,5 @@ Data moats, stealthy AI, and more: AI Con 2024 notes | Yanir Seroussi | Data & AI for Startup Impact -

                                                                                                                                                                                Data moats, stealthy AI, and more: AI Con 2024 notes

                                                                                                                                                                                image showcasing some of the speakers from AI Con 2024

                                                                                                                                                                                Earlier this month, I was delighted to participate in a panel discussion as part of Sydney’s AI Con 2024. +

                                                                                                                                                                                Data moats, stealthy AI, and more: AI Con 2024 notes

                                                                                                                                                                                image showcasing some of the speakers from AI Con 2024

                                                                                                                                                                                Earlier this month, I was delighted to participate in a panel discussion as part of Sydney’s AI Con 2024. I felt like a bit of an impostor as I found myself on the cover banner alongside truly accomplished speakers like Cassie Kozyrkov and Mike Bewley, but it all went smoothly.

                                                                                                                                                                                This post summarises five themes from the conference:

                                                                                                                                                                                • Data is the moat (but data scientist is a somewhat toxic term)
                                                                                                                                                                                • Stealthy AI use may be the wisest approach
                                                                                                                                                                                • Chatty is a UX revolution – model building is dead
                                                                                                                                                                                • Testing is key
                                                                                                                                                                                • Business and product fundamentals remain unchanged

                                                                                                                                                                                The talks weren’t recorded, so I’m basing this post on rereading my notes a few weeks later. Also, some of my takeaways draw on conversations from the best part of any conference: The Hallway Track. Therefore, I apologise if I misquoted any speaker, and I’d be happy to update the post if needed.

                                                                                                                                                                                Data is the moat (but data scientist is a somewhat toxic term)

                                                                                                                                                                                As I’ve been banging on about how data is still key to success in the GenAI world, it was great to feed my confirmation bias. @@ -27,7 +27,9 @@

                                                                                                                                                                                  Public comments are closed, but I love hearing from readers. Feel free to -contact me with your thoughts.

                                                                                                                                                                                  \ No newline at end of file diff --git a/404.html b/404.html index 2389b1134..4a5587b95 100644 --- a/404.html +++ b/404.html @@ -1,6 +1,6 @@ 404 Page not found | Yanir Seroussi | Data & AI for Startup Impact
                                                                                                                                                                                  Page not found
                                                                                                                                                                                  🐳🏠