Historical decline in WWW subdomain use?

Hi @gwern.

At the end of 2018, the HTTP Archive changed the source of the URLs from Alexa to CrUX. Alexa URLs (before Dec 2018) include the www by default (www.example.com), while the CrUX URLs do not (example.com and www.example.com are both valid origins). Since the initial URL has changed, we don’t have a reliable comparison before 2018. Some websites would redirect from www.example.com to example.com, while others would serve the same content on both domains or redirect to another subdomain altogether (en.example.com or m.example.com).

Looking at the data from Dec 2018 onwards:

client year total www pct
desktop 2022* 1,246,032 511,012 41.01%
desktop 2021 5,824,858 2,508,805 43.07%
desktop 2020 6,018,707 2,716,807 45.14%
desktop 2019 4,291,086 2,086,970 48.64%
desktop 2018 3,840,067 1,815,773 47.28%
mobile 2022* 1,613,142 651,794 40.41%
mobile 2021 7,957,652 3,350,268 42.10%
mobile 2020 7,157,942 3,203,779 44.76%
mobile 2019 5,181,871 2,490,108 48.05%
mobile 2018 1,247,333 286,662 22.98%

* 2022 is sampled at 10% of corpus

Hope that helps.


Query:

#standardSQL
# Count of domains which use 'www' subdomain

WITH data_2022 AS (
  SELECT
    _TABLE_SUFFIX AS client,
    "2022" AS year,
    COUNT(DISTINCT url) AS total,
    COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
  FROM
    `httparchive.pages.2022_12_01_*` TABLESAMPLE SYSTEM(10 PERCENT)
  GROUP BY
    client
),

data_2021 AS (
  SELECT
    _TABLE_SUFFIX AS client,
    "2021" AS year,
    COUNT(DISTINCT url) AS total,
    COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
  FROM
    `httparchive.pages.2021_12_01_*` TABLESAMPLE SYSTEM(100 PERCENT)
  GROUP BY
    client
),

data_2020 AS (
  SELECT
    _TABLE_SUFFIX AS client,
    "2020" AS year,
    COUNT(DISTINCT url) AS total,
    COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
  FROM
    `httparchive.pages.2020_12_01_*` TABLESAMPLE SYSTEM(100 PERCENT)
  GROUP BY
    client
),

data_2019 AS (
  SELECT
    _TABLE_SUFFIX AS client,
    "2019" AS year,
    COUNT(DISTINCT url) AS total,
    COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
  FROM
    `httparchive.pages.2019_12_01_*` TABLESAMPLE SYSTEM(100 PERCENT)
  GROUP BY
    client
),

data_2018 AS (
  SELECT
    _TABLE_SUFFIX AS client,
    "2018" AS year,
    COUNT(DISTINCT url) AS total,
    COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
  FROM
    `httparchive.pages.2018_12_15_*` TABLESAMPLE SYSTEM(100 PERCENT)
  GROUP BY
    client
),

combined_data AS (
  SELECT
  client,
  year,
  www,
  total,
  ROUND(100 * www / total, 2) AS pct
  FROM
    data_2022
  UNION ALL
  SELECT
    client,
    year,
    www,
    total,
    ROUND(100 * www / total, 2) AS pct
  FROM
    data_2021
  UNION ALL
  SELECT
    client,
    year,
    www,
    total,
    ROUND(100 * www / total, 2) AS pct
  FROM
    data_2020
  UNION ALL
  SELECT
    client,
    year,
    www,
    total,
    ROUND(100 * www / total, 2) AS pct
  FROM
    data_2019
  UNION ALL
  SELECT
    client,
    year,
    www,
    total,
    ROUND(100 * www / total, 2) AS pct
  FROM
    data_2018
)

SELECT
  client,
  year,
  total,
  www,
  pct
FROM
  combined_data
ORDER BY
  client,
  year DESC
1 Like