Hi @gwern.
At the end of 2018, the HTTP Archive changed the source of the URLs from Alexa to CrUX. Alexa URLs (before Dec 2018) include the www
by default (www.example.com
), while the CrUX URLs do not (example.com
and www.example.com
are both valid origins). Since the initial URL has changed, we don’t have a reliable comparison before 2018. Some websites would redirect from www.example.com
to example.com
, while others would serve the same content on both domains or redirect to another subdomain altogether (en.example.com
or m.example.com
).
Looking at the data from Dec 2018 onwards:
client | year | total | www | pct |
---|---|---|---|---|
desktop | 2022* | 1,246,032 | 511,012 | 41.01% |
desktop | 2021 | 5,824,858 | 2,508,805 | 43.07% |
desktop | 2020 | 6,018,707 | 2,716,807 | 45.14% |
desktop | 2019 | 4,291,086 | 2,086,970 | 48.64% |
desktop | 2018 | 3,840,067 | 1,815,773 | 47.28% |
mobile | 2022* | 1,613,142 | 651,794 | 40.41% |
mobile | 2021 | 7,957,652 | 3,350,268 | 42.10% |
mobile | 2020 | 7,157,942 | 3,203,779 | 44.76% |
mobile | 2019 | 5,181,871 | 2,490,108 | 48.05% |
mobile | 2018 | 1,247,333 | 286,662 | 22.98% |
* 2022 is sampled at 10% of corpus
Hope that helps.
Query:
#standardSQL
# Count of domains which use 'www' subdomain
WITH data_2022 AS (
SELECT
_TABLE_SUFFIX AS client,
"2022" AS year,
COUNT(DISTINCT url) AS total,
COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
FROM
`httparchive.pages.2022_12_01_*` TABLESAMPLE SYSTEM(10 PERCENT)
GROUP BY
client
),
data_2021 AS (
SELECT
_TABLE_SUFFIX AS client,
"2021" AS year,
COUNT(DISTINCT url) AS total,
COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
FROM
`httparchive.pages.2021_12_01_*` TABLESAMPLE SYSTEM(100 PERCENT)
GROUP BY
client
),
data_2020 AS (
SELECT
_TABLE_SUFFIX AS client,
"2020" AS year,
COUNT(DISTINCT url) AS total,
COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
FROM
`httparchive.pages.2020_12_01_*` TABLESAMPLE SYSTEM(100 PERCENT)
GROUP BY
client
),
data_2019 AS (
SELECT
_TABLE_SUFFIX AS client,
"2019" AS year,
COUNT(DISTINCT url) AS total,
COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
FROM
`httparchive.pages.2019_12_01_*` TABLESAMPLE SYSTEM(100 PERCENT)
GROUP BY
client
),
data_2018 AS (
SELECT
_TABLE_SUFFIX AS client,
"2018" AS year,
COUNT(DISTINCT url) AS total,
COUNT(DISTINCT IF(REGEXP_CONTAINS(url, r"^https?://www\."), url, NULL)) AS www
FROM
`httparchive.pages.2018_12_15_*` TABLESAMPLE SYSTEM(100 PERCENT)
GROUP BY
client
),
combined_data AS (
SELECT
client,
year,
www,
total,
ROUND(100 * www / total, 2) AS pct
FROM
data_2022
UNION ALL
SELECT
client,
year,
www,
total,
ROUND(100 * www / total, 2) AS pct
FROM
data_2021
UNION ALL
SELECT
client,
year,
www,
total,
ROUND(100 * www / total, 2) AS pct
FROM
data_2020
UNION ALL
SELECT
client,
year,
www,
total,
ROUND(100 * www / total, 2) AS pct
FROM
data_2019
UNION ALL
SELECT
client,
year,
www,
total,
ROUND(100 * www / total, 2) AS pct
FROM
data_2018
)
SELECT
client,
year,
total,
www,
pct
FROM
combined_data
ORDER BY
client,
year DESC