For the Markup chapter of the Web Almanac I revisited this topic. Here are the latest results:
#standardSQL
# 03_01a: % of pages with deprecated elements
CREATE TEMPORARY FUNCTION containsDeprecatedElement(payload STRING)
RETURNS BOOLEAN LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var elements = JSON.parse($._element_count)
var deprecatedElements = new Set(["applet","acronym","bgsound","dir","frame","frameset","noframes","isindex","keygen","listing","menuitem","nextid","noembed","plaintext","rb","rtc","strike","xmp","basefont","big","blink","center","font","marquee","multicol","nobr","spacer","tt"]);
return !!Object.keys(elements).find(e => {
return deprecatedElements.has(e);
});
} catch (e) {
return false;
}
''';
SELECT
_TABLE_SUFFIX AS client,
COUNTIF(containsDeprecatedElement(payload)) AS pages,
ROUND(COUNTIF(containsDeprecatedElement(payload)) * 100 / COUNT(0), 2) AS pct_pages
FROM
`httparchive.pages.2019_07_01_*`
GROUP BY
client
client | pages | % of pages with deprecated elements |
---|---|---|
mobile | 804387 | 15.18% |
desktop | 705134 | 16.13% |
Top deprecated elements:
#standardSQL
# 03_01b: Top deprecated elements
CREATE TEMPORARY FUNCTION getElements(payload STRING)
RETURNS ARRAY<STRING> LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var elements = JSON.parse($._element_count)
return Object.keys(elements);
} catch (e) {
return [];
}
''';
CREATE TEMPORARY FUNCTION isDeprecated(element STRING) AS (
element IN ("applet","acronym","bgsound","dir","frame","frameset","noframes","isindex","keygen","listing","menuitem","nextid","noembed","plaintext","rb","rtc","strike","xmp","basefont","big","blink","center","font","marquee","multicol","nobr","spacer","tt")
);
SELECT
_TABLE_SUFFIX AS client,
element AS deprecated,
COUNT(0) AS freq,
SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS total,
ROUND(COUNT(0) * 100 / SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX), 2) AS pct
FROM
`httparchive.pages.2019_07_01_*`,
UNNEST(getElements(payload)) AS element
WHERE
isDeprecated(element)
GROUP BY
client,
deprecated
ORDER BY
freq DESC,
client
client | deprecated | freq | total | pct |
---|---|---|---|---|
mobile | center | 421571 | 1010479 | 41.72 |
mobile | font | 390929 | 1010479 | 38.69 |
desktop | center | 363308 | 887408 | 40.94 |
desktop | font | 350413 | 887408 | 39.49 |
mobile | marquee | 63378 | 1010479 | 6.27 |
desktop | marquee | 46620 | 887408 | 5.25 |
desktop | nobr | 31002 | 887408 | 3.49 |
mobile | nobr | 29247 | 1010479 | 2.89 |
mobile | big | 24996 | 1010479 | 2.47 |
desktop | big | 23030 | 887408 | 2.6 |
mobile | frame | 18649 | 1010479 | 1.85 |
mobile | frameset | 18387 | 1010479 | 1.82 |
desktop | frame | 17147 | 887408 | 1.93 |
desktop | frameset | 16902 | 887408 | 1.9 |
mobile | noframes | 14531 | 1010479 | 1.44 |
mobile | strike | 14346 | 1010479 | 1.42 |
desktop | strike | 14270 | 887408 | 1.61 |
desktop | noframes | 11060 | 887408 | 1.25 |
Kind of surprised to see such a big change since my last analysis. Namely, center
taking over the top spot from font
and marquee
jumping up the list. This could be due to the corpus changes that happened between these times, with our dataset reaching more of the tail of the web. The methodologies are also a bit different, before I was querying over the HTML itself with a regular expression, and now I’m using a custom metric that extracts each tag at runtime.
You can explore the full results in this sheet.