Top 50 lang values over all pages from the 2019_04_01_desktop dataset:
#standardSQL
# WARNING! This query consumes 6.2 TB!
SELECT
APPROX_TOP_COUNT(LOWER(REGEXP_EXTRACT(body, '(?i)<html[^>]*lang=[\'"]?([a-z]{2})')), 50) AS lang
FROM
`httparchive.response_bodies.2019_04_01_desktop`
WHERE
page = url
Language | Lang | Count | Percent |
---|---|---|---|
English | en | 1571474 | 39.82% |
1095890 | 27.77% | ||
Japanese | ja | 188711 | 4.78% |
Spanish | es | 160215 | 4.06% |
Russian | ru | 159318 | 4.04% |
French | fr | 112883 | 2.86% |
Portuguese | pt | 107759 | 2.73% |
German | de | 89616 | 2.27% |
Dutch | nl | 58122 | 1.47% |
Italian | it | 57720 | 1.46% |
Polish | pl | 51615 | 1.31% |
Korean | ko | 41660 | 1.06% |
Chinese | zh | 41632 | 1.05% |
Turkish | tr | 35342 | 0.90% |
Czech | cs | 26374 | 0.67% |
Hungarian | hu | 20690 | 0.52% |
Swedish | sv | 19802 | 0.50% |
Vietnamese | vi | 16591 | 0.42% |
Danish | da | 15176 | 0.38% |
Romanian | ro | 14313 | 0.36% |
Greek | el | 12478 | 0.32% |
Hebrew | he | 11886 | 0.30% |
Thai | th | 10843 | 0.27% |
Slovak | sk | 9838 | 0.25% |
Arabic | ar | 9682 | 0.25% |
Finnish | fi | 9658 | 0.24% |
Ukrainian | uk | 7696 | 0.20% |
Bulgarian | bg | 7676 | 0.19% |
Persian | fa | 6726 | 0.17% |
Indonesian | id | 6281 | 0.16% |
Norwegian Bokmål | nb | 5874 | 0.15% |
Lithuanian | lt | 5232 | 0.13% |
Croatian | hr | 4260 | 0.11% |
Norwegian | no | 4045 | 0.10% |
Serbian | sr | 3921 | 0.10% |
Slovenian | sl | 3546 | 0.09% |
Catalan | ca | 3421 | 0.09% |
Estonian | et | 3269 | 0.08% |
Latvian | lv | 2326 | 0.06% |
Icelandic | is | 1122 | 0.03% |
jp | 1018 | 0.03% | |
us | 937 | 0.02% | |
ua | 923 | 0.02% | |
zx | 868 | 0.02% | |
Bosnian | bs | 797 | 0.02% |
cz | 755 | 0.02% | |
Georgian | ka | 710 | 0.02% |
Breton | br | 668 | 0.02% |
Malay | ms | 574 | 0.01% |
eu | 553 | 0.01% |