Top 50 lang values over all pages from the 2019_04_01_desktop dataset:
#standardSQL
# WARNING! This query consumes 6.2 TB!
SELECT
APPROX_TOP_COUNT(LOWER(REGEXP_EXTRACT(body, '(?i)<html[^>]*lang=[\'"]?([a-z]{2})')), 50) AS lang
FROM
`httparchive.response_bodies.2019_04_01_desktop`
WHERE
page = url
| Language | Lang | Count | Percent |
|---|---|---|---|
| English | en | 1571474 | 39.82% |
| 1095890 | 27.77% | ||
| Japanese | ja | 188711 | 4.78% |
| Spanish | es | 160215 | 4.06% |
| Russian | ru | 159318 | 4.04% |
| French | fr | 112883 | 2.86% |
| Portuguese | pt | 107759 | 2.73% |
| German | de | 89616 | 2.27% |
| Dutch | nl | 58122 | 1.47% |
| Italian | it | 57720 | 1.46% |
| Polish | pl | 51615 | 1.31% |
| Korean | ko | 41660 | 1.06% |
| Chinese | zh | 41632 | 1.05% |
| Turkish | tr | 35342 | 0.90% |
| Czech | cs | 26374 | 0.67% |
| Hungarian | hu | 20690 | 0.52% |
| Swedish | sv | 19802 | 0.50% |
| Vietnamese | vi | 16591 | 0.42% |
| Danish | da | 15176 | 0.38% |
| Romanian | ro | 14313 | 0.36% |
| Greek | el | 12478 | 0.32% |
| Hebrew | he | 11886 | 0.30% |
| Thai | th | 10843 | 0.27% |
| Slovak | sk | 9838 | 0.25% |
| Arabic | ar | 9682 | 0.25% |
| Finnish | fi | 9658 | 0.24% |
| Ukrainian | uk | 7696 | 0.20% |
| Bulgarian | bg | 7676 | 0.19% |
| Persian | fa | 6726 | 0.17% |
| Indonesian | id | 6281 | 0.16% |
| Norwegian Bokmål | nb | 5874 | 0.15% |
| Lithuanian | lt | 5232 | 0.13% |
| Croatian | hr | 4260 | 0.11% |
| Norwegian | no | 4045 | 0.10% |
| Serbian | sr | 3921 | 0.10% |
| Slovenian | sl | 3546 | 0.09% |
| Catalan | ca | 3421 | 0.09% |
| Estonian | et | 3269 | 0.08% |
| Latvian | lv | 2326 | 0.06% |
| Icelandic | is | 1122 | 0.03% |
| jp | 1018 | 0.03% | |
| us | 937 | 0.02% | |
| ua | 923 | 0.02% | |
| zx | 868 | 0.02% | |
| Bosnian | bs | 797 | 0.02% |
| cz | 755 | 0.02% | |
| Georgian | ka | 710 | 0.02% |
| Breton | br | 668 | 0.02% |
| Malay | ms | 574 | 0.01% |
| eu | 553 | 0.01% |