HuggingFaceFW/fineweb-2
A sparkling update with 1000s of languages
mlforge datasets pull HuggingFaceFW/fineweb-2
Dataset details
About HuggingFaceFW/fineweb-2
--- license: odc-by taskcategories: - text-generation datasetinfo: features: - name: text dtype: string - name: id dtype: string - name: dump dtype: string - name: url dtype: string - name: date dtype: string - name: filepath dtype: string - name: language dtype: string - name: languagescore dtype: float64 - name: languagescript dtype: string - name: minhashclustersize dtype: int64 - name: toplangs dtype: string - name: wordlistratio dtype: float64 language: - aai - aak - aau - aaz - aba - abi - abk - abn - abq - abs - abt - abx - aby - abz - aca - acd - ace - acf - ach - acm - acn - acr - acu - ada - ade - adh - adi - adj - adl - ady - adz - aeb - aer - aeu - aey - afr - agd - agg - agm - agn - agr - agt - agu - agw - agx - aha - ahk - aia - aii - aim - ain - ajg - aji - ajz - akb - ake - akh - akp - alj - aln - alp - alq - als - alt - aly - alz - ame - amf - amh - ami - amk - amm - amn - amp - amr - amu - amx - ang - anm - ann - anp - anv - any - aoi - aoj - aom - aoz - apb - apc - ape - apn - apr - apt - apu - apw - apy - apz - arb - are - arg - arl - arn - arp - arq - ars - ary - arz - asg - asm - aso - ast - ata - atb - atd - atg - ati - atj - atq - att - auc - aui - auy - ava - avk - avn - avt - avu - awa - awb - awx - ayo - ayp - ayr - azb - azg - azj - azz - bak - bam - ban - bao - bar - bas - bav - bba