package kaun
Flax-inspired neural network library for OCaml
Install
dune-project
Dependency
Authors
Maintainers
Sources
raven-1.0.0.alpha1.tbz
sha256=8e277ed56615d388bc69c4333e43d1acd112b5f2d5d352e2453aef223ff59867
sha512=369eda6df6b84b08f92c8957954d107058fb8d3d8374082e074b56f3a139351b3ae6e3a99f2d4a4a2930dd950fd609593467e502368a13ad6217b571382da28c
doc/kaun.datasets/Kaun_datasets/index.html
Module Kaun_datasets
Source
Ready-to-use datasets for machine learning
This module provides popular datasets with built-in support for:
- Streaming and lazy loading (no OOM on large datasets)
- Automatic caching and prefetching
- Configurable preprocessing pipelines
- Efficient batching and shuffling
Core Types
Vision Datasets
Source
val mnist :
?train:bool ->
?flatten:bool ->
?normalize:bool ->
?data_format:[ `NCHW | `NHWC ] ->
?cache_dir:string ->
unit ->
(Bigarray.float32_elt Kaun.tensor * Bigarray.float32_elt Kaun.tensor)
Kaun.Dataset.t
MNIST handwritten digits dataset. Returns a dataset of (images, labels) pairs.
Source
val cifar10 :
?train:bool ->
?normalize:bool ->
?data_format:[ `NCHW | `NHWC ] ->
?augmentation:bool ->
?cache_dir:string ->
unit ->
(Bigarray.float32_elt Kaun.tensor * Bigarray.float32_elt Kaun.tensor)
Kaun.Dataset.t
CIFAR-10 image classification dataset
Source
val fashion_mnist :
?train:bool ->
?flatten:bool ->
?normalize:bool ->
?data_format:[ `NCHW | `NHWC ] ->
?cache_dir:string ->
unit ->
(Bigarray.float32_elt Kaun.tensor * Bigarray.float32_elt Kaun.tensor)
Kaun.Dataset.t
Fashion-MNIST clothing classification dataset
Text Datasets
Source
val imdb :
?train:bool ->
?tokenizer:Kaun.Dataset.tokenizer ->
?max_length:int ->
?cache_dir:string ->
unit ->
(int array * Bigarray.float32_elt Kaun.tensor) Kaun.Dataset.t
IMDB movie review sentiment dataset. Returns (token_ids, labels) where labels are 0 (negative) or 1 (positive)
Source
val wikitext :
?dataset_name:[ `Wikitext2 | `Wikitext103 ] ->
?tokenizer:Kaun.Dataset.tokenizer ->
?sequence_length:int ->
?cache_dir:string ->
unit ->
(int array * int array) Kaun.Dataset.t
WikiText language modeling dataset. Returns (input_ids, target_ids) for next-token prediction
Structured Data
Source
val iris :
?normalize:bool ->
?train_split:float ->
?shuffle_seed:int ->
unit ->
(Bigarray.float32_elt Kaun.tensor * Bigarray.float32_elt Kaun.tensor)
Kaun.Dataset.t
Iris flower classification dataset
Source
val boston_housing :
?normalize:bool ->
?train_split:float ->
unit ->
(Bigarray.float32_elt Kaun.tensor * Bigarray.float32_elt Kaun.tensor)
Kaun.Dataset.t
Boston housing price regression dataset
Dataset Utilities
Download a dataset file and optionally extract it. Returns the path to the downloaded/extracted data.
Source
val train_test_split :
?test_size:float ->
?shuffle:bool ->
?seed:int ->
'a Kaun.Dataset.t ->
'a Kaun.Dataset.t * 'a Kaun.Dataset.t
Split a dataset into training and test sets
Examples
(* Simple MNIST training loop *)
let dataset =
Kaun_datasets.mnist ~train:true ()
|> Kaun.Dataset.shuffle ~buffer_size:60000
|> Kaun.Dataset.batch 32
|> Kaun.Dataset.prefetch ~buffer_size:2
in
Kaun.Dataset.iter (fun (x_batch, y_batch) ->
let loss = train_step model x_batch y_batch in
Printf.printf "Loss: %f\n" (Kaun.Ops.to_float loss)
) dataset
(* Text classification with IMDB *)
let dataset =
Kaun_datasets.imdb ~train:true ~max_length:256 ()
|> Kaun_datasets.text_pipeline ~batch_size:16
~bucket_boundaries:[50; 100; 200]
in
(* Using train/test split *)
let all_data = load_custom_dataset () in
let train_data, test_data =
Kaun_datasets.train_test_split ~test_size:0.2 all_data
in
sectionYPositions = computeSectionYPositions($el), 10)"
x-init="setTimeout(() => sectionYPositions = computeSectionYPositions($el), 10)"
>
On This Page