From 796bb37fb6ba593f4e7d891b7bef2154840e0128 Mon Sep 17 00:00:00 2001 From: Patrick Ferris Date: Sat, 16 Sep 2023 12:07:11 +0100 Subject: [PATCH 1/8] Initial LZW implementation --- lib/dune | 5 ++ lib/lzw.ml | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/lzw.mli | 37 +++++++++++ 3 files changed, 228 insertions(+) create mode 100644 lib/lzw.ml create mode 100644 lib/lzw.mli diff --git a/lib/dune b/lib/dune index 3a66b7f..9d4b9d6 100644 --- a/lib/dune +++ b/lib/dune @@ -41,6 +41,11 @@ (public_name decompress.lzo) (modules lzo)) +(library + (name lzw) + (public_name decompress.lzw) + (modules lzw)) + (alias (name default) (package decompress)) diff --git a/lib/lzw.ml b/lib/lzw.ml new file mode 100644 index 0000000..99e0555 --- /dev/null +++ b/lib/lzw.ml @@ -0,0 +1,186 @@ +let io_buffer_size = 65535 + +module Io = struct + type src = unit -> bytes * int * int + type dst = (bytes * int * int) option -> unit +end + +module Dictionary : sig + type t + + val v : unit -> t + val lookup : t -> string -> int option + val add : t -> string -> unit +end = struct + type t = { tbl : (string, int) Hashtbl.t; mutable next_id : int } + + let v () = + let tbl = Hashtbl.create 128 in + for i = 0 to 255 do + Hashtbl.add tbl (String.make 1 (Char.chr i)) i + done; + { tbl; next_id = 257 } + + let lookup t = Hashtbl.find_opt t.tbl + + let add t v = + Hashtbl.add t.tbl v t.next_id; + t.next_id <- t.next_id + 1 +end + +module Reverse_dictionary : sig + type t + + val v : unit -> t + val lookup : t -> int -> bytes option + val add : t -> bytes -> unit +end = struct + type t = { tbl : (int, bytes) Hashtbl.t; mutable next_id : int } + + let v () = + let tbl = Hashtbl.create 128 in + for i = 0 to 255 do + Hashtbl.add tbl i (Bytes.make 1 (Char.chr i)) + done; + { tbl; next_id = 257 } + + let lookup t = Hashtbl.find_opt t.tbl + + let add t v = + Hashtbl.add t.tbl t.next_id v; + t.next_id <- t.next_id + 1 +end + +let ux_eoi = max_int (* End of input, outside unicode range. *) +let ux_soi = max_int - 1 (* Start of input, outside unicode range. *) + +type src = { + src : Io.src; + d : Dictionary.t; + mutable i : bytes; (* Current input chunk. *) + mutable i_pos : int; (* Next input position to read. *) + mutable i_max : int; (* Maximal input position to read. *) + mutable c : int; + mutable rbuf : Buffer.t; + mutable buf : Buffer.t; +} + +let badd d = Buffer.add_char d.rbuf (Char.chr d.c) + +let src src = + { + src; + d = Dictionary.v (); + i = Bytes.empty; + i_pos = max_int; + i_max = 0; + buf = Buffer.create 256; + rbuf = Buffer.create 8; + c = ux_soi; + } + +let refill d = + match d.src () with + | s, pos, len -> + d.i <- s; + d.i_pos <- pos; + d.i_max <- pos + len - 1 + +let rec readc d = + if d.i_pos > d.i_max then + if d.c = ux_eoi then () + else ( + refill d; + readc d) + else ( + d.c <- Char.code (Bytes.unsafe_get d.i d.i_pos); + d.i_pos <- d.i_pos + 1) + +type dst = { + dst : Io.dst; (* Output destination. *) + buff : Buffer.t; (* Scratch buffer. *) + mutable o : bytes; (* Current output chunk. *) + mutable o_pos : int; (* Next output position to write. *) + mutable o_max : int; (* Maximal output position to write. *) +} + +let dst ?(buf = Bytes.create io_buffer_size) dst = + let o_max = Bytes.length buf - 1 in + if o_max = 0 then invalid_arg "buf's length is empty" + else { dst; o = buf; buff = Buffer.create 128; o_pos = 0; o_max } + +let flush e ~stop = + if stop then ( + if e.o_pos <> 0 then e.dst (Some (e.o, 0, e.o_pos)); + e.dst None) + else e.dst (Some (e.o, 0, e.o_pos)); + e.o_pos <- 0 + +let rec writec e c = + if e.o_pos > e.o_max then ( + flush e ~stop:false; + writec e c) + else ( + Bytes.set_uint8 e.o e.o_pos c; + e.o_pos <- e.o_pos + 1) + +let compress src dst = + let scratch = Bytes.create 2 in + try + while true do + readc src; + let chr = src.c in + let s_old = Buffer.contents src.buf in + Buffer.add_uint8 src.buf chr; + let s = Buffer.contents src.buf in + match Dictionary.lookup src.d s with + | Some _ -> () + | None -> ( + Dictionary.add src.d s; + match Dictionary.lookup src.d s_old with + | None -> assert false + | Some c -> + Bytes.set_uint16_ne scratch 0 c; + writec dst (Bytes.get_uint8 scratch 0); + writec dst (Bytes.get_uint8 scratch 1); + Buffer.reset src.buf; + Buffer.add_uint8 src.buf chr) + done + with End_of_file -> ( + let s = Buffer.contents src.buf in + if String.length s = 0 then () + else + match Dictionary.lookup src.d s with + | None -> assert false + | Some c -> + Bytes.set_uint16_ne scratch 0 c; + writec dst (Bytes.get_uint8 scratch 0); + writec dst (Bytes.get_uint8 scratch 1); + flush dst ~stop:true) + +let r_uint16_be s = + readc s; + badd s; + readc s; + badd s; + Buffer.to_bytes s.rbuf |> fun b -> + Buffer.clear s.rbuf; + Bytes.get_uint16_ne b 0 + +let decompress src dst = + let d = Reverse_dictionary.v () in + let previous_string = ref Bytes.empty in + try + while true do + let code = r_uint16_be src in + match Reverse_dictionary.lookup d code with + | None -> failwith ("No code found for " ^ string_of_int code) + | Some v -> + Bytes.iter (fun c -> writec dst (Char.code c)) v; + if not (Int.equal (Bytes.length !previous_string) 0) then + Reverse_dictionary.add d + (Bytes.concat Bytes.empty + [ !previous_string; Bytes.make 1 (Bytes.unsafe_get v 0) ]); + previous_string := v + done + with End_of_file -> flush dst ~stop:true diff --git a/lib/lzw.mli b/lib/lzw.mli new file mode 100644 index 0000000..7d8a5e7 --- /dev/null +++ b/lib/lzw.mli @@ -0,0 +1,37 @@ +(** {1 Lempel–Ziv–Welch Compression} + + Lempel–Ziv–Welch (LZW) compression is a lossless data compression algorithm. It is + commonly used in the TIFF file format and is also the algorithm used in the Unix + file compression utility [compress]. + + This library provides a way to compress and decompress byte streams using LZW compression. +*) + +module Io : sig + type src = unit -> bytes * int * int + (** An IO source. This should raise [End_of_file] whenever the last bytes have been read. *) + + type dst = (bytes * int * int) option -> unit + (** An IO destination. *) +end + +type src +(** A source to compress or decompress. *) + +type dst +(** A destination to write a compressed or decompressed stream to. *) + +val src : Io.src -> src +(** [src io] creates a new source from an {! Io.src}. *) + +val dst : ?buf:bytes -> Io.dst -> dst +(** [dst io] creates a new destination from an {! Io.dst}. You can optionally + provide a pre-allocated internal buffer. *) + +val compress : src -> dst -> unit +(** [compress src dst] compresses the bytes coming from [src] + into [dst]. *) + +val decompress : src -> dst -> unit +(** [decompress src dst] decompresses the bytes coming from [src] into [dst]. *) + From 2c1c460c23673aef735bb078447136ca1528541d Mon Sep 17 00:00:00 2001 From: Patrick Ferris Date: Sat, 16 Sep 2023 12:40:27 +0100 Subject: [PATCH 2/8] Add small decoder fix --- lib/lzw.ml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/lzw.ml b/lib/lzw.ml index 99e0555..75e8c2b 100644 --- a/lib/lzw.ml +++ b/lib/lzw.ml @@ -34,6 +34,7 @@ module Reverse_dictionary : sig val v : unit -> t val lookup : t -> int -> bytes option val add : t -> bytes -> unit + val raw_add : t -> int -> bytes -> unit end = struct type t = { tbl : (int, bytes) Hashtbl.t; mutable next_id : int } @@ -49,6 +50,8 @@ end = struct let add t v = Hashtbl.add t.tbl t.next_id v; t.next_id <- t.next_id + 1 + + let raw_add t i v = Hashtbl.add t.tbl i v end let ux_eoi = max_int (* End of input, outside unicode range. *) @@ -173,14 +176,17 @@ let decompress src dst = try while true do let code = r_uint16_be src in + if Option.is_none (Reverse_dictionary.lookup d code) then + Reverse_dictionary.raw_add d code + (Bytes.cat !previous_string + (Bytes.make 1 (Bytes.unsafe_get !previous_string 0))); match Reverse_dictionary.lookup d code with | None -> failwith ("No code found for " ^ string_of_int code) | Some v -> Bytes.iter (fun c -> writec dst (Char.code c)) v; if not (Int.equal (Bytes.length !previous_string) 0) then Reverse_dictionary.add d - (Bytes.concat Bytes.empty - [ !previous_string; Bytes.make 1 (Bytes.unsafe_get v 0) ]); + (Bytes.cat !previous_string (Bytes.make 1 (Bytes.unsafe_get v 0))); previous_string := v done with End_of_file -> flush dst ~stop:true From 2c3add115b9cdb98a177712fde0762fac4f02de6 Mon Sep 17 00:00:00 2001 From: Patrick Ferris Date: Sat, 16 Sep 2023 12:56:21 +0100 Subject: [PATCH 3/8] Prepare for levels --- lib/lzw.ml | 24 ++++++++++++++---------- lib/lzw.mli | 4 ++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/lib/lzw.ml b/lib/lzw.ml index 75e8c2b..9164d68 100644 --- a/lib/lzw.ml +++ b/lib/lzw.ml @@ -60,6 +60,7 @@ let ux_soi = max_int - 1 (* Start of input, outside unicode range. *) type src = { src : Io.src; d : Dictionary.t; + level : int; mutable i : bytes; (* Current input chunk. *) mutable i_pos : int; (* Next input position to read. *) mutable i_max : int; (* Maximal input position to read. *) @@ -70,9 +71,10 @@ type src = { let badd d = Buffer.add_char d.rbuf (Char.chr d.c) -let src src = +let src ?(level=16) src = { src; + level; d = Dictionary.v (); i = Bytes.empty; i_pos = max_int; @@ -102,15 +104,17 @@ let rec readc d = type dst = { dst : Io.dst; (* Output destination. *) buff : Buffer.t; (* Scratch buffer. *) + scratch : bytes; + level : int; mutable o : bytes; (* Current output chunk. *) mutable o_pos : int; (* Next output position to write. *) mutable o_max : int; (* Maximal output position to write. *) } -let dst ?(buf = Bytes.create io_buffer_size) dst = +let dst ?(level=16) ?(buf = Bytes.create io_buffer_size) dst = let o_max = Bytes.length buf - 1 in if o_max = 0 then invalid_arg "buf's length is empty" - else { dst; o = buf; buff = Buffer.create 128; o_pos = 0; o_max } + else { dst; o = buf; buff = Buffer.create 128; o_pos = 0; o_max; level; scratch = Bytes.create 2 } let flush e ~stop = if stop then ( @@ -127,8 +131,12 @@ let rec writec e c = Bytes.set_uint8 e.o e.o_pos c; e.o_pos <- e.o_pos + 1) +let w_uint16_be dst c = + Bytes.set_uint16_ne dst.scratch 0 c; + writec dst (Bytes.get_uint8 dst.scratch 0); + writec dst (Bytes.get_uint8 dst.scratch 1) + let compress src dst = - let scratch = Bytes.create 2 in try while true do readc src; @@ -143,9 +151,7 @@ let compress src dst = match Dictionary.lookup src.d s_old with | None -> assert false | Some c -> - Bytes.set_uint16_ne scratch 0 c; - writec dst (Bytes.get_uint8 scratch 0); - writec dst (Bytes.get_uint8 scratch 1); + w_uint16_be dst c; Buffer.reset src.buf; Buffer.add_uint8 src.buf chr) done @@ -156,9 +162,7 @@ let compress src dst = match Dictionary.lookup src.d s with | None -> assert false | Some c -> - Bytes.set_uint16_ne scratch 0 c; - writec dst (Bytes.get_uint8 scratch 0); - writec dst (Bytes.get_uint8 scratch 1); + w_uint16_be dst c; flush dst ~stop:true) let r_uint16_be s = diff --git a/lib/lzw.mli b/lib/lzw.mli index 7d8a5e7..7282285 100644 --- a/lib/lzw.mli +++ b/lib/lzw.mli @@ -21,10 +21,10 @@ type src type dst (** A destination to write a compressed or decompressed stream to. *) -val src : Io.src -> src +val src : ?level:int -> Io.src -> src (** [src io] creates a new source from an {! Io.src}. *) -val dst : ?buf:bytes -> Io.dst -> dst +val dst : ?level:int -> ?buf:bytes -> Io.dst -> dst (** [dst io] creates a new destination from an {! Io.dst}. You can optionally provide a pre-allocated internal buffer. *) From 3d15ca974318adabf17aecd3d2b0ecd777520931 Mon Sep 17 00:00:00 2001 From: Patrick Ferris Date: Sat, 16 Sep 2023 14:45:25 +0100 Subject: [PATCH 4/8] Allow different levels --- lib/lzw.ml | 72 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/lib/lzw.ml b/lib/lzw.ml index 9164d68..144ea6f 100644 --- a/lib/lzw.ml +++ b/lib/lzw.ml @@ -5,6 +5,8 @@ module Io = struct type dst = (bytes * int * int) option -> unit end +let eof = 256 + module Dictionary : sig type t @@ -60,18 +62,17 @@ let ux_soi = max_int - 1 (* Start of input, outside unicode range. *) type src = { src : Io.src; d : Dictionary.t; - level : int; + level : int; (* Max bit size *) mutable i : bytes; (* Current input chunk. *) mutable i_pos : int; (* Next input position to read. *) mutable i_max : int; (* Maximal input position to read. *) mutable c : int; + mutable extra_bits : int * int; (* The bits left-aligned and how many *) mutable rbuf : Buffer.t; mutable buf : Buffer.t; } -let badd d = Buffer.add_char d.rbuf (Char.chr d.c) - -let src ?(level=16) src = +let src ?(level = 12) src = { src; level; @@ -82,6 +83,7 @@ let src ?(level=16) src = buf = Buffer.create 256; rbuf = Buffer.create 8; c = ux_soi; + extra_bits = (0, 0); } let refill d = @@ -109,12 +111,23 @@ type dst = { mutable o : bytes; (* Current output chunk. *) mutable o_pos : int; (* Next output position to write. *) mutable o_max : int; (* Maximal output position to write. *) + mutable pending : int * int; } -let dst ?(level=16) ?(buf = Bytes.create io_buffer_size) dst = +let dst ?(level = 12) ?(buf = Bytes.create io_buffer_size) dst = let o_max = Bytes.length buf - 1 in if o_max = 0 then invalid_arg "buf's length is empty" - else { dst; o = buf; buff = Buffer.create 128; o_pos = 0; o_max; level; scratch = Bytes.create 2 } + else + { + dst; + o = buf; + buff = Buffer.create 128; + o_pos = 0; + o_max; + level; + scratch = Bytes.create 2; + pending = (0, 0); + } let flush e ~stop = if stop then ( @@ -131,10 +144,19 @@ let rec writec e c = Bytes.set_uint8 e.o e.o_pos c; e.o_pos <- e.o_pos + 1) -let w_uint16_be dst c = - Bytes.set_uint16_ne dst.scratch 0 c; - writec dst (Bytes.get_uint8 dst.scratch 0); - writec dst (Bytes.get_uint8 dst.scratch 1) +let w_flush dst v = + while snd dst.pending >= v do + let output = fst dst.pending in + writec dst (output land 0xff); + let i = output lsr 8 in + dst.pending <- (i, snd dst.pending - 8) + done + +let w_level dst c = + let out, bits = dst.pending in + let output = out lor (c lsl bits) in + dst.pending <- (output, bits + dst.level); + w_flush dst 8 let compress src dst = try @@ -151,7 +173,7 @@ let compress src dst = match Dictionary.lookup src.d s_old with | None -> assert false | Some c -> - w_uint16_be dst c; + w_level dst c; Buffer.reset src.buf; Buffer.add_uint8 src.buf chr) done @@ -162,24 +184,32 @@ let compress src dst = match Dictionary.lookup src.d s with | None -> assert false | Some c -> - w_uint16_be dst c; + w_level dst c; + writec dst eof; flush dst ~stop:true) -let r_uint16_be s = - readc s; - badd s; - readc s; - badd s; - Buffer.to_bytes s.rbuf |> fun b -> - Buffer.clear s.rbuf; - Bytes.get_uint16_ne b 0 +let r_level s = + (* Make sure we have enough bytes to grab the data *) + while snd s.extra_bits < s.level do + readc s; + let pending_input = fst s.extra_bits in + let new_input = pending_input lor ((s.c land 0xFF) lsl snd s.extra_bits) in + s.extra_bits <- (new_input, snd s.extra_bits + 8) + done; + let pending_input = fst s.extra_bits in + let available = snd s.extra_bits in + let i = pending_input land lnot (lnot 0 lsl s.level) in + let r = pending_input lsr s.level in + let m = available - s.level in + s.extra_bits <- (r, m); + if i = eof then raise End_of_file else i let decompress src dst = let d = Reverse_dictionary.v () in let previous_string = ref Bytes.empty in try while true do - let code = r_uint16_be src in + let code = r_level src in if Option.is_none (Reverse_dictionary.lookup d code) then Reverse_dictionary.raw_add d code (Bytes.cat !previous_string From b5acccaf5ace8cf10c7b9f19b2b5850c34b95e55 Mon Sep 17 00:00:00 2001 From: Patrick Ferris Date: Sat, 16 Sep 2023 15:47:41 +0100 Subject: [PATCH 5/8] Make codes variable length --- lib/lzw.ml | 43 ++++++++++++++++++++++++++++++++++++------- lib/lzw.mli | 12 ++++++------ 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/lib/lzw.ml b/lib/lzw.ml index 144ea6f..38fd9cc 100644 --- a/lib/lzw.ml +++ b/lib/lzw.ml @@ -1,3 +1,6 @@ +(* This implementation relied on this excellent blog post + https://marknelson.us/posts/2011/11/08/lzw-revisited.html *) + let io_buffer_size = 65535 module Io = struct @@ -62,7 +65,10 @@ let ux_soi = max_int - 1 (* Start of input, outside unicode range. *) type src = { src : Io.src; d : Dictionary.t; - level : int; (* Max bit size *) + max_code : int; + mutable level : int; + mutable current_code : int; + mutable next_bump : int; mutable i : bytes; (* Current input chunk. *) mutable i_pos : int; (* Next input position to read. *) mutable i_max : int; (* Maximal input position to read. *) @@ -72,10 +78,10 @@ type src = { mutable buf : Buffer.t; } -let src ?(level = 12) src = +let src ?(max_code=65536) src = { src; - level; + level = 9; d = Dictionary.v (); i = Bytes.empty; i_pos = max_int; @@ -84,6 +90,9 @@ let src ?(level = 12) src = rbuf = Buffer.create 8; c = ux_soi; extra_bits = (0, 0); + next_bump = 512; + current_code = 256; + max_code; } let refill d = @@ -107,14 +116,17 @@ type dst = { dst : Io.dst; (* Output destination. *) buff : Buffer.t; (* Scratch buffer. *) scratch : bytes; - level : int; + max_code : int; + mutable level : int; (* Current code size *) + mutable current_code : int; + mutable next_bump : int; mutable o : bytes; (* Current output chunk. *) mutable o_pos : int; (* Next output position to write. *) mutable o_max : int; (* Maximal output position to write. *) mutable pending : int * int; } -let dst ?(level = 12) ?(buf = Bytes.create io_buffer_size) dst = +let dst ?(max_code=65536) ?(buf = Bytes.create io_buffer_size) dst = let o_max = Bytes.length buf - 1 in if o_max = 0 then invalid_arg "buf's length is empty" else @@ -124,7 +136,10 @@ let dst ?(level = 12) ?(buf = Bytes.create io_buffer_size) dst = buff = Buffer.create 128; o_pos = 0; o_max; - level; + level = 9; + current_code = 256; + next_bump = 512; + max_code; scratch = Bytes.create 2; pending = (0, 0); } @@ -156,7 +171,14 @@ let w_level dst c = let out, bits = dst.pending in let output = out lor (c lsl bits) in dst.pending <- (output, bits + dst.level); - w_flush dst 8 + w_flush dst 8; + if dst.current_code < dst.max_code then begin + dst.current_code <- dst.current_code + 1; + if dst.current_code == dst.next_bump then begin + dst.next_bump <- dst.next_bump * 2; + dst.level <- dst.level + 1 + end + end let compress src dst = try @@ -202,6 +224,13 @@ let r_level s = let r = pending_input lsr s.level in let m = available - s.level in s.extra_bits <- (r, m); + if s.current_code < s.max_code then begin + s.current_code <- s.current_code + 1; + if s.current_code = s.next_bump then begin + s.next_bump <- s.next_bump * 2; + s.level <- s.level + 1 + end + end; if i = eof then raise End_of_file else i let decompress src dst = diff --git a/lib/lzw.mli b/lib/lzw.mli index 7282285..e2e2d3e 100644 --- a/lib/lzw.mli +++ b/lib/lzw.mli @@ -21,12 +21,13 @@ type src type dst (** A destination to write a compressed or decompressed stream to. *) -val src : ?level:int -> Io.src -> src -(** [src io] creates a new source from an {! Io.src}. *) +val src : ?max_code:int -> Io.src -> src +(** [src io] creates a new source from an {! Io.src}. Max code size controls + the upper code that can be used. *) -val dst : ?level:int -> ?buf:bytes -> Io.dst -> dst -(** [dst io] creates a new destination from an {! Io.dst}. You can optionally - provide a pre-allocated internal buffer. *) +val dst : ?max_code:int -> ?buf:bytes -> Io.dst -> dst +(** [dst io] creates a new destination from an {! Io.dst}. Max code size controls + the upper code that can be used. *) val compress : src -> dst -> unit (** [compress src dst] compresses the bytes coming from [src] @@ -34,4 +35,3 @@ val compress : src -> dst -> unit val decompress : src -> dst -> unit (** [decompress src dst] decompresses the bytes coming from [src] into [dst]. *) - From ff00cd0c7557f99185f384db843c5dbf55d01113 Mon Sep 17 00:00:00 2001 From: Patrick Ferris Date: Sat, 16 Sep 2023 16:34:05 +0100 Subject: [PATCH 6/8] Support bigstrings for lzw --- lib/lzw.ml | 83 ++++++++++++++++++++++++++++++++++++++--------------- lib/lzw.mli | 50 +++++++++++++++++++++++++------- 2 files changed, 100 insertions(+), 33 deletions(-) diff --git a/lib/lzw.ml b/lib/lzw.ml index 38fd9cc..81e2771 100644 --- a/lib/lzw.ml +++ b/lib/lzw.ml @@ -4,8 +4,39 @@ let io_buffer_size = 65535 module Io = struct - type src = unit -> bytes * int * int - type dst = (bytes * int * int) option -> unit + type bigstring = (char, Bigarray.int8_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t + + type _ src = + | Bytes : (unit -> bytes * int * int) -> bytes src + | Bigstring : (unit -> bigstring * int * int) -> bigstring src + + type _ dst = + | Bytes : ((bytes * int * int) option -> unit) -> bytes dst + | Bigstring : ((bigstring * int * int) option -> unit) -> bigstring dst + + let set_uint8 (type b) (v : b dst) (b : b) off value = match v with + | Bytes _ -> Bytes.set_uint8 b off value + | Bigstring _ -> Bigarray.Array1.unsafe_set b off (Char.unsafe_chr value) + + let unsafe_get (type b) (v : b src) (b : b) off = match v with + | Bytes _ -> Bytes.unsafe_get b off + | Bigstring _ -> Bigarray.Array1.unsafe_get b off + + let empty (type b) (v : b src) : b = match v with + | Bytes _ -> Bytes.empty + | Bigstring _ -> Bigarray.Array1.create Char C_layout 0 + + let create (type b) (v : b dst) i : b = match v with + | Bytes _ -> Bytes.create i + | Bigstring _ -> Bigarray.Array1.create Char C_layout i + + let length (type b) (v : b dst) (i : b) : int = match v with + | Bytes _ -> Bytes.length i + | Bigstring _ -> Bigarray.Array1.dim i + + let write (type b) (d : b dst) (v : (b * int * int) option) = match d with + | Bytes fn -> fn v + | Bigstring fn -> fn v end let eof = 256 @@ -62,14 +93,14 @@ end let ux_eoi = max_int (* End of input, outside unicode range. *) let ux_soi = max_int - 1 (* Start of input, outside unicode range. *) -type src = { - src : Io.src; +type 'a src = { + src : 'a Io.src; d : Dictionary.t; max_code : int; mutable level : int; mutable current_code : int; mutable next_bump : int; - mutable i : bytes; (* Current input chunk. *) + mutable i : 'a; (* Current input chunk. *) mutable i_pos : int; (* Next input position to read. *) mutable i_max : int; (* Maximal input position to read. *) mutable c : int; @@ -78,12 +109,12 @@ type src = { mutable buf : Buffer.t; } -let src ?(max_code=65536) src = +let src (type b) ?(max_code=65536) (src : b Io.src) = { src; level = 9; d = Dictionary.v (); - i = Bytes.empty; + i = Io.empty src; i_pos = max_int; i_max = 0; buf = Buffer.create 256; @@ -95,12 +126,18 @@ let src ?(max_code=65536) src = max_code; } -let refill d = - match d.src () with - | s, pos, len -> - d.i <- s; - d.i_pos <- pos; - d.i_max <- pos + len - 1 +let refill (type b) (d : b src) = + match d.src with + | Bytes fn -> + let s, pos, len = fn () in + d.i <- s; + d.i_pos <- pos; + d.i_max <- pos + len - 1 + | Bigstring fn -> + let s, pos, len = fn () in + d.i <- s; + d.i_pos <- pos; + d.i_max <- pos + len - 1 let rec readc d = if d.i_pos > d.i_max then @@ -109,25 +146,25 @@ let rec readc d = refill d; readc d) else ( - d.c <- Char.code (Bytes.unsafe_get d.i d.i_pos); + d.c <- Char.code (Io.unsafe_get d.src d.i d.i_pos); d.i_pos <- d.i_pos + 1) -type dst = { - dst : Io.dst; (* Output destination. *) +type 'a dst = { + dst : 'a Io.dst; (* Output destination. *) buff : Buffer.t; (* Scratch buffer. *) scratch : bytes; max_code : int; mutable level : int; (* Current code size *) mutable current_code : int; mutable next_bump : int; - mutable o : bytes; (* Current output chunk. *) + mutable o : 'a; (* Current output chunk. *) mutable o_pos : int; (* Next output position to write. *) mutable o_max : int; (* Maximal output position to write. *) mutable pending : int * int; } -let dst ?(max_code=65536) ?(buf = Bytes.create io_buffer_size) dst = - let o_max = Bytes.length buf - 1 in +let dst (type b) ?(max_code=65536) (dst : b Io.dst) ?(buf = Io.create dst io_buffer_size) () = + let o_max = Io.length dst buf - 1 in if o_max = 0 then invalid_arg "buf's length is empty" else { @@ -146,9 +183,9 @@ let dst ?(max_code=65536) ?(buf = Bytes.create io_buffer_size) dst = let flush e ~stop = if stop then ( - if e.o_pos <> 0 then e.dst (Some (e.o, 0, e.o_pos)); - e.dst None) - else e.dst (Some (e.o, 0, e.o_pos)); + if e.o_pos <> 0 then Io.write e.dst (Some (e.o, 0, e.o_pos)); + Io.write e.dst None) + else Io.write e.dst (Some (e.o, 0, e.o_pos)); e.o_pos <- 0 let rec writec e c = @@ -156,7 +193,7 @@ let rec writec e c = flush e ~stop:false; writec e c) else ( - Bytes.set_uint8 e.o e.o_pos c; + Io.set_uint8 e.dst e.o e.o_pos c; e.o_pos <- e.o_pos + 1) let w_flush dst v = diff --git a/lib/lzw.mli b/lib/lzw.mli index e2e2d3e..f724dd2 100644 --- a/lib/lzw.mli +++ b/lib/lzw.mli @@ -8,30 +8,60 @@ *) module Io : sig - type src = unit -> bytes * int * int + type bigstring = (char, Bigarray.int8_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t + + type _ src = + | Bytes : (unit -> bytes * int * int) -> bytes src + | Bigstring : (unit -> bigstring * int * int) -> bigstring src (** An IO source. This should raise [End_of_file] whenever the last bytes have been read. *) - type dst = (bytes * int * int) option -> unit + type _ dst = + | Bytes : ((bytes * int * int) option -> unit) -> bytes dst + | Bigstring : ((bigstring * int * int) option -> unit) -> bigstring dst (** An IO destination. *) end -type src +type _ src (** A source to compress or decompress. *) -type dst +type _ dst (** A destination to write a compressed or decompressed stream to. *) -val src : ?max_code:int -> Io.src -> src +val src : ?max_code:int -> 'a Io.src -> 'a src (** [src io] creates a new source from an {! Io.src}. Max code size controls - the upper code that can be used. *) + the highest code that can be used. *) -val dst : ?max_code:int -> ?buf:bytes -> Io.dst -> dst +val dst : ?max_code:int -> 'a Io.dst -> ?buf:'a -> unit -> 'a dst (** [dst io] creates a new destination from an {! Io.dst}. Max code size controls - the upper code that can be used. *) + the highest code that can be used. *) -val compress : src -> dst -> unit +val compress : _ src -> _ dst -> unit (** [compress src dst] compresses the bytes coming from [src] into [dst]. *) -val decompress : src -> dst -> unit +val decompress : _ src -> _ dst -> unit (** [decompress src dst] decompresses the bytes coming from [src] into [dst]. *) + +(** {2 Example} + + Here is a small example of compressing data from [stdin] and outputting to [stdout]. + + {[ + let stdin = + let bytes = Bytes.create 4096 in + fun () -> + match In_channel.input stdin bytes 0 4096 with + | 0 -> raise End_of_file + | r -> bytes, 0, r + + let stdout = function + | None -> () + | Some (s, o, l) -> + Out_channel.output_bytes stdout (Bytes.sub s o l) + + let () = + let src = Lzw.src (Bytes stdin) in + let dst = Lzw.dst (Bytes stdout) () in + Lzw.compress src dst + ]} +*) \ No newline at end of file From 4bfc8fae6f00f9550dc6c766a2a372082643c7f5 Mon Sep 17 00:00:00 2001 From: Patrick Ferris Date: Sat, 16 Sep 2023 16:49:15 +0100 Subject: [PATCH 7/8] Add lzw to decompress --- bin/decompress.ml | 31 +++++++++++++++++++++++++++++-- bin/dune | 1 + test/bin/simple.t | 4 ++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/bin/decompress.ml b/bin/decompress.ml index b309cfb..7e1866d 100644 --- a/bin/decompress.ml +++ b/bin/decompress.ml @@ -235,6 +235,28 @@ let run_lzo_inflate ic oc = | Ok str -> output_string oc str ; `Ok 0 | Error err -> `Error (false, str "%a." Lzo.pp_error err) +let run_lzw ic oc mode = + let stdin = + let bytes = Bytes.create 4096 in + fun () -> + match In_channel.input ic bytes 0 4096 with + | 0 -> raise End_of_file + | r -> bytes, 0, r + in + let stdout = function + | None -> () + | Some (s, o, l) -> + Out_channel.output_bytes oc (Bytes.sub s o l) + in + let src = Lzw.src (Bytes stdin) in + let dst = Lzw.dst (Bytes stdout) () in + let () = + match mode with + | `Compress -> Lzw.compress src dst + | `Decompress -> Lzw.decompress src dst + in + `Ok 0 + let run deflate format level filename_ic filename_oc = let ic, close_ic = match filename_ic with @@ -257,7 +279,10 @@ let run deflate format level filename_ic filename_oc = | true, `Gzip -> run_gzip_deflate ~level ic oc | false, `Gzip -> run_gzip_inflate ic oc | true, `Lzo -> run_lzo_deflate ic oc - | false, `Lzo -> run_lzo_inflate ic oc in + | false, `Lzo -> run_lzo_inflate ic oc + | true, `Lzw -> run_lzw ic oc `Compress + | false, `Lzw -> run_lzw ic oc `Decompress + in close_ic () ; close_oc () ; res open Cmdliner @@ -273,12 +298,14 @@ let format = | "gzip" -> Ok `Gzip | "deflate" -> Ok `Deflate | "lzo" -> Ok `Lzo + | "lzw" -> Ok `Lzw | x -> error_msgf "Invalid format: %S" x in let pp ppf = function | `Zlib -> Format.pp_print_string ppf "zlib" | `Gzip -> Format.pp_print_string ppf "gzip" | `Deflate -> Format.pp_print_string ppf "deflate" - | `Lzo -> Format.pp_print_string ppf "lzo" in + | `Lzo -> Format.pp_print_string ppf "lzo" + | `Lzw -> Format.pp_print_string ppf "lzw" in let format = Arg.conv (parser, pp) in Arg.(value & opt format `Deflate & info ["f"; "format"] ~docv:"") diff --git a/bin/dune b/bin/dune index 92043e9..38a7266 100644 --- a/bin/dune +++ b/bin/dune @@ -10,4 +10,5 @@ decompress.zl decompress.gz decompress.lzo + decompress.lzw cmdliner)) diff --git a/test/bin/simple.t b/test/bin/simple.t index c787ff2..479a3a3 100644 --- a/test/bin/simple.t +++ b/test/bin/simple.t @@ -26,3 +26,7 @@ Simple tests $ decompress -fgzip -d --level 0 ../corpus/news news.gz $ decompress -fgzip news.gz news $ diff news ../corpus/news + $ decompress -flzw -d ../corpus/news news.lzw + $ decompress -flzw news.lzw news.lzw.decompressed + $ diff news.lzw.decompressed ../corpus/news + From d33a47001274246d7a703d0541d35857ad9ac9cd Mon Sep 17 00:00:00 2001 From: Patrick Ferris Date: Sat, 16 Sep 2023 18:19:30 +0100 Subject: [PATCH 8/8] Remove In_channel/Out_channel --- bin/decompress.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/decompress.ml b/bin/decompress.ml index 7e1866d..ce3a6d9 100644 --- a/bin/decompress.ml +++ b/bin/decompress.ml @@ -239,14 +239,14 @@ let run_lzw ic oc mode = let stdin = let bytes = Bytes.create 4096 in fun () -> - match In_channel.input ic bytes 0 4096 with + match input ic bytes 0 4096 with | 0 -> raise End_of_file | r -> bytes, 0, r in let stdout = function | None -> () | Some (s, o, l) -> - Out_channel.output_bytes oc (Bytes.sub s o l) + output_bytes oc (Bytes.sub s o l) in let src = Lzw.src (Bytes stdin) in let dst = Lzw.dst (Bytes stdout) () in