KGBCREST/book_cipher.py at master · akingry/KGBCREST · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
"""
Book Cipher Steganography with Error Correction

Encodes messages as relative character positions in a shared source text (book).
Both sender and receiver must have the same source text to communicate.

The cipher finds each character's position in the book and records relative
jumps (forward or backward, whichever is shorter). These positions are then
compressed and embedded in the image using DCT watermarking.

Example: "The" with T at position 325, h at 225, e at 228
         Records: [325, -100, +3] (relative positions)
"""

import re
import os
import zlib
import struct
import hashlib
import secrets
import numpy as np
from PIL import Image
from pathlib import Path
from reedsolo import RSCodec, ReedSolomonError
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding
from cryptography.hazmat.backends import default_backend


class AESCipher:
    """AES-256 encryption using CBC mode with PKCS7 padding."""

    SALT_SIZE = 16
    IV_SIZE = 16
    KEY_SIZE = 32  # 256 bits
    ITERATIONS = 100000

    @staticmethod
    def _derive_key(password: str, salt: bytes) -> bytes:
        """Derive a 256-bit key from password using PBKDF2."""
        from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
        from cryptography.hazmat.primitives import hashes

        kdf = PBKDF2HMAC(
            algorithm=hashes.SHA256(),
            length=AESCipher.KEY_SIZE,
            salt=salt,
            iterations=AESCipher.ITERATIONS,
            backend=default_backend()
        )
        return kdf.derive(password.encode('utf-8'))

    @staticmethod
    def encrypt(plaintext: bytes, password: str) -> bytes:
        """
        Encrypt data with AES-256-CBC.

        Returns: salt (16) + iv (16) + ciphertext
        """
        salt = secrets.token_bytes(AESCipher.SALT_SIZE)
        iv = secrets.token_bytes(AESCipher.IV_SIZE)
        key = AESCipher._derive_key(password, salt)

        # Pad plaintext to block size
        padder = padding.PKCS7(128).padder()
        padded_data = padder.update(plaintext) + padder.finalize()

        # Encrypt
        cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
        encryptor = cipher.encryptor()
        ciphertext = encryptor.update(padded_data) + encryptor.finalize()

        return salt + iv + ciphertext

    @staticmethod
    def decrypt(data: bytes, password: str) -> bytes:
        """
        Decrypt AES-256-CBC encrypted data.

        Expects: salt (16) + iv (16) + ciphertext
        """
        if len(data) < AESCipher.SALT_SIZE + AESCipher.IV_SIZE + 16:
            raise ValueError("Invalid encrypted data (too short)")

        salt = data[:AESCipher.SALT_SIZE]
        iv = data[AESCipher.SALT_SIZE:AESCipher.SALT_SIZE + AESCipher.IV_SIZE]
        ciphertext = data[AESCipher.SALT_SIZE + AESCipher.IV_SIZE:]

        key = AESCipher._derive_key(password, salt)

        # Decrypt
        cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
        decryptor = cipher.decryptor()
        padded_data = decryptor.update(ciphertext) + decryptor.finalize()

        # Remove padding
        unpadder = padding.PKCS7(128).unpadder()
        plaintext = unpadder.update(padded_data) + unpadder.finalize()

        return plaintext

# Default source text location
DEFAULT_SOURCE_FILE = Path(__file__).parent / "source_text.txt"
GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/766/pg766.txt"  # David Copperfield


class BookCipher:
    """
    Book cipher using character positions in a shared source text.

    Both sender and receiver must have the same source text (book) to
    encode and decode messages. Characters are found by their position
    in the text, using relative jumps (forward or backward) for efficiency.
    """

    def __init__(self, source_path=None):
        """
        Initialize with a source text file.

        Args:
            source_path: Path to the source text file. If None, uses default
                        (downloads David Copperfield from Gutenberg if needed).
        """
        self.source_path = Path(source_path) if source_path else DEFAULT_SOURCE_FILE
        self.text = ""
        self.char_positions = {}  # char -> list of positions
        self._load_source()

    def _load_source(self):
        """Load and index the source text."""
        if not self.source_path.exists():
            if self.source_path == DEFAULT_SOURCE_FILE:
                print("Downloading default source text (David Copperfield)...")
                import urllib.request
                data = urllib.request.urlopen(GUTENBERG_URL).read().decode('utf-8')
                self.source_path.write_text(data, encoding='utf-8')
            else:
                raise FileNotFoundError(f"Source text not found: {self.source_path}")

        self.text = self.source_path.read_text(encoding='utf-8')

        # Build character index: char -> list of positions
        self.char_positions = {}
        for i, char in enumerate(self.text):
            if char not in self.char_positions:
                self.char_positions[char] = []
            self.char_positions[char].append(i)

        unique_chars = len(self.char_positions)
        print(f"Loaded source text: {len(self.text):,} characters, {unique_chars} unique chars")

    def encode_message(self, message):
        """
        Encode message as sequence of relative character positions.

        For each character in the message, finds the nearest occurrence
        in the source text (forward or backward) and records the relative
        jump distance.

        Args:
            message: The secret message to encode

        Returns:
            Compressed bytes containing the position data
        """
        if not message:
            raise ValueError("Empty message")

        positions = []
        current_pos = 0

        for i, char in enumerate(message):
            if char not in self.char_positions:
                raise ValueError(
                    f"Character '{char}' (position {i}) not found in source text. "
                    f"The source text may not contain this character."
                )

            # Find nearest occurrence (forward or backward)
            occurrences = self.char_positions[char]

            # Find the occurrence with shortest distance from current position
            best_pos = min(occurrences, key=lambda p: abs(p - current_pos))

            # Calculate relative jump (can be negative)
            jump = best_pos - current_pos
            positions.append(jump)
            current_pos = best_pos

        # Encode positions as variable-length signed integers
        encoded = self._encode_positions(positions)

        # Compress
        compressed = zlib.compress(encoded, level=9)

        print(f"Encoded {len(message)} chars: {len(encoded)} bytes -> {len(compressed)} bytes compressed")
        return compressed

    def decode_message(self, compressed_data):
        """
        Decode message from compressed position data.

        Args:
            compressed_data: Compressed bytes from encode_message

        Returns:
            The decoded message string
        """
        try:
            encoded = zlib.decompress(compressed_data)
        except zlib.error as e:
            raise ValueError(f"Decompression failed: {e}")

        positions = self._decode_positions(encoded)

        # Reconstruct message by following positions
        chars = []
        current_pos = 0

        for jump in positions:
            current_pos += jump
            if 0 <= current_pos < len(self.text):
                chars.append(self.text[current_pos])
            else:
                chars.append(f"[?{current_pos}]")

        return ''.join(chars)

    def _encode_positions(self, positions):
        """
        Encode positions as variable-length signed integers.

        Uses zigzag encoding for efficient storage of signed integers,
        then varint encoding for variable-length output.
        """
        result = bytearray()

        # Store count as 2-byte big-endian
        result.extend(struct.pack('>H', len(positions)))

        for pos in positions:
            # Zigzag encoding: map signed to unsigned
            # 0 -> 0, -1 -> 1, 1 -> 2, -2 -> 3, 2 -> 4, ...
            if pos >= 0:
                zigzag = pos * 2
            else:
                zigzag = (-pos) * 2 - 1

            # Varint encoding
            while zigzag >= 0x80:
                result.append((zigzag & 0x7F) | 0x80)
                zigzag >>= 7
            result.append(zigzag)

        return bytes(result)

    def _decode_positions(self, data):
        """Decode variable-length signed integers."""
        positions = []
        idx = 0

        # Read count
        count = struct.unpack('>H', data[idx:idx+2])[0]
        idx += 2

        for _ in range(count):
            # Varint decoding
            zigzag = 0
            shift = 0

            while True:
                if idx >= len(data):
                    raise ValueError("Unexpected end of data")
                byte = data[idx]
                idx += 1
                zigzag |= (byte & 0x7F) << shift
                if byte < 0x80:
                    break
                shift += 7

            # Zigzag decoding: unsigned to signed
            if zigzag & 1:
                pos = -((zigzag + 1) >> 1)
            else:
                pos = zigzag >> 1

            positions.append(pos)

        return positions


# Alias for backward compatibility
MessageEncoder = BookCipher


class ErrorCorrection:
    """
    Reed-Solomon error correction + repetition coding for robustness.
    """

    def __init__(self, rs_symbols=32, repetition=3):
        """
        rs_symbols: Number of Reed-Solomon parity symbols (more = more robust)
        repetition: How many times to repeat each bit (odd number for majority vote)
        """
        self.rs = RSCodec(rs_symbols)
        self.rs_symbols = rs_symbols
        self.repetition = repetition

    def encode(self, data):
        """
        Apply Reed-Solomon encoding, then repetition coding.
        """
        # 1. Reed-Solomon encoding (adds parity bytes)
        rs_encoded = bytes(self.rs.encode(data))
        print(f"  RS encoded: {len(data)} -> {len(rs_encoded)} bytes (+{self.rs_symbols} parity)")

        # 2. Repetition coding (repeat each bit)
        bits = self._bytes_to_bits(rs_encoded)
        repeated_bits = []
        for bit in bits:
            repeated_bits.extend([bit] * self.repetition)

        print(f"  Repetition: {len(bits)} -> {len(repeated_bits)} bits (x{self.repetition})")

        return repeated_bits

    def decode(self, repeated_bits):
        """
        Majority vote on repetition coding, then Reed-Solomon decoding.
        """
        # 1. Majority vote to recover original bits
        bits = []
        for i in range(0, len(repeated_bits), self.repetition):
            chunk = repeated_bits[i:i + self.repetition]
            ones = sum(chunk)
            zeros = len(chunk) - ones
            bits.append(1 if ones > zeros else 0)

        # 2. Convert bits to bytes
        rs_encoded = self._bits_to_bytes(bits)

        # 3. Reed-Solomon decoding (corrects errors)
        try:
            decoded = bytes(self.rs.decode(rs_encoded)[0])
            return decoded
        except ReedSolomonError as e:
            raise ValueError(f"Reed-Solomon decoding failed: {e}")

    def _bytes_to_bits(self, data):
        """Convert bytes to bit list."""
        bits = []
        for byte in data:
            for i in range(7, -1, -1):
                bits.append((byte >> i) & 1)
        return bits

    def _bits_to_bytes(self, bits):
        """Convert bit list to bytes."""
        # Pad to multiple of 8
        while len(bits) % 8 != 0:
            bits.append(0)

        result = bytearray()
        for i in range(0, len(bits), 8):
            byte = 0
            for j in range(8):
                byte = (byte << 1) | bits[i + j]
            result.append(byte)
        return bytes(result)


class RobustWatermark:
    """
    DCT-domain watermarking that survives JPEG compression.
    Embeds data in mid-frequency DCT coefficients.
    """

    def __init__(self, strength=50):
        self.strength = strength
        self.block_size = 8

    def embed(self, image_path, bits, output_path):
        """Embed bits in image using DCT watermarking."""
        img = Image.open(image_path).convert('RGB')
        pixels = np.array(img, dtype=np.float64)

        # Work with luminance (Y channel)
        y_channel = 0.299 * pixels[:,:,0] + 0.587 * pixels[:,:,1] + 0.114 * pixels[:,:,2]

        height, width = y_channel.shape

        # Add 24-bit length header (supports up to 16M bits)
        total_bits = len(bits)
        length_bits = [(total_bits >> (23 - i)) & 1 for i in range(24)]
        all_bits = length_bits + bits

        # Calculate blocks
        blocks_h = height // self.block_size
        blocks_w = width // self.block_size
        total_blocks = blocks_h * blocks_w

        if len(all_bits) > total_blocks:
            raise ValueError(f"Data too large: {len(all_bits)} bits, only {total_blocks} blocks available")

        print(f"  Embedding {len(all_bits)} bits in {total_blocks} available blocks")

        # Embed one bit per block
        bit_idx = 0
        for by in range(blocks_h):
            for bx in range(blocks_w):
                if bit_idx >= len(all_bits):
                    break

                y1 = by * self.block_size
                x1 = bx * self.block_size
                block = y_channel[y1:y1+self.block_size, x1:x1+self.block_size].copy()

                # Apply DCT
                dct_block = self._dct2(block)

                # Embed bit using quantization index modulation
                bit = all_bits[bit_idx]
                coef = dct_block[4, 3]

                q = self.strength
                quantized = round(coef / q) * q

                if bit == 1:
                    dct_block[4, 3] = quantized + q * 0.3
                else:
                    dct_block[4, 3] = quantized - q * 0.3

                # Apply inverse DCT
                block_new = self._idct2(dct_block)
                adjustment = block_new - block

                for c in range(3):
                    pixels[y1:y1+self.block_size, x1:x1+self.block_size, c] += adjustment

                bit_idx += 1

        # Clip and save
        pixels = np.clip(pixels, 0, 255).astype(np.uint8)
        result = Image.fromarray(pixels)

        # Determine format from extension
        ext = output_path.lower().split('.')[-1]
        if ext == 'webp':
            result.save(output_path, 'WEBP', lossless=True)
        elif ext in ('jpg', 'jpeg'):
            result.save(output_path, 'JPEG', quality=95)
        else:
            result.save(output_path, 'PNG')

        return result

    def extract(self, image_path):
        """Extract bits from watermarked image."""
        img = Image.open(image_path).convert('RGB')
        pixels = np.array(img, dtype=np.float64)

        y_channel = 0.299 * pixels[:,:,0] + 0.587 * pixels[:,:,1] + 0.114 * pixels[:,:,2]

        height, width = y_channel.shape
        blocks_h = height // self.block_size
        blocks_w = width // self.block_size

        # Extract all bits we might need
        bits = []
        for by in range(blocks_h):
            for bx in range(blocks_w):
                y1 = by * self.block_size
                x1 = bx * self.block_size
                block = y_channel[y1:y1+self.block_size, x1:x1+self.block_size]

                dct_block = self._dct2(block)
                coef = dct_block[4, 3]
                q = self.strength

                quantized = round(coef / q) * q
                bit = 1 if coef >= quantized else 0
                bits.append(bit)

        # Extract length (24-bit header)
        length = 0
        for i in range(24):
            length = (length << 1) | bits[i]

        if length <= 0 or length > len(bits) - 24:
            raise ValueError(
                f"No hidden message found in this image.\n\n"
                f"This could mean:\n"
                f"• The image doesn't contain an encoded message\n"
                f"• JPEG quality was too low (below 60)\n"
                f"• The image was cropped or resized\n\n"
                f"(Technical: invalid length header {length})"
            )

        # Return data bits
        return bits[24:24 + length]

    def _dct2(self, block):
        """2D DCT."""
        from scipy.fftpack import dct
        return dct(dct(block.T, norm='ortho').T, norm='ortho')

    def _idct2(self, block):
        """2D inverse DCT."""
        from scipy.fftpack import idct
        return idct(idct(block.T, norm='ortho').T, norm='ortho')


def encode_image(image_path, message, output_path, strength=50, rs_symbols=32, repetition=3, source_path=None, password=None):
    """
    Encode a message into an image using book cipher + error correction + DCT watermarking.

    Args:
        image_path: Path to source image
        message: Secret message to hide
        output_path: Path for output image
        strength: DCT embedding strength (default 50, use 150 for JPEG q60 survival)
        rs_symbols: Reed-Solomon parity symbols (default 32, use 64 for max robustness)
        repetition: Bit repetition count (default 3, use 7 for max robustness)
        source_path: Path to source text file (book). If None, uses default.
        password: Optional password for AES-256 encryption. If provided, message is encrypted.
    """
    print(f"Encoding: '{message[:50]}{'...' if len(message) > 50 else ''}'")

    # 1. Book cipher: encode as relative character positions
    cipher = BookCipher(source_path)
    compressed = cipher.encode_message(message)

    # 2. Optional AES-256 encryption (after compression, before error correction)
    if password:
        print("Encrypting with AES-256...")
        encrypted = AESCipher.encrypt(compressed, password)
        # Prefix with marker byte (0x01 = encrypted)
        data_for_ec = b'\x01' + encrypted
    else:
        # Prefix with marker byte (0x00 = plaintext)
        data_for_ec = b'\x00' + compressed

    # 3. Error correction: add RS parity + repetition
    print("Adding error correction...")
    ec = ErrorCorrection(rs_symbols=rs_symbols, repetition=repetition)
    protected_bits = ec.encode(data_for_ec)

    # 4. Embed in image
    print("Embedding in image...")
    watermark = RobustWatermark(strength=strength)
    watermark.embed(image_path, protected_bits, output_path)

    print(f"\n✓ Saved to: {output_path}")
    print(f"  Message: {len(message)} chars")
    print(f"  Encrypted: {'Yes (AES-256)' if password else 'No'}")
    print(f"  Compressed: {len(compressed)} bytes")
    print(f"  Final data: {len(data_for_ec)} bytes")
    print(f"  With EC: {len(protected_bits)} bits")

    return True


def decode_image(image_path, strength=50, rs_symbols=32, repetition=3, source_path=None, password=None):
    """
    Decode a message from a watermarked image.

    Args:
        image_path: Path to encoded image
        strength: DCT embedding strength (must match encoding)
        rs_symbols: Reed-Solomon parity symbols (must match encoding)
        repetition: Bit repetition count (must match encoding)
        source_path: Path to source text file (book). Must be same as encoding.
        password: Optional password for AES-256 decryption. Required if message was encrypted.
    """
    print(f"Decoding: {image_path}")

    # 1. Extract watermark bits
    watermark = RobustWatermark(strength=strength)
    protected_bits = watermark.extract(image_path)
    print(f"  Extracted {len(protected_bits)} bits")

    # 2. Remove error correction
    ec = ErrorCorrection(rs_symbols=rs_symbols, repetition=repetition)
    data_from_ec = ec.decode(protected_bits)
    print(f"  After EC: {len(data_from_ec)} bytes")

    # 3. Check encryption marker and decrypt if needed
    if len(data_from_ec) < 1:
        raise ValueError("Invalid message data")

    marker = data_from_ec[0]
    payload = data_from_ec[1:]

    if marker == 0x01:
        # Encrypted message
        if not password:
            raise ValueError("This message is encrypted. Password required.")
        print("Decrypting AES-256...")
        try:
            compressed = AESCipher.decrypt(payload, password)
        except Exception as e:
            raise ValueError(f"Decryption failed. Wrong password or corrupted data.")
    elif marker == 0x00:
        # Plaintext message
        compressed = payload
    else:
        # Legacy format (no marker) - treat as plaintext compressed data
        compressed = data_from_ec

    # 4. Book cipher: decode relative positions to message
    cipher = BookCipher(source_path)
    message = cipher.decode_message(compressed)

    return message


if __name__ == '__main__':
    import sys

    if len(sys.argv) < 2:
        print("Book Cipher Steganography with Error Correction")
        print("=" * 50)
        print()
        print("Usage:")
        print("  Encode: python book_cipher.py encode <image> <message> [output]")
        print("  Decode: python book_cipher.py decode <image>")
        print("  Test:   python book_cipher.py test <image> <message>")
        print()
        print("Example:")
        print("  python book_cipher.py encode photo.png 'Meet me at noon' secret.png")
        print("  python book_cipher.py decode secret.png")
        sys.exit(1)

    mode = sys.argv[1]

    if mode == 'encode':
        if len(sys.argv) < 4:
            print("Usage: python book_cipher.py encode <image> <message> [output]")
            sys.exit(1)

        image = sys.argv[2]
        message = sys.argv[3]
        output = sys.argv[4] if len(sys.argv) > 4 else 'encoded.png'

        encode_image(image, message, output)

    elif mode == 'decode':
        if len(sys.argv) < 3:
            print("Usage: python book_cipher.py decode <image>")
            sys.exit(1)

        image = sys.argv[2]
        message = decode_image(image)
        print(f"\n✓ Decoded message: {message}")

    elif mode == 'test':
        if len(sys.argv) < 4:
            print("Usage: python book_cipher.py test <image> <message>")
            sys.exit(1)

        image = sys.argv[2]
        message = sys.argv[3]

        print("=" * 50)
        print("JPEG SURVIVAL TEST")
        print("=" * 50)

        # Encode
        encode_image(image, message, 'test_encoded.png')

        # Test various JPEG quality levels
        img = Image.open('test_encoded.png')

        for quality in [90, 85, 80, 75, 70, 65, 60]:
            jpg_path = f'test_q{quality}.jpg'
            img.save(jpg_path, 'JPEG', quality=quality)

            try:
                decoded = decode_image(jpg_path)
                status = "✓" if decoded.lower() == message.lower() else f"≈ ({decoded})"
            except Exception as e:
                status = f"✗ {str(e)[:30]}"

            print(f"  Quality {quality}: {status}")

    else:
        print(f"Unknown mode: {mode}")
        sys.exit(1)