Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,14 @@
* SPDX-License-Identifier: Apache-2.0
*/

package org.apache.yoko.orb.codecs;
package org.apache.yoko.codecs;

import org.apache.yoko.io.ReadBuffer;
import org.apache.yoko.io.WriteBuffer;
import org.apache.yoko.orb.OB.CodeSetInfo;
import org.omg.CORBA.DATA_CONVERSION;

import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;

import static org.apache.yoko.orb.codecs.LatinCodec.getLatinCodec;
import static org.apache.yoko.orb.codecs.Util.getUnicodeCodec;
import static org.apache.yoko.codecs.LatinCodec.getLatinCodec;
import static org.apache.yoko.util.MinorCodes.MinorUTF8Encoding;
import static org.apache.yoko.util.MinorCodes.MinorUTF8Overflow;
import static org.omg.CORBA.CompletionStatus.COMPLETED_MAYBE;
Expand Down Expand Up @@ -80,40 +75,31 @@
* </p>
*/
public interface CharCodec {
@FunctionalInterface interface CharReader { char readChar(ReadBuffer in); }

String name();

CodeSetInfo getCodeSetInfo();

/**
* Get a char codec instance for the named Java charset.
*
* @param name the name of the Java charset for which a codec is required
* @return an instance of the appropriate char codec
* @throws IllegalCharsetNameException if the provided name is not a valid charset name
* @throws IllegalArgumentException if the provided name is null
* @throws UnsupportedCharsetException if the named charset is not supported
* Returns true iff the encoding always uses the same number of octets per char
*/
static CharCodec forName(String name) throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
// fastest result: directly named unicode codec
CharCodec result = getUnicodeCodec(name);
if (null != result) return result;
// next see if it is an alias for a unicode codec
Charset charset = Charset.forName(name);
result = getUnicodeCodec(charset.name());
if (null != result) return result;
// the only other codecs currently supported are the Latin ones
return getLatinCodec(charset);
}
default boolean isFixedWidth() { return true; }

static CharCodec forRegistryId(int id) throws UnsupportedCharsetException {
CodeSetInfo csi = CodeSetInfo.forRegistryId(id);
switch (csi) {
case UTF_16: return SimpleWcharCodec.UTF_16;
case UTF_8: return new Utf8Codec();

}
throw new UnsupportedCharsetException("Charset registry id = " + id);
}
/**
* Returns the number of octets per char iff {@link #isFixedWidth()} returns <code>true</code>
* @throws UnsupportedOperationException for non-fixed-width encodings
*/
default int charSize() { return 1; }
/**
* Read the next char.
* @throws IndexOutOfBoundsException if the buffer does not contain enough bytes to read a single char
*/
char readChar(ReadBuffer in);

String name();
/**
* Gives the number of octets needed to encode the specified char.
*/
default int octetCount(char c) { return 1; }

/**
* Encodes a character to a buffer.
Expand All @@ -133,10 +119,6 @@ static CharCodec forRegistryId(int id) throws UnsupportedCharsetException {
*/
void writeChar(char c, WriteBuffer out);

/** Read the next char */
char readChar(ReadBuffer in);


/**
* Check there is no unfinished character data.
* This is only relevant for encodings that encode
Expand All @@ -155,6 +137,8 @@ default void assertNoBufferedCharData() throws DATA_CONVERSION {
/** Check whether the last character was not a high surrogate. */
default boolean writeFinished() { return true; }

/** Provides an identical object that can be used concurrently with this one */
default CharCodec getInstanceOrCopy() { return this; }
/**
* Provides an identical object that can be used concurrently with this one
*/
default CharCodec duplicate() { return this; }
}
89 changes: 89 additions & 0 deletions yoko-core/src/main/java/org/apache/yoko/codecs/Codex.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Copyright 2026 IBM Corporation and others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an \"AS IS\" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-License-Identifier: Apache-2.0
*/
package org.apache.yoko.codecs;

import org.apache.yoko.orb.OB.CodeSetInfo;

import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Optional;

import static org.apache.yoko.codecs.LatinCodec.getLatinCodec;
import static org.apache.yoko.codecs.SimpleWcharCodec.UTF_16;
import static org.apache.yoko.codecs.Util.getUnicodeCharCodec;

public enum Codex {
;

public static CharCodec getCollocatedCharCodec() { return SimpleWcharCodec.COLLOCATED; }

public static CharCodec getDefaultCharCodec() { return SimpleCharCodec.ISO_LATIN_1; }

/**
* Get a char codec instance for the named Java charset.
*
* @param charsetName the charsetName of the Java charset for which a codec is required
* @return an instance of the appropriate char codec
* @throws IllegalCharsetNameException if the provided charsetName is not a valid charset charsetName
* @throws IllegalArgumentException if the provided charsetName is null
* @throws UnsupportedCharsetException if the named charset is not supported
*/
public static CharCodec getCharCodec(String charsetName) throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
// fastest result: directly named unicode codec
CharCodec result = getUnicodeCharCodec(charsetName);
if (null != result) return result;
// next see if it is an alias for a unicode codec
Charset charset = Charset.forName(charsetName);
result = getUnicodeCharCodec(charset.name());
if (null != result) return result;
// the only other codecs currently supported are the Latin ones
return getLatinCodec(charset);
}

public static CharCodec getCharCodec(int id) throws UnsupportedCharsetException {
CodeSetInfo csi = CodeSetInfo.forRegistryId(id);
if (null == csi) throw new UnsupportedCharsetException(String.format("Unknown registry id: 0x%08x", id));
switch (csi) {
case UTF_8: return new Utf8Codec();
case ISO_LATIN_1: return SimpleCharCodec.ISO_LATIN_1;
default: return LatinCodec.getLatinCodec(csi); // throws if unknown
}
}

public static WcharCodec getCollocatedWcharCodec() { return SimpleWcharCodec.COLLOCATED; }

public static WcharCodec getDefaultWcharCodec() { return UTF_16; }

public static WcharCodec getUnspecifiedWcharCodec() { return SimpleWcharCodec.UNSPECIFIED; }

public static WcharCodec getWcharCodec(String charsetName) {
if (charsetName == null) throw new NullPointerException();
if ("UTF-16".equalsIgnoreCase(charsetName)) return UTF_16;
if ("UTF-16".equalsIgnoreCase(Charset.forName(charsetName).name())) return UTF_16;
throw new UnsupportedCharsetException(charsetName + " not supported for wchar");
}

public static WcharCodec getWcharCodec(int twcsId) {
if (CodeSetInfo.UTF_16.id == twcsId) return UTF_16;
String message = Optional.ofNullable(CodeSetInfo.forRegistryId(twcsId))
.map(info -> String.format("Charset %s unsupported for wchar", info.name()))
.orElse(String.format("Unknown registry id 0x%08x unsupported for wchar", twcsId));
throw new UnsupportedCharsetException(message);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*
* SPDX-License-Identifier: Apache-2.0
*/
package org.apache.yoko.orb.codecs;
package org.apache.yoko.codecs;

import org.apache.yoko.io.ReadBuffer;
import org.apache.yoko.io.WriteBuffer;
Expand All @@ -32,8 +32,16 @@
import static java.nio.ByteBuffer.allocate;
import static java.util.Collections.unmodifiableMap;
import static java.util.stream.IntStream.range;
import static org.apache.yoko.orb.codecs.Util.ASCII_REPLACEMENT_BYTE;
import static org.apache.yoko.orb.codecs.Util.UNICODE_REPLACEMENT_CHAR;
import static org.apache.yoko.codecs.Util.ASCII_REPLACEMENT_BYTE;
import static org.apache.yoko.codecs.Util.UNICODE_REPLACEMENT_CHAR;
import static org.apache.yoko.orb.OB.CodeSetInfo.ISO_8859_5;
import static org.apache.yoko.orb.OB.CodeSetInfo.ISO_8859_6;
import static org.apache.yoko.orb.OB.CodeSetInfo.ISO_8859_7;
import static org.apache.yoko.orb.OB.CodeSetInfo.ISO_8859_8;
import static org.apache.yoko.orb.OB.CodeSetInfo.ISO_8859_9;
import static org.apache.yoko.orb.OB.CodeSetInfo.ISO_LATIN_2;
import static org.apache.yoko.orb.OB.CodeSetInfo.ISO_LATIN_3;
import static org.apache.yoko.orb.OB.CodeSetInfo.ISO_LATIN_4;
import static org.apache.yoko.util.Collectors.neverCombine;

/**
Expand All @@ -43,28 +51,28 @@ class LatinCodec implements CharCodec {
static LatinCodec getLatinCodec(Charset charset) {
if (!charset.canEncode()) throw new UnsupportedCharsetException(charset.name());
switch (charset.name()) {
case ISO_8859_2.NAME: return ISO_8859_2.INSTANCE;
case ISO_8859_3.NAME: return ISO_8859_3.INSTANCE;
case ISO_8859_4.NAME: return ISO_8859_4.INSTANCE;
case ISO_8859_5.NAME: return ISO_8859_5.INSTANCE;
case ISO_8859_6.NAME: return ISO_8859_6.INSTANCE;
case ISO_8859_7.NAME: return ISO_8859_7.INSTANCE;
case ISO_8859_8.NAME: return ISO_8859_8.INSTANCE;
case ISO_8859_9.NAME: return ISO_8859_9.INSTANCE;
case "ISO-8859-2": return Iso8859_2.INSTANCE;
case "ISO-8859-3": return Iso8859_3.INSTANCE;
case "ISO-8859-4": return Iso8859_4.INSTANCE;
case "ISO-8859-5": return Iso8859_5.INSTANCE;
case "ISO-8859-6": return Iso8859_6.INSTANCE;
case "ISO-8859-7": return Iso8859_7.INSTANCE;
case "ISO-8859-8": return Iso8859_8.INSTANCE;
case "ISO-8859-9": return Iso8859_9.INSTANCE;
default: throw new UnsupportedCharsetException(charset.name());
}
}

static LatinCodec getLatinCodec(CodeSetInfo csi) {
switch (csi) {
case ISO_LATIN_2: return ISO_8859_2.INSTANCE;
case ISO_LATIN_3: return ISO_8859_3.INSTANCE;
case ISO_LATIN_4: return ISO_8859_4.INSTANCE;
case ISO_8859_5: return ISO_8859_5.INSTANCE;
case ISO_8859_6: return ISO_8859_6.INSTANCE;
case ISO_8859_7: return ISO_8859_7.INSTANCE;
case ISO_8859_8: return ISO_8859_8.INSTANCE;
case ISO_8859_9: return ISO_8859_9.INSTANCE;
case ISO_LATIN_2: return Iso8859_2.INSTANCE;
case ISO_LATIN_3: return Iso8859_3.INSTANCE;
case ISO_LATIN_4: return Iso8859_4.INSTANCE;
case ISO_8859_5: return Iso8859_5.INSTANCE;
case ISO_8859_6: return Iso8859_6.INSTANCE;
case ISO_8859_7: return Iso8859_7.INSTANCE;
case ISO_8859_8: return Iso8859_8.INSTANCE;
case ISO_8859_9: return Iso8859_9.INSTANCE;
}
throw new UnsupportedCharsetException(csi.name());
}
Expand All @@ -73,22 +81,24 @@ static LatinCodec getLatinCodec(CodeSetInfo csi) {
// (e.g. if only Latin-2 is used, the others are never created.)
// N.B. NAME is a compile-time constant and gets inlined so using it does not drive class initialization
// whereas dereferencing INSTANCE forces initialization. (See JLS 12.4)
private interface ISO_8859_2 { String NAME = "ISO-8859-2"; LatinCodec INSTANCE = new LatinCodec(NAME); }
private interface ISO_8859_3 { String NAME = "ISO-8859-3"; LatinCodec INSTANCE = new LatinCodec(NAME); }
private interface ISO_8859_4 { String NAME = "ISO-8859-4"; LatinCodec INSTANCE = new LatinCodec(NAME); }
private interface ISO_8859_5 { String NAME = "ISO-8859-5"; LatinCodec INSTANCE = new LatinCodec(NAME); }
private interface ISO_8859_6 { String NAME = "ISO-8859-6"; LatinCodec INSTANCE = new LatinCodec(NAME); }
private interface ISO_8859_7 { String NAME = "ISO-8859-7"; LatinCodec INSTANCE = new LatinCodec(NAME); }
private interface ISO_8859_8 { String NAME = "ISO-8859-8"; LatinCodec INSTANCE = new LatinCodec(NAME); }
private interface ISO_8859_9 { String NAME = "ISO-8859-9"; LatinCodec INSTANCE = new LatinCodec(NAME); }
private interface Iso8859_2 { LatinCodec INSTANCE = new LatinCodec("ISO-8859-2", ISO_LATIN_2); }
private interface Iso8859_3 { LatinCodec INSTANCE = new LatinCodec("ISO-8859-3", ISO_LATIN_3); }
private interface Iso8859_4 { LatinCodec INSTANCE = new LatinCodec("ISO-8859-4", ISO_LATIN_4); }
private interface Iso8859_5 { LatinCodec INSTANCE = new LatinCodec("ISO-8859-5", ISO_8859_5); }
private interface Iso8859_6 { LatinCodec INSTANCE = new LatinCodec("ISO-8859-6", ISO_8859_6); }
private interface Iso8859_7 { LatinCodec INSTANCE = new LatinCodec("ISO-8859-7", ISO_8859_7); }
private interface Iso8859_8 { LatinCodec INSTANCE = new LatinCodec("ISO-8859-8", ISO_8859_8); }
private interface Iso8859_9 { LatinCodec INSTANCE = new LatinCodec("ISO-8859-9", ISO_8859_9); }

final String name;
final char[] decoderArray;
final Map<Character, Byte> encoderMap;
private final String name;
private final CodeSetInfo codeSetInfo;
private final char[] decoderArray;
private final Map<Character, Byte> encoderMap;

private LatinCodec(String name) {
private LatinCodec(String name, CodeSetInfo csi) {
Charset cs = Charset.forName(name);
this.name = cs.name();
this.codeSetInfo = csi;
ByteBuffer bytes = range(0, 256)
.collect(() -> allocate(256), (bb, b) -> bb.put(b, (byte) b), neverCombine());
CharBuffer chars = cs.decode(bytes);
Expand All @@ -109,6 +119,9 @@ public char readChar(ReadBuffer in) {
@Override
public String name() { return name; }

@Override
public CodeSetInfo getCodeSetInfo() { return codeSetInfo; }

@Override
public boolean equals(Object o) {
if (!(o instanceof LatinCodec)) return false;
Expand All @@ -120,4 +133,7 @@ public boolean equals(Object o) {
public int hashCode() {
return Objects.hashCode(name);
}

@Override
public String toString() { return name; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,24 @@
* SPDX-License-Identifier: Apache-2.0
*/

package org.apache.yoko.orb.codecs;
package org.apache.yoko.codecs;

import org.apache.yoko.io.ReadBuffer;
import org.apache.yoko.io.WriteBuffer;
import org.apache.yoko.orb.OB.CodeSetInfo;

import static org.apache.yoko.orb.codecs.Util.expect7bit;
import static org.apache.yoko.orb.codecs.Util.require7bit;
import static org.apache.yoko.orb.codecs.Util.require8bit;
import static org.apache.yoko.codecs.Util.expect7bit;
import static org.apache.yoko.codecs.Util.require7bit;
import static org.apache.yoko.codecs.Util.require8bit;

enum SimpleCharCodec implements CharCodec {
US_ASCII {
public CodeSetInfo getCodeSetInfo() { return CodeSetInfo.ISO_646_IRV; }
public char readChar(ReadBuffer in) { return expect7bit(in.readByteAsChar()); }
public void writeChar(char c, WriteBuffer out) { out.writeByte(require7bit(c)); }
},
ISO_LATIN_1 {
public CodeSetInfo getCodeSetInfo() { return CodeSetInfo.ISO_LATIN_1; }
public char readChar(ReadBuffer in) { return in.readByteAsChar(); } // no checking - a single-byte character can't be > 0xFF
public void writeChar(char c, WriteBuffer out) { out.writeByte(require8bit(c)); }
}
Expand Down
Loading