From 8c30cefcd786796ca19faad9c18724725f1ee603 Mon Sep 17 00:00:00 2001 From: Alvari Date: Tue, 28 Jan 2025 12:05:47 +1300 Subject: [PATCH] refactored the Bigram class to use a stringbuilder when concatenating a string in a loop to boost performance. ALso added override annotations to the AtomicFloat class --- .../java/org/apdplat/word/corpus/Bigram.java | 365 +++++++++--------- .../org/apdplat/word/util/AtomicFloat.java | 208 +++++----- 2 files changed, 286 insertions(+), 287 deletions(-) diff --git a/src/main/java/org/apdplat/word/corpus/Bigram.java b/src/main/java/org/apdplat/word/corpus/Bigram.java index 82610ce2..2adb84cd 100644 --- a/src/main/java/org/apdplat/word/corpus/Bigram.java +++ b/src/main/java/org/apdplat/word/corpus/Bigram.java @@ -1,25 +1,23 @@ /** - * - * APDPlat - Application Product Development Platform - * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川, + * yang-shangchuan@qq.com + * + *

This program is free software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + *

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * + * + *

You should have received a copy of the GNU General Public License along with this program. If + * not, see . */ - package org.apdplat.word.corpus; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import org.apdplat.word.segmentation.Word; import org.apdplat.word.util.AutoDetector; import org.apdplat.word.util.DoubleArrayGenericTrie; @@ -28,195 +26,196 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - /** * 二元语法模型 + * * @author 杨尚川 */ public class Bigram { - private static final Logger LOGGER = LoggerFactory.getLogger(Bigram.class); - private static final DoubleArrayGenericTrie DOUBLE_ARRAY_GENERIC_TRIE = new DoubleArrayGenericTrie(WordConfTools.getInt("bigram.double.array.trie.size", 5300000)); - private static int maxFrequency = 0; - static{ - reload(); + private static final Logger LOGGER = LoggerFactory.getLogger(Bigram.class); + private static final DoubleArrayGenericTrie DOUBLE_ARRAY_GENERIC_TRIE = + new DoubleArrayGenericTrie(WordConfTools.getInt("bigram.double.array.trie.size", 5300000)); + private static int maxFrequency = 0; + + static { + reload(); + } + + public static void reload() { + if (!"bigram".equals(WordConfTools.get("ngram", "bigram"))) { + LOGGER.info("未启用bigram"); + return; } - public static void reload(){ - if(!"bigram".equals(WordConfTools.get("ngram", "bigram"))){ - LOGGER.info("未启用bigram"); - return; - } - AutoDetector.loadAndWatch(new ResourceLoader(){ + AutoDetector.loadAndWatch( + new ResourceLoader() { - @Override - public void clear() { - DOUBLE_ARRAY_GENERIC_TRIE.clear(); - } + @Override + public void clear() { + DOUBLE_ARRAY_GENERIC_TRIE.clear(); + } - @Override - public void load(List lines) { - LOGGER.info("初始化bigram"); - Map map = new HashMap<>(); - for(String line : lines){ - try{ - addLine(line, map); - }catch(Exception e){ - LOGGER.error("错误的bigram数据:"+line); - } - } - int size = map.size(); - DOUBLE_ARRAY_GENERIC_TRIE.putAll(map); - LOGGER.info("bigram初始化完毕,bigram数据条数:" + size); + @Override + public void load(List lines) { + LOGGER.info("初始化bigram"); + Map map = new HashMap<>(); + for (String line : lines) { + try { + addLine(line, map); + } catch (Exception e) { + LOGGER.error("错误的bigram数据:" + line); + } } + int size = map.size(); + DOUBLE_ARRAY_GENERIC_TRIE.putAll(map); + LOGGER.info("bigram初始化完毕,bigram数据条数:" + size); + } - @Override - public void add(String line) { - throw new RuntimeException("not yet support menthod!"); - } + @Override + public void add(String line) { + throw new RuntimeException("not yet support menthod!"); + } - private void addLine(String line, Map map){ - String[] attr = line.split("\\s+"); - int frequency = Integer.parseInt(attr[1]); - if(frequency > maxFrequency){ - maxFrequency = frequency; - } - map.put(attr[0], frequency); + private void addLine(String line, Map map) { + String[] attr = line.split("\\s+"); + int frequency = Integer.parseInt(attr[1]); + if (frequency > maxFrequency) { + maxFrequency = frequency; } + map.put(attr[0], frequency); + } - @Override - public void remove(String line) { - throw new RuntimeException("not yet support menthod!"); - } - - }, WordConfTools.get("bigram.path", "classpath:bigram.txt")); - } + @Override + public void remove(String line) { + throw new RuntimeException("not yet support menthod!"); + } + }, + WordConfTools.get("bigram.path", "classpath:bigram.txt")); + } - public static int getMaxFrequency() { - return maxFrequency; - } + public static int getMaxFrequency() { + return maxFrequency; + } - /** - * 含有语境的二元模型分值算法 - * 计算多种分词结果的分值 - * 利用获得的二元模型分值重新计算分词结果的分值 - * 补偿细粒度切分获得分值而粗粒度切分未获得分值的情况 - * @param sentences 多种分词结果 - * @return 分词结果及其对应的分值 - */ - public static Map, Float> bigram(List... sentences){ - Map, Float> map = new HashMap<>(); - Map bigramScores = new HashMap<>(); - //两个连续的bigram补偿粗粒度分值 - //如:美国, 加州, 大学,如果美国, 加州和加州, 大学有分值 - //则美国加州大学也会获得分值 - Map twoBigramScores = new HashMap<>(); - //1、计算多种分词结果的分值 - for(List sentence : sentences){ - if(map.get(sentence) != null){ - continue; + /** + * 含有语境的二元模型分值算法 计算多种分词结果的分值 利用获得的二元模型分值重新计算分词结果的分值 补偿细粒度切分获得分值而粗粒度切分未获得分值的情况 + * + * @param sentences 多种分词结果 + * @return 分词结果及其对应的分值 + */ + public static Map, Float> bigram(List... sentences) { + Map, Float> map = new HashMap<>(); + Map bigramScores = new HashMap<>(); + // 两个连续的bigram补偿粗粒度分值 + // 如:美国, 加州, 大学,如果美国, 加州和加州, 大学有分值 + // 则美国加州大学也会获得分值 + Map twoBigramScores = new HashMap<>(); + // 1、计算多种分词结果的分值 + for (List sentence : sentences) { + if (map.get(sentence) != null) { + continue; + } + float score = 0; + // 计算其中一种分词结果的分值 + if (sentence.size() > 1) { + StringBuilder last = new StringBuilder(""); + for (int i = 0; i < sentence.size() - 1; i++) { + String first = sentence.get(i).getText(); + String second = sentence.get(i + 1).getText(); + float bigramScore = getScore(first, second); + if (bigramScore > 0) { + if (last.toString().endsWith(first)) { + twoBigramScores.put(last + second, bigramScores.get(last) + bigramScore); + last.setLength(0); } - float score=0; - //计算其中一种分词结果的分值 - if(sentence.size() > 1){ - String last=""; - for(int i=0; i 0){ - if(last.endsWith(first)){ - twoBigramScores.put(last+second, bigramScores.get(last)+bigramScore); - last=""; - } - last = first+second; - bigramScores.put(last, bigramScore); - score += bigramScore; - } - } - } - map.put(sentence, score); + last.append(first + second); + bigramScores.put(last.toString(), bigramScore); + score += bigramScore; + } } - //2、利用获得的二元模型分值重新计算分词结果的分值 - //补偿细粒度切分获得分值而粗粒度切分未获得分值的情况 - //计算多种分词结果的分值 - if(bigramScores.size() > 0 || twoBigramScores.size() > 0){ - for(List sentence : map.keySet()){ - //计算其中一种分词结果的分值 - for(Word word : sentence){ - Float bigramScore = bigramScores.get(word.getText()); - Float twoBigramScore = twoBigramScores.get(word.getText()); - Float[] array = {bigramScore, twoBigramScore}; - for(Float score : array){ - if(score !=null && score > 0){ - if(LOGGER.isDebugEnabled()) { - LOGGER.debug(word.getText() + " 获得分值:" + score); - } - float value = map.get(sentence); - value += score; - map.put(sentence, value); - } - } - } - } - } - - return map; + } + map.put(sentence, score); } - public static float sentenceScore(List words){ - if(words.size() > 1){ - float total = words.size() - 1; - float match = 0; - for(int i=0; i 0){ - match++; - } + // 2、利用获得的二元模型分值重新计算分词结果的分值 + // 补偿细粒度切分获得分值而粗粒度切分未获得分值的情况 + // 计算多种分词结果的分值 + if (bigramScores.size() > 0 || twoBigramScores.size() > 0) { + for (List sentence : map.keySet()) { + // 计算其中一种分词结果的分值 + for (Word word : sentence) { + Float bigramScore = bigramScores.get(word.getText()); + Float twoBigramScore = twoBigramScores.get(word.getText()); + Float[] array = {bigramScore, twoBigramScore}; + for (Float score : array) { + if (score != null && score > 0) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug(word.getText() + " 获得分值:" + score); + } + float value = map.get(sentence); + value += score; + map.put(sentence, value); } - return match/total; + } } - return 0; + } } - /** - * 计算分词结果的二元模型分值 - * @param words 分词结果 - * @return 二元模型分值 - */ - public static float bigram(List words){ - if(words.size() > 1){ - float score=0; - for(int i=0; i words) { + if (words.size() > 1) { + float total = words.size() - 1; + float match = 0; + for (int i = 0; i < words.size() - 1; i++) { + if (getScore(words.get(i).getText(), words.get(i + 1).getText()) > 0) { + match++; } - return 0; + } + return match / total; } - /** - * 获取两个词一前一后紧挨着同时出现在语料库中的分值 - * 分值被归一化了: - * 完全没有出现分值为0 - * 出现频率最高的分值为1 - * @param first 前一个词 - * @param second 后一个词 - * @return 同时出现的分值 - */ - public static float getScore(String first, String second) { - int frequency = getFrequency(first, second); - float score = frequency/(float)maxFrequency; - if(LOGGER.isDebugEnabled()) { - if(score>0) { - LOGGER.debug("二元模型 " + first + ":" + second + " 获得分值:" + score); - } - } - return score; + return 0; + } + + /** + * 计算分词结果的二元模型分值 + * + * @param words 分词结果 + * @return 二元模型分值 + */ + public static float bigram(List words) { + if (words.size() > 1) { + float score = 0; + for (int i = 0; i < words.size() - 1; i++) { + score += getScore(words.get(i).getText(), words.get(i + 1).getText()); + } + return score; } + return 0; + } - public static int getFrequency(String first, String second) { - Integer value = DOUBLE_ARRAY_GENERIC_TRIE.get(first+":"+second); - if(value == null || value < 0){ - return 0; - } - return value; + /** + * 获取两个词一前一后紧挨着同时出现在语料库中的分值 分值被归一化了: 完全没有出现分值为0 出现频率最高的分值为1 + * + * @param first 前一个词 + * @param second 后一个词 + * @return 同时出现的分值 + */ + public static float getScore(String first, String second) { + int frequency = getFrequency(first, second); + float score = frequency / (float) maxFrequency; + if (LOGGER.isDebugEnabled()) { + if (score > 0) { + LOGGER.debug("二元模型 " + first + ":" + second + " 获得分值:" + score); + } + } + return score; + } + + public static int getFrequency(String first, String second) { + Integer value = DOUBLE_ARRAY_GENERIC_TRIE.get(first + ":" + second); + if (value == null || value < 0) { + return 0; } -} \ No newline at end of file + return value; + } +} diff --git a/src/main/java/org/apdplat/word/util/AtomicFloat.java b/src/main/java/org/apdplat/word/util/AtomicFloat.java index 2e53b0ff..75afceac 100644 --- a/src/main/java/org/apdplat/word/util/AtomicFloat.java +++ b/src/main/java/org/apdplat/word/util/AtomicFloat.java @@ -1,120 +1,120 @@ /** + * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川, + * yang-shangchuan@qq.com * - * APDPlat - Application Product Development Platform - * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com + *

This program is free software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + *

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * + *

You should have received a copy of the GNU General Public License along with this program. If + * not, see . */ - package org.apdplat.word.util; import java.util.concurrent.atomic.AtomicInteger; /** - * 因为Java没有提供AtomicFloat - * 所以自己实现一个 + * 因为Java没有提供AtomicFloat 所以自己实现一个 + * * @author 杨尚川 */ public class AtomicFloat extends Number { - private AtomicInteger bits; - - public AtomicFloat() { - this(0f); - } - - public AtomicFloat(float initialValue) { - bits = new AtomicInteger(Float.floatToIntBits(initialValue)); - } - - public final float addAndGet(float delta){ - float expect; - float update; - do { - expect = get(); - update = expect + delta; - } while(!this.compareAndSet(expect, update)); - - return update; - } - - public final float getAndAdd(float delta){ - float expect; - float update; - do { - expect = get(); - update = expect + delta; - } while(!this.compareAndSet(expect, update)); - - return expect; - } - - public final float getAndDecrement(){ - return getAndAdd(-1); - } - - public final float decrementAndGet(){ - return addAndGet(-1); - } - - public final float getAndIncrement(){ - return getAndAdd(1); - } - - public final float incrementAndGet(){ - return addAndGet(1); - } - - public final float getAndSet(float newValue) { - float expect; - do { - expect = get(); - } while(!this.compareAndSet(expect, newValue)); - - return expect; - } - - public final boolean compareAndSet(float expect, float update) { - return bits.compareAndSet(Float.floatToIntBits(expect), Float.floatToIntBits(update)); - } - - public final void set(float newValue) { - bits.set(Float.floatToIntBits(newValue)); - } - - public final float get() { - return Float.intBitsToFloat(bits.get()); - } - - public float floatValue() { - return get(); - } - - public double doubleValue() { - return (double) floatValue(); - } - - public int intValue() { - return (int) get(); - } - - public long longValue() { - return (long) get(); - } - - public String toString() { - return Float.toString(get()); - } + private AtomicInteger bits; + + public AtomicFloat() { + this(0f); + } + + public AtomicFloat(float initialValue) { + bits = new AtomicInteger(Float.floatToIntBits(initialValue)); + } + + public final float addAndGet(float delta) { + float expect; + float update; + do { + expect = get(); + update = expect + delta; + } while (!this.compareAndSet(expect, update)); + + return update; + } + + public final float getAndAdd(float delta) { + float expect; + float update; + do { + expect = get(); + update = expect + delta; + } while (!this.compareAndSet(expect, update)); + + return expect; + } + + public final float getAndDecrement() { + return getAndAdd(-1); + } + + public final float decrementAndGet() { + return addAndGet(-1); + } + + public final float getAndIncrement() { + return getAndAdd(1); + } + + public final float incrementAndGet() { + return addAndGet(1); + } + + public final float getAndSet(float newValue) { + float expect; + do { + expect = get(); + } while (!this.compareAndSet(expect, newValue)); + + return expect; + } + + public final boolean compareAndSet(float expect, float update) { + return bits.compareAndSet(Float.floatToIntBits(expect), Float.floatToIntBits(update)); + } + + public final void set(float newValue) { + bits.set(Float.floatToIntBits(newValue)); + } + + public final float get() { + return Float.intBitsToFloat(bits.get()); + } + + @Override + public float floatValue() { + return get(); + } + + @Override + public double doubleValue() { + return (double) floatValue(); + } + + @Override + public int intValue() { + return (int) get(); + } + + @Override + public long longValue() { + return (long) get(); + } + + @Override + public String toString() { + return Float.toString(get()); + } }