atupal · Byang119 · Mar 28, 2013 · Aug 23, 2013 · Aug 6, 2017
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
 nlp
 ===
 
-自然语言处理，目前想识别的是帮短信分级，如垃圾短信，重要短信，一般短信，群发短信等等
+自然语言处理，目前想识别的是帮短信分级，如垃圾短信，重要短信，一般短信，群发短信等等
diff --git a/split/.idea/inspectionProfiles/profiles_settings.xml b/split/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/split/.idea/misc.xml b/split/.idea/misc.xml
diff --git a/split/.idea/modules.xml b/split/.idea/modules.xml
diff --git a/split/.idea/split.iml b/split/.idea/split.iml
diff --git a/split/.idea/workspace.xml b/split/.idea/workspace.xml
diff --git a/split/README.md b/split/README.md
@@ -0,0 +1,4 @@
+nlp
+===
+
+自然语言处理，目前想识别的是帮短信分级，如垃圾短信，重要短信，一般短信，群发短信等等
diff --git a/split/__init__.py b/split/__init__.py
diff --git a/split/bayes.py b/split/bayes.py
@@ -0,0 +1,40 @@
+#coding=utf-8
+import re
+import xml.dom.minidom
+from mmseg import Algorithm, dict_load_defaults
+
+class Bayes:
+    def __init__(self):
+        self.message_xml = open('message.xml', "r")
+        self.message = self.message_xml.read()
+        self.good = {}
+        self.bad = {}
+        import collections
+        self.total = collections.defaultdict(lambda: 1)
+        return
+
+    def get_messages(self):
+        pattern_message = re.compile('(<Body><\!\[CDATA\[)(.+?)(\]\]></Body>)')
+        self.message.decode('utf-8')
+        res = pattern_message.findall(self.message)
+        message_flag = open('message_flag', 'w')
+        for mes in res:
+            tmp = unicode(mes[1], 'utf-8')
+            print(tmp)
+            flag = raw_input()
+            message_flag.write(tmp.encode('utf-8') + ' ' + flag + '\n')
+            for word in Algorithm(tmp):
+                self.total[word] += 1
+        message_xml.close()
+        for i in self.total.keys():
+            print i,self.total[i]
+
+    def cuttest(self, text):
+        wlist = [word for word in Algorithm(text)]
+        tmp = "/".join(wlist)
+        print (tmp)
+        print ("================================")
+
+if __name__ == '__main__':
+    bayes = Bayes()
+    bayes.get_messages()