DenoisingUtil.java
5.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/**@author Sherlock_yb
* Created time:2014年9月25日
*/
package nlp.whu.utils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.log4j.Logger;
/**@description: 关键词去噪工具
*/
public class DenoisingUtil {
private static final Logger log = Log4jUtil.getInstance().getLogger(DenoisingUtil.class);
private static DenoisingUtil denoisingUtil = new DenoisingUtil();
public static final String txtFilename = "filterKeytemsTxt";
public static final String objectFilename = "filterKeytemsDict";
public static final String charsetName = "UTF-8";
private DenoisingUtil(){
// singleTon, do nothing
}
public static DenoisingUtil getInstance(){
return denoisingUtil;
}
public void txtFile2ObjectFile(String txtFilename, String charsetName, String objectFilename){
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(txtFilename), charsetName));
Set<String> strSet = new HashSet<String>();
String str = null;
while((str = br.readLine()) != null){
if(str.trim().length() > 0){
strSet.add(str.trim());
}
}
outputObjectFile(strSet, objectFilename);
} catch (UnsupportedEncodingException e) {
log.error("", e);
} catch (FileNotFoundException e) {
log.error("", e);
} catch (IOException e) {
log.error("", e);
}finally{
if(br != null){
try {
br.close();
} catch (IOException e) {
log.error("close br failed", e);
}
}
}
}
public void objectFile2TxtFile(String objectFilename, String txtFilename, String charsetName){
Set<String> filterKeyterms = new LinkedHashSet<String>();
loadObjectFile(filterKeyterms, objectFilename);
BufferedWriter bw = null;
try {
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(txtFilename), charsetName));
for(String keyterm : filterKeyterms){
bw.write(keyterm + "\n");
}
} catch (UnsupportedEncodingException e) {
log.error("", e);
} catch (FileNotFoundException e) {
log.error("", e);
} catch (IOException e) {
log.error("", e);
}finally{
if(bw != null){
try {
bw.close();
} catch (IOException e) {
log.error("close bw failed", e);
}
}
}
}
public void outputObjectFile(Set<String> filterKeyterms, String filename){
ObjectOutputStream oos = null;
try {
oos = new ObjectOutputStream(new FileOutputStream(filename));
oos.writeObject(filterKeyterms);
} catch (FileNotFoundException e) {
log.error("", e);
} catch (IOException e) {
log.error("", e);
}finally{
if(oos != null){
try {
oos.close();
} catch (IOException e) {
log.error("", e);
}
}
}
}
@SuppressWarnings("unchecked")
public void loadObjectFile(Set<String> filterKeyterms, String filename){
ObjectInputStream ois = null;
try {
ois = new ObjectInputStream(new FileInputStream(filename));
filterKeyterms.addAll((Set<String>)ois.readObject());
} catch (FileNotFoundException e) {
log.error("", e);
} catch (IOException e) {
log.error("", e);
} catch (ClassNotFoundException e) {
log.error("", e);
}finally{
if(ois != null){
try {
ois.close();
} catch (IOException e) {
log.error("close ois failed", e);
}
}
}
}
public void test(){
String filter = "在这里|版纳|流连依旧|流连忘返|遥远|难听|西双版纳|树立|zW|这里|那里|按时睡|美丽|打扰|转载|觉得|少吃盐 |家和比啥都重要|"
+ "人们|你们|我们|他们|她们|它们|咱们|"
+ "那家|哪家|这家|一条|哪里|"
+ "今天|昨天|明天|后天|前天|本周|上周|下周|"
+ "召开|"
+ "详情|多云|阵雨|阴有小|多云间晴|晴转多云|微风|中雨|间晴|阴有大雨|天气预报|周边天气|遥远|截至|请有关|预计未来12小时内|小雨|"
+ "想去|只需|油腻|不错|感觉|特级|哈哈|嘻嘻|呜呜|嘿嘿|呵呵|谢谢|"
+ "如果想|就去|过桥|"
+ "爸爸|妈妈|朋友|"
+ "微博|博文|网站|阅读|全文|分享|信息网|县至网传|第一门户|推荐|咨询|热线|地址|县至|地貌|路上|江边|论坛|帮转|"
+ "什么|怎么|不会|告诉|自己|那么|这么|今晚|早安|午安|晚安|"
+ "泸沽湖|白药|艳遇|上传|下载|本网讯|72拐|"
+ "百度贴|"
+ "mdash|url|http|null|nbsp|"
+ "转发理由|味道|鲜花饼";
String[] strArray = filter.split("\\|");
System.out.println(Arrays.toString(strArray));
Set<String> strs = new LinkedHashSet<String>();
strs.addAll(Arrays.asList(strArray));
outputObjectFile(strs, objectFilename);
objectFile2TxtFile(objectFilename, txtFilename, charsetName);
}
public static void main(String[] args){
DenoisingUtil denoisingUtil = DenoisingUtil.getInstance();
// denoisingUtil.test();
if(args.length != 1){
log.info("need just one argument: 1 or 2");
log.info("1 : txtFile to ObjectFile; 2 : ObjectFile to txtFile, exit");
System.exit(0);
}
if("1".equals(args[0])){
log.info("convert txtFile to ObjectFile.");
denoisingUtil.txtFile2ObjectFile(txtFilename, charsetName, objectFilename);
}else if("2".equals(args[0])){
log.info("convert ObjectFile to txtFile");
denoisingUtil.objectFile2TxtFile(objectFilename, txtFilename, charsetName);
}else{
log.info("argument must be 1 or 2");
}
log.info("done");
}
}