DenoisingUtil.java 5.83 KB
/**@author Sherlock_yb
 * Created time:2014年9月25日
 */
package nlp.whu.utils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Set;

import org.apache.log4j.Logger;

/**@description: 关键词去噪工具
 */
public class DenoisingUtil {
	private static final Logger log = Log4jUtil.getInstance().getLogger(DenoisingUtil.class);
	private static DenoisingUtil denoisingUtil = new DenoisingUtil();
	
	public static final String txtFilename = "filterKeytemsTxt";
	public static final String objectFilename = "filterKeytemsDict";
	public static final String charsetName = "UTF-8";
	private DenoisingUtil(){
		// singleTon, do nothing
	}
	public static DenoisingUtil getInstance(){
		return denoisingUtil;
	}
	public void txtFile2ObjectFile(String txtFilename, String charsetName, String objectFilename){
		BufferedReader br = null;
		try {
			br = new BufferedReader(new InputStreamReader(new FileInputStream(txtFilename), charsetName));
			Set<String> strSet = new HashSet<String>();
			String str = null;
			while((str = br.readLine()) != null){
				if(str.trim().length() > 0){
					strSet.add(str.trim());
				}
			}
			outputObjectFile(strSet, objectFilename);
		} catch (UnsupportedEncodingException e) {
			log.error("", e);
		} catch (FileNotFoundException e) {
			log.error("", e);
		} catch (IOException e) {
			log.error("", e);
		}finally{
			if(br != null){
				try {
					br.close();
				} catch (IOException e) {
					log.error("close br failed", e);
				}
			}
		}
	}
	public void objectFile2TxtFile(String objectFilename, String txtFilename, String charsetName){
		Set<String> filterKeyterms = new LinkedHashSet<String>();
		loadObjectFile(filterKeyterms, objectFilename);
		BufferedWriter bw = null;
		try {
			bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(txtFilename), charsetName));
			for(String keyterm : filterKeyterms){
				bw.write(keyterm + "\n");
			}
		} catch (UnsupportedEncodingException e) {
			log.error("", e);
		} catch (FileNotFoundException e) {
			log.error("", e);
		} catch (IOException e) {
			log.error("", e);
		}finally{
			if(bw != null){
				try {
					bw.close();
				} catch (IOException e) {
					log.error("close bw failed", e);
				}
			}
		}
	}
	public void outputObjectFile(Set<String> filterKeyterms, String filename){
		ObjectOutputStream oos = null;
		try {
			oos = new ObjectOutputStream(new FileOutputStream(filename));
			oos.writeObject(filterKeyterms);
		} catch (FileNotFoundException e) {
			log.error("", e);
		} catch (IOException e) {
			log.error("", e);
		}finally{
			if(oos != null){
				try {
					oos.close();
				} catch (IOException e) {
					log.error("", e);
				}
			}
		}
	}
	@SuppressWarnings("unchecked")
	public void loadObjectFile(Set<String> filterKeyterms, String filename){
		ObjectInputStream ois = null;
		try {
			ois = new ObjectInputStream(new FileInputStream(filename));
			filterKeyterms.addAll((Set<String>)ois.readObject());
		} catch (FileNotFoundException e) {
			log.error("", e);
		} catch (IOException e) {
			log.error("", e);
		} catch (ClassNotFoundException e) {
			log.error("", e);
		}finally{
			if(ois != null){
				try {
					ois.close();
				} catch (IOException e) {
					log.error("close ois failed", e);
				}
			}
		}
	}
	public void test(){
		String filter = "在这里|版纳|流连依旧|流连忘返|遥远|难听|西双版纳|树立|zW|这里|那里|按时睡|美丽|打扰|转载|觉得|少吃盐 |家和比啥都重要|"
				+ "人们|你们|我们|他们|她们|它们|咱们|"
				+ "那家|哪家|这家|一条|哪里|"
				+ "今天|昨天|明天|后天|前天|本周|上周|下周|"
				+ "召开|"
				+ "详情|多云|阵雨|阴有小|多云间晴|晴转多云|微风|中雨|间晴|阴有大雨|天气预报|周边天气|遥远|截至|请有关|预计未来12小时内|小雨|"
				+ "想去|只需|油腻|不错|感觉|特级|哈哈|嘻嘻|呜呜|嘿嘿|呵呵|谢谢|"
				+ "如果想|就去|过桥|"
				+ "爸爸|妈妈|朋友|"
				+ "微博|博文|网站|阅读|全文|分享|信息网|县至网传|第一门户|推荐|咨询|热线|地址|县至|地貌|路上|江边|论坛|帮转|"
				+ "什么|怎么|不会|告诉|自己|那么|这么|今晚|早安|午安|晚安|"
				+ "泸沽湖|白药|艳遇|上传|下载|本网讯|72拐|"
				+ "百度贴|" 
				+ "mdash|url|http|null|nbsp|"
				+ "转发理由|味道|鲜花饼";
		String[] strArray = filter.split("\\|");
		System.out.println(Arrays.toString(strArray));
		Set<String> strs = new LinkedHashSet<String>();
		strs.addAll(Arrays.asList(strArray));
		outputObjectFile(strs, objectFilename);
		objectFile2TxtFile(objectFilename, txtFilename, charsetName);
	}
	public static void main(String[] args){
		DenoisingUtil denoisingUtil = DenoisingUtil.getInstance();
//		denoisingUtil.test();
		if(args.length != 1){
			log.info("need just one argument: 1 or 2");
			log.info("1 : txtFile to ObjectFile; 2 : ObjectFile to txtFile, exit");
			System.exit(0);
		}
		if("1".equals(args[0])){
			log.info("convert txtFile to ObjectFile.");
			denoisingUtil.txtFile2ObjectFile(txtFilename, charsetName, objectFilename);
		}else if("2".equals(args[0])){
			log.info("convert ObjectFile to txtFile");
			denoisingUtil.objectFile2TxtFile(objectFilename, txtFilename, charsetName);
		}else{
			log.info("argument must be 1 or 2");
		}
		log.info("done");
	}
}