CVBB ACM Team

词频统计的一些做法

主要难点在大词典的读入与存储，和大文本在大词典中的搜索。
个人有几个思路，整理归纳一下。

存储

普通的数组直接pass，链表搜索起来太慢。

二叉查找树

可以把搜索的时间复杂度降到logn。
不过需要注意的是读入的字典是有序的，在创建二叉查找树时不经过一些处理会构建成最劣的链式结构。
因此需要应用到平衡二叉树的一些手段，具体可以参考自平衡二叉查找树
不过字典的情况比较特殊，字典是严格有序的，不需要运用到上面那些复杂的结构，有比较简单的处理方法
有序链表自然地将小元素划分到了左边，大元素划分到了右边。我们只要找出中间节点提为父节点即可。
找中间节点的方法也有很多，我这里用了快慢指针。

代码

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include<ctype.h>
#define MAXWORD 32
int n[3]={0},now=0;
 
struct lnode{
	char word[MAXWORD];
	int count;
	struct lnode* next;
};
typedef struct lnode lNode;
typedef struct lnode* lNodeptr;
 
lNodeptr wl[3],r[3];
 
struct tnode{
	char word[MAXWORD];
	int count;
	struct tnode* left;
	struct tnode* right;
};
typedef struct tnode tNode;
typedef struct tnode* tNodeptr;
 
int getWord(FILE *fp,char *w)
{
	int c;
	while(!isalpha(c=fgetc(fp)))
		if(c==EOF) return c;
		else continue;
	do{
		*w++=tolower(c);
	}while(isalpha(c=fgetc(fp)));
	*w='\0';
	return 1;
}
 
tNodeptr ListToBst(lNodeptr head, lNodeptr tail){
	if (head==tail) return NULL;
	lNodeptr fast=head;
	lNodeptr slow=head;
	while (fast!=tail&&fast->next!=tail)
	{
		slow=slow->next;
		fast=fast->next->next;
	}
	tNodeptr root=(tNodeptr)malloc(sizeof(tNode));
	strcpy(root->word,slow->word);
	root->left=ListToBst(head,slow);
	root->right=ListToBst(slow->next,tail);
	return root;
}
 
int isword(tNodeptr list,char *w)
{
	if (strcmp(list->word,w)==0) return 1;
	else if (strcmp(list->word,w)<0&&list->right!=NULL) return isword(list->right,w);
	else if (strcmp(list->word,w)>0&&list->left!=NULL) return isword(list->left,w);
	return -1;
}
 
void insert(tNodeptr r,char x[1000],int t)
{
	if (strcmp(r->word,x)==0){
		r->count++;
	}
	else if (strcmp(r->word,x)>0) {
		if (r->left==NULL) {
			n[t]++;
			tNodeptr p=(tNodeptr)malloc(sizeof(tNode));
			strcpy(p->word,x);
			p->count=1;
			p->left=NULL;
			p->right=NULL;
			r->left=p;
		}
		else insert(r->left,x,t);
	}
	else {
		if (r->right==NULL) {
			n[t]++;
			tNodeptr p=(tNodeptr)malloc(sizeof(tNode));
			p->count=1;
			strcpy(p->word,x);
			p->left=NULL;
			p->right=NULL;
			r->right=p;
		}
		else insert(r->right,x,t); 
	}
}
 
void search(tNodeptr r,int n,int t)
{
	if (r!=NULL){
		search(r->left,n,t);
		if (wl[t]==NULL) {
			wl[t]=(lNodeptr)malloc(sizeof(lNode));
			strcpy(wl[t]->word,r->word);
			wl[t]->count=r->count;
			wl[t]->next=NULL;
		}
		else {
			int i=0;
			lNodeptr p=wl[t],pr;
			lNodeptr tmp=(lNodeptr)malloc(sizeof(lNode));
			strcpy(tmp->word,r->word);
			tmp->count=r->count;
			while (p->next!=NULL&&p->count>=r->count&&i<n)
			{
				pr=p;
				p=p->next;
				i++;
			}
			if (p->next==NULL) {
				if (p->count>=r->count) 
				{
					p->next=tmp;
					tmp->next=NULL;
				}
				else if (p==wl[t]){
					tmp->next=p;
					wl[t]=tmp;
				}
				else {
					pr->next=tmp;
					tmp->next=p;
				}
			}
			else if (p==wl[t]){
				tmp->next=p;
				wl[t]=tmp;
			}
			else {
				pr->next=tmp;
				tmp->next=p;
			}
		}
		search(r->right,n,t);
	}
}
 
double min1(double x,double y)
{
	if (x<y) return x;
	else return y;
}
 
double max1(double x,double y)
{
	if (x>y) return x;
	else return y;
}
 
int main()
{
	FILE *fp1,*fp2,*fstop,*dic,*res;
	fp1=fopen("article1.txt","r");
	fp2=fopen("article2.txt","r");
	fstop=fopen("stopwords.txt","r");
	dic=fopen("dictionary.txt","r");
	res=fopen("results.txt","w");
	lNodeptr Wordlist,Wordlists,end;
	Wordlist=NULL;
	Wordlists=NULL;
	char word[MAXWORD];
	int nn;
	scanf("%d",&nn);
	while (getWord(dic,word)!=EOF)
	{
		if (Wordlist==NULL) {
			Wordlist=(lNodeptr)malloc(sizeof(lNode));
			strcpy(Wordlist->word,word);
			Wordlist->next=NULL;
			end=Wordlist;
		}
		else {
			lNodeptr p=(lNodeptr)malloc(sizeof(lNode));
			strcpy(p->word,word);
			p->next=NULL;
			end->next=p;
			end=p;
		}
	}
	lNodeptr ends;
	while (getWord(fstop,word)!=EOF)
	{
		if (Wordlists==NULL) {
			Wordlists=(lNodeptr)malloc(sizeof(lNode));
			strcpy(Wordlists->word,word);
			Wordlists->next=NULL;
			ends=Wordlists;
		}
		else {
			lNodeptr p=(lNodeptr)malloc(sizeof(lNode));
			strcpy(p->word,word);
			p->next=NULL;
			ends->next=p;
			ends=p;
		}
	}
	tNodeptr wordlist,wordlists;
	wordlist=ListToBst(Wordlist, end->next);
	wordlists=ListToBst(Wordlists, end->next);
	tNodeptr wordlist1,wordlist2;
	if(fp1!=NULL)
	{
		while(getWord(fp1,word)!=EOF){
			if(isword(wordlist,word)==1&&isword(wordlists,word)==-1)
			{
				if (n[1]==0)
				{
					wordlist1=(tNodeptr)malloc(sizeof(tNode));
					strcpy(wordlist1->word,word);
					wordlist1->count=1;
					wordlist1->left=NULL;
					wordlist1->right=NULL;
					n[1]=1;
				}
				else insert(wordlist1,word,1);
			}
		}
	}
	if(fp2!=NULL)
	{
		while(getWord(fp2,word)!=EOF){
			if(isword(wordlist,word)==1&&isword(wordlists,word)==-1)
			{
				if (n[2]==0)
				{
					wordlist2=(tNodeptr)malloc(sizeof(tNode));
					strcpy(wordlist2->word,word);
					wordlist2->count=1;
					wordlist2->left=NULL;
					wordlist2->right=NULL;
					n[2]=1;
				}
				else insert(wordlist2,word,2);
			}
		}
	}
	int n1=min1(nn,n[1]);
	int n2=min1(nn,n[2]);
	search(wordlist1,n1,1);
	now=0;
	search(wordlist2,n2,2);
	int freq1=0,freq2=0,freqm1=0,freqm2=0;
	double pro1,pro2;
	int i=0;
	for (lNodeptr r=wl[1];i<n1;r=r->next,i++)
	{
		freq1=freq1+r->count;
		int j=0;
		for (lNodeptr rr=wl[2];j<n2;rr=rr->next,j++)
			if (strcmp(r->word,rr->word)==0){
				freqm1=freqm1+r->count;
				freqm2=freqm2+rr->count;
			}
	}
	i=0;
	for (lNodeptr r=wl[2];i<n2;r=r->next,i++) freq2=freq2+r->count;
	pro1=(double)freqm1/(double)freq1;
	pro2=(double)freqm2/(double)freq2;
	double sim=min1(pro1,pro2)/max1(pro1,pro2);
	printf("%.5lf",sim);
	fprintf(res,"%3.5lf\n\n",sim);
	i=0;
	for (lNodeptr r=wl[1];i<n1;r=r->next,i++) fprintf(res,"%s %d\n",r->word,r->count);
	fprintf(res,"\n");
	i=0;
	for (lNodeptr r=wl[2];i<n2;r=r->next,i++) fprintf(res,"%s %d\n",r->word,r->count);
	getchar();
	getchar();
	return 0;
}

不过实测下来效果并不理想，算上比较字符串的时间即使是logn也是不可接受的。

Trie树

应对大词典的一个算法，以字母作为节点进行存储，节省存储空间的同时也能加快存储速度。
时间上只与最长单词长度相关。

代码

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include<ctype.h>
#define MAXWORD 32
#define MULT  37
int n[3]={0},now=0;
 
struct lnode{
	char word[MAXWORD];
	int count;
	struct lnode* next;
};
typedef struct lnode lNode;
typedef struct lnode* lNodeptr;
 
struct trie{
	char isword;
	char isleaf;
	struct trie *ptr[26];
};
typedef struct trie Trie;
typedef struct trie* Trieptr;
 
Trieptr root,roots;
 
lNodeptr wl[3],r[3];
 
struct tnode{
	char word[MAXWORD];
	int count;
	struct tnode* left;
	struct tnode* right;
};
typedef struct tnode tNode;
typedef struct tnode* tNodeptr;
 
int getWord(FILE *fp,char *w)
{
	int c;
	while(!isalpha(c=fgetc(fp)))
		if(c==EOF) return c;
		else continue;
	do{
		*w++=tolower(c);
	}while(isalpha(c=fgetc(fp)));
	*w='\0';
	return 1;
}
 
int isword(Trieptr list,char *w)
{
	Trieptr p;
	for (p=list;*w!='\0';w++){
		if (p->ptr[*w-'a']==NULL) return -1;
		p=p->ptr[*w-'a'];
	}
	if (p->isword==1) return 1;
	else return -1;
}
 
void insert(tNodeptr r,char x[1000],int t)
{
	if (strcmp(r->word,x)==0){
		r->count++;
	}
	else if (strcmp(r->word,x)>0) {
		if (r->left==NULL) {
			n[t]++;
			tNodeptr p=(tNodeptr)malloc(sizeof(tNode));
			strcpy(p->word,x);
			p->count=1;
			p->left=NULL;
			p->right=NULL;
			r->left=p;
		}
		else insert(r->left,x,t);
	}
	else {
		if (r->right==NULL) {
			n[t]++;
			tNodeptr p=(tNodeptr)malloc(sizeof(tNode));
			p->count=1;
			strcpy(p->word,x);
			p->left=NULL;
			p->right=NULL;
			r->right=p;
		}
		else insert(r->right,x,t); 
	}
}
 
void search(tNodeptr r,int n,int t)
{
	if (r!=NULL){
		search(r->left,n,t);
		if (wl[t]==NULL) {
			wl[t]=(lNodeptr)malloc(sizeof(lNode));
			strcpy(wl[t]->word,r->word);
			wl[t]->count=r->count;
			wl[t]->next=NULL;
		}
		else {
			int i=0;
			lNodeptr p=wl[t],pr;
			lNodeptr tmp=(lNodeptr)malloc(sizeof(lNode));
			strcpy(tmp->word,r->word);
			tmp->count=r->count;
			while (p->next!=NULL&&p->count>=r->count&&i<n)
			{
				pr=p;
				p=p->next;
				i++;
			}
			if (p->next==NULL) {
				if (p->count>=r->count) 
				{
					p->next=tmp;
					tmp->next=NULL;
				}
				else if (p==wl[t]){
					tmp->next=p;
					wl[t]=tmp;
				}
				else {
					pr->next=tmp;
					tmp->next=p;
				}
			}
			else if (p==wl[t]){
				tmp->next=p;
				wl[t]=tmp;
			}
			else {
				pr->next=tmp;
				tmp->next=p;
			}
		}
		search(r->right,n,t);
	}
}
 
double min1(double x,double y)
{
	if (x<y) return x;
	else return y;
}
 
double max1(double x,double y)
{
	if (x>y) return x;
	else return y;
}
 
Trieptr talloc()
{
	int i;
	Trieptr p;
	p=(Trieptr)malloc(sizeof(Trie));
	p->isword=0;
	p->isleaf=1;
	for (i=0;i<26;i++)
		p->ptr[i]=NULL;
	return p;
}
 
void wordTree(Trieptr root,char *w)
{
	Trieptr p;
	for (p=root;*w!='\0';w++){
		if (p->ptr[*w-'a']==NULL){
			p->ptr[*w-'a']=talloc();
			p->isleaf=0;
		}
		p=p->ptr[*w-'a'];
	}
	p->isword=1;
}
 
int main()
{
	FILE *fp1,*fp2,*fstop,*dic,*res;
	fp1=fopen("article1.txt","r");
	fp2=fopen("article2.txt","r");
	fstop=fopen("stopwords.txt","r");
	dic=fopen("dictionary.txt","r");
	res=fopen("results.txt","w");
	char word[MAXWORD];
	int nn;
	scanf("%d",&nn);
	root=talloc();
	roots=talloc();
	while (getWord(dic,word)!=EOF)
	{
		wordTree(root,word);
	}
	while (getWord(fstop,word)!=EOF)
	{
		wordTree(roots,word);
	}
	tNodeptr wordlist1,wordlist2;
	if(fp1!=NULL)
	{
		while(getWord(fp1,word)!=EOF){
			if(isword(root,word)==1&&isword(roots,word)==-1)
			{
				if (n[1]==0)
				{
					wordlist1=(tNodeptr)malloc(sizeof(tNode));
					strcpy(wordlist1->word,word);
					wordlist1->count=1;
					wordlist1->left=NULL;
					wordlist1->right=NULL;
					n[1]=1;
				}
				else insert(wordlist1,word,1);
			}
		}
	}
	if(fp2!=NULL)
	{
		while(getWord(fp2,word)!=EOF){
			if(isword(root,word)==1&&isword(roots,word)==-1)
			{
				if (n[2]==0)
				{
					wordlist2=(tNodeptr)malloc(sizeof(tNode));
					strcpy(wordlist2->word,word);
					wordlist2->count=1;
					wordlist2->left=NULL;
					wordlist2->right=NULL;
					n[2]=1;
				}
				else insert(wordlist2,word,2);
			}
		}
	}
	int n1=min1(nn,n[1]);
	int n2=min1(nn,n[2]);
	search(wordlist1,n1,1);
	now=0;
	search(wordlist2,n2,2);
	int freq1=0,freq2=0,freqm1=0,freqm2=0;
	double pro1,pro2;
	int i=0;
	for (lNodeptr r=wl[1];i<n1;r=r->next,i++)
	{
		freq1=freq1+r->count;
		int j=0;
		for (lNodeptr rr=wl[2];j<n2;rr=rr->next,j++)
			if (strcmp(r->word,rr->word)==0){
				freqm1=freqm1+r->count;
				freqm2=freqm2+rr->count;
			}
	}
	i=0;
	for (lNodeptr r=wl[2];i<n2;r=r->next,i++) freq2=freq2+r->count;
	pro1=(double)freqm1/(double)freq1;
	pro2=(double)freqm2/(double)freq2;
	double sim=min1(pro1,pro2)/max1(pro1,pro2);
	printf("%.5lf",sim);
	fprintf(res,"%3.5lf\n\n",sim);
	i=0;
	for (lNodeptr r=wl[1];i<n1;r=r->next,i++) fprintf(res,"%s %d\n",r->word,r->count);
	fprintf(res,"\n");
	i=0;
	for (lNodeptr r=wl[2];i<n2;r=r->next,i++) fprintf(res,"%s %d\n",r->word,r->count);
	getchar();
	getchar();
	return 0;
}

理应是最快的，但也许是我优化的不够好吧，实际效果比不上hash。

Hash

比较容易想到的一个方法，做起来也比较简单。
用一定的算法将单词转为数字，存在hash表里，达到理论上O(1)的速度，不过实际上会有重复，需要一个链表来存储重复单词。

代码

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include<ctype.h>
#define MAXWORD 32
#define NHASH  1000000
#define MULT  37
int n[3]={0},now=0;
 
struct lnode{
	char word[MAXWORD];
	int count;
	struct lnode* next;
};
typedef struct lnode lNode;
typedef struct lnode* lNodeptr;
 
lNodeptr hash[1000010],hashstop[1000010];
 
lNodeptr wl[3],r[3];
 
struct tnode{
	char word[MAXWORD];
	int count;
	struct tnode* left;
	struct tnode* right;
};
typedef struct tnode tNode;
typedef struct tnode* tNodeptr;
 
int getWord(FILE *fp,char *w)
{
	int c;
	while(!isalpha(c=fgetc(fp)))
		if(c==EOF) return c;
		else continue;
	do{
		*w++=tolower(c);
	}while(isalpha(c=fgetc(fp)));
	*w='\0';
	return 1;
}
 
int isword(lNodeptr *list,char *w)
{
	char *p;
	unsigned int h=0;
	for (p=w;*p!='\0';p++)
		h= MULT*h+*p;
	h=h%NHASH;
	if (list[h]==NULL) return -1;
	else {
		lNodeptr p=list[h];
		while (p!=NULL&&strcmp(p->word,w)<0)
		{
			p=p->next;
		}
		if (p==NULL) return -1;
		else if (strcmp(p->word,w)>0) return -1;
		else if (strcmp(p->word,w)==0) return 1;
	}
	return -1;
}
 
void insert(tNodeptr r,char x[1000],int t)
{
	if (strcmp(r->word,x)==0){
		r->count++;
	}
	else if (strcmp(r->word,x)>0) {
		if (r->left==NULL) {
			n[t]++;
			tNodeptr p=(tNodeptr)malloc(sizeof(tNode));
			strcpy(p->word,x);
			p->count=1;
			p->left=NULL;
			p->right=NULL;
			r->left=p;
		}
		else insert(r->left,x,t);
	}
	else {
		if (r->right==NULL) {
			n[t]++;
			tNodeptr p=(tNodeptr)malloc(sizeof(tNode));
			p->count=1;
			strcpy(p->word,x);
			p->left=NULL;
			p->right=NULL;
			r->right=p;
		}
		else insert(r->right,x,t); 
	}
}
 
void search(tNodeptr r,int n,int t)
{
	if (r!=NULL){
		search(r->left,n,t);
		if (wl[t]==NULL) {
			wl[t]=(lNodeptr)malloc(sizeof(lNode));
			strcpy(wl[t]->word,r->word);
			wl[t]->count=r->count;
			wl[t]->next=NULL;
		}
		else {
			int i=0;
			lNodeptr p=wl[t],pr;
			lNodeptr tmp=(lNodeptr)malloc(sizeof(lNode));
			strcpy(tmp->word,r->word);
			tmp->count=r->count;
			while (p->next!=NULL&&p->count>=r->count&&i<n)
			{
				pr=p;
				p=p->next;
				i++;
			}
			if (p->next==NULL) {
				if (p->count>=r->count) 
				{
					p->next=tmp;
					tmp->next=NULL;
				}
				else if (p==wl[t]){
					tmp->next=p;
					wl[t]=tmp;
				}
				else {
					pr->next=tmp;
					tmp->next=p;
				}
			}
			else if (p==wl[t]){
				tmp->next=p;
				wl[t]=tmp;
			}
			else {
				pr->next=tmp;
				tmp->next=p;
			}
			//printf("%d\n",now);
		}
		search(r->right,n,t);
	}
}
 
double min1(double x,double y)
{
	if (x<y) return x;
	else return y;
}
 
double max1(double x,double y)
{
	if (x>y) return x;
	else return y;
}
 
int main()
{
	FILE *fp1,*fp2,*fstop,*dic,*res;
	fp1=fopen("article1.txt","r");
	fp2=fopen("article2.txt","r");
	fstop=fopen("stopwords.txt","r");
	dic=fopen("dictionary.txt","r");
	res=fopen("results.txt","w");
	char word[MAXWORD];
	int nn;
	scanf("%d",&nn);
	while (getWord(dic,word)!=EOF)
	{
		char *p;
		unsigned int h=0;
		for (p=word;*p!='\0';p++)
			h= MULT*h+*p;
		h=h%NHASH;
		if (hash[h]==NULL)
		{
			hash[h]=(lNodeptr)malloc(sizeof(lNode));
			strcpy(hash[h]->word,word);
			hash[h]->next=NULL;
		}
		else {
			lNodeptr p=hash[h];
			lNodeptr q=(lNodeptr)malloc(sizeof(lNode));;
			strcpy(q->word,word);
			if (strcmp(p->word,word)>0) 
			{
				q->next=hash[h];
				hash[h]=q;
			}
			else {
				lNodeptr r=p;
				while (p!=NULL&&strcmp(p->word,word)<0)
				{
					r=p;
					p=p->next;
				}
				if (p==NULL) {
					r->next=q;
					q->next=NULL;
				}
				else {
					q->next=r->next;
					r->next=q;
				}
			}
		}
	}
	while (getWord(fstop,word)!=EOF)
	{
		char *p;
		unsigned int h=0;
		for (p=word;*p!='\0';p++)
			h= MULT*h+*p;
		h=h%NHASH;
		if (hashstop[h]==NULL)
		{
			hashstop[h]=(lNodeptr)malloc(sizeof(lNode));
			strcpy(hashstop[h]->word,word);
			hashstop[h]->next=NULL;
		}
		else {
			lNodeptr p=hashstop[h];
			lNodeptr q=(lNodeptr)malloc(sizeof(lNode));;
			strcpy(q->word,word);
			if (strcmp(p->word,word)>0) 
			{
				q->next=hashstop[h];
				hashstop[h]=q;
			}
			else {
				lNodeptr r=p;
				while (p!=NULL&&strcmp(p->word,word)<0)
				{
					r=p;
					p=p->next;
				}
				if (p==NULL) {
					r->next=q;
					q->next=NULL;
				}
				else {
					q->next=r->next;
					r->next=q;
				}
			}
		}
	}
	tNodeptr wordlist1,wordlist2;
	if(fp1!=NULL)
	{
		while(getWord(fp1,word)!=EOF){
			if(isword(hash,word)==1&&isword(hashstop,word)==-1)
			{
				if (n[1]==0)
				{
					wordlist1=(tNodeptr)malloc(sizeof(tNode));
					strcpy(wordlist1->word,word);
					wordlist1->count=1;
					wordlist1->left=NULL;
					wordlist1->right=NULL;
					n[1]=1;
				}
				else insert(wordlist1,word,1);
			}
		}
	}
	if(fp2!=NULL)
	{
		while(getWord(fp2,word)!=EOF){
			if(isword(hash,word)==1&&isword(hashstop,word)==-1)
			{
				if (n[2]==0)
				{
					wordlist2=(tNodeptr)malloc(sizeof(tNode));
					strcpy(wordlist2->word,word);
					wordlist2->count=1;
					wordlist2->left=NULL;
					wordlist2->right=NULL;
					n[2]=1;
				}
				else insert(wordlist2,word,2);
			}
		}
	}
	int n1=min1(nn,n[1]);
	int n2=min1(nn,n[2]);
	search(wordlist1,n1,1);
	now=0;
	search(wordlist2,n2,2);
	int freq1=0,freq2=0,freqm1=0,freqm2=0;
	double pro1,pro2;
	int i=0;
	for (lNodeptr r=wl[1];i<n1;r=r->next,i++)
	{
		freq1=freq1+r->count;
		int j=0;
		for (lNodeptr rr=wl[2];j<n2;rr=rr->next,j++)
			if (strcmp(r->word,rr->word)==0){
				freqm1=freqm1+r->count;
				freqm2=freqm2+rr->count;
			}
	}
	i=0;
	for (lNodeptr r=wl[2];i<n2;r=r->next,i++) freq2=freq2+r->count;
	pro1=(double)freqm1/(double)freq1;
	pro2=(double)freqm2/(double)freq2;
	double sim=min1(pro1,pro2)/max1(pro1,pro2);
	printf("%.5lf",sim);
	fprintf(res,"%3.5lf\n\n",sim);
	i=0;
	for (lNodeptr r=wl[1];i<n1;r=r->next,i++) fprintf(res,"%s %d\n",r->word,r->count);
	fprintf(res,"\n");
	i=0;
	for (lNodeptr r=wl[2];i<n2;r=r->next,i++) fprintf(res,"%s %d\n",r->word,r->count);
	getchar();
	getchar();
	return 0;
}

查找

查找的方法依据存储方法而定。特别的是这次有两个字典：dictionary与stopwords，需要去除dictionary中的stopwords。
我个人的处理办法是将两个单词表都记录下来，对于文本中的一个单词在两个表中都执行搜索操作，若在dictionary中而不在stopwords中则判断为合法单词。
这个应该不是时间上最快的处理方法，但在编写上是相当简单的。
对于文本的存储方法我采用的是简单的二叉查找树，因为文本量比起字典量来说小很多，所以不同方法理应影响不大（大概），时间原因更好地方法没有来得及编写。

CVBB ACM Team

用户工具

站点工具

目录

词频统计的一些做法

存储

二叉查找树

Trie树

Hash

查找

页面工具