Python是一門很強(qiáng)大的編程語言,在文本挖掘方面也有很多的庫可以使用,下面介紹一些常用的Python文本挖掘庫:
import re #正則表達(dá)式,用于處理文字 text = "Hello, my name is John. I live in New York." pattern = r'\w+' result = re.findall(pattern, text) print(result) #輸出:['Hello', 'my', 'name', 'is', 'John', 'I', 'live', 'in', 'New', 'York'] import nltk #自然語言處理工具包,用于分析語言 text = "This is an example sentence." tokens = nltk.word_tokenize(text) print(tokens) #輸出:['This', 'is', 'an', 'example', 'sentence', '.'] from sklearn.feature_extraction.text import CountVectorizer #文本特征提取器,用于將文本轉(zhuǎn)化為向量 corpus = ['This is the first document.', 'This is the second document.', 'And this is the third document.'] cv = CountVectorizer() X = cv.fit_transform(corpus) print(X.toarray()) #輸出:[[1 1 1 0 0 0 1 0 1] [1 1 0 1 0 0 1 0 1] [0 1 0 0 1 1 1 1 1]] import gensim #用于生成文本向量 doc = ['This is the first document', 'This is the second document', 'This is the third document'] tokenized_doc = [] for d in doc: tokenized_doc.append(d.lower().split()) model = gensim.models.Word2Vec(tokenized_doc, size=300, window=5, min_count=1, workers=4) print(model['document']) #輸出:array([...], dtype=float32)