NLP 数据预处理分词、向量化与特征工程1. 技术分析1.1 NLP 数据预处理流程数据预处理是 NLP 管道的重要环节NLP 预处理流程 原始文本 → 清洗 → 分词 → 向量化 → 特征工程1.2 预处理步骤对比步骤目的方法文本清洗去除噪声正则表达式分词切分文本规则/统计/深度学习停用词过滤去除无意义词停用词表词干化/词形还原词形归一化NLTK/SpaCy向量化转为数值TF-IDF/Word2Vec/BERT1.3 文本表示方法文本表示层次 字符级: 字符序列 词级: 词袋模型 句子级: 句向量 文档级: 文档向量2. 核心功能实现2.1 文本清洗import re import string class TextCleaner: def __init__(self): self.patterns { url: rhttps?://\S|www\.\S, email: r\b[A-Za-z0-9._%-][A-Za-z0-9.-]\.[A-Z|a-z]{2,}\b, html: r.*?, special_chars: r[^a-zA-Z0-9\s], extra_spaces: r\s } def clean(self, text): text text.lower() text re.sub(self.patterns[url], , text) text re.sub(self.patterns[email], , text) text re.sub(self.patterns[html], , text) text re.sub(self.patterns[special_chars], , text) text re.sub(self.patterns[extra_spaces], , text) return text.strip() class ChineseTextCleaner: def __init__(self): self.patterns { url: rhttps?://\S|www\.\S, email: r\b[A-Za-z0-9._%-][A-Za-z0-9.-]\.[A-Z|a-z]{2,}\b, html: r.*?, punctuation: r[。、【】《》], extra_spaces: r\s } def clean(self, text): text re.sub(self.patterns[url], , text) text re.sub(self.patterns[email], , text) text re.sub(self.patterns[html], , text) text re.sub(self.patterns[punctuation], , text) text re.sub(self.patterns[extra_spaces], , text) return text.strip()2.2 分词处理import jieba class Tokenizer: def __init__(self, languageenglish): self.language language if language chinese: self.tokenizer jieba elif language english: from nltk.tokenize import word_tokenize self.tokenizer word_tokenize def tokenize(self, text): if self.language chinese: return self.tokenizer.lcut(text) else: return self.tokenizer(text) def tokenize_batch(self, texts): return [self.tokenize(text) for text in texts] class StopwordFilter: def __init__(self, languageenglish): if language english: from nltk.corpus import stopwords self.stopwords set(stopwords.words(english)) elif language chinese: self.stopwords self._load_chinese_stopwords() def _load_chinese_stopwords(self): stopwords set() common_stopwords [ 的, 是, 在, 和, 有, 我, 他, 她, 它, 这, 那, 个, 都, 就, 也, 很, 到, 说, 要, 去, 会, 着, 没有, 看, 好, 自己, 又 ] stopwords.update(common_stopwords) return stopwords def filter(self, tokens): return [token for token in tokens if token not in self.stopwords] def filter_batch(self, tokenized_texts): return [self.filter(tokens) for tokens in tokenized_texts]2.3 向量化import torch import torch.nn as nn from sklearn.feature_extraction.text import TfidfVectorizer class TFIDFVectorizer: def __init__(self, max_features5000): self.vectorizer TfidfVectorizer(max_featuresmax_features) def fit(self, texts): self.vectorizer.fit(texts) def transform(self, texts): return torch.tensor(self.vectorizer.transform(texts).toarray(), dtypetorch.float32) def fit_transform(self, texts): return torch.tensor(self.vectorizer.fit_transform(texts).toarray(), dtypetorch.float32) class WordEmbeddingVectorizer: def __init__(self, embedding_dim100): self.embedding_dim embedding_dim self.word_to_idx {} self.embedding None def fit(self, tokenized_texts): vocab set() for tokens in tokenized_texts: vocab.update(tokens) self.word_to_idx {word: i 1 for i, word in enumerate(vocab)} self.word_to_idx[UNK] 0 self.embedding nn.Embedding(len(self.word_to_idx), self.embedding_dim) def transform(self, tokenized_texts, max_len50): sequences [] for tokens in tokenized_texts: sequence [] for token in tokens[:max_len]: sequence.append(self.word_to_idx.get(token, 0)) sequence [0] * (max_len - len(sequence)) sequences.append(sequence) return torch.tensor(sequences, dtypetorch.long) class BERTVectorizer: def __init__(self, model_namebert-base-uncased): from transformers import BertModel, BertTokenizer self.model BertModel.from_pretrained(model_name) self.tokenizer BertTokenizer.from_pretrained(model_name) def encode(self, texts): inputs self.tokenizer( texts, paddingTrue, truncationTrue, max_length512, return_tensorspt ) with torch.no_grad(): outputs self.model(**inputs) return outputs.last_hidden_state[:, 0, :]2.4 特征工程class TextFeatureExtractor: def __init__(self): self.features [] def add_length_feature(self, texts): lengths [len(text) for text in texts] self.features.append(torch.tensor(lengths, dtypetorch.float32).unsqueeze(1)) def add_word_count_feature(self, tokenized_texts): word_counts [len(tokens) for tokens in tokenized_texts] self.features.append(torch.tensor(word_counts, dtypetorch.float32).unsqueeze(1)) def add_punctuation_feature(self, texts): punctuation_ratios [] for text in texts: punctuation_count sum(1 for char in text if char in string.punctuation) ratio punctuation_count / len(text) if len(text) 0 else 0 punctuation_ratios.append(ratio) self.features.append(torch.tensor(punctuation_ratios, dtypetorch.float32).unsqueeze(1)) def get_features(self): if not self.features: return None return torch.cat(self.features, dim1) class FeaturePipeline: def __init__(self, steps): self.steps steps def fit_transform(self, texts): features texts for step in self.steps: features step.fit_transform(features) return features def transform(self, texts): features texts for step in self.steps: features step.transform(features) return features3. 性能对比3.1 向量化方法对比方法维度信息量计算复杂度适用场景TF-IDF词汇表大小中低传统模型Word2Vec固定维度高中深度学习BERT768/1024很高高预训练3.2 分词器对比分词器语言准确率速度jieba中文95%快THULAC中文97%中HanLP中文98%慢NLTK英文95%快SpaCy英文98%中3.3 预处理步骤影响步骤效果提升计算开销文本清洗2%低停用词过滤1%低词干化1%中向量化5-10%高4. 最佳实践4.1 预处理管道构建def build_preprocessing_pipeline(languageenglish): steps [ TextCleaner() if language english else ChineseTextCleaner(), Tokenizer(languagelanguage), StopwordFilter(languagelanguage), TFIDFVectorizer() ] return FeaturePipeline(steps) class PreprocessingFactory: staticmethod def create(config): if config[type] tfidf: return TFIDFVectorizer(**config[params]) elif config[type] word2vec: return WordEmbeddingVectorizer(**config[params]) elif config[type] bert: return BERTVectorizer(**config[params])4.2 预处理流程class NLPPreprocessor: def __init__(self, tokenizer, vectorizer, cleanerNone): self.tokenizer tokenizer self.vectorizer vectorizer self.cleaner cleaner def process(self, texts): if self.cleaner: texts [self.cleaner.clean(text) for text in texts] tokenized self.tokenizer.tokenize_batch(texts) if hasattr(self.vectorizer, fit_transform): features self.vectorizer.fit_transform(tokenized) else: features self.vectorizer.encode(texts) return features def transform(self, texts): if self.cleaner: texts [self.cleaner.clean(text) for text in texts] tokenized self.tokenizer.tokenize_batch(texts) if hasattr(self.vectorizer, transform): features self.vectorizer.transform(tokenized) else: features self.vectorizer.encode(texts) return features5. 总结NLP 数据预处理是模型效果的关键文本清洗去除噪声提高数据质量分词将文本切分为基本单元向量化将文本转为数值表示特征工程提取额外特征对比数据如下好的预处理可提升 5-10% 模型效果BERT 向量比传统向量化方法效果更好中文需要专用的分词器预处理管道应根据任务定制