对txt文件的操作

junlan
3
2024-11-25

1、逐行读取txt文件

def read_txt_file_by_line(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            print(line.strip())

# 示例调用
read_txt_file_by_line('example.txt')

2、统计单词出现次数

from collections import Counter

def count_word_frequency(txt_file):
    with open(txt_file, 'r', encoding='utf-8') as file:
        words = file.read().split()
        word_freq = Counter(words)
    return word_freq

# 示例调用
word_freq = count_word_frequency(r'D:\1.txt')
for word, freq in word_freq.most_common():
    print(f'{word}: {freq}')

2.1运行演示

image-20240906202240310

3、过滤空行和注释行

def filter_empty_and_comment_lines(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    filtered_lines = [line for line in lines if line.strip() and not line.strip().startswith('#')]
    return filtered_lines

# 示例调用
filtered = filter_empty_and_comment_lines('example.txt')
for line in filtered:
    print(line.strip())

4、提取邮件地址或URL

def extract_emails_and_urls(txt_file):
  email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
  url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

  with open(txt_file, 'r', encoding='utf-8') as file:
    content = file.read()

  emails = re.findall(email_pattern, content)
  urls = re.findall(url_pattern, content)

  return emails, urls

*# 示例调用*
emails, urls = extract_emails_and_urls('example.txt')
print("Emails found:", emails)
print("URLs found:", urls)