Files
Pars_Lynx/main.py

42 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from bs4 import BeautifulSoup
import csv
import re
from tqdm import tqdm
# Задаем путь к директории
directory = r'D:\Работа\DATA\lynx'
# Используем генератор списка для поиска файлов .html
html_files = [os.path.join(root, file) for root, dirs, files in os.walk(directory) for file in files if file.endswith(".html")]
# Выводим массив с путями к файлам .html
with open('output.csv', 'w', newline='', encoding='windows-1251') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(
['Производитель', 'Модель', 'Двигатель', 'Мощность кВт (л. с.)', 'Год выпуска', 'Тип детали', 'No.', 'Link',
'Description', 'Information'])
for path in tqdm(html_files, desc="Processing", unit="iteration"):
try:
file = open(path, 'r', encoding='utf-8')
text = file.read()
file.close()
soup_path = BeautifulSoup(text, 'html.parser')
vendor = soup_path.find('span', id='select2-vendor-container').text.strip()
car = soup_path.find('span', id='select2-car-container').text.strip()
modification, power_engine, year = soup_path.find('span', id='select2-modification-container').text.strip().split(' | ')
category_type = soup_path.find('div', class_='header-info-block-category').find('span').text.strip()
table = soup_path.find('table', class_='list_showtable')
for row in table.find_all('tr')[1:]:
row_data = [re.sub(r'\n+', ' | ', (
cell.get_text().strip().replace('\t', '').replace(' ',''))) if count == 3 else cell.get_text(
strip=True) for count, cell in enumerate(row.find_all(['td', 'th']))]
row_data[1] = row['data-link']
row_data = [vendor, car, modification, power_engine, year, category_type] + row_data
writer.writerow(row_data)
except Exception as e:
pass