42 lines
2.1 KiB
Python
42 lines
2.1 KiB
Python
import os
|
||
from bs4 import BeautifulSoup
|
||
import csv
|
||
import re
|
||
from tqdm import tqdm
|
||
# Задаем путь к директории
|
||
directory = r'D:\Работа\DATA\lynx'
|
||
|
||
# Используем генератор списка для поиска файлов .html
|
||
html_files = [os.path.join(root, file) for root, dirs, files in os.walk(directory) for file in files if file.endswith(".html")]
|
||
|
||
# Выводим массив с путями к файлам .html
|
||
|
||
with open('output.csv', 'w', newline='', encoding='windows-1251') as csvfile:
|
||
writer = csv.writer(csvfile)
|
||
writer.writerow(
|
||
['Производитель', 'Модель', 'Двигатель', 'Мощность кВт (л. с.)', 'Год выпуска', 'Тип детали', 'No.', 'Link',
|
||
'Description', 'Information'])
|
||
for path in tqdm(html_files, desc="Processing", unit="iteration"):
|
||
try:
|
||
file = open(path, 'r', encoding='utf-8')
|
||
text = file.read()
|
||
file.close()
|
||
soup_path = BeautifulSoup(text, 'html.parser')
|
||
vendor = soup_path.find('span', id='select2-vendor-container').text.strip()
|
||
car = soup_path.find('span', id='select2-car-container').text.strip()
|
||
modification, power_engine, year = soup_path.find('span', id='select2-modification-container').text.strip().split(' | ')
|
||
category_type = soup_path.find('div', class_='header-info-block-category').find('span').text.strip()
|
||
|
||
table = soup_path.find('table', class_='list_showtable')
|
||
for row in table.find_all('tr')[1:]:
|
||
row_data = [re.sub(r'\n+', ' | ', (
|
||
cell.get_text().strip().replace('\t', '').replace(' ',''))) if count == 3 else cell.get_text(
|
||
strip=True) for count, cell in enumerate(row.find_all(['td', 'th']))]
|
||
row_data[1] = row['data-link']
|
||
row_data = [vendor, car, modification, power_engine, year, category_type] + row_data
|
||
writer.writerow(row_data)
|
||
except Exception as e:
|
||
pass
|
||
|
||
|