Загрузить файлы в «/»

This commit is contained in:
2024-08-01 03:33:25 +00:00
parent ab4882e39f
commit 7bfd735f1f
5 changed files with 35116 additions and 0 deletions

41
main.py Normal file
View File

@@ -0,0 +1,41 @@
import os
from bs4 import BeautifulSoup
import csv
import re
from tqdm import tqdm
# Задаем путь к директории
directory = r'D:\Работа\DATA\lynx'
# Используем генератор списка для поиска файлов .html
html_files = [os.path.join(root, file) for root, dirs, files in os.walk(directory) for file in files if file.endswith(".html")]
# Выводим массив с путями к файлам .html
with open('output.csv', 'w', newline='', encoding='windows-1251') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(
['Производитель', 'Модель', 'Двигатель', 'Мощность кВт (л. с.)', 'Год выпуска', 'Тип детали', 'No.', 'Link',
'Description', 'Information'])
for path in tqdm(html_files, desc="Processing", unit="iteration"):
try:
file = open(path, 'r', encoding='utf-8')
text = file.read()
file.close()
soup_path = BeautifulSoup(text, 'html.parser')
vendor = soup_path.find('span', id='select2-vendor-container').text.strip()
car = soup_path.find('span', id='select2-car-container').text.strip()
modification, power_engine, year = soup_path.find('span', id='select2-modification-container').text.strip().split(' | ')
category_type = soup_path.find('div', class_='header-info-block-category').find('span').text.strip()
table = soup_path.find('table', class_='list_showtable')
for row in table.find_all('tr')[1:]:
row_data = [re.sub(r'\n+', ' | ', (
cell.get_text().strip().replace('\t', '').replace(' ',''))) if count == 3 else cell.get_text(
strip=True) for count, cell in enumerate(row.find_all(['td', 'th']))]
row_data[1] = row['data-link']
row_data = [vendor, car, modification, power_engine, year, category_type] + row_data
writer.writerow(row_data)
except Exception as e:
pass