Загрузить файлы в «/»
This commit is contained in:
41
main.py
Normal file
41
main.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
import csv
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
# Задаем путь к директории
|
||||
directory = r'D:\Работа\DATA\lynx'
|
||||
|
||||
# Используем генератор списка для поиска файлов .html
|
||||
html_files = [os.path.join(root, file) for root, dirs, files in os.walk(directory) for file in files if file.endswith(".html")]
|
||||
|
||||
# Выводим массив с путями к файлам .html
|
||||
|
||||
with open('output.csv', 'w', newline='', encoding='windows-1251') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(
|
||||
['Производитель', 'Модель', 'Двигатель', 'Мощность кВт (л. с.)', 'Год выпуска', 'Тип детали', 'No.', 'Link',
|
||||
'Description', 'Information'])
|
||||
for path in tqdm(html_files, desc="Processing", unit="iteration"):
|
||||
try:
|
||||
file = open(path, 'r', encoding='utf-8')
|
||||
text = file.read()
|
||||
file.close()
|
||||
soup_path = BeautifulSoup(text, 'html.parser')
|
||||
vendor = soup_path.find('span', id='select2-vendor-container').text.strip()
|
||||
car = soup_path.find('span', id='select2-car-container').text.strip()
|
||||
modification, power_engine, year = soup_path.find('span', id='select2-modification-container').text.strip().split(' | ')
|
||||
category_type = soup_path.find('div', class_='header-info-block-category').find('span').text.strip()
|
||||
|
||||
table = soup_path.find('table', class_='list_showtable')
|
||||
for row in table.find_all('tr')[1:]:
|
||||
row_data = [re.sub(r'\n+', ' | ', (
|
||||
cell.get_text().strip().replace('\t', '').replace(' ',''))) if count == 3 else cell.get_text(
|
||||
strip=True) for count, cell in enumerate(row.find_all(['td', 'th']))]
|
||||
row_data[1] = row['data-link']
|
||||
row_data = [vendor, car, modification, power_engine, year, category_type] + row_data
|
||||
writer.writerow(row_data)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user