diff --git a/pars_oem.py b/pars_oem.py new file mode 100644 index 0000000..48a3b7c --- /dev/null +++ b/pars_oem.py @@ -0,0 +1,81 @@ +import csv +import re +import time +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm + +f = open('lynx_pn_input.txt', 'r') +urls = f.read().split() +f.close() +with open('oen.csv', 'w', newline='', encoding='windows-1251') as csvfile: + writer = csv.writer(csvfile) + writer.writerow( + ['url', 'ID', 'Название', 'Описание', 'Фото', 'OeN', 'Аналоги']) + for url in tqdm(urls, desc="Processing", unit="iteration"): + try: + # Отправляем GET-запрос к странице + response = requests.get(url) + + html_content = response.text + + soup = BeautifulSoup(html_content, "html.parser") + + data = [url] + IDD = soup.find("div", class_="pcard-model").get_text(strip=True) + if IDD: + data.append(IDD) + else: + data.append('None') + Name = soup.find("div", class_="pcard-name").get_text(strip=True) + + if Name: + data.append(Name) + else: + data.append('None') + + form_element = soup.find("div", id="pcard-props") + if form_element: + data.append(' | '.join([ + f'{row.find("td", class_="title").get_text(strip=True)} {row.find("td", class_="value").get_text(strip=True)}' + for row in form_element.find_all("tr")])) + else: + data.append('None') + + pcard_view_wrapper = soup.find("div", id="pcard-view-images") + if pcard_view_wrapper: + img = ' | '.join([img['src'] for img in pcard_view_wrapper.find_all('img')]) + if img == 'https://lynxauto.info/image/trumb/400x300/no_image.jpg': + data.append('None') + else: + data.append(img) + else: + data.append('None') + + OeN = [] + table = soup.find('div', id='pcard-oeno') + if table: + for row in table.find_all('tr')[1:]: + row_data = [re.sub(r'\n+', ' | ', ( + cell.get_text().strip().replace('\t', '').replace(' ', ''))) for cell in + row.find_all(['td', 'th'])] + OeN.append(' | '.join(row_data)) + data.append(' <> '.join(OeN)) + else: + data.append('None') + + analog = [] + table = soup.find('div', id='pcard-analog') + if table: + for row in table.find_all('tr')[1:]: + row_data = [re.sub(r'\n+', ' | ', ( + cell.get_text().strip().replace('\t', '').replace(' ', ''))) for cell in + row.find_all(['td', 'th'])] + analog.append(' | '.join(row_data)) + data.append(' <> '.join(analog)) + else: + data.append('None') + writer.writerow(data) + except Exception as e: + print(e) + time.sleep(60) diff --git a/ДОПАРСИНГ.py b/ДОПАРСИНГ.py new file mode 100644 index 0000000..a39bc2d --- /dev/null +++ b/ДОПАРСИНГ.py @@ -0,0 +1,48 @@ +import csv +from tqdm import tqdm + +def chort_text(parts): + if parts[0] == 'None': + return 'None' + # Создаем словарь для хранения данных + processed_data = {} + + # Обрабатываем каждую часть + for part in parts: + # Разбиваем часть по разделителю | + subparts = part.split('|') + # Получаем производителя и номер детали + manufacturer = subparts[0].strip() + number = subparts[1].strip() + # Получаем дополнительную информацию + info = subparts[2].strip() if len(subparts) > 2 else '' + # Добавляем данные в словарь + if manufacturer in processed_data: + processed_data[manufacturer].append(f"{number}") + else: + processed_data[manufacturer] = [info, number] + # Преобразуем словарь в требуемый формат + string = '' + for key, value in processed_data.items(): + if value[0] == '': + string += f'{key} : {" | ".join(value[1:])} <> ' + else: + string += f'{key} : {" | ".join(value[1:])} : {value[0]} <> ' + return string + +with open('oen.csv', newline='') as csvfile: + reader = csv.reader(csvfile) + csv_data = list(reader) + + + +with open('oem2.csv', 'w', newline='', encoding='windows-1251') as csvfile: + writer = csv.writer(csvfile) + + for datas in tqdm(csv_data[1:], desc="Processing", unit="iteration"): + row = datas + parts = datas[5].split('<>') + row[5] = chort_text(parts) + parts = datas[6].split('<>') + row[6] = chort_text(parts) + writer.writerow(row)