From 0c18720ac6f0bacee408388568d1e77a85a94d3b Mon Sep 17 00:00:00 2001 From: wonipapa Date: Wed, 9 Nov 2016 18:36:30 +0900 Subject: [PATCH] =?UTF-8?q?KODI=EB=A5=BC=20=EC=9C=84=ED=95=B4=EC=84=9C=20C?= =?UTF-8?q?DATA=20=EC=82=AD=EC=A0=9C=20=EC=84=9C=EB=B8=8C=ED=83=80?= =?UTF-8?q?=EC=9D=B4=ED=8B=80=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- epg2xml.py | 286 ++++++++++++++++++++++++++--------------------------- 1 file changed, 141 insertions(+), 145 deletions(-) diff --git a/epg2xml.py b/epg2xml.py index ddb8c8c..e3b11fe 100644 --- a/epg2xml.py +++ b/epg2xml.py @@ -7,17 +7,16 @@ import httplib import urllib import json import datetime -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, SoupStrainer import codecs import socket import re from xml.sax.saxutils import escape, unescape import argparse - reload(sys) sys.setdefaultencoding('utf-8') -__version__ = '1.0.3' +__version__ = '1.0.4' # Set My Configuration default_icon_url = '' # TV channel icon url (ex : http://www.example.com/Channels) @@ -48,15 +47,14 @@ def getEpg(): # Print Channel information for ChannelInfo in ChannelInfos: ChannelId = ChannelInfo[0] - ChannelName = ChannelInfo[1] + ChannelName = escape(ChannelInfo[1]) ChannelSource = ChannelInfo[2] ChannelServiceId = ChannelInfo[3] - writeXML('\t' % (ChannelId)) - writeXML('\t\t' % (ChannelName)) + writeXML(' ' % (ChannelId)) + writeXML(' %s' % (ChannelName)) if IconUrl: - writeXML('\t\t' % (IconUrl, ChannelId)) - writeXML('\t') - + writeXML(' ' % (IconUrl, ChannelId)) + writeXML(' ') # Print Program Information for ChannelInfo in ChannelInfos: @@ -71,15 +69,13 @@ def getEpg(): elif ChannelSource == 'LG': GetEPGFromLG(ChannelInfo) elif ChannelSource == 'SK': - GetEPGFromSK(ChannelInfo) + GetEPGFromSK(ChannelInfo) elif ChannelSource == 'SKY': - GetEPGFromSKY(ChannelInfo) + GetEPGFromSKY(ChannelInfo) GetEPGFromEPG(SiteEPG) # Get EPG data from epg.co.kr def GetEPGFromEPG(ChannelInfos): - pattern = "Preview\('(.*?)','(.*?)','(.*?)','(.*?)','(.*?)','(.*?)','(.*?)'\)\">.*?<\/a>(.*?)<\/td>" - p = re.compile(pattern) ChannelInfo = [ChannelInfos[i:i+5] for i in range(0, len(ChannelInfos),5)] html = [] @@ -92,44 +88,37 @@ def GetEPGFromEPG(ChannelInfos): url = 'http://schedule.epg.co.kr/php/guide/schedule_day_on.php?%snext=&old_sub_channel_group=110&old_sub_channel_group=110&old_top_channel_group=2&search_sub_category=&search_sub_channel_group=110&search_top_category=&search_top_channel_group=2&selectday=%s&selectday2=%s&weekchannel=&ymd=%s' % (churl, day, day, day) u = urllib.urlopen(url).read() data = unicode(u, 'euc-kr', 'ignore').encode('utf-8', 'ignore') - soup = BeautifulSoup(data,'lxml', from_encoding='utf-8') + strainer = SoupStrainer('table', {"width" : "125"}) + soup = BeautifulSoup(data, 'lxml', parse_only=strainer, from_encoding='utf-8') html.append(soup.select('td > a[href^="JavaScript:ViewContent"]')) for row in html: - for i, cell in enumerate(row): + for cell in row: td = cell.parent - epgdata = p.findall(str(td)) - programName = unescape(epgdata[0][1].decode('string_escape')) - channelId = epgdata[0][2] - startTime, endTime = unescape(epgdata[0][3]).split('
~') + epgdata = re.findall("[\(]?'(.*?)'[,\)]", str(td)) + programName = unescape(epgdata[2].decode('string_escape')) + subprogramName = '' + channelId = epgdata[3] + startTime, endTime = unescape(epgdata[4]).split('
~') startTime = str(today.year) + '/' + startTime startTime = datetime.datetime.strptime(startTime, '%Y/%m/%d %p %I:%M') startTime = startTime.strftime('%Y%m%d%H%M%S') endTime = str(today.year) + '/' + endTime endTime = datetime.datetime.strptime(endTime, '%Y/%m/%d %p %I:%M') endTime = endTime.strftime('%Y%m%d%H%M%S') - category = escape(epgdata[0][4]) - actors = escape(epgdata[0][5]) - producer = escape(epgdata[0][6]) - image = epgdata[0][7] - checkRebroadcast = re.search('rebroadcast', image) - if not (checkRebroadcast is None) : - programName = programName + ' (재방송)' - checkRating = re.findall('7|12|15|19', image) - if len(checkRating) == 0: - rating = '전체 연령 시청가' - else: - rating = '%s세 이상 시청가' % (checkRating[0]) - episode = None - checkEpisode = re.search('(?<=\()[\d]+', programName) - if not (checkEpisode is None): - episode = int(checkEpisode.group()) - desc = programName - if episode : desc = desc + '\n회차 : ' + str(episode) + '회' - desc = desc + '\n장르 : ' + category - if actors : desc = desc + '\n출연 : ' + actors - if producer : desc = desc + '\n제작 : ' + producer - desc = desc + '\n등급 : ' + rating - programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'desc':desc, 'actors':actors, 'producer':producer, 'category':category, 'episode':episode, 'rating':rating} + category = epgdata[5].split('-')[0].strip() + actors = epgdata[6] + producers = epgdata[7] + matches = re.match('^(.*?)\s*(<(.*)>)?(\(([\d,]+)회\))?$', programName) + if not (matches is None): + programName = matches.group(1) if matches.group(1) else '' + subprogramName = matches.group(3) if matches.group(3) else '' + episode = matches.group(5) if matches.group(5) else '' + rating = 0 + for image in td.findAll('img'): + if 'rebroadcast' in image.get('src') : programName = programName + '재방송' + if 'grade' in image.get('src') : rating = int(image.get('src')[22:].replace('.gif','')) + desc = '' + programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'subprogramName':subprogramName, 'desc':desc, 'actors':actors, 'producers':producers, 'category':category, 'episode':episode, 'rating':rating} writeProgram(programdata) # Get EPG data from KT @@ -142,30 +131,33 @@ def GetEPGFromKT(ChannelInfo): url = 'http://tv.olleh.com/renewal_sub/liveTv/pop_schedule_week.asp?ch_name=&ch_no=%s&nowdate=%s&seldate=%s&tab_no=1' % (ServiceId, day, day) u = urllib.urlopen(url).read() data = unicode(u, 'euc-kr', 'ignore').encode('utf-8', 'ignore') - soup = BeautifulSoup(data,'lxml', from_encoding='utf-8') - html = soup.find('table', {'id':'pop_day'}).tbody.findAll('tr') + strainer = SoupStrainer('table', {'id':'pop_day'}) + soup = BeautifulSoup(data, 'lxml', parse_only=strainer, from_encoding='utf-8') + html = soup.find('table', {'id':'pop_day'}).tbody.findAll('tr') if soup.find('table', {'id':'pop_day'}) else '' for row in html: for cell in [row.findAll('td')]: epginfo.append([cell[1].text, str(day) + ' ' + cell[0].text, cell[4].text, cell[2].text]) for epg1, epg2 in zip(epginfo, epginfo[1:]): - programName = epg1[0].decode('string_escape') + programName = '' + subprogrmaName = '' + matches = re.match('^(.*?)( <(.*)>)?$', epg1[0].decode('string_escape')) + if not (matches is None): + programName = matches.group(1) if matches.group(1) else '' + subprogramName = matches.group(3) if matches.group(3) else '' startTime = datetime.datetime.strptime(epg1[1], '%Y-%m-%d %H:%M') startTime = startTime.strftime('%Y%m%d%H%M%S') endTime = datetime.datetime.strptime(epg2[1], '%Y-%m-%d %H:%M') endTime = endTime.strftime('%Y%m%d%H%M%S') - category = escape(epg1[2]) - rating = escape(epg1[3]) - if rating == 'all세 이상': - rating = '전체 연령 시청가' - else: - rating = rating + ' 시청가' - desc = programName + '\n장르 : ' + category + '\n등급 : ' + rating - actors = ''; - producer = ''; - episode = ''; - programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'desc':desc, 'actors':actors, 'producer':producer, 'category':category, 'episode':episode, 'rating':rating} + category = epg1[2] + rating = 0 + matches = re.match('(\d+)', epg1[3]) + if not(matches is None): rating = int(matches.group()) + desc = '' + actors = '' + producers = '' + episode = '' + programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'subprogramName':subprogramName, 'desc':desc, 'actors':actors, 'producers':producers, 'category':category, 'episode':episode, 'rating':rating} writeProgram(programdata) - # Get EPG data from LG def GetEPGFromLG(ChannelInfo): channelId = ChannelInfo[0] @@ -173,30 +165,35 @@ def GetEPGFromLG(ChannelInfo): epginfo = [] for k in range(period): day = today + datetime.timedelta(days=k) - url = 'https://www.uplus.co.kr/css/chgi/chgi/RetrieveTvSchedule.hpi?chnlCd=%s&evntCmpYmd=%s' % (ServiceId, day.strftime('%Y%m%d')) + url = 'http://www.uplus.co.kr/css/chgi/chgi/RetrieveTvSchedule.hpi?chnlCd=%s&evntCmpYmd=%s' % (ServiceId, day.strftime('%Y%m%d')) u = urllib.urlopen(url).read() data = unicode(u, 'euc-kr', 'ignore').encode('utf-8', 'ignore') - soup = BeautifulSoup(data,'lxml', from_encoding='utf-8') + strainer = SoupStrainer('table') + soup = BeautifulSoup(data, 'lxml', parse_only=strainer, from_encoding='utf-8') html = soup.find('table', {'class':'datatable06'}).tbody.findAll('tr') if soup.find('table', {'class':'datatable06'}) else '' for row in html: for cell in [row.findAll('td')]: epginfo.append([cell[1].text.strip(), str(day) + ' ' + cell[0].text, cell[2].text.strip(), cell[1].find('img', alt=True)['alt'].strip()]) for epg1, epg2 in zip(epginfo, epginfo[1:]): - programName = epg1[0].decode('string_escape') + programName = '' + subprogramName = '' + episode = '' + matches = re.match('^(.*?)(\(([\d,]+)회\))?$', epg1[0].decode('string_escape')) + if not (matches is None): + programName = matches.group(1) if matches.group(1) else '' + episode = int(matches.group(3)) if matches.group(3) else '' startTime = datetime.datetime.strptime(epg1[1], "%Y-%m-%d %H:%M") startTime = startTime.strftime("%Y%m%d%H%M%S") endTime = datetime.datetime.strptime(epg2[1], "%Y-%m-%d %H:%M") endTime = endTime.strftime("%Y%m%d%H%M%S") - category = escape(epg1[2]) - rating = escape(epg1[3]) - desc = programName + '\n장르 : ' + category + '\n등급 : ' + rating - actors = ''; - producer = ''; - episode = None - checkEpisode = re.search('(?<=\()[\d]+', programName) - if not (checkEpisode is None): - episode = int(checkEpisode.group()) - programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'desc':desc, 'actors':actors, 'producer':producer, 'category':category, 'episode':episode, 'rating':rating} + category = epg1[2] + rating = 0 + matches = re.match('(\d+)세이상 관람가', epg1[3].encode('utf-8')) + if not(matches is None): rating = int(matches.group(1)) + desc = '' + actors = '' + producers = '' + programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'subprogramName':subprogramName, 'desc':desc, 'actors':actors, 'producers':producers, 'category':category, 'episode':episode, 'rating':rating} writeProgram(programdata) # Get EPG data from SK @@ -209,36 +206,28 @@ def GetEPGFromSK(ChannelInfo): data = json.loads(u, encoding='utf-8') programs = data['channel']['programs'] for program in programs: - programName = program['programName'] - if programName: - programName = programName.replace('(재)', ' (재방송)') - actors = program['actorName'] - if actors: actors = escape(actors) - producer = program['directorName'] - if producer: producer = escape(producer) + programName = '' + subprogramName = '' + episode = '' + rebroadcast = '' + matches = re.match('^(.*?)(?:\s*[\(<]([\d,회]+)[\)>])?(?:\s*<([^<]*?)>)?(\((재)\))?$', program['programName'].replace('...', '>').encode('utf-8')) + if not (matches is None): + programName = matches.group(1).strip() if matches.group(1) else '' + subprogramName = matches.group(3).strip() if matches.group(3) else '' + episode = matches.group(2).replace('회', '') if matches.group(2) else '' + rebroadcast = 'Y' if matches.group(5) else 'N' + if rebroadcast == 'Y': programName = programName + ' (재방송)' + actors = program['actorName'].replace('...','').strip(', ') if program['actorName'] else '' + producers = program['directorName'].replace('...','').strip(', ') if program['directorName'] else '' startTime = datetime.datetime.fromtimestamp(int(program['startTime'])/1000) startTime = startTime.strftime('%Y%m%d%H%M%S') endTime = datetime.datetime.fromtimestamp(int(program['endTime'])/1000) endTime = endTime.strftime('%Y%m%d%H%M%S') - category = program['mainGenreName'] + '-' + program['subGenreName'] - if category: category = escape(category) - rating = program['ratingCd'] - if rating == '0': - rating = '전체 시청가' - else : - rating = '%s세 이상 시청가' % (rating) - episode = None - checkEpisode = re.search('(?<=\()[\d]+', programName) - if not (checkEpisode is None): - episode = int(checkEpisode.group()) - desc = programName - if episode : desc = desc + '\n회차 : ' + str(episode) + '회' - desc = desc + '\n장르 : ' + category - if actors : desc = desc + '\n출연 : ' + actors - if producer : desc = desc + '\n제작 : ' + producer - desc = desc + '\n등급 : ' + rating - if program['synopsis'] : desc = desc + '\n' + program['synopsis'] - programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'desc':desc, 'actors':actors, 'producer':producer, 'category':category, 'episode':episode, 'rating':rating} + category = program['mainGenreName'] + rating = int(program['ratingCd']) if program['programName'] else 0 + desc = '' + if program['synopsis'] : desc = program['synopsis'] + programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'subprogramName':subprogramName, 'desc':desc, 'actors':actors, 'producers':producers, 'category':category, 'episode':episode, 'rating':rating} writeProgram(programdata) # Get EPG data from SKY @@ -252,37 +241,25 @@ def GetEPGFromSKY(ChannelInfo): data = json.loads(u, encoding='utf-8') programs = data['scheduleListIn'] for program in {v['starttime']:v for v in programs}.values(): - programName = unescape(program['program_name']).replace('lt;','<').replace('gt;','>').replace('amp;','&') - rebroadcast = program['rebroad'] + programName = unescape(program['program_name']).replace('lt;','<').replace('gt;','>').replace('amp;','&') if program['program_name'] else '' + subprogramName = unescape(program['program_subname']).replace('lt;','<').replace('gt;','>').replace('amp;','&') if program['program_subname'] else '' + rebroadcast = program['rebroad'] if program['rebroad'] else '' if rebroadcast == 'Y': programName = programName + ' (재방송)' - actors = program['cast'] - if actors: actors = escape(actors) - producer = program['dirt'] - if producer: producer = escape(producer) + actors = program['cast'].replace('...','').strip(', ') if program['cast'] else '' + producers = program['dirt'].replace('...','').strip(', ') if program['dirt'] else '' startTime = program['starttime'] endTime = program['endtime'] - category = program['program_category1'] + '/' + program['program_category2'] - if category: category = escape(category) - rating = escape(program['grade']) - if rating == '0': - rating = '전체 시청가' - else : - rating = '%s세 이상 시청가' % (rating) - episode = program['episode_id'] + category = program['program_category1'] + rating = int(program['grade']) if program['grade'] else '' + episode = program['episode_id'] if program['episode_id'] else '' if episode : episode = int(episode) - description = program['description'] + description = unescape(program['description']).replace('lt;','<').replace('gt;','>').replace('amp;','&') if program['description'] else '' if description: description = unescape(description).replace('lt;','<').replace('gt;','>').replace('amp;','&') - summary = program['summary'] - if summary: summary = unescape(summary).replace('lt;','<').replace('gt;','>').replace('amp;','&') - desc = programName - if episode : desc = desc + '\n회차 : ' + str(episode) + '회' - desc = desc + '\n장르 : ' + category - if actors : desc = desc + '\n출연 : ' + actors - if producer : desc = desc + '\n제작 : ' + producer - desc = desc + '\n등급 : ' + rating - if description: desc = desc + '\n' + description + summary = unescape(program['summary']).replace('lt;','<').replace('gt;','>').replace('amp;','&') if program['summary'] else '' + desc = '' + if description: desc = description if summary : desc = desc + '\n' + summary - programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'desc':desc, 'actors':actors, 'producer':producer, 'category':category, 'episode':episode, 'rating':rating} + programdata = {'channelId':channelId, 'startTime':startTime, 'endTime':endTime, 'programName':programName, 'subprogramName':subprogramName, 'desc':desc, 'actors':actors, 'producers':producers, 'category':category, 'episode':episode, 'rating':rating} writeProgram(programdata) # Write Program @@ -290,33 +267,53 @@ def writeProgram(programdata): channelId = programdata['channelId'] startTime = programdata['startTime'] endTime = programdata['endTime'] - programName = programdata['programName'] - desc = programdata['desc'] - actors = programdata['actors'] - producer = programdata['producer'] - category = programdata['category'] + programName = escape(programdata['programName']) + subprogramName = escape(programdata['subprogramName']) + actors = escape(programdata['actors']) + producers = escape(programdata['producers']) + category = escape(programdata['category']) episode = programdata['episode'] - rating = programdata['rating'] + if programdata['rating'] == 0 : + rating = '전체 관람가' + else : + rating = '%s세 이상 관람가' % (programdata['rating']) + + desc = programName + if subprogramName : desc = desc + '\n부제 : ' + subprogramName + if episode : desc = desc + '\n회차 : ' + str(episode) + '회' + desc = desc + '\n장르 : ' + category + if actors : desc = desc + '\n출연 : ' + actors + if producers : desc = desc + '\n제작 : ' + producers + desc = desc + '\n등급 : ' + rating + if programdata['desc'] : desc = desc + '\n' + escape(programdata['desc']) contentTypeDict={'교양':'Arts / Culture (without music)', '만화':'Cartoons / Puppets', '교육':'Education / Science / Factual topics', '취미':'Leisure hobbies', '드라마':'Movie / Drama', '영화':'Movie / Drama', '음악':'Music / Ballet / Dance', '뉴스':'News / Current affairs', '다큐':'Documentary', '시사/다큐':'Documentary', '연예':'Show / Game show', '스포츠':'Sports', '홈쇼핑':'Advertisement / Shopping'} contentType = '' for key, value in contentTypeDict.iteritems(): if category.startswith(key): contentType = value - print '\t' % (startTime, endTime,channelId) - print '\t\t<![CDATA[%s]]>' % (programName) - print '\t\t' % (desc) - if actors or producer: - print '\t\t' - if actors: print '\t\t\t%s' % (actors) - if producer: print '\t\t\t%s' % (producer) - print '\t\t' - print '\t\t%s' % (category) - print '\t\t%s' % (contentType) + print ' ' % (startTime, endTime,channelId) + print ' %s' % (programName) + if subprogramName : + print ' %s' % (subprogramName) + print ' %s' % (desc) + if actors or producers: + print ' ' + if actors: + for actor in actors.split(','): + if actor: print ' %s' % (actor) + if producers: + for producer in producers.split(','): + if producer: print ' %s' % (producer) + print ' ' + if category: print ' %s' % (category) + if contentType: print ' %s' % (contentType) if episode: - print '\t\t%s' % (episode) - print '\t\t\n\t\t\t%s\n\t\t' % (rating) - print '\t' - + print ' %s' % (episode) + if rating: + print ' ' + print ' %s' % (rating) + print ' ' + print ' ' # Write XML def writeXML(data): print data @@ -344,7 +341,7 @@ if args.iptv: if args.limit: period = args.limit else: - period = default_fetch_limit; + period = default_fetch_limit if args.icon: IconUrl = args.icon @@ -357,9 +354,8 @@ elif args.socket: sock.connect(args.socket) sockfile = sock.makefile('w+') sys.stdout = sockfile - writeXML('') -writeXML('') -writeXML('') +writeXML('\n') +writeXML('') getEpg() writeXML('')