diff --git a/epg2xml.php b/epg2xml.php
index 99b1c85..6e7c82a 100644
--- a/epg2xml.php
+++ b/epg2xml.php
@@ -588,10 +588,10 @@ function GetEPGFromLG($ChannelInfo) {
printError($ChannelName.HTTP_ERROR);
else :
$response = ''.$response;
- $dom = new DomDocument;
- libxml_use_internal_errors(True);
$response = mb_convert_encoding($response, "UTF-8", "EUC-KR");
$response = str_replace(array('<재>', ' [..', ' (..'), array('<재>', '', ''), $response);
+ $dom = new DomDocument;
+ libxml_use_internal_errors(True);
if($dom->loadHTML($response)):
$xpath = new DomXPath($dom);
$query = "//div[@class='tblType list']/table/tbody/tr";
@@ -603,7 +603,7 @@ function GetEPGFromLG($ChannelInfo) {
$cells = $row->getElementsByTagName('td');
$startTime = date("YmdHis", strtotime($day." ".trim($cells->item(0)->nodeValue)));
$programName = trim($cells->item(1)->childNodes->item(0)->nodeValue);
- $pattern = '/(<재>?)?(.*?)(?:\[(.*)\])?\s?(?:\(([\d,]+)회\))?$/';
+ $pattern = '/(<재>)?\s?(?:\[.*?\])?(.*?)(?:\[(.*)\])?\s?(?:\(([\d,]+)회\))?$/';
preg_match($pattern, $programName, $matches);
if ($matches != NULL) :
if(isset($matches[2])) $programName = trim($matches[2]) ?: "";
@@ -726,9 +726,13 @@ function GetEPGFromSKB($ChannelInfo) {
printError($ChannelName.HTTP_ERROR);
else :
$response = str_replace('charset="euc-kr"', 'charset="utf-8"', $response);
+ $response = mb_convert_encoding($response, "UTF-8", "EUC-KR");
+ $response = preg_replace('//is', '', $response);
+ $response = preg_replace('/<\/span>/is', '', $response);
+ $pattern = '/(.*)<\/span>/';
+ $response = preg_replace_callback($pattern, function($matches) { return ''.htmlspecialchars($matches[1], ENT_NOQUOTES).'';}, $response);
$dom = new DomDocument;
libxml_use_internal_errors(True);
- $response = mb_convert_encoding($response, "UTF-8", "EUC-KR");
if($dom->loadHTML($response)):
$xpath = new DomXPath($dom);
$query = "//span[@class='caption' or @class='explan' or @class='fullHD' or @class='UHD' or @class='nowon']";
@@ -758,7 +762,6 @@ function GetEPGFromSKB($ChannelInfo) {
//ChannelId, startTime, programName, subprogramName, desc, actors, producers, category, episode, rebroadcast, rating
$epginfo[] = array($ChannelId, $startTime, $programName, $subprogramName, $desc, $actors, $producers, $category, $episode, $rebroadcast, $rating);
endforeach;
- epgzip($epginfo);
else :
if($GLOBALS['debug']) printError($ChannelName.CONTENT_ERROR);
endif;
@@ -767,6 +770,7 @@ function GetEPGFromSKB($ChannelInfo) {
if($GLOBALS['debug']) printError($e->getMessage());
}
endforeach;
+ epgzip($epginfo);
}
// Get EPG data from SKY
@@ -1535,7 +1539,7 @@ function writeProgram($programdata) {
$rating = sprintf("%s세 이상 관람가", $programdata['rating']);
endif;
if($GLOBALS['addverbose'] == 'y') :
- $desc = htmlspecialchars($programdata['programName'], ENT_XML1);
+ $desc = trim(htmlspecialchars($programdata['programName'], ENT_XML1));
if($subprogramName) $desc = $desc."\n부제 : ".$subprogramName;
if($rebroadcast == True && $GLOBALS['addrebroadcast'] == 'y') $desc = $desc."\n방송 : 재방송";
if($episode) $desc = $desc."\n회차 : ".$episode."회";
diff --git a/epg2xml.py b/epg2xml.py
index eca42fa..63038b9 100644
--- a/epg2xml.py
+++ b/epg2xml.py
@@ -163,7 +163,7 @@ def GetEPGFromEPG(ChannelInfo):
html_data = response.content
data = unicode(html_data, 'euc-kr', 'ignore').encode('utf-8', 'ignore')
pattern = '
(.*)<\/td>'
- data = re.sub(pattern, partial(replacement,txt='title'), data)
+ data = re.sub(pattern, partial(replacement, tag='td'), data)
strainer = SoupStrainer('table', {'style':'margin-bottom:30'})
soup = BeautifulSoup(data, 'lxml', parse_only=strainer, from_encoding='utf-8')
html = soup.find_all('table', {'style':'margin-bottom:30'})
@@ -270,7 +270,7 @@ def GetEPGFromLG(ChannelInfo):
response.raise_for_status()
html_data = response.content
data = unicode(html_data, 'euc-kr', 'ignore').encode('utf-8', 'ignore')
- data = data.replace('<재>', '<재>')
+ data = data.replace('<재>', '<재>').replace(' [..','').replace(' (..', '')
strainer = SoupStrainer('table')
soup = BeautifulSoup(data, 'lxml', parse_only=strainer, from_encoding='utf-8')
html = soup.find('table').tbody.find_all('tr') if soup.find('table') else ''
@@ -285,7 +285,7 @@ def GetEPGFromLG(ChannelInfo):
startTime = startTime.strftime('%Y%m%d%H%M%S')
rating = 0 if cell[1].find('span', {'class': 'tag cte_all'}).text.strip()=="All" else int(cell[1].find('span', {'class': 'tag cte_all'}).text.strip())
cell[1].find('span', {'class': 'tagGroup'}).decompose()
- pattern = '(<재>?)?(.*?)(?:\[(.*)\])?\s?(?:\(([\d,]+)회\))?$'
+ pattern = '(<재>)?\s?(?:\[.*?\])?(.*?)(?:\[(.*)\])?\s?(?:\(([\d,]+)회\))?$'
matches = re.match(pattern, cell[1].text.strip().decode('string_escape'))
if not (matches is None):
programName = matches.group(2).strip() if matches.group(2) else ''
@@ -368,6 +368,17 @@ def GetEPGFromSKB(ChannelInfo):
response.raise_for_status()
html_data = response.content
data = unicode(html_data, 'euc-kr', 'ignore').encode('utf-8', 'ignore')
+ data = re.sub('', '', data, 0, re.I|re.S)
+ data = re.sub('', '', data)
+ data = re.sub('', '', data)
+ data = re.sub('화면해설','',data)
+ data = re.sub('자막방송','',data)
+ data = re.sub('Full HD','',data)
+ data = re.sub('UHD','',data)
+ data = re.sub('now on','',data)
+ pattern = '(.*)<\/span>'
+ data = re.sub(pattern, partial(replacement, tag='span'), data)
+ #print(data)
strainer = SoupStrainer('div', {'id':'dawn'})
soup = BeautifulSoup(data, 'lxml', parse_only=strainer, from_encoding='utf-8')
html = soup.find_all('li') if soup.find_all('li') else ''
@@ -379,9 +390,7 @@ def GetEPGFromSKB(ChannelInfo):
startTime = str(day) + ' ' + row.find('span', {'class':'time'}).text
startTime = datetime.datetime.strptime(startTime, '%Y-%m-%d %H:%M')
startTime = startTime.strftime('%Y%m%d%H%M%S')
- if row.find('span', {'class':['caption', 'explan', 'fullHD', 'UHD', 'nowon']}) :
- row.find('span', {'class':['caption', 'explan', 'fullHD', 'UHD', 'nowon']}).decompose()
- cell = row.find('span', {'class':None}).text.decode('string_escape').strip()
+ cell = row.find('span', {'class':'title'}).text.decode('string_escape').strip()
pattern = "^(.*?)(\(([\d,]+)회\))?(<(.*)>)?(\((재)\))?$"
matches = re.match(pattern, cell)
if not(matches is None) :
@@ -394,13 +403,13 @@ def GetEPGFromSKB(ChannelInfo):
rating = int(rating.text.decode('string_escape').replace('세','').strip())
#ChannelId, startTime, programName, subprogramName, desc, actors, producers, category, episode, rebroadcast, rating
epginfo.append([ChannelId, startTime, programName, subprogramName, desc, actors, producers, category, episode, rebroadcast, rating])
- epgzip(epginfo)
else:
if(debug): printError(ChannelName + CONTENT_ERROR)
else: pass
except (requests.exceptions.RequestException) as e:
if(debug): printError(ChannelName + str(e))
else: pass
+ epgzip(epginfo)
# Get EPG data from SKY
def GetEPGFromSKY(ChannelInfo):
@@ -510,7 +519,7 @@ def GetEPGFromIscs(ChannelInfo):
try:
data = json.loads(json_data, encoding='utf-8')
pattern = ' | (.*)<\/td>'
- data['html'] = re.sub(pattern, partial(replacement, txt='name'), data['html'])
+ data['html'] = re.sub(pattern, partial(replacement, tag='td'), data['html'])
strainer = SoupStrainer('tbody')
soup = BeautifulSoup(data['html'], 'lxml', parse_only=strainer)
html = soup.find_all('tr') if soup.find_all('tr') else ''
@@ -522,7 +531,7 @@ def GetEPGFromIscs(ChannelInfo):
startTime = str(day) + ' ' + row.find('td', {'class':'time'}).text
startTime = datetime.datetime.strptime(startTime, '%Y-%m-%d %H:%M')
startTime = startTime.strftime('%Y%m%d%H%M%S')
- programName = row.find('td', {'class':'name'}).text.decode('string_escape').strip()
+ programName = row.find('td', {'class':'title'}).text.decode('string_escape').strip()
rating = row.find('span', {'class':'year'}).text.decode('string_escape').strip()
if rating == '전체관람' : rating = 0
else : rating = rating.replace('세이상', ' ')
@@ -888,7 +897,7 @@ def writeProgram(programdata):
subprogramName = escape(programdata['subprogramName']).strip()
matches = re.match('(.*) \(?(\d+부)\)?', unescape(programName.encode('utf-8', 'ignore')))
if not(matches is None):
- programName = escape(matches.group(1));
+ programName = escape(matches.group(1)).strip();
subprogramName = escape(matches.group(2)) + ' ' + subprogramName
subprogramName = subprogramName.strip()
if programName is None:
@@ -905,7 +914,7 @@ def writeProgram(programdata):
else :
rating = '%s세 이상 관람가' % (programdata['rating'])
if addverbose == 'y':
- desc = escape(programdata['programName'])
+ desc = escape(programdata['programName']).strip()
if subprogramName : desc = desc + '\n부제 : ' + subprogramName
if rebroadcast == True and addrebroadcast == 'y' : desc = desc + '\n방송 : 재방송'
if episode : desc = desc + '\n회차 : ' + str(episode) + '회'
@@ -917,11 +926,10 @@ def writeProgram(programdata):
desc =''
if programdata['desc'] : desc = desc + '\n' + escape(programdata['desc'])
desc = re.sub(' +',' ', desc)
- #desc = re.sub('\s+','\s', desc)
contentTypeDict={'교양':'Arts / Culture (without music)', '만화':'Cartoons / Puppets', '교육':'Education / Science / Factual topics', '취미':'Leisure hobbies', '드라마':'Movie / Drama', '영화':'Movie / Drama', '음악':'Music / Ballet / Dance', '뉴스':'News / Current affairs', '다큐':'Documentary', '라이프':'Documentary', '시사/다큐':'Documentary', '연예':'Show / Game show', '스포츠':'Sports', '홈쇼핑':'Advertisement / Shopping'}
contentType = ''
for key, value in contentTypeDict.iteritems():
- if category.startswith(key):
+ if key in category:
contentType = value
print(' ' % (startTime, endTime, ChannelId))
print(' %s' % (programName))
@@ -956,10 +964,11 @@ def printLog(*args):
def printError(*args):
print("Error : ", *args, file=sys.stderr)
-def replacement(match, txt):
+def replacement(match, tag):
if not(match is None):
+ tag = tag.strip()
programName = unescape(match.group(1)).replace('<','<').replace('>','>').strip()
- programName = '' + programName + ' | '
+ programName = '<'+ tag + ' class="title">' + programName + '' + tag + '>'
return programName
else:
return '';
|