SKB 함수까지 체크

This commit is contained in:
wonipapa 2017-09-01 12:23:52 +09:00
parent 5059499479
commit f40c20e977
2 changed files with 34 additions and 21 deletions

View File

@ -588,10 +588,10 @@ function GetEPGFromLG($ChannelInfo) {
printError($ChannelName.HTTP_ERROR);
else :
$response = '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'.$response;
$dom = new DomDocument;
libxml_use_internal_errors(True);
$response = mb_convert_encoding($response, "UTF-8", "EUC-KR");
$response = str_replace(array('<재>', ' [..', ' (..'), array('&lt;재&gt;', '', ''), $response);
$dom = new DomDocument;
libxml_use_internal_errors(True);
if($dom->loadHTML($response)):
$xpath = new DomXPath($dom);
$query = "//div[@class='tblType list']/table/tbody/tr";
@ -603,7 +603,7 @@ function GetEPGFromLG($ChannelInfo) {
$cells = $row->getElementsByTagName('td');
$startTime = date("YmdHis", strtotime($day." ".trim($cells->item(0)->nodeValue)));
$programName = trim($cells->item(1)->childNodes->item(0)->nodeValue);
$pattern = '/(<재>?)?(.*?)(?:\[(.*)\])?\s?(?:\(([\d,]+)회\))?$/';
$pattern = '/(<재>)?\s?(?:\[.*?\])?(.*?)(?:\[(.*)\])?\s?(?:\(([\d,]+)회\))?$/';
preg_match($pattern, $programName, $matches);
if ($matches != NULL) :
if(isset($matches[2])) $programName = trim($matches[2]) ?: "";
@ -726,9 +726,13 @@ function GetEPGFromSKB($ChannelInfo) {
printError($ChannelName.HTTP_ERROR);
else :
$response = str_replace('charset="euc-kr"', 'charset="utf-8"', $response);
$response = mb_convert_encoding($response, "UTF-8", "EUC-KR");
$response = preg_replace('/<!--(.*?)-->/is', '', $response);
$response = preg_replace('/<span><\/span>/is', '', $response);
$pattern = '/<span>(.*)<\/span>/';
$response = preg_replace_callback($pattern, function($matches) { return '<span class="title">'.htmlspecialchars($matches[1], ENT_NOQUOTES).'</span>';}, $response);
$dom = new DomDocument;
libxml_use_internal_errors(True);
$response = mb_convert_encoding($response, "UTF-8", "EUC-KR");
if($dom->loadHTML($response)):
$xpath = new DomXPath($dom);
$query = "//span[@class='caption' or @class='explan' or @class='fullHD' or @class='UHD' or @class='nowon']";
@ -758,7 +762,6 @@ function GetEPGFromSKB($ChannelInfo) {
//ChannelId, startTime, programName, subprogramName, desc, actors, producers, category, episode, rebroadcast, rating
$epginfo[] = array($ChannelId, $startTime, $programName, $subprogramName, $desc, $actors, $producers, $category, $episode, $rebroadcast, $rating);
endforeach;
epgzip($epginfo);
else :
if($GLOBALS['debug']) printError($ChannelName.CONTENT_ERROR);
endif;
@ -767,6 +770,7 @@ function GetEPGFromSKB($ChannelInfo) {
if($GLOBALS['debug']) printError($e->getMessage());
}
endforeach;
epgzip($epginfo);
}
// Get EPG data from SKY
@ -1535,7 +1539,7 @@ function writeProgram($programdata) {
$rating = sprintf("%s세 이상 관람가", $programdata['rating']);
endif;
if($GLOBALS['addverbose'] == 'y') :
$desc = htmlspecialchars($programdata['programName'], ENT_XML1);
$desc = trim(htmlspecialchars($programdata['programName'], ENT_XML1));
if($subprogramName) $desc = $desc."\n부제 : ".$subprogramName;
if($rebroadcast == True && $GLOBALS['addrebroadcast'] == 'y') $desc = $desc."\n방송 : 재방송";
if($episode) $desc = $desc."\n회차 : ".$episode."";

View File

@ -163,7 +163,7 @@ def GetEPGFromEPG(ChannelInfo):
html_data = response.content
data = unicode(html_data, 'euc-kr', 'ignore').encode('utf-8', 'ignore')
pattern = '<td height="25" valign=top >(.*)<\/td>'
data = re.sub(pattern, partial(replacement,txt='title'), data)
data = re.sub(pattern, partial(replacement, tag='td'), data)
strainer = SoupStrainer('table', {'style':'margin-bottom:30'})
soup = BeautifulSoup(data, 'lxml', parse_only=strainer, from_encoding='utf-8')
html = soup.find_all('table', {'style':'margin-bottom:30'})
@ -270,7 +270,7 @@ def GetEPGFromLG(ChannelInfo):
response.raise_for_status()
html_data = response.content
data = unicode(html_data, 'euc-kr', 'ignore').encode('utf-8', 'ignore')
data = data.replace('<재>', '&lt;재&gt;')
data = data.replace('<재>', '&lt;재&gt;').replace(' [..','').replace(' (..', '')
strainer = SoupStrainer('table')
soup = BeautifulSoup(data, 'lxml', parse_only=strainer, from_encoding='utf-8')
html = soup.find('table').tbody.find_all('tr') if soup.find('table') else ''
@ -285,7 +285,7 @@ def GetEPGFromLG(ChannelInfo):
startTime = startTime.strftime('%Y%m%d%H%M%S')
rating = 0 if cell[1].find('span', {'class': 'tag cte_all'}).text.strip()=="All" else int(cell[1].find('span', {'class': 'tag cte_all'}).text.strip())
cell[1].find('span', {'class': 'tagGroup'}).decompose()
pattern = '(<재>?)?(.*?)(?:\[(.*)\])?\s?(?:\(([\d,]+)회\))?$'
pattern = '(<재>)?\s?(?:\[.*?\])?(.*?)(?:\[(.*)\])?\s?(?:\(([\d,]+)회\))?$'
matches = re.match(pattern, cell[1].text.strip().decode('string_escape'))
if not (matches is None):
programName = matches.group(2).strip() if matches.group(2) else ''
@ -368,6 +368,17 @@ def GetEPGFromSKB(ChannelInfo):
response.raise_for_status()
html_data = response.content
data = unicode(html_data, 'euc-kr', 'ignore').encode('utf-8', 'ignore')
data = re.sub('<!--(.*?)-->', '', data, 0, re.I|re.S)
data = re.sub('<span></span>', '', data)
data = re.sub('<span class="title">', '<span>', data)
data = re.sub('<span class="explan">화면해설</span>','',data)
data = re.sub('<span class="caption">자막방송</span>','',data)
data = re.sub('<span class="fullHD">Full HD</span>','',data)
data = re.sub('<span class="UHD">UHD</span>','',data)
data = re.sub('<span class="nowon">now on</span>','',data)
pattern = '<span>(.*)<\/span>'
data = re.sub(pattern, partial(replacement, tag='span'), data)
#print(data)
strainer = SoupStrainer('div', {'id':'dawn'})
soup = BeautifulSoup(data, 'lxml', parse_only=strainer, from_encoding='utf-8')
html = soup.find_all('li') if soup.find_all('li') else ''
@ -379,9 +390,7 @@ def GetEPGFromSKB(ChannelInfo):
startTime = str(day) + ' ' + row.find('span', {'class':'time'}).text
startTime = datetime.datetime.strptime(startTime, '%Y-%m-%d %H:%M')
startTime = startTime.strftime('%Y%m%d%H%M%S')
if row.find('span', {'class':['caption', 'explan', 'fullHD', 'UHD', 'nowon']}) :
row.find('span', {'class':['caption', 'explan', 'fullHD', 'UHD', 'nowon']}).decompose()
cell = row.find('span', {'class':None}).text.decode('string_escape').strip()
cell = row.find('span', {'class':'title'}).text.decode('string_escape').strip()
pattern = "^(.*?)(\(([\d,]+)회\))?(<(.*)>)?(\((재)\))?$"
matches = re.match(pattern, cell)
if not(matches is None) :
@ -394,13 +403,13 @@ def GetEPGFromSKB(ChannelInfo):
rating = int(rating.text.decode('string_escape').replace('','').strip())
#ChannelId, startTime, programName, subprogramName, desc, actors, producers, category, episode, rebroadcast, rating
epginfo.append([ChannelId, startTime, programName, subprogramName, desc, actors, producers, category, episode, rebroadcast, rating])
epgzip(epginfo)
else:
if(debug): printError(ChannelName + CONTENT_ERROR)
else: pass
except (requests.exceptions.RequestException) as e:
if(debug): printError(ChannelName + str(e))
else: pass
epgzip(epginfo)
# Get EPG data from SKY
def GetEPGFromSKY(ChannelInfo):
@ -510,7 +519,7 @@ def GetEPGFromIscs(ChannelInfo):
try:
data = json.loads(json_data, encoding='utf-8')
pattern = '<td class="name">(.*)<\/td>'
data['html'] = re.sub(pattern, partial(replacement, txt='name'), data['html'])
data['html'] = re.sub(pattern, partial(replacement, tag='td'), data['html'])
strainer = SoupStrainer('tbody')
soup = BeautifulSoup(data['html'], 'lxml', parse_only=strainer)
html = soup.find_all('tr') if soup.find_all('tr') else ''
@ -522,7 +531,7 @@ def GetEPGFromIscs(ChannelInfo):
startTime = str(day) + ' ' + row.find('td', {'class':'time'}).text
startTime = datetime.datetime.strptime(startTime, '%Y-%m-%d %H:%M')
startTime = startTime.strftime('%Y%m%d%H%M%S')
programName = row.find('td', {'class':'name'}).text.decode('string_escape').strip()
programName = row.find('td', {'class':'title'}).text.decode('string_escape').strip()
rating = row.find('span', {'class':'year'}).text.decode('string_escape').strip()
if rating == '전체관람' : rating = 0
else : rating = rating.replace('세이상', ' ')
@ -888,7 +897,7 @@ def writeProgram(programdata):
subprogramName = escape(programdata['subprogramName']).strip()
matches = re.match('(.*) \(?(\d+부)\)?', unescape(programName.encode('utf-8', 'ignore')))
if not(matches is None):
programName = escape(matches.group(1));
programName = escape(matches.group(1)).strip();
subprogramName = escape(matches.group(2)) + ' ' + subprogramName
subprogramName = subprogramName.strip()
if programName is None:
@ -905,7 +914,7 @@ def writeProgram(programdata):
else :
rating = '%s세 이상 관람가' % (programdata['rating'])
if addverbose == 'y':
desc = escape(programdata['programName'])
desc = escape(programdata['programName']).strip()
if subprogramName : desc = desc + '\n부제 : ' + subprogramName
if rebroadcast == True and addrebroadcast == 'y' : desc = desc + '\n방송 : 재방송'
if episode : desc = desc + '\n회차 : ' + str(episode) + ''
@ -917,11 +926,10 @@ def writeProgram(programdata):
desc =''
if programdata['desc'] : desc = desc + '\n' + escape(programdata['desc'])
desc = re.sub(' +',' ', desc)
#desc = re.sub('\s+','\s', desc)
contentTypeDict={'교양':'Arts / Culture (without music)', '만화':'Cartoons / Puppets', '교육':'Education / Science / Factual topics', '취미':'Leisure hobbies', '드라마':'Movie / Drama', '영화':'Movie / Drama', '음악':'Music / Ballet / Dance', '뉴스':'News / Current affairs', '다큐':'Documentary', '라이프':'Documentary', '시사/다큐':'Documentary', '연예':'Show / Game show', '스포츠':'Sports', '홈쇼핑':'Advertisement / Shopping'}
contentType = ''
for key, value in contentTypeDict.iteritems():
if category.startswith(key):
if key in category:
contentType = value
print(' <programme start="%s +0900" stop="%s +0900" channel="%s">' % (startTime, endTime, ChannelId))
print(' <title lang="kr">%s</title>' % (programName))
@ -956,10 +964,11 @@ def printLog(*args):
def printError(*args):
print("Error : ", *args, file=sys.stderr)
def replacement(match, txt):
def replacement(match, tag):
if not(match is None):
tag = tag.strip()
programName = unescape(match.group(1)).replace('<','&lt;').replace('>','&gt;').strip()
programName = '<td class="'+ txt.strip() + '">' + programName + '</td>'
programName = '<'+ tag + ' class="title">' + programName + '</' + tag + '>'
return programName
else:
return '';