import pandas as pd # Reading the raw data file and replaceing the unnecessary quotation and line breaks in the file. # In the end we pull up a sample of 1000000 observations from the file""" reviewsfileloc = "/home/haida_niejie/work/xhx/zahose/recommender/mmnet/mmnet/data/google_local/reviews.clean.json" f = codecs . open ( reviewsfileloc , "r" , encoding = "UTF-8" ) count = 0 #The data has many discripency like having unnecessary qutotaions etc so we first try to understand the pattern the data is in and replace those discripencies. y = open ( "sample.txt" , "w" , encoding = "UTF-8" , newline = "" ) for line in f : z = line . encode ( 'ascii' ) . decode ( 'unicode-escape' ) z = z . replace ( ": u" , ":" ) z = z . replace ( "[u" , "[" ) z = z . replace ( ', u' , ',' ) z = z . replace ( "\n" , '' ) z = z . replace ( "}" , "}\n" ) z = z . replace ( '"' , '\\"' ) z = z . replace ( "'rating':" , '"rating":' ) z = z . replace ( "'reviewerName':'" , '"reviewerName":"' ) z = z . replace ( "', '" , '", "' ) z = z . replace ( "':'" , '":"' ) z = z . replace ( "': ['" , '": ["' ) z = z . replace ( "'], '" , '"], "' ) z = z . replace ( "unixReviewTime': " , 'unixReviewTime": ' ) z = z . replace ( ", 'reviewTime" , ', "reviewTime' ) z = z . replace ( "reviewText':" , 'reviewText":' ) z = z . replace ( "','" , '","' ) z = z . replace ( '.\\", ' , '.", ' ) z = z . replace ( '?\\", ' , '?", ' ) z = z . replace ( "'categories" , '"categories' ) z = z . replace ( '":\\"' , '":"' ) z = z . replace ( '\\", "' , '", "' ) z = z . replace ( "Clothing Store'," , 'Clothing Store",' ) z = z . replace ( '",\\"' , '","' ) z = z . replace ( '\\",\\"' , '","' ) z = z . replace ( '\\"],' , '"],' ) z = z . replace ( '\\","' , '","' ) z = z . replace ( ", 'gPlusPlaceId" , ', "gPlusPlaceId' ) z = z . replace ( "categories':" , 'categories":' ) z = z . replace ( '": [\\"' , '": ["' ) z = z . replace ( "'}" , '"}' ) z = z . replace ( "'reviewerName':" , '"reviewerName":' ) z = z . replace ( ", 'reviewText" , ', "reviewText' ) z = z . replace ( '":\\"' , '":"' ) z = z . replace ( '\\", "' , '", "' ) z = z . replace ( '\\"refrigerator\\"' , "'refrigerator'" ) y . write ( z ) count = count + 1 #Here we extracting only the first 1000000 observations from the data as the file is too big to execute the code. if count == 10 : break y . close ( ) f . close ( ) # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # when we see the sample file we notice that many observations are divided into 2-3 line because there are line breaks in the data. So we try to join those observations back into one. f = codecs . open ( "sample.txt" , "r" , encoding = "UTF-8" ) y = open ( "file_without_line_breaks.txt" , "w" , encoding = "UTF-8" , newline = "\n" ) temp = "" count = 0 for line in f : count = count + 1 if len ( line ) > 1 : if line [ - 2 ] == '}' : # A given observation is correct when the last second character is "}" temp = temp . strip ( '\r\n' ) + '' + line y . write ( temp , ) if count == 10 : break temp = "" elif line [ - 2 ] != '}' : # if the given line is not having last second character ending with "}" then it is stored in a Temp variable and strip any line breaks at the end of that line. temp = temp + line . strip ( '\r\n' ) #y.close() #f.close() # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # Here as the file still has lots of discrepancy in the data we use regular expression to pull out the Data from the Raw data file and store it in the form of Pandas data frame in CSV # Here we also detect in which language the review text is been given so that in the end we can filter out x = codecs . open ( "file_without_line_breaks.txt" , "r" , encoding = "UTF-8" ) count = 0 data = [ ] for line in x : count = count + 1 print ( count ) # when we parse the data we observe that the data is still not in correct format so we use the replace function and replace all the naming conventions in the format we need and make the data in the format needed. if ( line . find ( "rating" ) >= 0 and line . find ( "gPlusUserId" ) >= 0 ) : line . replace ( "'rating'" , '"rating"' ) line . replace ( "'reviewerName'" , '"reviewerName"' ) line . replace ( "'reviewText'" , '"reviewText"' ) line . replace ( "'categories'" , '"categories"' ) line . replace ( "'gPlusPlaceId'" , '"gPlusPlaceId"' ) line . replace ( "'unixReviewTime'" , '"unixReviewTime"' ) # so we use regular expression here and extract the data with the help of naming convention pattern. m = re . search ( '"rating":(.+?), "reviewerName"' , line ) n = re . search ( '"reviewerName":(.+?), "reviewText"' , line ) o = re . search ( '"reviewText":(.+?), "categories"' , line ) p = re . search ( '"categories":(.+?), "gPlusPlaceId"' , line ) q = re . search ( '"gPlusPlaceId":(.+?), "unixReviewTime"' , line ) r = re . search ( '"unixReviewTime":(.+?), "reviewTime' , line ) s = re . search ( '"reviewTime(.+?)gPlusUserId"' , line ) t = re . search ( 'gPlusUserId":"(.+?)"' , line ) # we use try and except here because for some observations we are not able to identify the language. so we except those observations try : #here we use detect() function to detect in which language the review text is in. output = m . group ( 1 ) , n . group ( 1 ) , o . group ( 1 ) , detect ( o . group ( 1 ) ) , p . group ( 1 ) , q . group ( 1 ) , r . group ( 1 ) , s . group ( 1 ) , t . group ( 1 ) data . append ( output ) except : language = "error" # if the function is not able to find the language we drop those exceptions. print ( "This row throws and error:" , o . group ( 1 ) ) # write the data in pandas Data frame. result = pd . DataFrame ( data , columns = [ 'Rating' , 'ReviewerName' , 'ReviewerText' , 'Lang' , 'Categories' , 'gPlusPlaceId' , 'UnixReviewTime' , 'ReviewTime' , 'gPlusUserId' ] ) result . to_csv ( "Review_sentiment.csv" ) #As the ReviewText column has lots of double quotes in the text data it becomes difficult for the text analysis so we remove all the double quotes from the data with the help of replace function. result [ 'ReviewerText' ] = result [ 'ReviewerText' ] . replace ( { '"' : '' } , regex = True ) result [ 'ReviewerName' ] = result [ 'ReviewerName' ] . replace ( { '"' : '' } , regex = True ) result [ 'Lang' ] = result [ 'Lang' ] . replace ( { '"' : '' } , regex = True ) result [ 'Categories' ] = result [ 'Categories' ] . replace ( { '"' : '' } , regex = True ) result [ 'gPlusPlaceId' ] = result [ 'gPlusPlaceId' ] . replace ( { '"' : '' } , regex = True ) result [ 'UnixReviewTime' ] = result [ 'UnixReviewTime' ] . replace ( { '"' : '' } , regex = True ) result [ 'ReviewTime' ] = result [ 'ReviewTime' ] . replace ( { '"' : '' } , regex = True ) result [ 'gPlusUserId' ] = result [ 'gPlusUserId' ] . replace ( { '"' : '' } , regex = True ) result = pd . DataFrame ( result , columns = [ 'Rating' , 'ReviewerName' , 'ReviewerText' , 'Lang' , 'Categories' , 'gPlusPlaceId' , 'UnixReviewTime' , 'ReviewTime' , 'gPlusUserId' ] ) # In the end we write all the clean data in output1.csv file result . to_csv ( "Reviews.csv" ) # print(data['ReviewerText'])

近期在对googal local代码过程中,借用github上某一博主处理代码时出现代码某一块直接跳过不执行的情况,具体问题为在代码运行过程中,有一块代码直接跳过了,没有执行,跳过的代码块为:

for line in x:
    count = count+1
    print(count)
# when we parse the data we observe that the data is still not in correct format so we use the replace function and replace all the naming conventions in the format we need and make the data in the format needed.
    if(line.find("rating")>=0 and line.find("gPlusUserId")>=0):
        line.replace("'rating'",'"rating"')
        line.replace("'reviewerName'",'"reviewerName"')
        line.replace("'reviewText'",'"reviewText"')
        line.replace("'categories'",'"categories"')
        line.replace("'gPlusPlaceId'",'"gPlusPlaceId"')
        line.replace("'unixReviewTime'",'"unixReviewTime"')
# so we use regular expression here and extract the data with the help of naming convention pattern.
        m = re.search('"rating":(.+?), "reviewerName"', line)
        n = re.search('"reviewerName":(.+?), "reviewText"', line)
        o = re.search('"reviewText":(.+?), "categories"', line)
        p = re.search('"categories":(.+?), "gPlusPlaceId"', line)
        q = re.search('"gPlusPlaceId":(.+?), "unixReviewTime"', line)
        r = re.search('"unixReviewTime":(.+?), "reviewTime', line)
        s = re.search('"reviewTime(.+?)gPlusUserId"', line)
        t = re.search('gPlusUserId":"(.+?)"', line)
        # we use try and except here because for some observations we are not able to identify the language. so we except those observations
        try: #here we use detect() function to detect in which language the review text is in.
            output= m.group(1),n.group(1),o.group(1),detect(o.group(1)),p.group(1),q.group(1),r.group(1),s.group(1),t.group(1)
            data.append(output)
        except:
            language = "error"# if the function is not able to find the language we drop those exceptions.
            print("This row throws and error:", o.group(1))

该代码块在运行过程中没有运行,经过改进将在循环上面加上代码

y.close()
f.close()

循环正常进行,原因是上面写入文件之后没有关闭,代码在下面直接读入,影响了文件的读入,但是具体原理还没有弄明白,希望有懂得大神能够帮忙解决这个原理问题。

源代码出处

当我们提到一门编程语言的效率时:通常有两层意思,第一是开发效率,这是对程序员而言,完成编码所需要的时间;另一个是运行效率,这是对计算机而言,完成计算任务所需要的时间。编码效率和运行效率往往是鱼与熊掌的关系,是很难同时兼顾的。不同的语言会有不同的侧重,python语言毫无疑问更在乎编码效率,life is short,we use python。 虽然使用python的编程人员都应该接受其运行效率低的事实,但python在越多越来的领域都有广泛应用,比如科学计算 、web服务器等。程序员当然也希望python能够运算得更快,希望python可以更强大。 首先,python相比其他语言具体有多慢,
for语句Python中for语句的特性是一种遍历循环的方法,利用该特性可以更灵活地进行循环设计。for循环的理解:该语句将遍历结构体中的元素一次赋值给循环变量,并执行一次语句块,因此循环次数理论上是有限的。下面通过代码理解几种for循环用法。print('常规遍历:') for i in range(5): print(i)常规遍历print('区间数值遍历:') for i in ra...
Python中for的用法跟C++有点区别,首先是把列表的元素传递给变量,并存储到这个变量中,然后循环打印存储在变量中的元素,别忘记了还有一个冒号,如果要循环输出,需要对后面的代码进行缩进Tab,没有缩进的话,就不属于for循环的范围。 for 变量名 in 列表名      .....语句      print(变量名)
这篇文章主要介绍了如何获取Python简单for循环索引,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下 Python的for循环相比其他语言而言更加简单,比如经常会有如下这样类似的例子。我们可以直接对列表进行遍历,获取列表的某个元素,并对这个元素进行相应的操作。 testList = ['nice', 'to', 'meet', 'you'] for x in testList: print(x) 运行结果如下: 但是,如果我们在上面的for循环中,除了想要获取对应元素x以外,还想知道对于这个元素在列表中的
不懂为啥第一次运行会出错,错了再运行一遍for就出来了! 以上这篇python实现画循环圆就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持软件开发网。 您可能感兴趣的文章:Python实现的圆形绘制(画圆)示例python实现画圆功能简单实现python画圆功能Python 用turtle实现用正方形画圆的例子 private static Scanner sc = new Scanner(System.in); public static void main(String[] args) { whiteMouseSortLogic(); private static void whiteMouseSortLogic() { System.out.println("请输入小白鼠只数:"); int x = sc.nextInt(); 张浩Java考试成绩未达到自己的目标。为了表明自己勤奋学习的决心,他决定写一百遍“好好学习,天天向上!”根据前面学习的内容我们知道System.out.println(“好好学习,天天向上!”)可以将这句话打印出来,但是现在是要打印100次,那是不是我们需要写100次呢?答案肯定是否定的,在java中我们要避免写重读的代码,我们要尽量“偷懒”让代码能重复使用。那如何才能写一次输出语句,让它运行100次呢?接下来我们就要学习流程控制中的--循环结构。 一、什么是循环结构 1.循环简单理解就是重读
要在 Python过 for 循环,您可以使用 `continue` 关键字。当代码执行到 `continue` 语句时,它会立即过当前循环迭代并继续下一个迭代。 以下是一个示例: ```python numbers = [1, 2, 3, 4, 5] for num in numbers: if num == 3: continue print(num) 在上面的代码中,当循环迭代到数字 3 时,`continue` 语句会过该迭代,不执行后续的打印语句,然后继续下一个迭代。因此,输出将是: 希望这可以帮助到您!如果您有任何其他问题,请随时提问。
sklearn.exceptions.NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call ‘fit Refining Graph Representation for Cross-DomainRecommendation Based on EdgePruning in Latent Space笔记 sklearn.exceptions.NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call ‘fit XinyuRen_: 谢谢大大救我狗命 Refining Graph Representation for Cross-DomainRecommendation Based on EdgePruning in Latent Space笔记 jupyterlab误删.py文件恢复 循环中嵌套条件语句,print的内容不显示