blazegraph struggles under load #187

kevinkle · 2017-07-20T17:06:40Z

        at javax.servlet.http.HttpServlet.service(HttpServlet.java:790)
        at org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:769)
        at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585)
        at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
        at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577)
        at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
        at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1125)
        at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515)
        at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
        at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1059)
        at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
        at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
        at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
        at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97)
        at org.eclipse.jetty.server.Server.handle(Server.java:497)
        at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:311)
        at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:248)
        at org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540)
        at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:610)
        at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:539)
        at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: java.util.concurrent.TimeoutException: Idle timeout expired: 30001/30000 ms
        at org.eclipse.jetty.util.SharedBlockingCallback$Blocker.block(SharedBlockingCallback.java:234)
        at org.eclipse.jetty.server.HttpInputOverHTTP.blockForContent(HttpInputOverHTTP.java:66)
        at org.eclipse.jetty.server.HttpInput$1.waitForContent(HttpInput.java:456)
        at org.eclipse.jetty.server.HttpInput.read(HttpInput.java:121)
        at org.apache.commons.io.input.BOMInputStream.read(BOMInputStream.java:286)
        at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
        at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
        at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
        at java.io.InputStreamReader.read(InputStreamReader.java:184)
        at java.io.BufferedReader.fill(BufferedReader.java:161)
        at java.io.BufferedReader.read(BufferedReader.java:182)
        at java.io.LineNumberReader.read(LineNumberReader.java:126)
        at java.io.FilterReader.read(FilterReader.java:65)
        at java.io.PushbackReader.read(PushbackReader.java:90)
        at org.openrdf.rio.turtle.TurtleParser.read(TurtleParser.java:1247)
        at org.openrdf.rio.turtle.TurtleParser.parseString(TurtleParser.java:764)
        at org.openrdf.rio.turtle.TurtleParser.parseQuotedString(TurtleParser.java:740)
        at org.openrdf.rio.turtle.TurtleParser.parseQuotedLiteral(TurtleParser.java:648)
        at org.openrdf.rio.turtle.TurtleParser.parseValue(TurtleParser.java:626)
        at org.openrdf.rio.turtle.TurtleParser.parseObject(TurtleParser.java:502)
        at org.openrdf.rio.turtle.TurtleParser.parseObjectList(TurtleParser.java:428)
        at org.openrdf.rio.turtle.TurtleParser.parsePredicateObjectList(TurtleParser.java:421)
        at org.openrdf.rio.turtle.TurtleParser.parseTriples(TurtleParser.java:385)
        at org.openrdf.rio.turtle.TurtleParser.parseStatement(TurtleParser.java:261)
        at org.openrdf.rio.turtle.TurtleParser.parse(TurtleParser.java:216)
        at org.openrdf.rio.turtle.TurtleParser.parse(TurtleParser.java:159)
        at com.bigdata.rdf.sail.webapp.InsertServlet$InsertWithBodyTask.call(InsertServlet.java:308)
        at com.bigdata.rdf.sail.webapp.InsertServlet$InsertWithBodyTask.call(InsertServlet.java:229)
        at com.bigdata.rdf.task.ApiTaskForIndexManager.call(ApiTaskForIndexManager.java:68)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        ... 1 more
Caused by: java.util.concurrent.TimeoutException: Idle timeout expired: 30001/30000 ms
        at org.eclipse.jetty.io.IdleTimeout.checkIdleTimeout(IdleTimeout.java:156)
        at org.eclipse.jetty.io.IdleTimeout$1.run(IdleTimeout.java:50)
        at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
        at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
        ... 3 more

kevinkle · 2017-07-20T17:14:59Z

https://sentry.io/share/issue/3133353338392e333133373233383832/

kevinkle · 2017-07-20T17:24:34Z

ERROR WITH JOB: blob-6603611803232263074
Traceback (most recent call last): File "/opt/conda/envs/backend/lib/python2.7/site-packages/rq/worker.py", line 700, in perform_job rv = job.perform() File "/opt/conda/envs/backend/lib/python2.7/site-packages/rq/job.py", line 500, in perform self._result = self.func(*self.args, **self.kwargs) File "./modules/blazeUploader/reserve_id.py", line 138, in write_reserve_id spfyid = reserve_id(query_file) File "./modules/blazeUploader/reserve_id.py", line 121, in reserve_id largest = check_largest_spfyid() File "./modules/blazeUploader/reserve_id.py", line 62, in check_largest_spfyid results = sparql.query().convert() File "/opt/conda/envs/backend/lib/python2.7/site-packages/SPARQLWrapper/Wrapper.py", line 567, in query return QueryResult(self._query()) File "/opt/conda/envs/backend/lib/python2.7/site-packages/SPARQLWrapper/Wrapper.py", line 537, in _query response = urlopener(request) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 154, in urlopen return opener.open(url, data, timeout) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 429, in open response = self._open(req, data) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 447, in _open '_open', req) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 407, in _call_chain result = func(*args) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 1228, in http_open return self.do_open(httplib.HTTPConnection, req) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 1201, in do_open r = h.getresponse(buffering=True) File "/opt/conda/envs/backend/lib/python2.7/site-packages/raven/breadcrumbs.py", line 346, in getresponse rv = real_getresponse(self, *args, **kwargs) File "/opt/conda/envs/backend/lib/python2.7/httplib.py", line 1121, in getresponse response.begin() File "/opt/conda/envs/backend/lib/python2.7/httplib.py", line 438, in begin version, status, reason = self._read_status() File "/opt/conda/envs/backend/lib/python2.7/httplib.py", line 394, in _read_status line = self.fp.readline(_MAXLINE + 1) File "/opt/conda/envs/backend/lib/python2.7/socket.py", line 480, in readline data = self._sock.recv(self._rbufsize) File "/opt/conda/envs/backend/lib/python2.7/site-packages/rq/timeouts.py", line 51, in handle_death_penalty 'value ({0} seconds)'.format(self._timeout)) JobTimeoutException: Job exceeded maximum timeout value (180 seconds)

kevinkle · 2017-07-20T17:36:26Z

It looks like our earlier concern about blazegraph not freeing up memory properly is also seen by other users https://www.linkedin.com/pulse/blazegraph-doesnt-do-windows-paul-houle

kevinkle · 2017-07-20T17:45:48Z

Hoping this issue is only connected to speed of disks on /Warehouse

kevinkle · 2017-07-20T17:49:16Z

To address Idle timeout expired, suggestion to increase max heap size to 8G http://lucene.472066.n3.nabble.com/java-util-concurrent-TimeoutException-Idle-timeout-expired-50001-50000-ms-td4321209.html

Could do this via java -server -Xmx8g -Dbigdata.propertyFile=/Warehouse/Users/claing/superphy/spfy/docker-blazegraph/2.1.4-inferencing/RWStore.properties -jar blazegraph.jar

kevinkle · 2017-08-31T15:36:28Z

May also want to bump up default time out in any task that queries blazegraph, could do this by setting it when declaring the blazegraph queue for id reservation. Note that we'll have to check whether graph uploads are effected too, but those would be more complicated as I'm not sure I want to bump up timeouts in the multiples queue.

Also, have to decide whether to address this in the production branches of spfy or in master, as this issue is localized to /Warehouse in core facility.

kevinkle · 2017-09-05T16:13:15Z

Had to restart the blazegraph instance today: was hitting timeouts on sparql queries when submitting new subtyping tasks, though the db status option was working.
After restart, subtyping works fine again - will have to look at using sentry to check on blazegraph.

kevinkle · 2017-09-18T18:01:29Z

As reported by @chadlaing , one of our collaborators had recently started to get errors. Looks like the errors are linked to this issue.

Traceback (most recent call last): File "/opt/conda/envs/backend/lib/python2.7/site-packages/rq/worker.py", line 700, in perform_job rv = job.perform() File "/opt/conda/envs/backend/lib/python2.7/site-packages/rq/job.py", line 500, in perform self._result = self.func(*self.args, **self.kwargs) File "./modules/blazeUploader/reserve_id.py", line 138, in write_reserve_id spfyid = reserve_id(query_file) File "./modules/blazeUploader/reserve_id.py", line 121, in reserve_id largest = check_largest_spfyid() File "./modules/blazeUploader/reserve_id.py", line 62, in check_largest_spfyid results = sparql.query().convert() File "/opt/conda/envs/backend/lib/python2.7/site-packages/SPARQLWrapper/Wrapper.py", line 567, in query return QueryResult(self._query()) File "/opt/conda/envs/backend/lib/python2.7/site-packages/SPARQLWrapper/Wrapper.py", line 537, in _query response = urlopener(request) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 154, in urlopen return opener.open(url, data, timeout) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 429, in open response = self._open(req, data) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 447, in _open '_open', req) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 407, in _call_chain result = func(*args) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 1228, in http_open return self.do_open(httplib.HTTPConnection, req) File "/opt/conda/envs/backend/lib/python2.7/urllib2.py", line 1201, in do_open r = h.getresponse(buffering=True) File "/opt/conda/envs/backend/lib/python2.7/site-packages/raven/breadcrumbs.py", line 346, in getresponse rv = real_getresponse(self, *args, **kwargs) File "/opt/conda/envs/backend/lib/python2.7/httplib.py", line 1121, in getresponse response.begin() File "/opt/conda/envs/backend/lib/python2.7/httplib.py", line 438, in begin version, status, reason = self._read_status() File "/opt/conda/envs/backend/lib/python2.7/httplib.py", line 394, in _read_status line = self.fp.readline(_MAXLINE + 1) File "/opt/conda/envs/backend/lib/python2.7/socket.py", line 480, in readline data = self._sock.recv(self._rbufsize) File "/opt/conda/envs/backend/lib/python2.7/site-packages/rq/timeouts.py", line 51, in handle_death_penalty 'value ({0} seconds)'.format(self._timeout)) JobTimeoutException: Job exceeded maximum timeout value (180 seconds)

kevinkle · 2017-09-22T14:48:22Z

Perhaps related to the main issue: https://jira.blazegraph.com/browse/BLZG-9003

kevinkle · 2017-11-15T04:03:02Z

Perhaps relevant: https://jira.blazegraph.com/browse/BLZG-9058

kevinkle · 2017-11-24T19:35:12Z

Looks like the issue has been solved as of https://github.com/superphy/backend/releases/tag/v5.0.3 with a caveat. Some notes from Slack:

----- Tuesday, November 21st -----
kevin [11:36 AM] 
we’re still hitting timeouts, albeit in the hundreds instead of the thousands. going to make another change to spfy’s run and install a jetty server on the vm


kevin
[2:09 PM] 
I’ve made the changes and it looks like jetty is saving the db at `/Warehouse/Users/claing/jetty/bigdata.jnl`


chadlaing [2:10 PM] 
indeed :slightly_smiling_face:


kevin
[2:11 PM] 
Which is good. Haven’t seen any timeouts yet, but I only just loaded a batch in. If this works, I’ll still need to confirm that I configured jetty correctly to take our blazegraph properties file, but crossing my fingers :sweat_smile:

----- Wednesday, November 22nd -----
kevin [10:32 AM] 
uploaded and commented on this image: Screen Shot 2017-11-22 at 10.31.47 AM (2).png
1 Comment
So far, we’re looking good. There were a few blips due to filenames (spaces) that didn’t get renamed in my shell script, but that’s been fixed in the actual python ones. Will test the other features once this first set loads and maybe decrease the batch size as temp. space is getting tight on the docker disk.

The caveat is that the change to upload graphs via a queue needs to be modified to serialize the graph files, as we've run out of RAM (24 GB) on the VM. The other option would be to allow Redis DB to store temp data to disk.

Will verify the inferencing setup and make this change over the weekend.

kevinkle · 2017-11-27T00:34:25Z

Looks like it didn't load the properties file correctly. Will have to debug.

kevinkle · 2017-11-28T20:36:52Z

Must've started blazegraph (& created a bigdata.jnl file) before making proper changes to the defaults. Inferencing is working now.

Here's a defaults file:
/etc/default/jetty

JETTY_HOME=/Warehouse/Users/claing/jetty
JETTY_USER=claing
JETTY_PORT=9999
JETTY_HOST=192.168.0.1
JETTY_LOGS=/Warehouse/Users/claing/jetty/logs/
JAVA_OPTIONS="-Dcom.bigdata.rdf.sail.webapp.ConfigParams.propertyFile=/Warehouse/Users/claing/RWStore.properties"

What the status should look like:

[claing@superphy claing]$ service jetty status
** WARNING: JETTY_LOGS is Deprecated. Please configure logging within the jetty base.
Jetty running pid=22135

JAVA           =  /bin/java
JAVA_OPTIONS   =  -Dcom.bigdata.rdf.sail.webapp.ConfigParams.propertyFile=/Warehouse/Users/claing/RWStore.properties -Djetty.home=/Warehouse/Users/claing/jetty -Djetty.base=/Warehouse/Users/claing/jetty -Djava.io.tmpdir=/tmp
JETTY_HOME     =  /Warehouse/Users/claing/jetty
JETTY_BASE     =  /Warehouse/Users/claing/jetty
START_D        =  /Warehouse/Users/claing/jetty/start.d
START_INI      =  /Warehouse/Users/claing/jetty/start.ini
JETTY_START    =  /Warehouse/Users/claing/jetty/start.jar
JETTY_CONF     =  /Warehouse/Users/claing/jetty/etc/jetty.conf
JETTY_ARGS     =  jetty.state=/Warehouse/Users/claing/jetty/jetty.state jetty-started.xml
JETTY_RUN      =  /Warehouse/Users/claing/jetty/jetty
JETTY_PID      =  /Warehouse/Users/claing/jetty/jetty/jetty.pid
JETTY_START_LOG=  /Warehouse/Users/claing/jetty/jetty/jetty-start.log
JETTY_STATE    =  /Warehouse/Users/claing/jetty/jetty.state
RUN_CMD        =  /bin/java -Dcom.bigdata.rdf.sail.webapp.ConfigParams.propertyFile=/Warehouse/Users/claing/RWStore.properties -Djetty.home=/Warehouse/Users/claing/jetty -Djetty.base=/Warehouse/Users/claing/jetty -Djava.io.tmpdir=/tmp -jar /Warehouse/Users/claing/jetty/start.jar jetty.state=/Warehouse/Users/claing/jetty/jetty.state jetty-started.xml

Bulk loading time!

kevinkle · 2017-12-04T16:53:40Z

python chunk.py -c batch_3155_6310.p -d /docker/chunk3/

kevinkle · 2017-12-10T00:13:44Z

We have some IO optimization already implemented as @chadlaing had suggested: https://wiki.blazegraph.com/wiki/index.php/IOOptimization https://github.com/superphy/docker-blazegraph/blob/master/2.1.4-inferencing/RWStore.properties

We're also currently using Incremental truth maintenance which sounds like it's pretty expensive with the amount of data we're adding. https://wiki.blazegraph.com/wiki/index.php/InferenceAndTruthMaintenance It's possible manually configuring this would make a good impact on the timeouts we're getting on uploads.

kevinkle · 2017-12-10T01:28:59Z

Suppressing truth maintenance via the REST API doesn't seem to work without a query parameter:
[claing@superphy backend-4.4.0]$ curl -X POST http://192.168.0.1:8080/blazegraph/sparql --data-urlencode 'suppressTruthMaintenance=true' Content-Type not recognized as RDF: application/x-www-form-urlencoded

[claing@superphy backend-4.4.0]$ curl -X POST http://192.168.0.1:8080/blazegraph/sparql --data-urlencode 'query=SELECT * { ?s ?p ?o } LIMIT 1' --data-urlencode 'suppressTruthMaintenance=true' -H 'Accept:application/rdf+xml' gives the query result as a response, but doesn't confirm if truth maintenance has been suppressed.

kevinkle · 2017-12-10T01:32:06Z

Not sure what this just gave me:

https://gist.github.com/kevinkle/45268c4c84a996c2c5dd3d759c326077

kevinkle · 2017-12-10T01:41:10Z

Maybe this is only a one-time disable thing??? Looks like there's also some SPARQL UPDATE command for this https://wiki.blazegraph.com/wiki/index.php/SPARQL_Update#Manage_truth_maintenance_in_SPARQL_UPDATE

kevinkle · 2017-12-10T17:07:18Z

re: #187 (comment) Looks like we're still generating timeouts at the same rate. Will have to check how this can be set.

kevinkle · 2017-12-10T18:42:13Z

Nothing good old DevTools + cURL can't get around:

curl 'http://192.168.0.1:8080/blazegraph/namespace/kb/sparql' -H 'Pragma: no-cache' -H 'Origin: http://192.168.5.19:8080' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en-US,en;q=0.9,la;q=0.8' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' -H 'Accept: */*' -H 'Cache-Control: no-cache' -H 'X-Requested-With: XMLHttpRequest' -H 'Connection: keep-alive' -H 'Referer: http://192.168.0.1:8080/blazegraph/' -H 'DNT: 1' --data 'update=DISABLE%20ENTAILMENTS%3B' --compressed

Response:

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><meta http-equiv="Content-Type" content="text&#47;html;charset=UTF-8"><title>blazegraph&trade; by SYSTAP</title
></head
><body<p>totalElapsed=215ms, elapsed=13ms, connFlush=0ms, batchResolve=0, whereClause=0ms, deleteClause=0ms, insertClause=0ms</p
><hr><p>COMMIT: totalElapsed=1815ms, commitTime=1512928665654, mutationCount=0</p
></html
>

kevinkle · 2017-12-10T20:28:08Z

Queuing up another batch for testing:

[claing@superphy ~]$ cd /docker/
[claing@superphy docker]$ sudo mkdir chunk4
[sudo] password for claing:
[claing@superphy docker]$ sudo chown -R claing:docker chunk4
[claing@superphy docker]$ cd /opt/chunk-pickles/
[claing@superphy chunk-pickles]$ screen
[claing@superphy chunk-pickles]$ python chunk.py -c batch_6310_9465.p -d /docker/chunk4/

kevinkle · 2017-12-12T14:43:43Z

Loaded this batch:

# nginx has to be stopped before touching the docker-compose
# think it has something to do with the way routing to the containers works
[claing@superphy backend-4.4.0]$ sudo systemctl stop nginx
# change mapping of volume
[claing@superphy backend-4.4.0]$ vim docker-compose.yml
# delete the redis persistence file
[claing@superphy backend-4.4.0]$ cd /docker/
[claing@superphy redis]$ sudo rm appendonly.aof
# bring everything back up
[claing@superphy redis]$ cd /opt/backend-4.4.0/
[claing@superphy backend-4.4.0]$ docker-compose up -d
[claing@superphy backend-4.4.0]$ sudo systemctl start nginx

kevinkle · 2017-12-12T14:46:16Z

Going to work on tuning Blazegraph (likely via branching factors) on a local box while the above is testing on corefacility.

kevinkle · 2017-12-12T17:38:58Z

Checking the branching factors (newM is the recommended value):
from cybera:

ubuntu@host-10-1-5-81:~$ curl -d 'dumpPages' -d 'dumpJournal' 'http://localhost:8080/bigdata/status'

name	indexType	m	height	nnodes	nleaves	nentries	nrawRecs	nerrors	nodeBytes	leafBytes	rawRecBytes	totalBytes	avgNodeBytes	avgLeafBytes	avgRawRecBytes	minNodeBytes	maxNodeBytes	minLeafBytes	maxLeafBytes	64	128	192	320	512	768	1024	2048	3072	4096	8192	blobs	newM	curM
__globalRowStore	BTree	32	1	1	3	66	0	0	139	6404	0	6543	139	2134	0139	139	971	3974	0.0	0.0	0.25	0.0	0.0	0.0	0.25	0.25	0.0	0.25	0.0	0.0	803	32
kb.lex.BLOBS	BTree	400	1	1	105	27507	27507	0	1419	536517	391801311	392339247	1419	5109	14243	1419	1419	3950	7724	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.009433962264150943	0.0	0.0660377358490566	0.9245283018867925	0.0	1180	400
kb.lex.ID2TERM	BTree	400	2	4	679	135940	126070	0	9186	2285517	11773580	14068283	2296	3366	93	84	3732	3265	5071	0.0	0.0014641288433382138	0.0	0.0	0.0	0.0	0.0	0.0	0.0029282576866764276	0.9941434846266471	0.0014641288433382138	0.0	959	400
kb.lex.TERM2ID	BTree	400	2	3	566	135940	0	0	19549	7568016	0	7587565	6516	13371	0	163	10061	6319	47559	0.0	0.0	0.0017574692442882249	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.1265377855887522	0.8717047451669596	299	400
kb.spo.JUST	BTree	1024	2	12	7438	4045350	0	0	289345	202792383	0	203081728	24112	27264	0	520	36563	8862	71349	0.0	0.0	0.0	0.0	0.0	1.3422818791946307E-4	0.0	0.0	0.0	0.0	0.0	0.9998657718120806	262	1024
kb.spo.OSP	BTree	1024	2	10	5637	3354502	0	0	100552	47560227	0	47660779	10055	8437	0	231	15766	3871	24212	0.0	0.0	0.0	1.7708517797060386E-4	0.0	0.0	0.0	0.0	0.0	0.21073136178501858	0.21692934301398972	0.572162210023021	731	1024
kb.spo.POS	BTree	1024	2	9	5552	3354502	0	0	100029	27106902	0	27206931	11114	4882	0	218	18414	2818	16447	0.0	0.0	0.0	1.798237727027513E-4	0.0	0.0	0.0	0.0	0.2138104657435713	0.24887610142060781	0.481568063297968	0.05556554576515015	988	1024
kb.spo.SPO	BTree	1024	2	11	6056	3354502	0	0	123800	31663320	0	31787120	11254	5228	0	273	20896	3423	10633	0.0	0.0	0.0	1.6482610845557937E-4	0.0	0.0	0.0	0.0	0.0	0.21279050601615296	0.7667710565353552	0.020273611340036263	939	1024

from corefacility:

curl 'http://192.168.0.1:8080/blazegraph/status?dumpJournal&dumpPages' -H 'Pragma: no-cache' -H 'Origin: http://192.168.5.19:8080' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en-US,en;q=0.9,la;q=0.8' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' -H 'Accept: */*' -H 'Cache-Control: no-cache' -H 'X-Requested-With: XMLHttpRequest' -H 'Connection: keep-alive' -H 'Referer: http://192.168.0.1:8080/blazegraph/' -H 'DNT: 1' --compressed

name	indexType	m	height	nnodes	nleaves	nentries	nrawRecs	nerrors	nodeBytes	leafBytes	rawRecBytes	totalBytes	avgNodeBytes	avgLeafBytes	avgRawRecBytes	minNodeBytes	maxNodeBytes	minLeafBytes	maxLeafBytes	64	128	192	320	512	768	1024	2048	3072	4096	8192	blobs	newM	curM
__globalRowStore	BTree	32	1	1	3	66	0	0	139	6404	0	6543	139	2134	0139	139	971	3974	0.0	0.0	0.25	0.0	0.0	0.0	0.25	0.25	0.0	0.25	0.0	0.0	803	32
kb.lex.BLOBS	BTree	400	2	17	3845	1036630	1036630	0	51306	19495651	27581346105	27600893062	3018	5070	26606	269	3601	3768	7506	0.0	0.0	0.0	2.589331952356292E-4	0.0	0.0	0.0	0.0	0.0010357327809425167	0.10227861211807354	0.8964267219057483	0.0	692	400
kb.lex.ID2TERM	BTree	400	2	128	25728	5145622	5139450	0	332519	84057361	516142833	600532713	2597	3267	100	2010	4388	3264	3737	0.0	0.0	0.0	0.0	0.0	0.0	0.0	3.8675742574257426E-5	0.004834467821782178	0.9950881806930693	3.8675742574257426E-5	0.0	905	400
kb.lex.TERM2ID	BTree	400	2	78	19912	5145622	0	0	898267	325617776	0	326516043	11516	16352	0	3810	22772	3604	130533	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	9.504752376188094E-4	0.006053026513256629	0.9929964982491246	193	400
kb.spo.JUST	BTree	1024	2	340	188967	103902351	0	0	6873183	5322237409	0	5329110592	20215	28164	0	7789	40163	8862	74882	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2.1657941861632165E-4	0.9997834205813837	284	1024
kb.spo.OSP	BTree	1024	2	251	143414	86535527	0	0	2624942	1239165246	0	1241790188	10457	8640	0	5477	19697	3764	23826	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.17524101207670623	0.23344586364110953	0.5913131242821843	708	1024
kb.spo.POS	BTree	1024	2	252	145839	86535527	0	0	2648135	728544112	0	731192247	10508	4995	0	5450	21343	2917	15590	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.17228987411955562	0.25579946745521626	0.5135018584307042	0.05840879999452396	990	1024
kb.spo.SPO	BTree	1024	2	266	155958	86535527	0	0	3588811	873139445	0	876728256	13491	5598	0	7388	22900	3423	11171	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.218750.7392846169602622	0.04196538303973781	847	1024

Raw data is at: superphy/docker-blazegraph@c1a13b4

New branchingFactors:

# Set the branching factor for "__globalRowStore" to the specified value.
com.bigdata.namespace.__globalRowStore.com.bigdata.btree.BTree.branchingFactor=32
# Set the branching factor for "kb.lex.BLOBS" to the specified value.
com.bigdata.namespace.kb.lex.BLOBS.com.bigdata.btree.BTree.branchingFactor=692
# Set the branching factor for "kb.lex.ID2TERM" to the specified value.
com.bigdata.namespace.kb.lex.ID2TERM.com.bigdata.btree.BTree.branchingFactor=905
# Set the branching factor for "kb.lex.TERM2ID" to the specified value.
com.bigdata.namespace.kb.lex.TERM2ID.com.bigdata.btree.BTree.branchingFactor=193
# Set the branching factor for "kb.spo.JUST" to the specified value.
com.bigdata.namespace.kb.spo.JUST.com.bigdata.btree.BTree.branchingFactor=284
# Set the branching factor for "kb.spo.OSP" to the specified value.
com.bigdata.namespace.kb.spo.OSP.com.bigdata.btree.BTree.branchingFactor=708
# Set the branching factor for "kb.spo.POS" to the specified value.
com.bigdata.namespace.kb.spo.POS.com.bigdata.btree.BTree.branchingFactor=990
# Set the branching factor for "kb.spo.SPO" to the specified value.
com.bigdata.namespace.kb.spo.OSP.com.bigdata.btree.BTree.branchingFactor=1024

The new factors vs the current ones:

name	nnodes	nleaves	nentries	newM	curM
__globalRowStore	1	3	66	32
kb.lex.BLOBS	17	3845	1036630	692	400
kb.lex.ID2TERM	128	25728	5145622	905	400
kb.lex.TERM2ID	78	19912	5145622	193	400
kb.spo.JUST	340	188967	103902351	284	1024
kb.spo.OSP	251	143414	86535527	708	1024
kb.spo.POS	252	145839	86535527	990	1024
kb.spo.SPO	266	155958	86535527	1024

Tangential relation blazegraph/database#9

kevinkle · 2017-12-13T16:22:35Z

Looks like no luck with the disable entanglements. Going to swap in the new branching factors. Additional option to look into if this fails: https://wiki.blazegraph.com/wiki/index.php/Bulk_Data_Load

kevinkle · 2017-12-13T17:03:16Z

Swapped in the new branching factors and loaded a test set now. Also configured the /bigdata/dataloader API so we can use it as backup.

kevinkle · 2017-12-13T17:24:18Z

[claing@superphy backend-4.4.0]$ cat /etc/default/jetty
JETTY_HOME=/Warehouse/Users/claing/jetty
JETTY_USER=claing
JETTY_PORT=9999
JETTY_HOST=192.168.0.1
JETTY_LOGS=/Warehouse/Users/claing/jetty/logs/
JAVA_OPTIONS="-Xmx6g -Dcom.bigdata.rdf.sail.webapp.ConfigParams.propertyFile=/Warehouse/Users/claing/RWStore.properties"

kevinkle · 2017-12-15T18:07:33Z

The first 3155 genomes loaded fine, but this is as before. Loading in the second set.

kevinkle · 2017-12-16T00:58:12Z

Looks like the errors we still see trace back to two things:

check_duplicates(): this checks blazegraph for duplicate copies of the given genome. Runs as a SPARQL GET query. Currently managed by a single worker with name blazegraph.
upload_graph(): this uploads the graph of various results to blazegraph. Runs as a function and enqueues are done via a Python decorator. Currently managed by a single worker with the name blazegraph_uploads.

Before, our assumption was that we had too much concurrency for blazegraph to handle (uploads were handled by the task worker which completed it), at least while it ran on our LFS. Atm., we only have 2 concurrent processes which interface with blazegraph (above). An optimistic outlook would be that moving 2. to the /bigdata/dataloader api would also fix 1., but I'm unsure.

Will make the changes to the uploading and see. A follow-up would be to cache some info. on identifiers (or whatever we can) onto redis and not go back and forth with blazegraph queries as much.

kevinkle · 2017-12-23T17:03:49Z

backup  bigdata.jnl  corefacility  datastore  enterobase_db  enterobase_db_10000-20000  enterobase.py  lost+found  samplegenomes  strains.json
ubuntu@host-10-1-5-81:/objectstore$ du -sh *
768M    backup
812M    bigdata.jnl
28G     corefacility
4.0K    datastore
47G     enterobase_db
47G     enterobase_db_10000-20000
8.0K    enterobase.py
du: cannot read directory 'lost+found': Permission denied
16K     lost+found
8.0G    samplegenomes
101M    strains.json

kevinkle · 2018-04-10T18:19:22Z

We've now offshored the indexing of spfyids into MongoDB as well as the storage of the largest current spfyid. Will still have effect retrieval for tasks like phylotyper or group comparisons, but this should solve ~90% of the issues. PR: #284

kevinkle added the bug label Jul 20, 2017

kevinkle added the high priority label Jul 20, 2017

kevinkle added this to the v5.0.0 milestone Jul 20, 2017

kevinkle self-assigned this Aug 15, 2017

kevinkle mentioned this issue Sep 19, 2017

error when headers are >NODE_2_length_339420_cov_157.209 #225

Closed

kevinkle mentioned this issue Nov 8, 2017

blazegraph posts hitting timeouts at corefacility #243

Closed

kevinkle added the corefacility label Nov 8, 2017

kevinkle mentioned this issue Nov 14, 2017

Consider using the blazegraph-python lib instead of requests #246

Closed

kevinkle mentioned this issue Nov 15, 2017

create a service to run blazegraph on corefacility #180

Closed

kevinkle mentioned this issue Dec 17, 2017

Use /bigdata/dataloader #259

Closed

kevinkle closed this as completed Apr 10, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

blazegraph struggles under load #187

blazegraph struggles under load #187

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017 •

edited

Loading

kevinkle commented Aug 31, 2017

kevinkle commented Sep 5, 2017

kevinkle commented Sep 18, 2017

kevinkle commented Sep 22, 2017

kevinkle commented Nov 15, 2017

kevinkle commented Nov 24, 2017

kevinkle commented Nov 27, 2017

kevinkle commented Nov 28, 2017 •

edited

Loading

kevinkle commented Dec 4, 2017

kevinkle commented Dec 10, 2017

kevinkle commented Dec 10, 2017

kevinkle commented Dec 10, 2017 •

edited

Loading

kevinkle commented Dec 10, 2017

kevinkle commented Dec 10, 2017

kevinkle commented Dec 10, 2017 •

edited

Loading

kevinkle commented Dec 10, 2017

kevinkle commented Dec 12, 2017

kevinkle commented Dec 12, 2017 •

edited

Loading

kevinkle commented Dec 12, 2017 •

edited

Loading

kevinkle commented Dec 13, 2017

kevinkle commented Dec 13, 2017

kevinkle commented Dec 13, 2017

kevinkle commented Dec 15, 2017

kevinkle commented Dec 16, 2017

kevinkle commented Dec 23, 2017

kevinkle commented Apr 10, 2018

blazegraph struggles under load #187

blazegraph struggles under load #187

Comments

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017

kevinkle commented Jul 20, 2017 • edited Loading

kevinkle commented Aug 31, 2017

kevinkle commented Sep 5, 2017

kevinkle commented Sep 18, 2017

kevinkle commented Sep 22, 2017

kevinkle commented Nov 15, 2017

kevinkle commented Nov 24, 2017

kevinkle commented Nov 27, 2017

kevinkle commented Nov 28, 2017 • edited Loading

kevinkle commented Dec 4, 2017

kevinkle commented Dec 10, 2017

kevinkle commented Dec 10, 2017

kevinkle commented Dec 10, 2017 • edited Loading

kevinkle commented Dec 10, 2017

kevinkle commented Dec 10, 2017

kevinkle commented Dec 10, 2017 • edited Loading

kevinkle commented Dec 10, 2017

kevinkle commented Dec 12, 2017

kevinkle commented Dec 12, 2017 • edited Loading

kevinkle commented Dec 12, 2017 • edited Loading

kevinkle commented Dec 13, 2017

kevinkle commented Dec 13, 2017

kevinkle commented Dec 13, 2017

kevinkle commented Dec 15, 2017

kevinkle commented Dec 16, 2017

kevinkle commented Dec 23, 2017

kevinkle commented Apr 10, 2018

kevinkle commented Jul 20, 2017 •

edited

Loading

kevinkle commented Nov 28, 2017 •

edited

Loading

kevinkle commented Dec 10, 2017 •

edited

Loading

kevinkle commented Dec 10, 2017 •

edited

Loading

kevinkle commented Dec 12, 2017 •

edited

Loading

kevinkle commented Dec 12, 2017 •

edited

Loading