+ return;
+ }
+ pfl->u.r.dtor(pfl);
+ }
/* Free the field log chunk */
freeListFree(dbevFieldLogFreeList, pfl);
}
After making that change then one time when I started the IOC I got these messages right after iocInit, which is not when it usually fails.
iocexample> db_delete_field_log pfl->u.r.dtor is in A32 address space 0xbf490019
db_delete_C interp: unknown symbol name 'pfl'.
iocexample> field_log pfl->u.r.dtor is in A32 address space 0xbf490019
db_delete_field_C interp: unknown symbol name 'pfl'.
iocexample> log pfl->u.r.dtor is in A32 address space 0xbf518011
db_deletC interp: unknown symbol name 'pfl'.
iocexample> e_field_log pfl->u.r.dtor is in A32 address space 0xbf490019
db_delete_field_lC interp: unknown symbol name 'pfl'.
iocexample> og pfl->u.r.dtor is in A32 address space 0xbf240014
db_delete_field_log C interp: unknown symbol name 'pfl'.
iocexample> pfl->u.r.dtor is in A32 address space 0xbf3e001e
db_deletC interp: unknown symbol name 'pfl'.
iocexample> e_field_log pfl->u.r.dtor is in A32 address space 0xbf3e001e
db_delete_field_lC interp: unknown symbol name 'pfl'.
iocexample> og pfl->u.r.dtor is in A32 address space 0xbf518011
db_delete_field_log C interp: unknown symbol name 'pfl'.
iocexample> pfl->u.r.dtor is in A32 address space 0xbf490019
db_deletC interp: unknown symbol name 'pfl'.
iocexample> e_field_log pfl->u.r.dtor is in A32 address space 0xbf490019
db_delete_field_lC interp: unknown symbol name 'pfl'.
iocexample> og pfl->u.r.dtor is in A32 address space 0xbf3e001e
db_delete_field_log C interp: unknopfl->.r.dtor is in A32 address space 0xbf490019
db_delete_feld_log pfl->u.r.dtor is in A32 address space 0xbf3e001e
db_delete_field_log pfl->u.r.dtor is in A32 address space 0xbf24014
db_delete_field_log pfl->u.r.dtor is in A32 address space0xbf3e001e
db_delete_field_log pfl->u.r.dtor is in A32 address space 0xbf490019
wn symbol name 'pfl'.
Note that those addresses are all 0xbfxxxxxx, which is exactly the address range of all the A32 bus errors I have ever seen.
However, adding those tests has actually made it fail less frequently if at all. In fact, the only failure I have seen is that one time when I got the above
messages right after iocInit. Since then it has run for 1000 seconds without failing three times.
I then simply commented out those new lines and ran the test 3 times. It failed in less than 10 seconds each time. The failures were all in db_delete_field_log.
I then uncommented the address checks and ran it again. It also ran for >1000 s without failing.
I guess there are 2 possibilities:
1.
The address checks change the timing, and a race error is avoided. I wonder if there is a race that sets pfl->u.r.dtor in another thread, and now we are giving
it time to be set correctly?
2.
The address checks change the memory layout, avoiding an uninitialized variable error.
I also made another interesting observation. When I was initially adding the debugging above I was printing an error if pfl->u.r.dtor was in the A32 address space
outside the check for pfl->type == dbfl_type_ref. There were lots of messages, not every event but maybe 10 Hz out of ~400 events/s. That means that the “uninitialized” value of pfl->u.r.dtor when pfl->type != dbfl_tpye_ref is often in the A32 address space.
Mark
-----Original Message-----
From: Till Straumann <till.straumann at psi.ch>
Sent: Wednesday, June 1, 2022 2:57 PM
To: Michael Davidsaver <mdavidsaver at gmail.com>; Mark Rivers
<rivers at cars.uchicago.edu>; Torsten Bögershausen
<Torsten.Bogershausen at ess.eu>; Benjamin Franksen
<benjamin.franksen at helmholtz-berlin.de>
Cc: tech-talk at aps.anl.gov
Subject: Re: Bus errors accessing VME with base 7.0.6.1 and latest synApps modules
On 6/1/22 21:08, Michael Davidsaver wrote:
> On 6/1/22 11:20, Mark Rivers wrote:
>> Hi Till,
>>
>> ØCould you please provide 'dbEvent.o' (from commit 85822f305) ?
>>
>> I’m not sure what you mean. Do you mean send the .o file itself, or
>> do a vxWorks disassembly of that function?
>
> Either. The question is where this invalid address is coming from,
> and how it is being used. I've wondered if this is a jump. eg.
> if garbage address is being executed through "pfl->u.r.dtor(pfl)"
I have wondered the same thing. What chips away at this hypothesis is that Mark observed bus errors reporting addresses that can be read but not written. I don't see any obvious writes in the code leading up to the call to freeListFree().
>
>
>> void db_delete_field_log (db_field_log *pfl)
>> {
>> if (pfl) {
>> /* Free field if reference type field log and dtor is set */
>> if (pfl->type == dbfl_type_ref && pfl->u.r.dtor)
>> pfl->u.r.dtor(pfl);
>> /* Free the field log chunk */
>> freeListFree(dbevFieldLogFreeList, pfl);
>> }
>> }
>
>
>
>
>
>> Ø - check if the bus error is 'real', i.e., try to access the
>> reported VME address e.g., from the console
>> (keeping in mind that A32 as seen from the CPU may be at an
>> offset - depending on OS/BSP).
>>
>> This is the system VME memory map:
>>
>> iocexample> sysVmeMapShow
>>
>> VMEbus access from CPU:
>>
>> 0x80000000 - 0xBFFFFFFF => A32: 0x80000000 - 0xBFFFFFFF
>>
>> 0xF0000000 - 0xF0FFFFFF => A24: 0x000000 - 0xFFFFFF
>>
>> 0xFBFF0000 - 0xFBFFFFFF => A16: 0x0000 - 0xFFFF
>>
>> 0xF2000000 - 0xF2FFFFFF => CR/CSR:0x000000 - 0xFFFFFF
>>
>> PCIbus access from VMEbus:
>>
>> A32: 0x80000000 - 0x9FFFFFFF => RAM: 0x00000000 - 0x1FFFFFFF
>>
>> value = 0 = 0x0
>>
>> The A16 errors are all at addresses like this, i.e. address 0x34xx
>>
>> [Fri May 27 14:05:24 2022] VME Bus Error accessing A16: 0x345e
>>
>> [Fri May 27 14:20:20 2022] VME Bus Error accessing A16: 0x345e
>>
>> [Fri May 27 15:49:10 2022] VME Bus Error accessing A16: 0x345e
>>
>> [Fri May 27 15:49:57 2022] VME Bus Error accessing A16: 0x347e
>>
>> [Fri May 27 16:15:55 2022] VME Bus Error accessing A16: 0x345e
>>
>> [Fri May 27 16:39:39 2022] VME Bus Error accessing A16: 0x345e
>>
>> [Tue May 31 17:59:31 2022] VME Bus Error accessing A16: 0x347e
>>
>> [Tue May 31 22:09:43 2022] VME Bus Error accessing A16: 0x347e
>>
>> The A32 VME bus errors are all at addresses like these, i.e.
>> addresses 0xbfxxxxxx.
>>
>> [Tue May 31 16:09:11 2022] VME Bus Error accessing A32: 0xbfe3332c
>>
>> [Tue May 31 16:11:01 2022] VME Bus Error accessing A32: 0xbfdffff8
>>
>> [Tue May 31 18:47:53 2022] VME Bus Error accessing A32: 0xbffb3330
>>
>> [Tue May 31 18:47:54 2022] VME Bus Error accessing A32: 0xbfe99994
>>
>> [Tue May 31 18:47:54 2022] VME Bus Error accessing A32: 0xbfe692d4
>>
>> [Wed Jun 1 07:29:53 2022] VME Bus Error accessing A32: 0xbffcf67c
>>
>> [Wed Jun 1 11:29:14 2022] VME Bus Error accessing A32: 0xbfa8c418
>>
>> For these tests I only have a single VME card installed, other than
>> the MVME5100 CPU. That card is a TVME200 IP carrier. It is
>> configured with this command:
>>
>> # The second carrier in our system is a TEWS TVME200
>>
>> # The argument to ipacAddTVME200 is the values of the 6 switches on
>> the board
>>
>> # In this case 34 = base address 3400
>>
>> # 2 = interrupt mapping 4, 5, 2, 1, 4, 5, 2, 1
>>
>> # F = A32 address space, 8MB per slot, 32MB total
>>
>> # A2 = A2000000 base address in A32 space
>>
>> ipacAddTVME200("342FA2")
>>
>> This means that the card occupies 0x3400 to 0x37FF in the A16 address
>> space, each of the 4 IP slots used 0x100 bytes. The IP330 module is
>> in slot 0, so it uses addresses 0x3400 – 0x34ff.
>>
>> The VME bus errors are all at address 0x345e or 0x347e. This means
>> they are in the mailbox register space of the IP330 card. Those
>> addresses should not generate a bus error on a read operation. But
>> if it was a write operation then it should generate a bus error.
>> This is a memory dump of the A16 memory for the IP330. Note that we
>> can indeed read address 0x345e and 0x347e.
>>
>> iocexample> d 0xfbff3400
>>
>> NOTE: memory values are displayed in hexadecimal.
>>
>> 0xfbff3400: 2b02 4078 0021 0f00 0000 0000 0000 0000
*+.@x.!..........*
>>
>> 0xfbff3410: 0000 0000 0000 0000 0000 0000 0000 0000 *................*
>>
>> 0xfbff3420: 0000 0000 0000 0000 0000 0000 0000 0000 *................*
>>
>> 0xfbff3430: 0000 0000 0000 0000 0000 0000 0000 0000 *................*
>>
>> 0xfbff3440: 16b8 1701 1701 1700 171b 16fe 170b 16f9 *................*
>>
>> 0xfbff3450: 1703 1702 16fa 16f4 170d 16ed 16fe 16e3 *................*
>>
>> 0xfbff3460: 16b6 169c 16a7 1c2f 1c4a 1c2c 1c36 2138 *......./.J.,.6!8*
>>
>> 0xfbff3470: 2db9 2dae 2d9a 2d86 330a 32e6 32ea 32cc *-.-.-.-.3.2.2.2.*
>>
>> 0xfbff3480: ff49 ff50 ff41 ff43 ffa3 ff11 ff00 ff00 *.I.P.A.C........*
>>
>> 0xfbff3490: ff00 ff00 ff0c ff5a ff00 ff00 ff00 ff00 *.......Z........*
>>
>> 0xfbff34a0: ff00 ff00 ff00 ff00 ff00 ff00 ff00 ff00 *................*
>>
>> 0xfbff34b0: ff00 ff00 ff00 ff00 ff00 ff00 ff00 ff00 *................*
>>
>> 0xfbff34c0: 0054 0000 0054 0000 0054 0000 0054 0000 *.T...T...T...T..*
>>
>> 0xfbff34d0: 0054 0000 0054 0000 0054 0000 0054 0000 *.T...T...T...T..*
>>
>> 0xfbff34e0: 0054 0000 0054 0000 0054 0000 0054 0000 *.T...T...T...T..*
>>
>> 0xfbff34f0: 0054 0000 0054 0000 0054 0000 0054 0000 *.T...T...T...T..*
>>
>> value = 0 = 0x0
>>
>> The A32 address errors are at 0xbfxxxxxx. That is at the very top of
>> the A32 address space. That is not where the TVME200 card is
>> located, it is at 0xA2000000. Those addresses are not readable, and
>> do indeed generate an A32 bus error:
>>
>> iocexample> *(0xbfe3332c)
>>
>> VME Bus Error accessing A32: 0xbfe3332c
>>
>> machine check
>>
>> Exception next instruction address: 0x001fe4f4
>>
>> Machine Status Register: 0x0008b032
>>
>> Condition Register: 0x88000244
>>
>> 0x0012489c vxTaskEntry +0x48 : 0x0020c554 ()
>>
>> 0x0020c554 shellTask +0x4a8: shellExec ()
>>
>> 0x0020c020 shellExec +0x11c: 0x00201a78 ()
>>
>> 0x00201ca0 shellInterpCInit+0x640: shellInterpCparse ()
>>
>> 0x00201384 shellInterpCparse+0xa3c: 0x001fe9f4 ()
>>
>> 0x001fea24 shellInterpClex+0x1d6c: 0x001fe6dc ()
>>
>> 0x001fe7a0 shellInterpClex+0x1ae8: 0x001fe490 ()
>>
>> Shell task 'tShell0' restarted...
>>
>> So the A16 bus errors are at a “valid” address, but perhaps a write
>> rather than a read is being done. The A32 addresses cannot be read,
>> they do generate bus errors.
>>
>> Mark
>>
>> *From:* Till Straumann <till.straumann at psi.ch>
>> *Sent:* Wednesday, June 1, 2022 9:34 AM
>> *To:* Mark Rivers <rivers at cars.uchicago.edu>; Michael Davidsaver
>> <mdavidsaver at gmail.com>; Torsten Bögershausen
>> <Torsten.Bogershausen at ess.eu>; Benjamin Franksen
>> <benjamin.franksen at helmholtz-berlin.de>
>> *Cc:* tech-talk at aps.anl.gov
>> *Subject:* Re: Bus errors accessing VME with base 7.0.6.1 and latest
>> synApps modules
>>
>> Hi Mark.
>>
>> I'm sorry I missed the earlier post where you had indeed added the
>> 'ti' information.
>>
>> Could you please provide 'dbEvent.o' (from commit 85822f305) ?
>>
>> Things I would try:
>>
>> - check if the bus error is 'real', i.e., try to access the
>> reported VME address e.g., from the console
>> (keeping in mind that A32 as seen from the CPU may be at an
>> offset - depending on OS/BSP).
>> - if this is the case then you may deal (as suggested earlier by
>> Michael, iirc) that the error is reported
>> in a delayed fashion. I would try to insert some 'no-op' just
>> before the call to freeListFree() (make
>> sure it is not optimized away) and see where the exception happens.
>> - add a 'sanity-check' of 'pfl' and 'pfl->u.r.dtor' in
>> 'db_delete_field_log()' and report if any of these
>> addresses is within a VME window.
>>
>> I have seen similar things (late-reported bus error) but often we
>> would then find the fault address
>> in one of the registers (or a combination of some 'base' + 'offset'
>> in a couple of them) but I can't
>> see that in your register dump. It is still possible that it was in
>> one of the caller-saved registers
>> and got wiped when the call to 'freeListFree()' was set up.
>>
>> Cheers
>> - T.
>>
>>
>> On 6/1/22 14:50, Mark Rivers via Tech-talk wrote:
>>
>> I agree that a debugger would be very helpful. I don't have that
>> tool, but I think others at the APS do. I'll find out when Andrew
>> returns next week. Meanwhile git bisect can help point in the right
>> direction.
>>
>> I tested this commit 85822f3051d2236144bb46dc2c24b7e38143e531
>> with soft records overnight. It ran for 32,000 seconds without
>> failing before I stopped it.
>>
>> The next commit, 56f05d722dee4b8ca2968b8bface2737a3a9b185, fails
>> in less than 10 seconds in each of 3 tests.
>>
>> I just tested with the IPAC hardware again with commit
>> 85822f3051d2236144bb46dc2c24b7e38143e531 (which does not fail with
>> soft records). It failed after 960 seconds with this error:
>>
>> VME Bus Error accessing A32: 0xbffcf67c
>>
>> machine check
>>
>> Exception next instruction address: 0x03326a90
>>
>> Machine Status Register: 0x0008b032
>>
>> Condition Register: 0x42000884
>>
>> Task: 0x3559330 "CAS-event"
>>
>> 0x3559330 (CAS-event): task 0x3559330 has had a failure and has
>> been stopped.
>>
>> 0x3559330 (CAS-event): The task has been terminated because it
>> triggered an exception that raised the signal 10.
>>
>> This is the task trace:
>>
>> iocexample> tt 0x3559330
>>
>> 0x0012489c vxTaskEntry +0x48 : epicsThreadEntry ()
>>
>> 0x0334501c epicsThreadEntry+0x80 : 0x032a17f0 ()
>>
>> 0x032a1acc db_start_events+0x450: db_delete_field_log ()
>>
>> 0x032a0fe4 db_delete_field_log+0x54 : freeListFree ()
>>
>> value = 0 = 0x0
>>
>> This tells us pretty clearly that the failure as in freeListFree,
>> called by db_delete_field_log. That has been a very common stack
>> trace for these errors.
>>
>> Till complained that vxWorks did not provide a register dump, but
>> that is not true, it is available with the “ti” command. I had
>> included that in several previous messages. Here it is for this task.
>>
>> iocexample> ti 0x3559330
>>
>> NAME ENTRY TID PRI STATUS PC SP
>> ERRNO DELAY
>>
>> ---------- ------------ -------- --- ---------- --------
>> -------- ------- -----
>>
>> CAS-event epicsThread> 3559330 133 STOP+I 3326a90
>> 3559240 0 0
>>
>> full task name : CAS-event
>>
>> task entry : epicsThreadEntry
>>
>> process : kernel
>>
>> options : 0x1009001
>>
>> VX_SUPERVISOR_MODE VX_DEALLOC_TCB VX_FP_TASK
>> VX_DEALLOC_EXC_STACK
>>
>> STACK BASE END SP SIZE HIGH MARGIN
>>
>> --------- -------- -------- -------- ------- ------- -------
>>
>> execution 3559330 3556450 3559240 12000 2800 9200
>>
>> exception 2700210 26ff3a0 3696 1344 2352
>>
>> VxWorks Events
>>
>> --------------
>>
>> Events Pended on : Not Pended
>>
>> Received Events : 0x0
>>
>> Options : N/A
>>
>> r0 = 0x40000000 sp = 0x03559240 r2 =
>> 0x002f1ac0
>>
>> r3 = 0x03492ce0 r4 = 0x0357bec8 r5 =
>> 0x00000001
>>
>> r6 = 0x00000001 r7 = 0x00000000 r8 =
>> 0xeefceefd
>>
>> r9 = 0x03326a90 r10 = 0x00000000 r11 =
>> 0x035591b0
>>
>> r12 = 0x20000884 r13 = 0x00321e60 r14 =
>> 0x033b6098
>>
>> r15 = 0x03373c64 r16 = 0x03289478 r17 =
>> 0x033424f0
>>
>> r18 = 0x03373bdc r19 = 0x03373c1c r20 =
>> 0x032a0ff8
>>
>> r21 = 0x032a0f90 r22 = 0x00000000 r23 =
>> 0x00000001
>>
>> r24 = 0x033b609c r25 = 0x03339340 r26 =
>> 0x033390e0
>>
>> r27 = 0x03443b60 r28 = 0x0357e748 r29 =
>> 0x03567d78
>>
>> r30 = 0x032dad44 r31 = 0x0357bec8 msr =
>> 0x0008b032
>>
>> lr = 0x032a0fe4 ctr = 0x03326a90 pc =
>> 0x03326a90
>>
>> cr = 0x42000884 xer = 0x00000000 pgTblPtr =
>> 0x02310000
>>
>> scSrTblPtr = 0x0230fe74 srTblPtr = 0x0230fe74
>>
>> fpcsr = 0
>>
>> fr0 = -1.81018 fr1 = NaN fr2 = NaN fr3
>> = NaN
>>
>> fr4 = NaN fr5 = NaN fr6 = NaN fr7
>> = NaN
>>
>> fr8 = NaN fr9 = NaN fr10 = NaN fr11
>> = NaN
>>
>> fr12 = NaN fr13 = NaN fr14 = NaN fr15
>> = NaN
>>
>> fr16 = NaN fr17 = NaN fr18 = NaN fr19
>> = NaN
>>
>> fr20 = NaN fr21 = NaN fr22 = NaN fr23
>> = NaN
>>
>> fr24 = NaN fr25 = NaN fr26 = NaN fr27
>> = NaN
>>
>> fr28 = NaN fr29 = NaN fr30 = NaN fr31
>> = NaN
>>
>> machine checkvalue =
>>
>> Exception next instruction address: 0x003326a90
>>
>> Machine Status Register: 0x = 0008b032
>>
>> Condition Register: 0x420008840x
>>
>> 0
>>
>> And here is the disassembly for freeListFree, where I have
>> highlighted in red the "next instruction" above.
>>
>> freeListFree:
>>
>> 0x3326a90 9421fff0 stwu r1,-16(r1)
>>
>> 0x3326a94 7c0802a6 mfspr r0,LR
>>
>> 0x3326a98 3d200334 lis r9,0x334 # 820
>>
>> 0x3326a9c 392990e0 addi r9,r9,0x90e0 # -28448
>>
>> 0x3326aa0 7d2903a6 mtspr CTR,r9
>>
>> 0x3326aa4 90010014 stw r0,20(r1)
>>
>> 0x3326aa8 93c10008 stw r30,8(r1)
>>
>> 0x3326aac 7c9e2378 or r30,r4,r4
>>
>> 0x3326ab0 93e1000c stw r31,12(r1)
>>
>> 0x3326ab4 7c7f1b78 or r31,r3,r3
>>
>> 0x3326ab8 80630014 lwz r3,20(r3)
>>
>> 0x3326abc 4e800421 bcctrl 0x14,0
>>
>> 0x3326ac0 2f830000 cmpi crf7,0,r3,0x0 # 0
>>
>> 0x3326ac4 41be002c bc 0xd,30, 0x3326af0 # 0x03326af0
>>
>> 0x3326ac8 3d200334 lis r9,0x334 # 820
>>
>> 0x3326acc 3c600338 lis r3,0x338 # 824
>>
>> 0x3326ad0 3ca00338 lis r5,0x338 # 824
>>
>> 0x3326ad4 392924f0 addi r9,r9,0x24f0 # 9456
>>
>> 0x3326ad8 386335a4 addi r3,r3,0x35a4 # 13732
>>
>> 0x3326adc 38a535c0 addi r5,r5,0x35c0 # 13760
>>
>> 0x3326ae0 7d2903a6 mtspr CTR,r9
>>
>> 0x3326ae4 3880008f li r4,0x8f # 143
>>
>> 0x3326ae8 38c00000 li r6,0x0 # 0
>>
>> 0x3326aec 4e800421 bcctrl 0x14,0
>>
>> 0x3326af0 813f0010 lwz r9,16(r31)
>>
>> 0x3326af4 801f0008 lwz r0,8(r31)
>>
>> 0x3326af8 39290001 addi r9,r9,0x1 # 1
>>
>> 0x3326afc 807f0014 lwz r3,20(r31)
>>
>> 0x3326b00 913f0010 stw r9,16(r31)
>>
>> 0x3326b04 3d200334 lis r9,0x334 # 820
>>
>> 0x3326b08 39299340 addi r9,r9,0x9340 # -27840
>>
>> 0x3326b0c 901e0000 stw r0,0(r30)
>>
>> 0x3326b10 7d2903a6 mtspr CTR,r9
>>
>> 0x3326b14 93df0008 stw r30,8(r31)
>>
>> 0x3326b18 4e800421 bcctrl 0x14,0
>>
>> 0x3326b1c 80010014 lwz r0,20(r1)
>>
>> 0x3326b20 83c10008 lwz r30,8(r1)
>>
>> 0x3326b24 7c0803a6 mtspr LR,r0
>>
>> 0x3326b28 83e1000c lwz r31,12(r1)
>>
>> 0x3326b2c 38210010 addi r1,r1,0x10 # 16
>>
>> 0x3326b30 4e800020 blr
>>
>> This is the source code for freeListFree in this tag of base.
>>
>> LIBCOM_API void epicsStdCall freeListFree(void *pvt,void*pmem)
>>
>> {
>>
>> FREELISTPVT *pfl = pvt;
>>
>> # ifdef EPICS_FREELIST_DEBUG
>>
>> memset ( pmem, 0xdd, pfl->size );
>>
>> free(pmem);
>>
>> # else
>>
>> void **ppnext;
>>
>> VALGRIND_MEMPOOL_FREE(pvt, pmem);
>>
>> VALGRIND_MEMPOOL_ALLOC(pvt, pmem, sizeof(void*));
>>
>> epicsMutexMustLock(pfl->lock);
>>
>> ppnext = pmem;
>>
>> *ppnext = pfl->head;
>>
>> pfl->head = pmem;
>>
>> pfl->nBlocksAvailable++;
>>
>> epicsMutexUnlock(pfl->lock);
>>
>> # endif
>>
>> }
>>
>> The last thing I will try is to git bisect from this commit
>> 85822f3051d2236144bb46dc2c24b7e38143e531 to R7.0.5 to see where the
>> hardware tests first begin to fail.
>>
>> Mark
>>
>> -----Original Message-----
>> From: Michael Davidsaver <mdavidsaver at gmail.com>
>> <mailto:mdavidsaver at gmail.com>
>> Sent: Wednesday, June 1, 2022 1:40 AM
>> To: Torsten Bögershausen <Torsten.Bogershausen at ess.eu>
>> <mailto:Torsten.Bogershausen at ess.eu>; Mark Rivers
>> <rivers at cars.uchicago.edu> <mailto:rivers at cars.uchicago.edu>;
>> Benjamin Franksen <benjamin.franksen at helmholtz-berlin.de>
>> <mailto:benjamin.franksen at helmholtz-berlin.de>
>> Cc: tech-talk at aps.anl.gov <mailto:tech-talk at aps.anl.gov>
>> Subject: Re: Bus errors accessing VME with base 7.0.6.1 and
>> latest synApps modules
>>
>> On 5/31/22 07:00, Torsten Bögershausen wrote:
>>
>> > Hej Mark,
>>
>> >
>>
>> > So R7.0.5 is good, and
>> f9ea6a5bff695c5f88bb95dce38a3fd349738907 is bad ?
>>
>> >
>>
>> > There are some "real" commits, and merges:
>>
>> > git log R7.0.5..f9ea6a5bff695c5f88bb95dce38a3fd349738907
>>
>> >
>>
>> > Then it could make sense, to bisect between those 2?
>>
>> >
>>
>> > Another question:
>>
>> > Coluld it make sense to run the SW (even more stripped may be)
>> under
>>
>> > Linux instead with valgrind ?
>>
>> An RTOS without a debugger is just about the worst situation I
>> can imagine to troubleshoot apparent memory corruption. (excepting
>> maybe having no console, or it being in orbit)
>>
>> I've made an attempt to run the database Mark describes with
>>
>> softIoc+valgrind, and a spam-y python script. Valgrind doesn't
>>
>> flag any access violates.
>>
>> The threading checker ("valgrind --tool=helgrind") does flag a
>> possible race which might be relevant. Or it might be a false
>> positive. I don't feel that I understand dbEvent.c well enough to
>> say anything conclusive this late at night.
>>
>> > Possible data race during write of size 8 at 0x5197520 by
>> thread #27
>>
>> > Locks held: 3, at addresses 0x51413A0 0x5143A60 0x5197A10
>>
>> > at 0x48CA60A: db_queue_event_log (dbEvent.c:824)
>>
>> > by 0x48CA732: db_post_events (dbEvent.c:892)
>>
>> > by 0x48595EE: monitor (aoRecord.c:532)
>>
>> > by 0x48595EE: process (aoRecord.c:232)
>>
>> > by 0x48B2C6B: dbProcess (dbAccess.c:608)
>>
>> > by 0x48B4BFF: dbPutField (dbAccess.c:1278)
>>
>> > by 0x48CE9DD: dbChannel_put (db_access.c:923)
>>
>> > by 0x48F6C9A: write_action (camessage.c:799)
>>
>> > by 0x48F7727: camessage (camessage.c:2546)
>>
>> > by 0x48F351F: camsgtask (camsgtask.c:116)
>>
>> > by 0x495E094: start_routine (osdThread.c:439)
>>
>> > by 0x483F876: mythread_wrapper (hg_intercepts.c:387)
>>
>> > by 0x4F1EEA6: start_thread (pthread_create.c:477)
>>
>> >
>>
>> > This conflicts with a previous read of size 8 by thread #21 Locks
>>
>> > held: none
>>
>> > at 0x48CA99D: event_read (dbEvent.c:999)
>>
>> > by 0x48CA99D: event_task (dbEvent.c:1078)
>>
>> > by 0x495E094: start_routine (osdThread.c:439)
>>
>> > by 0x483F876: mythread_wrapper (hg_intercepts.c:387)
>>
>> > by 0x4F1EEA6: start_thread (pthread_create.c:477)
>>
>> > by 0x4CBADEE: clone (clone.S:95)
>>
>> > Address 0x5197520 is 18,416 bytes inside a block of size
>> 19,528 alloc'd
>>
>> > at 0x48397CF: malloc (vg_replace_malloc.c:307)
>>
>> > by 0x494DBE1: freeListMalloc (freeListLib.c:95)
>>
>> > by 0x494DCE8: freeListCalloc (freeListLib.c:68)
>>
>> > by 0x48C95B2: db_init_events (dbEvent.c:304)
>>
>> > by 0x48D653C: dbContext::subscribe(epicsGuard<epicsMutex>&,
>> dbChannel*, dbChannelIO&, unsigned int, unsigned long, unsigned int,
>> cacStateNotify&, unsigned int*) (dbContext.cpp:221)
>>
>> > by 0x48D7468:
>> dbChannelIO::subscribe(epicsGuard<epicsMutex>&, unsigned int,
>> unsigned long, unsigned int, cacStateNotify&, unsigned int*)
>> (dbChannelIO.cpp:119)
>>
>> > by 0x4EF7771:
>> oldSubscription::oldSubscription(epicsGuard<epicsMutex>&,
>> oldChannelNotify&, cacChannel&, unsigned int, unsigned long, unsigned
>> int, void (*)(event_handler_args), void*, oldSubscription**)
>> (oldSubscription.cpp:43)
>>
>> > by 0x4EF7099: ca_create_subscription
>> (oldChannelNotify.cpp:573)
>>
>> > by 0x48D2BE8: dbCaTask (dbCa.c:1249)
>>
>> > by 0x495E094: start_routine (osdThread.c:439)
>>
>> > by 0x483F876: mythread_wrapper (hg_intercepts.c:387)
>>
>> > by 0x4F1EEA6: start_thread (pthread_create.c:477) Block was
>>
>> > alloc'd by thread #9
>>
>>
https://github.com/epics-base/epics-base/blob/3fadf4a26cfe33dcb9eb9e4620634e1d3a7b9763/modules/database/src/ioc/db/dbEvent.c#L824
>> <https://github.com/epics-base/epics-base/blob/3fadf4a26cfe33dcb9eb9e4620634e1d3a7b9763/modules/database/src/ioc/db/dbEvent.c#L824>
>>
>>
>>
https://github.com/epics-base/epics-base/blob/3fadf4a26cfe33dcb9eb9e4620634e1d3a7b9763/modules/database/src/ioc/db/dbEvent.c#L999
>> <https://github.com/epics-base/epics-base/blob/3fadf4a26cfe33dcb9eb9e4620634e1d3a7b9763/modules/database/src/ioc/db/dbEvent.c#L999>
>>
>>
>> > $ cat mr-regress.db
>>
>> > record(ao,"testAo_0") {}
>>
>> > record(ao,"testAo_1") {}
>>
>> > record(ao,"testAo_2") {}
>>
>> > record(ao,"testAo_3") {}
>>
>> > record(ao,"testAo_4") {}
>>
>> > record(ao,"testAo_5") {}
>>
>> > record(ao,"testAo_6") {}
>>
>> > record(ao,"testAo_7") {}
>>
>> > record(ao,"testAo_8") {}
>>
>> > record(ao,"testAo_9") {}
>>
>> > record(ai,"testAi_0") {field(INP, "testAo_0 CP") }
>>
>> > record(ai,"testAi_1") {field(INP, "testAo_1 CP") }
>>
>> > record(ai,"testAi_2") {field(INP, "testAo_2 CP") }
>>
>> > record(ai,"testAi_3") {field(INP, "testAo_3 CP") }
>>
>> > record(ai,"testAi_4") {field(INP, "testAo_4 CP") }
>>
>> > record(ai,"testAi_5") {field(INP, "testAo_5 CP") }
>>
>> > record(ai,"testAi_6") {field(INP, "testAo_6 CP") }
>>
>> > record(ai,"testAi_7") {field(INP, "testAo_7 CP") }
>>
>> > record(ai,"testAi_8") {field(INP, "testAo_8 CP") }
>>
>> > record(ai,"testAi_9") {field(INP, "testAo_9 CP") }
>>
>> > $ valgrind --tool=helgrind ./bin/linux-x86_64/softIoc -d
>> mr-regress.db
>>
>> > $ ./bin/linux-x86_64/caput testAo_0 1.0
>>
>> > $ ./bin/linux-x86_64/caput testAo_0 2.0
>>
>